diff mbox

[v2] vfio/type1: Remove locked page accounting workqueue

Message ID 20170406145250.16956.95264.stgit@gimli.home (mailing list archive)
State New, archived
Headers show

Commit Message

Alex Williamson April 6, 2017, 2:53 p.m. UTC
If the mmap_sem is contented then the vfio type1 IOMMU backend will
defer locked page accounting updates to a workqueue task.  This has
a few problems and depending on which side the user tries to play,
they might be over-penalized for unmaps that haven't yet been
accounted, or able to race the workqueue to enter more mappings
than they're allowed.  It's not entirely clear what motivated this
workqueue mechanism in the original vfio design, but it seems to
introduce more problems than it solves, so remove it and update the
callers to allow for failure.  We can also now recheck the limit
under write lock to make sure we don't exceed it.

Cc: stable@vger.kernel.org
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

v2: Fixed missed mmput on failure to acquire mmap_sem as noted by Eric

 drivers/vfio/vfio_iommu_type1.c |  101 ++++++++++++++++++---------------------
 1 file changed, 46 insertions(+), 55 deletions(-)

Comments

Eric Auger April 6, 2017, 4:53 p.m. UTC | #1
Hi Alex,

On 06/04/2017 16:53, Alex Williamson wrote:
> If the mmap_sem is contented then the vfio type1 IOMMU backend will
> defer locked page accounting updates to a workqueue task.  This has
> a few problems and depending on which side the user tries to play,
> they might be over-penalized for unmaps that haven't yet been
> accounted, or able to race the workqueue to enter more mappings
> than they're allowed.  It's not entirely clear what motivated this
> workqueue mechanism in the original vfio design, but it seems to
> introduce more problems than it solves, so remove it and update the
> callers to allow for failure.  We can also now recheck the limit
> under write lock to make sure we don't exceed it.
> 
> Cc: stable@vger.kernel.org
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Looks good to me.

Reviewed-by: Eric Auger <eric.auger@redhat.com>

Thanks

Eric

> ---
> 
> v2: Fixed missed mmput on failure to acquire mmap_sem as noted by Eric
> 
>  drivers/vfio/vfio_iommu_type1.c |  101 ++++++++++++++++++---------------------
>  1 file changed, 46 insertions(+), 55 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 32d2633092a3..b799edbb8c4f 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -246,69 +246,45 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
>  	return ret;
>  }
>  
> -struct vwork {
> -	struct mm_struct	*mm;
> -	long			npage;
> -	struct work_struct	work;
> -};
> -
> -/* delayed decrement/increment for locked_vm */
> -static void vfio_lock_acct_bg(struct work_struct *work)
> +static int vfio_lock_acct(struct task_struct *task, long npage)
>  {
> -	struct vwork *vwork = container_of(work, struct vwork, work);
> -	struct mm_struct *mm;
> -
> -	mm = vwork->mm;
> -	down_write(&mm->mmap_sem);
> -	mm->locked_vm += vwork->npage;
> -	up_write(&mm->mmap_sem);
> -	mmput(mm);
> -	kfree(vwork);
> -}
> -
> -static void vfio_lock_acct(struct task_struct *task, long npage)
> -{
> -	struct vwork *vwork;
>  	struct mm_struct *mm;
>  	bool is_current;
> +	int ret;
>  
>  	if (!npage)
> -		return;
> +		return 0;
>  
>  	is_current = (task->mm == current->mm);
>  
>  	mm = is_current ? task->mm : get_task_mm(task);
>  	if (!mm)
> -		return; /* process exited */
> +		return -ESRCH; /* process exited */
>  
> -	if (down_write_trylock(&mm->mmap_sem)) {
> -		mm->locked_vm += npage;
> -		up_write(&mm->mmap_sem);
> -		if (!is_current)
> -			mmput(mm);
> -		return;
> -	}
> +	ret = down_write_killable(&mm->mmap_sem);
> +	if (!ret) {
> +		if (npage < 0) {
> +			mm->locked_vm += npage;
> +		} else {
> +			unsigned long limit;
> +
> +			limit = is_current ?
> +				rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT :
> +				task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +
> +			if (mm->locked_vm + npage <= limit)
> +				mm->locked_vm += npage;
> +			else
> +				ret = -ENOMEM;
> +		}
>  
> -	if (is_current) {
> -		mm = get_task_mm(task);
> -		if (!mm)
> -			return;
> +		up_write(&mm->mmap_sem);
>  	}
>  
> -	/*
> -	 * Couldn't get mmap_sem lock, so must setup to update
> -	 * mm->locked_vm later. If locked_vm were atomic, we
> -	 * wouldn't need this silliness
> -	 */
> -	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
> -	if (WARN_ON(!vwork)) {
> +	if (!is_current)
>  		mmput(mm);
> -		return;
> -	}
> -	INIT_WORK(&vwork->work, vfio_lock_acct_bg);
> -	vwork->mm = mm;
> -	vwork->npage = npage;
> -	schedule_work(&vwork->work);
> +
> +	return ret;
>  }
>  
>  /*
> @@ -405,7 +381,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
>  static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
>  				  long npage, unsigned long *pfn_base)
>  {
> -	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +	unsigned long pfn = 0, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>  	bool lock_cap = capable(CAP_IPC_LOCK);
>  	long ret, pinned = 0, lock_acct = 0;
>  	bool rsvd;
> @@ -442,8 +418,6 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
>  	/* Lock all the consecutive pages from pfn_base */
>  	for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
>  	     pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
> -		unsigned long pfn = 0;
> -
>  		ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, &pfn);
>  		if (ret)
>  			break;
> @@ -460,14 +434,25 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
>  				put_pfn(pfn, dma->prot);
>  				pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
>  					__func__, limit << PAGE_SHIFT);
> -				break;
> +				ret = -ENOMEM;
> +				goto unpin_out;
>  			}
>  			lock_acct++;
>  		}
>  	}
>  
>  out:
> -	vfio_lock_acct(current, lock_acct);
> +	ret = vfio_lock_acct(current, lock_acct);
> +
> +unpin_out:
> +	if (ret) {
> +		if (!rsvd) {
> +			for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
> +				put_pfn(pfn, dma->prot);
> +		}
> +
> +		return ret;
> +	}
>  
>  	return pinned;
>  }
> @@ -522,8 +507,14 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
>  		goto pin_page_exit;
>  	}
>  
> -	if (!rsvd && do_accounting)
> -		vfio_lock_acct(dma->task, 1);
> +	if (!rsvd && do_accounting) {
> +		ret = vfio_lock_acct(dma->task, 1);
> +		if (ret) {
> +			put_pfn(*pfn_base, dma->prot);
> +			goto pin_page_exit;
> +		}
> +	}
> +
>  	ret = 1;
>  
>  pin_page_exit:
>
Alex Williamson April 6, 2017, 5:05 p.m. UTC | #2
On Thu, 6 Apr 2017 18:53:04 +0200
Auger Eric <eric.auger@redhat.com> wrote:

> Hi Alex,
> 
> On 06/04/2017 16:53, Alex Williamson wrote:
> > If the mmap_sem is contented then the vfio type1 IOMMU backend will
> > defer locked page accounting updates to a workqueue task.  This has
> > a few problems and depending on which side the user tries to play,
> > they might be over-penalized for unmaps that haven't yet been
> > accounted, or able to race the workqueue to enter more mappings
> > than they're allowed.  It's not entirely clear what motivated this
> > workqueue mechanism in the original vfio design, but it seems to
> > introduce more problems than it solves, so remove it and update the
> > callers to allow for failure.  We can also now recheck the limit
> > under write lock to make sure we don't exceed it.
> > 
> > Cc: stable@vger.kernel.org
> > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>  
> Looks good to me.
> 
> Reviewed-by: Eric Auger <eric.auger@redhat.com>
> 

One possible enhancement below.

> 
> > ---
> > 
> > v2: Fixed missed mmput on failure to acquire mmap_sem as noted by Eric
> > 
> >  drivers/vfio/vfio_iommu_type1.c |  101 ++++++++++++++++++---------------------
> >  1 file changed, 46 insertions(+), 55 deletions(-)
> > 
> > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > index 32d2633092a3..b799edbb8c4f 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -246,69 +246,45 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
> >  	return ret;
> >  }
> >  
> > -struct vwork {
> > -	struct mm_struct	*mm;
> > -	long			npage;
> > -	struct work_struct	work;
> > -};
> > -
> > -/* delayed decrement/increment for locked_vm */
> > -static void vfio_lock_acct_bg(struct work_struct *work)
> > +static int vfio_lock_acct(struct task_struct *task, long npage)
> >  {
> > -	struct vwork *vwork = container_of(work, struct vwork, work);
> > -	struct mm_struct *mm;
> > -
> > -	mm = vwork->mm;
> > -	down_write(&mm->mmap_sem);
> > -	mm->locked_vm += vwork->npage;
> > -	up_write(&mm->mmap_sem);
> > -	mmput(mm);
> > -	kfree(vwork);
> > -}
> > -
> > -static void vfio_lock_acct(struct task_struct *task, long npage)
> > -{
> > -	struct vwork *vwork;
> >  	struct mm_struct *mm;
> >  	bool is_current;
> > +	int ret;
> >  
> >  	if (!npage)
> > -		return;
> > +		return 0;
> >  
> >  	is_current = (task->mm == current->mm);
> >  
> >  	mm = is_current ? task->mm : get_task_mm(task);
> >  	if (!mm)
> > -		return; /* process exited */
> > +		return -ESRCH; /* process exited */
> >  
> > -	if (down_write_trylock(&mm->mmap_sem)) {
> > -		mm->locked_vm += npage;
> > -		up_write(&mm->mmap_sem);
> > -		if (!is_current)
> > -			mmput(mm);
> > -		return;
> > -	}
> > +	ret = down_write_killable(&mm->mmap_sem);


Hmm, since we're already testing current, I wonder if it makes sense to
have this bimodal, killable if current, straight down_write()
otherwise.  I'm not too sure how important it is to use killable
regardless, but mlock does, which seemed like a good model to follow.
Thanks,

Alex

> > +	if (!ret) {
> > +		if (npage < 0) {
> > +			mm->locked_vm += npage;
> > +		} else {
> > +			unsigned long limit;
> > +
> > +			limit = is_current ?
> > +				rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT :
> > +				task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > +
> > +			if (mm->locked_vm + npage <= limit)
> > +				mm->locked_vm += npage;
> > +			else
> > +				ret = -ENOMEM;
> > +		}
> >  
> > -	if (is_current) {
> > -		mm = get_task_mm(task);
> > -		if (!mm)
> > -			return;
> > +		up_write(&mm->mmap_sem);
> >  	}
> >  
> > -	/*
> > -	 * Couldn't get mmap_sem lock, so must setup to update
> > -	 * mm->locked_vm later. If locked_vm were atomic, we
> > -	 * wouldn't need this silliness
> > -	 */
> > -	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
> > -	if (WARN_ON(!vwork)) {
> > +	if (!is_current)
> >  		mmput(mm);
> > -		return;
> > -	}
> > -	INIT_WORK(&vwork->work, vfio_lock_acct_bg);
> > -	vwork->mm = mm;
> > -	vwork->npage = npage;
> > -	schedule_work(&vwork->work);
> > +
> > +	return ret;
> >  }
> >  
> >  /*
> > @@ -405,7 +381,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
> >  static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
> >  				  long npage, unsigned long *pfn_base)
> >  {
> > -	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > +	unsigned long pfn = 0, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> >  	bool lock_cap = capable(CAP_IPC_LOCK);
> >  	long ret, pinned = 0, lock_acct = 0;
> >  	bool rsvd;
> > @@ -442,8 +418,6 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
> >  	/* Lock all the consecutive pages from pfn_base */
> >  	for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
> >  	     pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
> > -		unsigned long pfn = 0;
> > -
> >  		ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, &pfn);
> >  		if (ret)
> >  			break;
> > @@ -460,14 +434,25 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
> >  				put_pfn(pfn, dma->prot);
> >  				pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
> >  					__func__, limit << PAGE_SHIFT);
> > -				break;
> > +				ret = -ENOMEM;
> > +				goto unpin_out;
> >  			}
> >  			lock_acct++;
> >  		}
> >  	}
> >  
> >  out:
> > -	vfio_lock_acct(current, lock_acct);
> > +	ret = vfio_lock_acct(current, lock_acct);
> > +
> > +unpin_out:
> > +	if (ret) {
> > +		if (!rsvd) {
> > +			for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
> > +				put_pfn(pfn, dma->prot);
> > +		}
> > +
> > +		return ret;
> > +	}
> >  
> >  	return pinned;
> >  }
> > @@ -522,8 +507,14 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
> >  		goto pin_page_exit;
> >  	}
> >  
> > -	if (!rsvd && do_accounting)
> > -		vfio_lock_acct(dma->task, 1);
> > +	if (!rsvd && do_accounting) {
> > +		ret = vfio_lock_acct(dma->task, 1);
> > +		if (ret) {
> > +			put_pfn(*pfn_base, dma->prot);
> > +			goto pin_page_exit;
> > +		}
> > +	}
> > +
> >  	ret = 1;
> >  
> >  pin_page_exit:
> >
Alex Williamson April 6, 2017, 7:32 p.m. UTC | #3
On Thu, 6 Apr 2017 11:05:31 -0600
Alex Williamson <alex.williamson@redhat.com> wrote:

> On Thu, 6 Apr 2017 18:53:04 +0200
> Auger Eric <eric.auger@redhat.com> wrote:
> 
> > Hi Alex,
> > 
> > On 06/04/2017 16:53, Alex Williamson wrote:  
> > > If the mmap_sem is contented then the vfio type1 IOMMU backend will
> > > defer locked page accounting updates to a workqueue task.  This has
> > > a few problems and depending on which side the user tries to play,
> > > they might be over-penalized for unmaps that haven't yet been
> > > accounted, or able to race the workqueue to enter more mappings
> > > than they're allowed.  It's not entirely clear what motivated this
> > > workqueue mechanism in the original vfio design, but it seems to
> > > introduce more problems than it solves, so remove it and update the
> > > callers to allow for failure.  We can also now recheck the limit
> > > under write lock to make sure we don't exceed it.
> > > 
> > > Cc: stable@vger.kernel.org
> > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>    
> > Looks good to me.
> > 
> > Reviewed-by: Eric Auger <eric.auger@redhat.com>
> >   
> 
> One possible enhancement below.
> 
> >   
> > > ---
> > > 
> > > v2: Fixed missed mmput on failure to acquire mmap_sem as noted by Eric
> > > 
> > >  drivers/vfio/vfio_iommu_type1.c |  101 ++++++++++++++++++---------------------
> > >  1 file changed, 46 insertions(+), 55 deletions(-)
> > > 
> > > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > > index 32d2633092a3..b799edbb8c4f 100644
> > > --- a/drivers/vfio/vfio_iommu_type1.c
> > > +++ b/drivers/vfio/vfio_iommu_type1.c
> > > @@ -246,69 +246,45 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
> > >  	return ret;
> > >  }
> > >  
> > > -struct vwork {
> > > -	struct mm_struct	*mm;
> > > -	long			npage;
> > > -	struct work_struct	work;
> > > -};
> > > -
> > > -/* delayed decrement/increment for locked_vm */
> > > -static void vfio_lock_acct_bg(struct work_struct *work)
> > > +static int vfio_lock_acct(struct task_struct *task, long npage)
> > >  {
> > > -	struct vwork *vwork = container_of(work, struct vwork, work);
> > > -	struct mm_struct *mm;
> > > -
> > > -	mm = vwork->mm;
> > > -	down_write(&mm->mmap_sem);
> > > -	mm->locked_vm += vwork->npage;
> > > -	up_write(&mm->mmap_sem);
> > > -	mmput(mm);
> > > -	kfree(vwork);
> > > -}
> > > -
> > > -static void vfio_lock_acct(struct task_struct *task, long npage)
> > > -{
> > > -	struct vwork *vwork;
> > >  	struct mm_struct *mm;
> > >  	bool is_current;
> > > +	int ret;
> > >  
> > >  	if (!npage)
> > > -		return;
> > > +		return 0;
> > >  
> > >  	is_current = (task->mm == current->mm);
> > >  
> > >  	mm = is_current ? task->mm : get_task_mm(task);
> > >  	if (!mm)
> > > -		return; /* process exited */
> > > +		return -ESRCH; /* process exited */
> > >  
> > > -	if (down_write_trylock(&mm->mmap_sem)) {
> > > -		mm->locked_vm += npage;
> > > -		up_write(&mm->mmap_sem);
> > > -		if (!is_current)
> > > -			mmput(mm);
> > > -		return;
> > > -	}
> > > +	ret = down_write_killable(&mm->mmap_sem);  
> 
> 
> Hmm, since we're already testing current, I wonder if it makes sense to
> have this bimodal, killable if current, straight down_write()
> otherwise.  I'm not too sure how important it is to use killable
> regardless, but mlock does, which seemed like a good model to follow.

I see other examples in the kernel where killable is used even for
remote mm, which would match the external page pinning usage, so I think
I'll keep this as is unless there are other comments for v2.  Thanks
for the review.

Alex

> > > +	if (!ret) {
> > > +		if (npage < 0) {
> > > +			mm->locked_vm += npage;
> > > +		} else {
> > > +			unsigned long limit;
> > > +
> > > +			limit = is_current ?
> > > +				rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT :
> > > +				task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > > +
> > > +			if (mm->locked_vm + npage <= limit)
> > > +				mm->locked_vm += npage;
> > > +			else
> > > +				ret = -ENOMEM;
> > > +		}
> > >  
> > > -	if (is_current) {
> > > -		mm = get_task_mm(task);
> > > -		if (!mm)
> > > -			return;
> > > +		up_write(&mm->mmap_sem);
> > >  	}
> > >  
> > > -	/*
> > > -	 * Couldn't get mmap_sem lock, so must setup to update
> > > -	 * mm->locked_vm later. If locked_vm were atomic, we
> > > -	 * wouldn't need this silliness
> > > -	 */
> > > -	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
> > > -	if (WARN_ON(!vwork)) {
> > > +	if (!is_current)
> > >  		mmput(mm);
> > > -		return;
> > > -	}
> > > -	INIT_WORK(&vwork->work, vfio_lock_acct_bg);
> > > -	vwork->mm = mm;
> > > -	vwork->npage = npage;
> > > -	schedule_work(&vwork->work);
> > > +
> > > +	return ret;
> > >  }
> > >  
> > >  /*
> > > @@ -405,7 +381,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
> > >  static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
> > >  				  long npage, unsigned long *pfn_base)
> > >  {
> > > -	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > > +	unsigned long pfn = 0, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > >  	bool lock_cap = capable(CAP_IPC_LOCK);
> > >  	long ret, pinned = 0, lock_acct = 0;
> > >  	bool rsvd;
> > > @@ -442,8 +418,6 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
> > >  	/* Lock all the consecutive pages from pfn_base */
> > >  	for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
> > >  	     pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
> > > -		unsigned long pfn = 0;
> > > -
> > >  		ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, &pfn);
> > >  		if (ret)
> > >  			break;
> > > @@ -460,14 +434,25 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
> > >  				put_pfn(pfn, dma->prot);
> > >  				pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
> > >  					__func__, limit << PAGE_SHIFT);
> > > -				break;
> > > +				ret = -ENOMEM;
> > > +				goto unpin_out;
> > >  			}
> > >  			lock_acct++;
> > >  		}
> > >  	}
> > >  
> > >  out:
> > > -	vfio_lock_acct(current, lock_acct);
> > > +	ret = vfio_lock_acct(current, lock_acct);
> > > +
> > > +unpin_out:
> > > +	if (ret) {
> > > +		if (!rsvd) {
> > > +			for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
> > > +				put_pfn(pfn, dma->prot);
> > > +		}
> > > +
> > > +		return ret;
> > > +	}
> > >  
> > >  	return pinned;
> > >  }
> > > @@ -522,8 +507,14 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
> > >  		goto pin_page_exit;
> > >  	}
> > >  
> > > -	if (!rsvd && do_accounting)
> > > -		vfio_lock_acct(dma->task, 1);
> > > +	if (!rsvd && do_accounting) {
> > > +		ret = vfio_lock_acct(dma->task, 1);
> > > +		if (ret) {
> > > +			put_pfn(*pfn_base, dma->prot);
> > > +			goto pin_page_exit;
> > > +		}
> > > +	}
> > > +
> > >  	ret = 1;
> > >  
> > >  pin_page_exit:
> > >     
>
Peter Xu April 11, 2017, 11:03 a.m. UTC | #4
On Thu, Apr 06, 2017 at 08:53:43AM -0600, Alex Williamson wrote:
> If the mmap_sem is contented then the vfio type1 IOMMU backend will
> defer locked page accounting updates to a workqueue task.  This has
> a few problems and depending on which side the user tries to play,
> they might be over-penalized for unmaps that haven't yet been
> accounted, or able to race the workqueue to enter more mappings
> than they're allowed.  It's not entirely clear what motivated this
> workqueue mechanism in the original vfio design, but it seems to
> introduce more problems than it solves, so remove it and update the
> callers to allow for failure.  We can also now recheck the limit
> under write lock to make sure we don't exceed it.
> 
> Cc: stable@vger.kernel.org
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
> 
> v2: Fixed missed mmput on failure to acquire mmap_sem as noted by Eric
> 
>  drivers/vfio/vfio_iommu_type1.c |  101 ++++++++++++++++++---------------------
>  1 file changed, 46 insertions(+), 55 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 32d2633092a3..b799edbb8c4f 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -246,69 +246,45 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
>  	return ret;
>  }
>  
> -struct vwork {
> -	struct mm_struct	*mm;
> -	long			npage;
> -	struct work_struct	work;
> -};
> -
> -/* delayed decrement/increment for locked_vm */
> -static void vfio_lock_acct_bg(struct work_struct *work)
> +static int vfio_lock_acct(struct task_struct *task, long npage)
>  {
> -	struct vwork *vwork = container_of(work, struct vwork, work);
> -	struct mm_struct *mm;
> -
> -	mm = vwork->mm;
> -	down_write(&mm->mmap_sem);
> -	mm->locked_vm += vwork->npage;
> -	up_write(&mm->mmap_sem);
> -	mmput(mm);
> -	kfree(vwork);
> -}
> -
> -static void vfio_lock_acct(struct task_struct *task, long npage)
> -{
> -	struct vwork *vwork;
>  	struct mm_struct *mm;
>  	bool is_current;
> +	int ret;
>  
>  	if (!npage)
> -		return;
> +		return 0;
>  
>  	is_current = (task->mm == current->mm);
>  
>  	mm = is_current ? task->mm : get_task_mm(task);

A question besides current patch: could I ask why we need to take
special care for is_current? I see that is only used to only try avoid
get_task_mm() when proper, but is get_task_mm() a heavy operation?

>  	if (!mm)
> -		return; /* process exited */
> +		return -ESRCH; /* process exited */
>  
> -	if (down_write_trylock(&mm->mmap_sem)) {
> -		mm->locked_vm += npage;
> -		up_write(&mm->mmap_sem);
> -		if (!is_current)
> -			mmput(mm);
> -		return;
> -	}
> +	ret = down_write_killable(&mm->mmap_sem);
> +	if (!ret) {
> +		if (npage < 0) {
> +			mm->locked_vm += npage;
> +		} else {
> +			unsigned long limit;
> +
> +			limit = is_current ?
> +				rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT :
> +				task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;

Maybe we can directly use task_rlimit() here? Since looks like
rlimit() is calling it as well, with "current".

> +
> +			if (mm->locked_vm + npage <= limit)
> +				mm->locked_vm += npage;
> +			else
> +				ret = -ENOMEM;
> +		}
>  
> -	if (is_current) {
> -		mm = get_task_mm(task);
> -		if (!mm)
> -			return;
> +		up_write(&mm->mmap_sem);
>  	}
>  
> -	/*
> -	 * Couldn't get mmap_sem lock, so must setup to update
> -	 * mm->locked_vm later. If locked_vm were atomic, we
> -	 * wouldn't need this silliness
> -	 */
> -	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
> -	if (WARN_ON(!vwork)) {
> +	if (!is_current)
>  		mmput(mm);
> -		return;
> -	}
> -	INIT_WORK(&vwork->work, vfio_lock_acct_bg);
> -	vwork->mm = mm;
> -	vwork->npage = npage;
> -	schedule_work(&vwork->work);
> +
> +	return ret;
>  }
>  
>  /*
> @@ -405,7 +381,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
>  static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
>  				  long npage, unsigned long *pfn_base)
>  {
> -	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +	unsigned long pfn = 0, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>  	bool lock_cap = capable(CAP_IPC_LOCK);
>  	long ret, pinned = 0, lock_acct = 0;
>  	bool rsvd;
> @@ -442,8 +418,6 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
>  	/* Lock all the consecutive pages from pfn_base */
>  	for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
>  	     pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
> -		unsigned long pfn = 0;
> -
>  		ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, &pfn);
>  		if (ret)
>  			break;
> @@ -460,14 +434,25 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
>  				put_pfn(pfn, dma->prot);
>  				pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
>  					__func__, limit << PAGE_SHIFT);
> -				break;
> +				ret = -ENOMEM;
> +				goto unpin_out;
>  			}
>  			lock_acct++;
>  		}
>  	}
>  
>  out:
> -	vfio_lock_acct(current, lock_acct);
> +	ret = vfio_lock_acct(current, lock_acct);
> +
> +unpin_out:
> +	if (ret) {
> +		if (!rsvd) {
> +			for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
> +				put_pfn(pfn, dma->prot);
> +		}
> +
> +		return ret;
> +	}

The change in vfio_pin_pages_remote() seems to contain a functional
change totally not related to the subject (IIUC, we are going to unpin
those pages if the huge page can only be pinned partially, and we are
not doing that before)? If so, would it be nice to split current patch
into two, or at least mention this behavior change in commit log of
this patch?

Thanks,

>  
>  	return pinned;
>  }
> @@ -522,8 +507,14 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
>  		goto pin_page_exit;
>  	}
>  
> -	if (!rsvd && do_accounting)
> -		vfio_lock_acct(dma->task, 1);
> +	if (!rsvd && do_accounting) {
> +		ret = vfio_lock_acct(dma->task, 1);
> +		if (ret) {
> +			put_pfn(*pfn_base, dma->prot);
> +			goto pin_page_exit;
> +		}
> +	}
> +
>  	ret = 1;
>  
>  pin_page_exit:
>
Alex Williamson April 11, 2017, 6:27 p.m. UTC | #5
On Tue, 11 Apr 2017 19:03:14 +0800
Peter Xu <peterx@redhat.com> wrote:

> On Thu, Apr 06, 2017 at 08:53:43AM -0600, Alex Williamson wrote:
> > If the mmap_sem is contented then the vfio type1 IOMMU backend will
> > defer locked page accounting updates to a workqueue task.  This has
> > a few problems and depending on which side the user tries to play,
> > they might be over-penalized for unmaps that haven't yet been
> > accounted, or able to race the workqueue to enter more mappings
> > than they're allowed.  It's not entirely clear what motivated this
> > workqueue mechanism in the original vfio design, but it seems to
> > introduce more problems than it solves, so remove it and update the
> > callers to allow for failure.  We can also now recheck the limit
> > under write lock to make sure we don't exceed it.
> > 
> > Cc: stable@vger.kernel.org
> > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > ---
> > 
> > v2: Fixed missed mmput on failure to acquire mmap_sem as noted by Eric
> > 
> >  drivers/vfio/vfio_iommu_type1.c |  101 ++++++++++++++++++---------------------
> >  1 file changed, 46 insertions(+), 55 deletions(-)
> > 
> > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > index 32d2633092a3..b799edbb8c4f 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -246,69 +246,45 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
> >  	return ret;
> >  }
> >  
> > -struct vwork {
> > -	struct mm_struct	*mm;
> > -	long			npage;
> > -	struct work_struct	work;
> > -};
> > -
> > -/* delayed decrement/increment for locked_vm */
> > -static void vfio_lock_acct_bg(struct work_struct *work)
> > +static int vfio_lock_acct(struct task_struct *task, long npage)
> >  {
> > -	struct vwork *vwork = container_of(work, struct vwork, work);
> > -	struct mm_struct *mm;
> > -
> > -	mm = vwork->mm;
> > -	down_write(&mm->mmap_sem);
> > -	mm->locked_vm += vwork->npage;
> > -	up_write(&mm->mmap_sem);
> > -	mmput(mm);
> > -	kfree(vwork);
> > -}
> > -
> > -static void vfio_lock_acct(struct task_struct *task, long npage)
> > -{
> > -	struct vwork *vwork;
> >  	struct mm_struct *mm;
> >  	bool is_current;
> > +	int ret;
> >  
> >  	if (!npage)
> > -		return;
> > +		return 0;
> >  
> >  	is_current = (task->mm == current->mm);
> >  
> >  	mm = is_current ? task->mm : get_task_mm(task);  
> 
> A question besides current patch: could I ask why we need to take
> special care for is_current? I see that is only used to only try avoid
> get_task_mm() when proper, but is get_task_mm() a heavy operation?

Yes, it's slower, performance was significantly degraded when mdev
support was introduced and imposed get_task_mm() on all calling paths.
 
> >  	if (!mm)
> > -		return; /* process exited */
> > +		return -ESRCH; /* process exited */
> >  
> > -	if (down_write_trylock(&mm->mmap_sem)) {
> > -		mm->locked_vm += npage;
> > -		up_write(&mm->mmap_sem);
> > -		if (!is_current)
> > -			mmput(mm);
> > -		return;
> > -	}
> > +	ret = down_write_killable(&mm->mmap_sem);
> > +	if (!ret) {
> > +		if (npage < 0) {
> > +			mm->locked_vm += npage;
> > +		} else {
> > +			unsigned long limit;
> > +
> > +			limit = is_current ?
> > +				rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT :
> > +				task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;  
> 
> Maybe we can directly use task_rlimit() here? Since looks like
> rlimit() is calling it as well, with "current".

We could, but does it actually change anything?  rlimit() is static
inline, so using task_rlimit() for both just moves the is_current
ternary into the task_rlimit() argument.  Is this:

			limit = task_rlimit(is_current ? current : task,
					    RLIMIT_MEMLOCK) >> PAGE_SHIFT);

notably cleaner than above?

> > +
> > +			if (mm->locked_vm + npage <= limit)
> > +				mm->locked_vm += npage;
> > +			else
> > +				ret = -ENOMEM;
> > +		}
> >  
> > -	if (is_current) {
> > -		mm = get_task_mm(task);
> > -		if (!mm)
> > -			return;
> > +		up_write(&mm->mmap_sem);
> >  	}
> >  
> > -	/*
> > -	 * Couldn't get mmap_sem lock, so must setup to update
> > -	 * mm->locked_vm later. If locked_vm were atomic, we
> > -	 * wouldn't need this silliness
> > -	 */
> > -	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
> > -	if (WARN_ON(!vwork)) {
> > +	if (!is_current)
> >  		mmput(mm);
> > -		return;
> > -	}
> > -	INIT_WORK(&vwork->work, vfio_lock_acct_bg);
> > -	vwork->mm = mm;
> > -	vwork->npage = npage;
> > -	schedule_work(&vwork->work);
> > +
> > +	return ret;
> >  }
> >  
> >  /*
> > @@ -405,7 +381,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
> >  static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
> >  				  long npage, unsigned long *pfn_base)
> >  {
> > -	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > +	unsigned long pfn = 0, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> >  	bool lock_cap = capable(CAP_IPC_LOCK);
> >  	long ret, pinned = 0, lock_acct = 0;
> >  	bool rsvd;
> > @@ -442,8 +418,6 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
> >  	/* Lock all the consecutive pages from pfn_base */
> >  	for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
> >  	     pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
> > -		unsigned long pfn = 0;
> > -
> >  		ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, &pfn);
> >  		if (ret)
> >  			break;
> > @@ -460,14 +434,25 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
> >  				put_pfn(pfn, dma->prot);
> >  				pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
> >  					__func__, limit << PAGE_SHIFT);
> > -				break;
> > +				ret = -ENOMEM;
> > +				goto unpin_out;
> >  			}
> >  			lock_acct++;
> >  		}
> >  	}
> >  
> >  out:
> > -	vfio_lock_acct(current, lock_acct);
> > +	ret = vfio_lock_acct(current, lock_acct);
> > +
> > +unpin_out:
> > +	if (ret) {
> > +		if (!rsvd) {
> > +			for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
> > +				put_pfn(pfn, dma->prot);
> > +		}
> > +
> > +		return ret;
> > +	}  
> 
> The change in vfio_pin_pages_remote() seems to contain a functional
> change totally not related to the subject (IIUC, we are going to unpin
> those pages if the huge page can only be pinned partially, and we are
> not doing that before)? If so, would it be nice to split current patch
> into two, or at least mention this behavior change in commit log of
> this patch?


This is only tangentially about hugepages, this loop is looking for
contiguous pages regardless of the processor or IOMMU page size
support.  We're trying to make as few calls to iommu_map() as we can
and thus we want the maximum range of IOVA to physical address we can
pump into the IOMMU driver.  It's up to the IOMMU driver to figure out
how it can optimize that range with hugepages or superpages in its page
tables.  So the caller of this function is looping over a range of
memory and each time this function returns, it maps that many pages
through the iommu.  On success we return <=npage.

The unpin_out loop here is absolutely related to the change proposed
here, vfio_lock_acct() can fail, we cannot return both an error and pin
pages, therefore we need to undo anything we've pinned this round.

Are you perhaps only referring to the exit path above going straight to
this loop rather than attempting to do the accounting for the pages
pinned so far?  Previously that was our only option because the unwind
path was to return a short count, invoking the page accounting and
iommu_mapping, while fully expecting the caller to again loop over the
excess page, return -ENOMEM, and teardown the entire mapping request.
So because we now require an unwind path for the vfio_lock_acct()
change, we can now do the teardown w/o the additional pinning here and
mapping by the caller.  In a very strict sense, we could consider that
a separate change and move those 3 lines to a follow-on patch but
ultimately the caller did request pinned pages beyond what we believe
their limit to be and making use of this new exit path here saves us a
useless accounting and mapping iteration.  I can note that in the
commit log.  Thanks,

Alex 

> >  
> >  	return pinned;
> >  }
> > @@ -522,8 +507,14 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
> >  		goto pin_page_exit;
> >  	}
> >  
> > -	if (!rsvd && do_accounting)
> > -		vfio_lock_acct(dma->task, 1);
> > +	if (!rsvd && do_accounting) {
> > +		ret = vfio_lock_acct(dma->task, 1);
> > +		if (ret) {
> > +			put_pfn(*pfn_base, dma->prot);
> > +			goto pin_page_exit;
> > +		}
> > +	}
> > +
> >  	ret = 1;
> >  
> >  pin_page_exit:
> >   
>
Alex Williamson April 11, 2017, 6:50 p.m. UTC | #6
On Tue, 11 Apr 2017 12:27:55 -0600
Alex Williamson <alex.williamson@redhat.com> wrote:

> On Tue, 11 Apr 2017 19:03:14 +0800
> Peter Xu <peterx@redhat.com> wrote:
> 
> > On Thu, Apr 06, 2017 at 08:53:43AM -0600, Alex Williamson wrote:  
> > > If the mmap_sem is contented then the vfio type1 IOMMU backend will
> > > defer locked page accounting updates to a workqueue task.  This has
> > > a few problems and depending on which side the user tries to play,
> > > they might be over-penalized for unmaps that haven't yet been
> > > accounted, or able to race the workqueue to enter more mappings
> > > than they're allowed.  It's not entirely clear what motivated this
> > > workqueue mechanism in the original vfio design, but it seems to
> > > introduce more problems than it solves, so remove it and update the
> > > callers to allow for failure.  We can also now recheck the limit
> > > under write lock to make sure we don't exceed it.
> > > 
> > > Cc: stable@vger.kernel.org
> > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > ---
> > > 
> > > v2: Fixed missed mmput on failure to acquire mmap_sem as noted by Eric
> > > 
> > >  drivers/vfio/vfio_iommu_type1.c |  101 ++++++++++++++++++---------------------
> > >  1 file changed, 46 insertions(+), 55 deletions(-)
> > > 
> > > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > > index 32d2633092a3..b799edbb8c4f 100644
> > > --- a/drivers/vfio/vfio_iommu_type1.c
> > > +++ b/drivers/vfio/vfio_iommu_type1.c
> > > @@ -246,69 +246,45 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
> > >  	return ret;
> > >  }
> > >  
> > > -struct vwork {
> > > -	struct mm_struct	*mm;
> > > -	long			npage;
> > > -	struct work_struct	work;
> > > -};
> > > -
> > > -/* delayed decrement/increment for locked_vm */
> > > -static void vfio_lock_acct_bg(struct work_struct *work)
> > > +static int vfio_lock_acct(struct task_struct *task, long npage)
> > >  {
> > > -	struct vwork *vwork = container_of(work, struct vwork, work);
> > > -	struct mm_struct *mm;
> > > -
> > > -	mm = vwork->mm;
> > > -	down_write(&mm->mmap_sem);
> > > -	mm->locked_vm += vwork->npage;
> > > -	up_write(&mm->mmap_sem);
> > > -	mmput(mm);
> > > -	kfree(vwork);
> > > -}
> > > -
> > > -static void vfio_lock_acct(struct task_struct *task, long npage)
> > > -{
> > > -	struct vwork *vwork;
> > >  	struct mm_struct *mm;
> > >  	bool is_current;
> > > +	int ret;
> > >  
> > >  	if (!npage)
> > > -		return;
> > > +		return 0;
> > >  
> > >  	is_current = (task->mm == current->mm);
> > >  
> > >  	mm = is_current ? task->mm : get_task_mm(task);    
> > 
> > A question besides current patch: could I ask why we need to take
> > special care for is_current? I see that is only used to only try avoid
> > get_task_mm() when proper, but is get_task_mm() a heavy operation?  
> 
> Yes, it's slower, performance was significantly degraded when mdev
> support was introduced and imposed get_task_mm() on all calling paths.
>  
> > >  	if (!mm)
> > > -		return; /* process exited */
> > > +		return -ESRCH; /* process exited */
> > >  
> > > -	if (down_write_trylock(&mm->mmap_sem)) {
> > > -		mm->locked_vm += npage;
> > > -		up_write(&mm->mmap_sem);
> > > -		if (!is_current)
> > > -			mmput(mm);
> > > -		return;
> > > -	}
> > > +	ret = down_write_killable(&mm->mmap_sem);
> > > +	if (!ret) {
> > > +		if (npage < 0) {
> > > +			mm->locked_vm += npage;
> > > +		} else {
> > > +			unsigned long limit;
> > > +
> > > +			limit = is_current ?
> > > +				rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT :
> > > +				task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;    
> > 
> > Maybe we can directly use task_rlimit() here? Since looks like
> > rlimit() is calling it as well, with "current".  
> 
> We could, but does it actually change anything?  rlimit() is static
> inline, so using task_rlimit() for both just moves the is_current
> ternary into the task_rlimit() argument.  Is this:
> 
> 			limit = task_rlimit(is_current ? current : task,
> 					    RLIMIT_MEMLOCK) >> PAGE_SHIFT);
> 
> notably cleaner than above?

Ah, maybe you were suggesting that we can just use "task" here for
both since it's always correct.  Thanks,

Alex


> > > +
> > > +			if (mm->locked_vm + npage <= limit)
> > > +				mm->locked_vm += npage;
> > > +			else
> > > +				ret = -ENOMEM;
> > > +		}
> > >  
> > > -	if (is_current) {
> > > -		mm = get_task_mm(task);
> > > -		if (!mm)
> > > -			return;
> > > +		up_write(&mm->mmap_sem);
> > >  	}
> > >  
> > > -	/*
> > > -	 * Couldn't get mmap_sem lock, so must setup to update
> > > -	 * mm->locked_vm later. If locked_vm were atomic, we
> > > -	 * wouldn't need this silliness
> > > -	 */
> > > -	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
> > > -	if (WARN_ON(!vwork)) {
> > > +	if (!is_current)
> > >  		mmput(mm);
> > > -		return;
> > > -	}
> > > -	INIT_WORK(&vwork->work, vfio_lock_acct_bg);
> > > -	vwork->mm = mm;
> > > -	vwork->npage = npage;
> > > -	schedule_work(&vwork->work);
> > > +
> > > +	return ret;
> > >  }
> > >  
> > >  /*
> > > @@ -405,7 +381,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
> > >  static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
> > >  				  long npage, unsigned long *pfn_base)
> > >  {
> > > -	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > > +	unsigned long pfn = 0, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > >  	bool lock_cap = capable(CAP_IPC_LOCK);
> > >  	long ret, pinned = 0, lock_acct = 0;
> > >  	bool rsvd;
> > > @@ -442,8 +418,6 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
> > >  	/* Lock all the consecutive pages from pfn_base */
> > >  	for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
> > >  	     pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
> > > -		unsigned long pfn = 0;
> > > -
> > >  		ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, &pfn);
> > >  		if (ret)
> > >  			break;
> > > @@ -460,14 +434,25 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
> > >  				put_pfn(pfn, dma->prot);
> > >  				pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
> > >  					__func__, limit << PAGE_SHIFT);
> > > -				break;
> > > +				ret = -ENOMEM;
> > > +				goto unpin_out;
> > >  			}
> > >  			lock_acct++;
> > >  		}
> > >  	}
> > >  
> > >  out:
> > > -	vfio_lock_acct(current, lock_acct);
> > > +	ret = vfio_lock_acct(current, lock_acct);
> > > +
> > > +unpin_out:
> > > +	if (ret) {
> > > +		if (!rsvd) {
> > > +			for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
> > > +				put_pfn(pfn, dma->prot);
> > > +		}
> > > +
> > > +		return ret;
> > > +	}    
> > 
> > The change in vfio_pin_pages_remote() seems to contain a functional
> > change totally not related to the subject (IIUC, we are going to unpin
> > those pages if the huge page can only be pinned partially, and we are
> > not doing that before)? If so, would it be nice to split current patch
> > into two, or at least mention this behavior change in commit log of
> > this patch?  
> 
> 
> This is only tangentially about hugepages, this loop is looking for
> contiguous pages regardless of the processor or IOMMU page size
> support.  We're trying to make as few calls to iommu_map() as we can
> and thus we want the maximum range of IOVA to physical address we can
> pump into the IOMMU driver.  It's up to the IOMMU driver to figure out
> how it can optimize that range with hugepages or superpages in its page
> tables.  So the caller of this function is looping over a range of
> memory and each time this function returns, it maps that many pages
> through the iommu.  On success we return <=npage.
> 
> The unpin_out loop here is absolutely related to the change proposed
> here, vfio_lock_acct() can fail, we cannot return both an error and pin
> pages, therefore we need to undo anything we've pinned this round.
> 
> Are you perhaps only referring to the exit path above going straight to
> this loop rather than attempting to do the accounting for the pages
> pinned so far?  Previously that was our only option because the unwind
> path was to return a short count, invoking the page accounting and
> iommu_mapping, while fully expecting the caller to again loop over the
> excess page, return -ENOMEM, and teardown the entire mapping request.
> So because we now require an unwind path for the vfio_lock_acct()
> change, we can now do the teardown w/o the additional pinning here and
> mapping by the caller.  In a very strict sense, we could consider that
> a separate change and move those 3 lines to a follow-on patch but
> ultimately the caller did request pinned pages beyond what we believe
> their limit to be and making use of this new exit path here saves us a
> useless accounting and mapping iteration.  I can note that in the
> commit log.  Thanks,
> 
> Alex 
> 
> > >  
> > >  	return pinned;
> > >  }
> > > @@ -522,8 +507,14 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
> > >  		goto pin_page_exit;
> > >  	}
> > >  
> > > -	if (!rsvd && do_accounting)
> > > -		vfio_lock_acct(dma->task, 1);
> > > +	if (!rsvd && do_accounting) {
> > > +		ret = vfio_lock_acct(dma->task, 1);
> > > +		if (ret) {
> > > +			put_pfn(*pfn_base, dma->prot);
> > > +			goto pin_page_exit;
> > > +		}
> > > +	}
> > > +
> > >  	ret = 1;
> > >  
> > >  pin_page_exit:
> > >     
> >   
>
Peter Xu April 12, 2017, 4:13 a.m. UTC | #7
On Tue, Apr 11, 2017 at 12:50:11PM -0600, Alex Williamson wrote:
> On Tue, 11 Apr 2017 12:27:55 -0600
> Alex Williamson <alex.williamson@redhat.com> wrote:
> 
> > On Tue, 11 Apr 2017 19:03:14 +0800
> > Peter Xu <peterx@redhat.com> wrote:
> > 
> > > On Thu, Apr 06, 2017 at 08:53:43AM -0600, Alex Williamson wrote:  

[...]

> > > > -static void vfio_lock_acct(struct task_struct *task, long npage)
> > > > -{
> > > > -	struct vwork *vwork;
> > > >  	struct mm_struct *mm;
> > > >  	bool is_current;
> > > > +	int ret;
> > > >  
> > > >  	if (!npage)
> > > > -		return;
> > > > +		return 0;
> > > >  
> > > >  	is_current = (task->mm == current->mm);
> > > >  
> > > >  	mm = is_current ? task->mm : get_task_mm(task);    
> > > 
> > > A question besides current patch: could I ask why we need to take
> > > special care for is_current? I see that is only used to only try avoid
> > > get_task_mm() when proper, but is get_task_mm() a heavy operation?  
> > 
> > Yes, it's slower, performance was significantly degraded when mdev
> > support was introduced and imposed get_task_mm() on all calling paths.

I see. Thanks.

> >  
> > > >  	if (!mm)
> > > > -		return; /* process exited */
> > > > +		return -ESRCH; /* process exited */
> > > >  
> > > > -	if (down_write_trylock(&mm->mmap_sem)) {
> > > > -		mm->locked_vm += npage;
> > > > -		up_write(&mm->mmap_sem);
> > > > -		if (!is_current)
> > > > -			mmput(mm);
> > > > -		return;
> > > > -	}
> > > > +	ret = down_write_killable(&mm->mmap_sem);
> > > > +	if (!ret) {
> > > > +		if (npage < 0) {
> > > > +			mm->locked_vm += npage;
> > > > +		} else {
> > > > +			unsigned long limit;
> > > > +
> > > > +			limit = is_current ?
> > > > +				rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT :
> > > > +				task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;    
> > > 
> > > Maybe we can directly use task_rlimit() here? Since looks like
> > > rlimit() is calling it as well, with "current".  
> > 
> > We could, but does it actually change anything?  rlimit() is static
> > inline, so using task_rlimit() for both just moves the is_current
> > ternary into the task_rlimit() argument.  Is this:
> > 
> > 			limit = task_rlimit(is_current ? current : task,
> > 					    RLIMIT_MEMLOCK) >> PAGE_SHIFT);
> > 
> > notably cleaner than above?
> 
> Ah, maybe you were suggesting that we can just use "task" here for
> both since it's always correct.  Thanks,

Yes it is.

[...]

> > > >  out:
> > > > -	vfio_lock_acct(current, lock_acct);
> > > > +	ret = vfio_lock_acct(current, lock_acct);
> > > > +
> > > > +unpin_out:
> > > > +	if (ret) {
> > > > +		if (!rsvd) {
> > > > +			for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
> > > > +				put_pfn(pfn, dma->prot);
> > > > +		}
> > > > +
> > > > +		return ret;
> > > > +	}    
> > > 
> > > The change in vfio_pin_pages_remote() seems to contain a functional
> > > change totally not related to the subject (IIUC, we are going to unpin
> > > those pages if the huge page can only be pinned partially, and we are
> > > not doing that before)? If so, would it be nice to split current patch
> > > into two, or at least mention this behavior change in commit log of
> > > this patch?  
> > 
> > 
> > This is only tangentially about hugepages, this loop is looking for
> > contiguous pages regardless of the processor or IOMMU page size
> > support.

It should somewhat related to huge pages? At least we have
disable_hugepages parameter, and as well in vfio_pin_pages_remote() we
have:

	if (unlikely(disable_hugepages))
		goto out;

So the loop will be skipped if that is specified.

> > We're trying to make as few calls to iommu_map() as we can
> > and thus we want the maximum range of IOVA to physical address we can
> > pump into the IOMMU driver.  It's up to the IOMMU driver to figure out
> > how it can optimize that range with hugepages or superpages in its page
> > tables.  So the caller of this function is looping over a range of
> > memory and each time this function returns, it maps that many pages
> > through the iommu.  On success we return <=npage.
> > 
> > The unpin_out loop here is absolutely related to the change proposed
> > here, vfio_lock_acct() can fail, we cannot return both an error and pin
> > pages, therefore we need to undo anything we've pinned this round.

Yes you are right. It's related.

> > 
> > Are you perhaps only referring to the exit path above going straight to
> > this loop rather than attempting to do the accounting for the pages
> > pinned so far?  Previously that was our only option because the unwind
> > path was to return a short count, invoking the page accounting and
> > iommu_mapping, while fully expecting the caller to again loop over the
> > excess page, return -ENOMEM, and teardown the entire mapping request.
> > So because we now require an unwind path for the vfio_lock_acct()
> > change, we can now do the teardown w/o the additional pinning here and
> > mapping by the caller.  In a very strict sense, we could consider that
> > a separate change and move those 3 lines to a follow-on patch but
> > ultimately the caller did request pinned pages beyond what we believe
> > their limit to be and making use of this new exit path here saves us a
> > useless accounting and mapping iteration.  I can note that in the
> > commit log.  Thanks,

I agree that in all cases it's a corner case and trivial, and the
userspace caller should anyway do something to release its memory
accounting.

Thanks for addressing my comments and replied with full detail!
Alex Williamson April 12, 2017, 6:24 p.m. UTC | #8
On Wed, 12 Apr 2017 12:13:43 +0800
Peter Xu <peterx@redhat.com> wrote:

> On Tue, Apr 11, 2017 at 12:50:11PM -0600, Alex Williamson wrote:
> > On Tue, 11 Apr 2017 12:27:55 -0600
> > Alex Williamson <alex.williamson@redhat.com> wrote:
> >   
> > > On Tue, 11 Apr 2017 19:03:14 +0800
> > > Peter Xu <peterx@redhat.com> wrote:
> > >   
> > > > On Thu, Apr 06, 2017 at 08:53:43AM -0600, Alex Williamson wrote:    
> 
> [...]
> 
> > > > > -static void vfio_lock_acct(struct task_struct *task, long npage)
> > > > > -{
> > > > > -	struct vwork *vwork;
> > > > >  	struct mm_struct *mm;
> > > > >  	bool is_current;
> > > > > +	int ret;
> > > > >  
> > > > >  	if (!npage)
> > > > > -		return;
> > > > > +		return 0;
> > > > >  
> > > > >  	is_current = (task->mm == current->mm);
> > > > >  
> > > > >  	mm = is_current ? task->mm : get_task_mm(task);      
> > > > 
> > > > A question besides current patch: could I ask why we need to take
> > > > special care for is_current? I see that is only used to only try avoid
> > > > get_task_mm() when proper, but is get_task_mm() a heavy operation?    
> > > 
> > > Yes, it's slower, performance was significantly degraded when mdev
> > > support was introduced and imposed get_task_mm() on all calling paths.  
> 
> I see. Thanks.
> 
> > >    
> > > > >  	if (!mm)
> > > > > -		return; /* process exited */
> > > > > +		return -ESRCH; /* process exited */
> > > > >  
> > > > > -	if (down_write_trylock(&mm->mmap_sem)) {
> > > > > -		mm->locked_vm += npage;
> > > > > -		up_write(&mm->mmap_sem);
> > > > > -		if (!is_current)
> > > > > -			mmput(mm);
> > > > > -		return;
> > > > > -	}
> > > > > +	ret = down_write_killable(&mm->mmap_sem);
> > > > > +	if (!ret) {
> > > > > +		if (npage < 0) {
> > > > > +			mm->locked_vm += npage;
> > > > > +		} else {
> > > > > +			unsigned long limit;
> > > > > +
> > > > > +			limit = is_current ?
> > > > > +				rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT :
> > > > > +				task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;      
> > > > 
> > > > Maybe we can directly use task_rlimit() here? Since looks like
> > > > rlimit() is calling it as well, with "current".    
> > > 
> > > We could, but does it actually change anything?  rlimit() is static
> > > inline, so using task_rlimit() for both just moves the is_current
> > > ternary into the task_rlimit() argument.  Is this:
> > > 
> > > 			limit = task_rlimit(is_current ? current : task,
> > > 					    RLIMIT_MEMLOCK) >> PAGE_SHIFT);
> > > 
> > > notably cleaner than above?  
> > 
> > Ah, maybe you were suggesting that we can just use "task" here for
> > both since it's always correct.  Thanks,  
> 
> Yes it is.
> 
> [...]
> 
> > > > >  out:
> > > > > -	vfio_lock_acct(current, lock_acct);
> > > > > +	ret = vfio_lock_acct(current, lock_acct);
> > > > > +
> > > > > +unpin_out:
> > > > > +	if (ret) {
> > > > > +		if (!rsvd) {
> > > > > +			for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
> > > > > +				put_pfn(pfn, dma->prot);
> > > > > +		}
> > > > > +
> > > > > +		return ret;
> > > > > +	}      
> > > > 
> > > > The change in vfio_pin_pages_remote() seems to contain a functional
> > > > change totally not related to the subject (IIUC, we are going to unpin
> > > > those pages if the huge page can only be pinned partially, and we are
> > > > not doing that before)? If so, would it be nice to split current patch
> > > > into two, or at least mention this behavior change in commit log of
> > > > this patch?    
> > > 
> > > 
> > > This is only tangentially about hugepages, this loop is looking for
> > > contiguous pages regardless of the processor or IOMMU page size
> > > support.  
> 
> It should somewhat related to huge pages? At least we have
> disable_hugepages parameter, and as well in vfio_pin_pages_remote() we
> have:
> 
> 	if (unlikely(disable_hugepages))
> 		goto out;
> 
> So the loop will be skipped if that is specified.

Right, that's why I say tangentially, the IOMMU driver cannot make use
of any sort of superpages if we map at PAGE_SIZE granularity, at least
not until they implement something like transparent hugepage support
for the IOMMU.  But the point is that the type1 vfio IOMMU backend
isn't directly aware of hugepages.
 
> > > We're trying to make as few calls to iommu_map() as we can
> > > and thus we want the maximum range of IOVA to physical address we can
> > > pump into the IOMMU driver.  It's up to the IOMMU driver to figure out
> > > how it can optimize that range with hugepages or superpages in its page
> > > tables.  So the caller of this function is looping over a range of
> > > memory and each time this function returns, it maps that many pages
> > > through the iommu.  On success we return <=npage.
> > > 
> > > The unpin_out loop here is absolutely related to the change proposed
> > > here, vfio_lock_acct() can fail, we cannot return both an error and pin
> > > pages, therefore we need to undo anything we've pinned this round.  
> 
> Yes you are right. It's related.
> 
> > > 
> > > Are you perhaps only referring to the exit path above going straight to
> > > this loop rather than attempting to do the accounting for the pages
> > > pinned so far?  Previously that was our only option because the unwind
> > > path was to return a short count, invoking the page accounting and
> > > iommu_mapping, while fully expecting the caller to again loop over the
> > > excess page, return -ENOMEM, and teardown the entire mapping request.
> > > So because we now require an unwind path for the vfio_lock_acct()
> > > change, we can now do the teardown w/o the additional pinning here and
> > > mapping by the caller.  In a very strict sense, we could consider that
> > > a separate change and move those 3 lines to a follow-on patch but
> > > ultimately the caller did request pinned pages beyond what we believe
> > > their limit to be and making use of this new exit path here saves us a
> > > useless accounting and mapping iteration.  I can note that in the
> > > commit log.  Thanks,  
> 
> I agree that in all cases it's a corner case and trivial, and the
> userspace caller should anyway do something to release its memory
> accounting.

There's actually no userspace visible change.  Before or after, if the
user attempts to map more pages than their locked memory limit, they
get -ENOMEM.  The difference is only whether we handle the condition
lazily by completing pinning and mapping up to the limit, before
failing and tearing down at the next pass, or undo the work of the
current pass and return error to teardown any previous work.  Either
path all happens within the mapping ioctl.
 
> Thanks for addressing my comments and replied with full detail!

Thanks for reviewing!

Alex
diff mbox

Patch

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 32d2633092a3..b799edbb8c4f 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -246,69 +246,45 @@  static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
 	return ret;
 }
 
-struct vwork {
-	struct mm_struct	*mm;
-	long			npage;
-	struct work_struct	work;
-};
-
-/* delayed decrement/increment for locked_vm */
-static void vfio_lock_acct_bg(struct work_struct *work)
+static int vfio_lock_acct(struct task_struct *task, long npage)
 {
-	struct vwork *vwork = container_of(work, struct vwork, work);
-	struct mm_struct *mm;
-
-	mm = vwork->mm;
-	down_write(&mm->mmap_sem);
-	mm->locked_vm += vwork->npage;
-	up_write(&mm->mmap_sem);
-	mmput(mm);
-	kfree(vwork);
-}
-
-static void vfio_lock_acct(struct task_struct *task, long npage)
-{
-	struct vwork *vwork;
 	struct mm_struct *mm;
 	bool is_current;
+	int ret;
 
 	if (!npage)
-		return;
+		return 0;
 
 	is_current = (task->mm == current->mm);
 
 	mm = is_current ? task->mm : get_task_mm(task);
 	if (!mm)
-		return; /* process exited */
+		return -ESRCH; /* process exited */
 
-	if (down_write_trylock(&mm->mmap_sem)) {
-		mm->locked_vm += npage;
-		up_write(&mm->mmap_sem);
-		if (!is_current)
-			mmput(mm);
-		return;
-	}
+	ret = down_write_killable(&mm->mmap_sem);
+	if (!ret) {
+		if (npage < 0) {
+			mm->locked_vm += npage;
+		} else {
+			unsigned long limit;
+
+			limit = is_current ?
+				rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT :
+				task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+			if (mm->locked_vm + npage <= limit)
+				mm->locked_vm += npage;
+			else
+				ret = -ENOMEM;
+		}
 
-	if (is_current) {
-		mm = get_task_mm(task);
-		if (!mm)
-			return;
+		up_write(&mm->mmap_sem);
 	}
 
-	/*
-	 * Couldn't get mmap_sem lock, so must setup to update
-	 * mm->locked_vm later. If locked_vm were atomic, we
-	 * wouldn't need this silliness
-	 */
-	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
-	if (WARN_ON(!vwork)) {
+	if (!is_current)
 		mmput(mm);
-		return;
-	}
-	INIT_WORK(&vwork->work, vfio_lock_acct_bg);
-	vwork->mm = mm;
-	vwork->npage = npage;
-	schedule_work(&vwork->work);
+
+	return ret;
 }
 
 /*
@@ -405,7 +381,7 @@  static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
 static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 				  long npage, unsigned long *pfn_base)
 {
-	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	unsigned long pfn = 0, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 	bool lock_cap = capable(CAP_IPC_LOCK);
 	long ret, pinned = 0, lock_acct = 0;
 	bool rsvd;
@@ -442,8 +418,6 @@  static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 	/* Lock all the consecutive pages from pfn_base */
 	for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
 	     pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
-		unsigned long pfn = 0;
-
 		ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, &pfn);
 		if (ret)
 			break;
@@ -460,14 +434,25 @@  static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 				put_pfn(pfn, dma->prot);
 				pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
 					__func__, limit << PAGE_SHIFT);
-				break;
+				ret = -ENOMEM;
+				goto unpin_out;
 			}
 			lock_acct++;
 		}
 	}
 
 out:
-	vfio_lock_acct(current, lock_acct);
+	ret = vfio_lock_acct(current, lock_acct);
+
+unpin_out:
+	if (ret) {
+		if (!rsvd) {
+			for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
+				put_pfn(pfn, dma->prot);
+		}
+
+		return ret;
+	}
 
 	return pinned;
 }
@@ -522,8 +507,14 @@  static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
 		goto pin_page_exit;
 	}
 
-	if (!rsvd && do_accounting)
-		vfio_lock_acct(dma->task, 1);
+	if (!rsvd && do_accounting) {
+		ret = vfio_lock_acct(dma->task, 1);
+		if (ret) {
+			put_pfn(*pfn_base, dma->prot);
+			goto pin_page_exit;
+		}
+	}
+
 	ret = 1;
 
 pin_page_exit: