diff mbox series

[V2,9/9] vfio/type1: block on invalid vaddr

Message ID 1611078509-181959-10-git-send-email-steven.sistare@oracle.com (mailing list archive)
State New, archived
Headers show
Series vfio virtual address update | expand

Commit Message

Steve Sistare Jan. 19, 2021, 5:48 p.m. UTC
Block translation of host virtual address while an iova range has an
invalid vaddr.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
---
 drivers/vfio/vfio_iommu_type1.c | 83 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 76 insertions(+), 7 deletions(-)

Comments

Alex Williamson Jan. 22, 2021, 10:59 p.m. UTC | #1
On Tue, 19 Jan 2021 09:48:29 -0800
Steve Sistare <steven.sistare@oracle.com> wrote:

> Block translation of host virtual address while an iova range has an
> invalid vaddr.
> 
> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
> ---
>  drivers/vfio/vfio_iommu_type1.c | 83 +++++++++++++++++++++++++++++++++++++----
>  1 file changed, 76 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 0167996..c97573a 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -31,6 +31,7 @@
>  #include <linux/rbtree.h>
>  #include <linux/sched/signal.h>
>  #include <linux/sched/mm.h>
> +#include <linux/kthread.h>
>  #include <linux/slab.h>
>  #include <linux/uaccess.h>
>  #include <linux/vfio.h>
> @@ -75,6 +76,7 @@ struct vfio_iommu {
>  	bool			dirty_page_tracking;
>  	bool			pinned_page_dirty_scope;
>  	bool			controlled;
> +	wait_queue_head_t	vaddr_wait;
>  };
>  
>  struct vfio_domain {
> @@ -145,6 +147,8 @@ struct vfio_regions {
>  #define DIRTY_BITMAP_PAGES_MAX	 ((u64)INT_MAX)
>  #define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
>  
> +#define WAITED 1
> +
>  static int put_pfn(unsigned long pfn, int prot);
>  
>  static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
> @@ -505,6 +509,52 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
>  }
>  
>  /*
> + * Wait for vaddr of *dma_p to become valid.  iommu lock is dropped if the task
> + * waits, but is re-locked on return.  If the task waits, then return an updated
> + * dma struct in *dma_p.  Return 0 on success with no waiting, 1 on success if

s/1/WAITED/

> + * waited, and -errno on error.
> + */
> +static int vfio_vaddr_wait(struct vfio_iommu *iommu, struct vfio_dma **dma_p)
> +{
> +	struct vfio_dma *dma = *dma_p;
> +	unsigned long iova = dma->iova;
> +	size_t size = dma->size;
> +	int ret = 0;
> +	DEFINE_WAIT(wait);
> +
> +	while (!dma->vaddr_valid) {
> +		ret = WAITED;
> +		prepare_to_wait(&iommu->vaddr_wait, &wait, TASK_KILLABLE);
> +		mutex_unlock(&iommu->lock);
> +		schedule();
> +		mutex_lock(&iommu->lock);
> +		finish_wait(&iommu->vaddr_wait, &wait);
> +		if (kthread_should_stop() || !iommu->controlled ||
> +		    fatal_signal_pending(current)) {
> +			return -EFAULT;
> +		}
> +		*dma_p = dma = vfio_find_dma(iommu, iova, size);
> +		if (!dma)
> +			return -EINVAL;
> +	}
> +	return ret;
> +}
> +
> +/*
> + * Find dma struct and wait for its vaddr to be valid.  iommu lock is dropped
> + * if the task waits, but is re-locked on return.  Return result in *dma_p.
> + * Return 0 on success, 1 on success if waited,  and -errno on error.
> + */
> +static int vfio_find_vaddr(struct vfio_iommu *iommu, dma_addr_t start,
> +			   size_t size, struct vfio_dma **dma_p)

more of a vfio_find_dma_valid()

> +{
> +	*dma_p = vfio_find_dma(iommu, start, size);
> +	if (!*dma_p)
> +		return -EINVAL;
> +	return vfio_vaddr_wait(iommu, dma_p);
> +}
> +
> +/*
>   * Attempt to pin pages.  We really don't want to track all the pfns and
>   * the iommu can only map chunks of consecutive pfns anyway, so get the
>   * first page and all consecutive pages with the same locking.
> @@ -693,11 +743,11 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>  		struct vfio_pfn *vpfn;
>  
>  		iova = user_pfn[i] << PAGE_SHIFT;
> -		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
> -		if (!dma) {
> -			ret = -EINVAL;
> +		ret = vfio_find_vaddr(iommu, iova, PAGE_SIZE, &dma);
> +		if (ret < 0)
>  			goto pin_unwind;
> -		}
> +		else if (ret == WAITED)
> +			do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);

We're iterating through an array of pfns to provide translations, once
we've released the lock it's not just the current one that could be
invalid.  I'm afraid we need to unwind and try again, but I think it's
actually worse than that because if we've marked pages within a
vfio_dma's pfn_list as pinned, then during the locking gap it gets
unmapped, the unmap path will call the unmap notifier chain to release
the page that the vendor driver doesn't have yet.  Yikes!

>  
>  		if ((dma->prot & prot) != prot) {
>  			ret = -EPERM;
> @@ -1496,6 +1546,22 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
>  	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>  	int ret;
>  
> +	/*
> +	 * Wait for all vaddr to be valid so they can be used in the main loop.
> +	 * If we do wait, the lock was dropped and re-taken, so start over to
> +	 * ensure the dma list is consistent.
> +	 */
> +again:
> +	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> +
> +		ret = vfio_vaddr_wait(iommu, &dma);
> +		if (ret < 0)
> +			return ret;
> +		else if (ret == WAITED)
> +			goto again;
> +	}

This "do nothing until all the vaddrs are valid" approach could work
above too, but gosh is that a lot of cache unfriendly work for a rare
invalidation.  Thanks,

Alex

> +
>  	/* Arbitrarily pick the first domain in the list for lookups */
>  	if (!list_empty(&iommu->domain_list))
>  		d = list_first_entry(&iommu->domain_list,
> @@ -2522,6 +2588,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)
>  	iommu->controlled = true;
>  	mutex_init(&iommu->lock);
>  	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
> +	init_waitqueue_head(&iommu->vaddr_wait);
>  
>  	return iommu;
>  }
> @@ -2972,12 +3039,13 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
>  	struct vfio_dma *dma;
>  	bool kthread = current->mm == NULL;
>  	size_t offset;
> +	int ret;
>  
>  	*copied = 0;
>  
> -	dma = vfio_find_dma(iommu, user_iova, 1);
> -	if (!dma)
> -		return -EINVAL;
> +	ret = vfio_find_vaddr(iommu, user_iova, 1, &dma);
> +	if (ret < 0)
> +		return ret;
>  
>  	if ((write && !(dma->prot & IOMMU_WRITE)) ||
>  			!(dma->prot & IOMMU_READ))
> @@ -3055,6 +3123,7 @@ static void vfio_iommu_type1_notify(void *iommu_data, unsigned int event,
>  	mutex_lock(&iommu->lock);
>  	iommu->controlled = false;
>  	mutex_unlock(&iommu->lock);
> +	wake_up_all(&iommu->vaddr_wait);
>  }
>  
>  static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
Steve Sistare Jan. 27, 2021, 11:25 p.m. UTC | #2
On 1/22/2021 5:59 PM, Alex Williamson wrote:
> On Tue, 19 Jan 2021 09:48:29 -0800
> Steve Sistare <steven.sistare@oracle.com> wrote:
> 
>> Block translation of host virtual address while an iova range has an
>> invalid vaddr.
>>
>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>> ---
>>  drivers/vfio/vfio_iommu_type1.c | 83 +++++++++++++++++++++++++++++++++++++----
>>  1 file changed, 76 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>> index 0167996..c97573a 100644
>> --- a/drivers/vfio/vfio_iommu_type1.c
>> +++ b/drivers/vfio/vfio_iommu_type1.c
>> @@ -31,6 +31,7 @@
>>  #include <linux/rbtree.h>
>>  #include <linux/sched/signal.h>
>>  #include <linux/sched/mm.h>
>> +#include <linux/kthread.h>
>>  #include <linux/slab.h>
>>  #include <linux/uaccess.h>
>>  #include <linux/vfio.h>
>> @@ -75,6 +76,7 @@ struct vfio_iommu {
>>  	bool			dirty_page_tracking;
>>  	bool			pinned_page_dirty_scope;
>>  	bool			controlled;
>> +	wait_queue_head_t	vaddr_wait;
>>  };
>>  
>>  struct vfio_domain {
>> @@ -145,6 +147,8 @@ struct vfio_regions {
>>  #define DIRTY_BITMAP_PAGES_MAX	 ((u64)INT_MAX)
>>  #define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
>>  
>> +#define WAITED 1
>> +
>>  static int put_pfn(unsigned long pfn, int prot);
>>  
>>  static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
>> @@ -505,6 +509,52 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
>>  }
>>  
>>  /*
>> + * Wait for vaddr of *dma_p to become valid.  iommu lock is dropped if the task
>> + * waits, but is re-locked on return.  If the task waits, then return an updated
>> + * dma struct in *dma_p.  Return 0 on success with no waiting, 1 on success if
> 
> s/1/WAITED/

OK, but the WAITED state will not need to be returned in the new scheme below.

>> + * waited, and -errno on error.
>> + */
>> +static int vfio_vaddr_wait(struct vfio_iommu *iommu, struct vfio_dma **dma_p)
>> +{
>> +	struct vfio_dma *dma = *dma_p;
>> +	unsigned long iova = dma->iova;
>> +	size_t size = dma->size;
>> +	int ret = 0;
>> +	DEFINE_WAIT(wait);
>> +
>> +	while (!dma->vaddr_valid) {
>> +		ret = WAITED;
>> +		prepare_to_wait(&iommu->vaddr_wait, &wait, TASK_KILLABLE);
>> +		mutex_unlock(&iommu->lock);
>> +		schedule();
>> +		mutex_lock(&iommu->lock);
>> +		finish_wait(&iommu->vaddr_wait, &wait);
>> +		if (kthread_should_stop() || !iommu->controlled ||
>> +		    fatal_signal_pending(current)) {
>> +			return -EFAULT;
>> +		}
>> +		*dma_p = dma = vfio_find_dma(iommu, iova, size);
>> +		if (!dma)
>> +			return -EINVAL;
>> +	}
>> +	return ret;
>> +}
>> +
>> +/*
>> + * Find dma struct and wait for its vaddr to be valid.  iommu lock is dropped
>> + * if the task waits, but is re-locked on return.  Return result in *dma_p.
>> + * Return 0 on success, 1 on success if waited,  and -errno on error.
>> + */
>> +static int vfio_find_vaddr(struct vfio_iommu *iommu, dma_addr_t start,
>> +			   size_t size, struct vfio_dma **dma_p)
> 
> more of a vfio_find_dma_valid()

I will slightly modify and rename this with the new scheme I describe below.

>> +{
>> +	*dma_p = vfio_find_dma(iommu, start, size);
>> +	if (!*dma_p)
>> +		return -EINVAL;
>> +	return vfio_vaddr_wait(iommu, dma_p);
>> +}
>> +
>> +/*
>>   * Attempt to pin pages.  We really don't want to track all the pfns and
>>   * the iommu can only map chunks of consecutive pfns anyway, so get the
>>   * first page and all consecutive pages with the same locking.
>> @@ -693,11 +743,11 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>>  		struct vfio_pfn *vpfn;
>>  
>>  		iova = user_pfn[i] << PAGE_SHIFT;
>> -		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
>> -		if (!dma) {
>> -			ret = -EINVAL;
>> +		ret = vfio_find_vaddr(iommu, iova, PAGE_SIZE, &dma);
>> +		if (ret < 0)
>>  			goto pin_unwind;
>> -		}
>> +		else if (ret == WAITED)
>> +			do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
> 
> We're iterating through an array of pfns to provide translations, once
> we've released the lock it's not just the current one that could be
> invalid.  I'm afraid we need to unwind and try again, but I think it's
> actually worse than that because if we've marked pages within a
> vfio_dma's pfn_list as pinned, then during the locking gap it gets
> unmapped, the unmap path will call the unmap notifier chain to release
> the page that the vendor driver doesn't have yet.  Yikes!

Yikes indeed.  The fix is easy, though.  I will maintain a count in vfio_iommu of 
vfio_dma objects with invalid vaddr's, modified and tested while holding the lock, 
provide a function to wait for the count to become 0, and call that function at the 
start of vfio_iommu_type1_pin_pages and vfio_iommu_replay.  I will use iommu->vaddr_wait 
for wait and wake.

>>  		if ((dma->prot & prot) != prot) {
>>  			ret = -EPERM;
>> @@ -1496,6 +1546,22 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
>>  	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>>  	int ret;
>>  
>> +	/*
>> +	 * Wait for all vaddr to be valid so they can be used in the main loop.
>> +	 * If we do wait, the lock was dropped and re-taken, so start over to
>> +	 * ensure the dma list is consistent.
>> +	 */
>> +again:
>> +	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
>> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
>> +
>> +		ret = vfio_vaddr_wait(iommu, &dma);
>> +		if (ret < 0)
>> +			return ret;
>> +		else if (ret == WAITED)
>> +			goto again;
>> +	}
> 
> This "do nothing until all the vaddrs are valid" approach could work
> above too, but gosh is that a lot of cache unfriendly work for a rare
> invalidation.  Thanks,

The new wait function described above is fast in the common case, just a check that
the invalid vaddr count is 0.

- Steve

>> +
>>  	/* Arbitrarily pick the first domain in the list for lookups */
>>  	if (!list_empty(&iommu->domain_list))
>>  		d = list_first_entry(&iommu->domain_list,
>> @@ -2522,6 +2588,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)
>>  	iommu->controlled = true;
>>  	mutex_init(&iommu->lock);
>>  	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
>> +	init_waitqueue_head(&iommu->vaddr_wait);
>>  
>>  	return iommu;
>>  }
>> @@ -2972,12 +3039,13 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
>>  	struct vfio_dma *dma;
>>  	bool kthread = current->mm == NULL;
>>  	size_t offset;
>> +	int ret;
>>  
>>  	*copied = 0;
>>  
>> -	dma = vfio_find_dma(iommu, user_iova, 1);
>> -	if (!dma)
>> -		return -EINVAL;
>> +	ret = vfio_find_vaddr(iommu, user_iova, 1, &dma);
>> +	if (ret < 0)
>> +		return ret;
>>  
>>  	if ((write && !(dma->prot & IOMMU_WRITE)) ||
>>  			!(dma->prot & IOMMU_READ))
>> @@ -3055,6 +3123,7 @@ static void vfio_iommu_type1_notify(void *iommu_data, unsigned int event,
>>  	mutex_lock(&iommu->lock);
>>  	iommu->controlled = false;
>>  	mutex_unlock(&iommu->lock);
>> +	wake_up_all(&iommu->vaddr_wait);
>>  }
>>  
>>  static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
>
Alex Williamson Jan. 28, 2021, 12:03 a.m. UTC | #3
On Wed, 27 Jan 2021 18:25:13 -0500
Steven Sistare <steven.sistare@oracle.com> wrote:

> On 1/22/2021 5:59 PM, Alex Williamson wrote:
> > On Tue, 19 Jan 2021 09:48:29 -0800
> > Steve Sistare <steven.sistare@oracle.com> wrote:
> >   
> >> Block translation of host virtual address while an iova range has an
> >> invalid vaddr.
> >>
> >> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
> >> ---
> >>  drivers/vfio/vfio_iommu_type1.c | 83 +++++++++++++++++++++++++++++++++++++----
> >>  1 file changed, 76 insertions(+), 7 deletions(-)
> >>
> >> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> >> index 0167996..c97573a 100644
> >> --- a/drivers/vfio/vfio_iommu_type1.c
> >> +++ b/drivers/vfio/vfio_iommu_type1.c
> >> @@ -31,6 +31,7 @@
> >>  #include <linux/rbtree.h>
> >>  #include <linux/sched/signal.h>
> >>  #include <linux/sched/mm.h>
> >> +#include <linux/kthread.h>
> >>  #include <linux/slab.h>
> >>  #include <linux/uaccess.h>
> >>  #include <linux/vfio.h>
> >> @@ -75,6 +76,7 @@ struct vfio_iommu {
> >>  	bool			dirty_page_tracking;
> >>  	bool			pinned_page_dirty_scope;
> >>  	bool			controlled;
> >> +	wait_queue_head_t	vaddr_wait;
> >>  };
> >>  
> >>  struct vfio_domain {
> >> @@ -145,6 +147,8 @@ struct vfio_regions {
> >>  #define DIRTY_BITMAP_PAGES_MAX	 ((u64)INT_MAX)
> >>  #define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
> >>  
> >> +#define WAITED 1
> >> +
> >>  static int put_pfn(unsigned long pfn, int prot);
> >>  
> >>  static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
> >> @@ -505,6 +509,52 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
> >>  }
> >>  
> >>  /*
> >> + * Wait for vaddr of *dma_p to become valid.  iommu lock is dropped if the task
> >> + * waits, but is re-locked on return.  If the task waits, then return an updated
> >> + * dma struct in *dma_p.  Return 0 on success with no waiting, 1 on success if  
> > 
> > s/1/WAITED/  
> 
> OK, but the WAITED state will not need to be returned in the new scheme below.
> 
> >> + * waited, and -errno on error.
> >> + */
> >> +static int vfio_vaddr_wait(struct vfio_iommu *iommu, struct vfio_dma **dma_p)
> >> +{
> >> +	struct vfio_dma *dma = *dma_p;
> >> +	unsigned long iova = dma->iova;
> >> +	size_t size = dma->size;
> >> +	int ret = 0;
> >> +	DEFINE_WAIT(wait);
> >> +
> >> +	while (!dma->vaddr_valid) {
> >> +		ret = WAITED;
> >> +		prepare_to_wait(&iommu->vaddr_wait, &wait, TASK_KILLABLE);
> >> +		mutex_unlock(&iommu->lock);
> >> +		schedule();
> >> +		mutex_lock(&iommu->lock);
> >> +		finish_wait(&iommu->vaddr_wait, &wait);
> >> +		if (kthread_should_stop() || !iommu->controlled ||
> >> +		    fatal_signal_pending(current)) {
> >> +			return -EFAULT;
> >> +		}
> >> +		*dma_p = dma = vfio_find_dma(iommu, iova, size);
> >> +		if (!dma)
> >> +			return -EINVAL;
> >> +	}
> >> +	return ret;
> >> +}
> >> +
> >> +/*
> >> + * Find dma struct and wait for its vaddr to be valid.  iommu lock is dropped
> >> + * if the task waits, but is re-locked on return.  Return result in *dma_p.
> >> + * Return 0 on success, 1 on success if waited,  and -errno on error.
> >> + */
> >> +static int vfio_find_vaddr(struct vfio_iommu *iommu, dma_addr_t start,
> >> +			   size_t size, struct vfio_dma **dma_p)  
> > 
> > more of a vfio_find_dma_valid()  
> 
> I will slightly modify and rename this with the new scheme I describe below.
> 
> >> +{
> >> +	*dma_p = vfio_find_dma(iommu, start, size);
> >> +	if (!*dma_p)
> >> +		return -EINVAL;
> >> +	return vfio_vaddr_wait(iommu, dma_p);
> >> +}
> >> +
> >> +/*
> >>   * Attempt to pin pages.  We really don't want to track all the pfns and
> >>   * the iommu can only map chunks of consecutive pfns anyway, so get the
> >>   * first page and all consecutive pages with the same locking.
> >> @@ -693,11 +743,11 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
> >>  		struct vfio_pfn *vpfn;
> >>  
> >>  		iova = user_pfn[i] << PAGE_SHIFT;
> >> -		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
> >> -		if (!dma) {
> >> -			ret = -EINVAL;
> >> +		ret = vfio_find_vaddr(iommu, iova, PAGE_SIZE, &dma);
> >> +		if (ret < 0)
> >>  			goto pin_unwind;
> >> -		}
> >> +		else if (ret == WAITED)
> >> +			do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);  
> > 
> > We're iterating through an array of pfns to provide translations, once
> > we've released the lock it's not just the current one that could be
> > invalid.  I'm afraid we need to unwind and try again, but I think it's
> > actually worse than that because if we've marked pages within a
> > vfio_dma's pfn_list as pinned, then during the locking gap it gets
> > unmapped, the unmap path will call the unmap notifier chain to release
> > the page that the vendor driver doesn't have yet.  Yikes!  
> 
> Yikes indeed.  The fix is easy, though.  I will maintain a count in vfio_iommu of 
> vfio_dma objects with invalid vaddr's, modified and tested while holding the lock, 
> provide a function to wait for the count to become 0, and call that function at the 
> start of vfio_iommu_type1_pin_pages and vfio_iommu_replay.  I will use iommu->vaddr_wait 
> for wait and wake.

I prefer the overhead of this, but the resulting behavior seems pretty
non-intuitive.  Any invalidated vaddr blocks all vaddr use cases, which
almost suggests the unmap _VADDR flag should only be allowed with the
_ALL flag, but then the map _VADDR flag can only be per mapping, which
would make accounting and recovering from _VADDR + _ALL pretty
complicated.  Thanks,

Alex

> >>  		if ((dma->prot & prot) != prot) {
> >>  			ret = -EPERM;
> >> @@ -1496,6 +1546,22 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
> >>  	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> >>  	int ret;
> >>  
> >> +	/*
> >> +	 * Wait for all vaddr to be valid so they can be used in the main loop.
> >> +	 * If we do wait, the lock was dropped and re-taken, so start over to
> >> +	 * ensure the dma list is consistent.
> >> +	 */
> >> +again:
> >> +	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
> >> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> >> +
> >> +		ret = vfio_vaddr_wait(iommu, &dma);
> >> +		if (ret < 0)
> >> +			return ret;
> >> +		else if (ret == WAITED)
> >> +			goto again;
> >> +	}  
> > 
> > This "do nothing until all the vaddrs are valid" approach could work
> > above too, but gosh is that a lot of cache unfriendly work for a rare
> > invalidation.  Thanks,  
> 
> The new wait function described above is fast in the common case, just a check that
> the invalid vaddr count is 0.
> 
> - Steve
> 
> >> +
> >>  	/* Arbitrarily pick the first domain in the list for lookups */
> >>  	if (!list_empty(&iommu->domain_list))
> >>  		d = list_first_entry(&iommu->domain_list,
> >> @@ -2522,6 +2588,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)
> >>  	iommu->controlled = true;
> >>  	mutex_init(&iommu->lock);
> >>  	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
> >> +	init_waitqueue_head(&iommu->vaddr_wait);
> >>  
> >>  	return iommu;
> >>  }
> >> @@ -2972,12 +3039,13 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
> >>  	struct vfio_dma *dma;
> >>  	bool kthread = current->mm == NULL;
> >>  	size_t offset;
> >> +	int ret;
> >>  
> >>  	*copied = 0;
> >>  
> >> -	dma = vfio_find_dma(iommu, user_iova, 1);
> >> -	if (!dma)
> >> -		return -EINVAL;
> >> +	ret = vfio_find_vaddr(iommu, user_iova, 1, &dma);
> >> +	if (ret < 0)
> >> +		return ret;
> >>  
> >>  	if ((write && !(dma->prot & IOMMU_WRITE)) ||
> >>  			!(dma->prot & IOMMU_READ))
> >> @@ -3055,6 +3123,7 @@ static void vfio_iommu_type1_notify(void *iommu_data, unsigned int event,
> >>  	mutex_lock(&iommu->lock);
> >>  	iommu->controlled = false;
> >>  	mutex_unlock(&iommu->lock);
> >> +	wake_up_all(&iommu->vaddr_wait);
> >>  }
> >>  
> >>  static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {  
> >   
>
Alex Williamson Jan. 28, 2021, 5:07 p.m. UTC | #4
On Wed, 27 Jan 2021 17:03:25 -0700
Alex Williamson <alex.williamson@redhat.com> wrote:

> On Wed, 27 Jan 2021 18:25:13 -0500
> Steven Sistare <steven.sistare@oracle.com> wrote:
> 
> > On 1/22/2021 5:59 PM, Alex Williamson wrote:  
> > > On Tue, 19 Jan 2021 09:48:29 -0800
> > > Steve Sistare <steven.sistare@oracle.com> wrote:
> > >     
> > >> Block translation of host virtual address while an iova range has an
> > >> invalid vaddr.
> > >>
> > >> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
> > >> ---
> > >>  drivers/vfio/vfio_iommu_type1.c | 83 +++++++++++++++++++++++++++++++++++++----
> > >>  1 file changed, 76 insertions(+), 7 deletions(-)
> > >>
> > >> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > >> index 0167996..c97573a 100644
> > >> --- a/drivers/vfio/vfio_iommu_type1.c
> > >> +++ b/drivers/vfio/vfio_iommu_type1.c
> > >> @@ -31,6 +31,7 @@
> > >>  #include <linux/rbtree.h>
> > >>  #include <linux/sched/signal.h>
> > >>  #include <linux/sched/mm.h>
> > >> +#include <linux/kthread.h>
> > >>  #include <linux/slab.h>
> > >>  #include <linux/uaccess.h>
> > >>  #include <linux/vfio.h>
> > >> @@ -75,6 +76,7 @@ struct vfio_iommu {
> > >>  	bool			dirty_page_tracking;
> > >>  	bool			pinned_page_dirty_scope;
> > >>  	bool			controlled;
> > >> +	wait_queue_head_t	vaddr_wait;
> > >>  };
> > >>  
> > >>  struct vfio_domain {
> > >> @@ -145,6 +147,8 @@ struct vfio_regions {
> > >>  #define DIRTY_BITMAP_PAGES_MAX	 ((u64)INT_MAX)
> > >>  #define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
> > >>  
> > >> +#define WAITED 1
> > >> +
> > >>  static int put_pfn(unsigned long pfn, int prot);
> > >>  
> > >>  static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
> > >> @@ -505,6 +509,52 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
> > >>  }
> > >>  
> > >>  /*
> > >> + * Wait for vaddr of *dma_p to become valid.  iommu lock is dropped if the task
> > >> + * waits, but is re-locked on return.  If the task waits, then return an updated
> > >> + * dma struct in *dma_p.  Return 0 on success with no waiting, 1 on success if    
> > > 
> > > s/1/WAITED/    
> > 
> > OK, but the WAITED state will not need to be returned in the new scheme below.
> >   
> > >> + * waited, and -errno on error.
> > >> + */
> > >> +static int vfio_vaddr_wait(struct vfio_iommu *iommu, struct vfio_dma **dma_p)
> > >> +{
> > >> +	struct vfio_dma *dma = *dma_p;
> > >> +	unsigned long iova = dma->iova;
> > >> +	size_t size = dma->size;
> > >> +	int ret = 0;
> > >> +	DEFINE_WAIT(wait);
> > >> +
> > >> +	while (!dma->vaddr_valid) {
> > >> +		ret = WAITED;
> > >> +		prepare_to_wait(&iommu->vaddr_wait, &wait, TASK_KILLABLE);
> > >> +		mutex_unlock(&iommu->lock);
> > >> +		schedule();
> > >> +		mutex_lock(&iommu->lock);
> > >> +		finish_wait(&iommu->vaddr_wait, &wait);
> > >> +		if (kthread_should_stop() || !iommu->controlled ||
> > >> +		    fatal_signal_pending(current)) {
> > >> +			return -EFAULT;
> > >> +		}
> > >> +		*dma_p = dma = vfio_find_dma(iommu, iova, size);
> > >> +		if (!dma)
> > >> +			return -EINVAL;
> > >> +	}
> > >> +	return ret;
> > >> +}
> > >> +
> > >> +/*
> > >> + * Find dma struct and wait for its vaddr to be valid.  iommu lock is dropped
> > >> + * if the task waits, but is re-locked on return.  Return result in *dma_p.
> > >> + * Return 0 on success, 1 on success if waited,  and -errno on error.
> > >> + */
> > >> +static int vfio_find_vaddr(struct vfio_iommu *iommu, dma_addr_t start,
> > >> +			   size_t size, struct vfio_dma **dma_p)    
> > > 
> > > more of a vfio_find_dma_valid()    
> > 
> > I will slightly modify and rename this with the new scheme I describe below.
> >   
> > >> +{
> > >> +	*dma_p = vfio_find_dma(iommu, start, size);
> > >> +	if (!*dma_p)
> > >> +		return -EINVAL;
> > >> +	return vfio_vaddr_wait(iommu, dma_p);
> > >> +}
> > >> +
> > >> +/*
> > >>   * Attempt to pin pages.  We really don't want to track all the pfns and
> > >>   * the iommu can only map chunks of consecutive pfns anyway, so get the
> > >>   * first page and all consecutive pages with the same locking.
> > >> @@ -693,11 +743,11 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
> > >>  		struct vfio_pfn *vpfn;
> > >>  
> > >>  		iova = user_pfn[i] << PAGE_SHIFT;
> > >> -		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
> > >> -		if (!dma) {
> > >> -			ret = -EINVAL;
> > >> +		ret = vfio_find_vaddr(iommu, iova, PAGE_SIZE, &dma);
> > >> +		if (ret < 0)
> > >>  			goto pin_unwind;
> > >> -		}
> > >> +		else if (ret == WAITED)
> > >> +			do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);    
> > > 
> > > We're iterating through an array of pfns to provide translations, once
> > > we've released the lock it's not just the current one that could be
> > > invalid.  I'm afraid we need to unwind and try again, but I think it's
> > > actually worse than that because if we've marked pages within a
> > > vfio_dma's pfn_list as pinned, then during the locking gap it gets
> > > unmapped, the unmap path will call the unmap notifier chain to release
> > > the page that the vendor driver doesn't have yet.  Yikes!    
> > 
> > Yikes indeed.  The fix is easy, though.  I will maintain a count in vfio_iommu of 
> > vfio_dma objects with invalid vaddr's, modified and tested while holding the lock, 
> > provide a function to wait for the count to become 0, and call that function at the 
> > start of vfio_iommu_type1_pin_pages and vfio_iommu_replay.  I will use iommu->vaddr_wait 
> > for wait and wake.  
> 
> I prefer the overhead of this, but the resulting behavior seems pretty
> non-intuitive.  Any invalidated vaddr blocks all vaddr use cases, which
> almost suggests the unmap _VADDR flag should only be allowed with the
> _ALL flag, but then the map _VADDR flag can only be per mapping, which
> would make accounting and recovering from _VADDR + _ALL pretty
> complicated.  Thanks,

I wonder if there's a hybrid approach, a counter on the vfio_iommu
which when non-zero causes pin pages to pre-test vaddr on all required
vfio_dma objects, waiting and being woken on counter decrement to check
again.  Thanks,

Alex

> > >>  		if ((dma->prot & prot) != prot) {
> > >>  			ret = -EPERM;
> > >> @@ -1496,6 +1546,22 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
> > >>  	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > >>  	int ret;
> > >>  
> > >> +	/*
> > >> +	 * Wait for all vaddr to be valid so they can be used in the main loop.
> > >> +	 * If we do wait, the lock was dropped and re-taken, so start over to
> > >> +	 * ensure the dma list is consistent.
> > >> +	 */
> > >> +again:
> > >> +	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
> > >> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> > >> +
> > >> +		ret = vfio_vaddr_wait(iommu, &dma);
> > >> +		if (ret < 0)
> > >> +			return ret;
> > >> +		else if (ret == WAITED)
> > >> +			goto again;
> > >> +	}    
> > > 
> > > This "do nothing until all the vaddrs are valid" approach could work
> > > above too, but gosh is that a lot of cache unfriendly work for a rare
> > > invalidation.  Thanks,    
> > 
> > The new wait function described above is fast in the common case, just a check that
> > the invalid vaddr count is 0.
> > 
> > - Steve
> >   
> > >> +
> > >>  	/* Arbitrarily pick the first domain in the list for lookups */
> > >>  	if (!list_empty(&iommu->domain_list))
> > >>  		d = list_first_entry(&iommu->domain_list,
> > >> @@ -2522,6 +2588,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)
> > >>  	iommu->controlled = true;
> > >>  	mutex_init(&iommu->lock);
> > >>  	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
> > >> +	init_waitqueue_head(&iommu->vaddr_wait);
> > >>  
> > >>  	return iommu;
> > >>  }
> > >> @@ -2972,12 +3039,13 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
> > >>  	struct vfio_dma *dma;
> > >>  	bool kthread = current->mm == NULL;
> > >>  	size_t offset;
> > >> +	int ret;
> > >>  
> > >>  	*copied = 0;
> > >>  
> > >> -	dma = vfio_find_dma(iommu, user_iova, 1);
> > >> -	if (!dma)
> > >> -		return -EINVAL;
> > >> +	ret = vfio_find_vaddr(iommu, user_iova, 1, &dma);
> > >> +	if (ret < 0)
> > >> +		return ret;
> > >>  
> > >>  	if ((write && !(dma->prot & IOMMU_WRITE)) ||
> > >>  			!(dma->prot & IOMMU_READ))
> > >> @@ -3055,6 +3123,7 @@ static void vfio_iommu_type1_notify(void *iommu_data, unsigned int event,
> > >>  	mutex_lock(&iommu->lock);
> > >>  	iommu->controlled = false;
> > >>  	mutex_unlock(&iommu->lock);
> > >> +	wake_up_all(&iommu->vaddr_wait);
> > >>  }
> > >>  
> > >>  static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {    
> > >     
> >   
>
Steve Sistare Jan. 28, 2021, 5:18 p.m. UTC | #5
On 1/28/2021 12:07 PM, Alex Williamson wrote:
> On Wed, 27 Jan 2021 17:03:25 -0700
> Alex Williamson <alex.williamson@redhat.com> wrote:
> 
>> On Wed, 27 Jan 2021 18:25:13 -0500
>> Steven Sistare <steven.sistare@oracle.com> wrote:
>>
>>> On 1/22/2021 5:59 PM, Alex Williamson wrote:  
>>>> On Tue, 19 Jan 2021 09:48:29 -0800
>>>> Steve Sistare <steven.sistare@oracle.com> wrote:
>>>>     
>>>>> Block translation of host virtual address while an iova range has an
>>>>> invalid vaddr.
>>>>>
>>>>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>>>>> ---
>>>>>  drivers/vfio/vfio_iommu_type1.c | 83 +++++++++++++++++++++++++++++++++++++----
>>>>>  1 file changed, 76 insertions(+), 7 deletions(-)
>>>>>
>>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>>>>> index 0167996..c97573a 100644
>>>>> --- a/drivers/vfio/vfio_iommu_type1.c
>>>>> +++ b/drivers/vfio/vfio_iommu_type1.c
>>>>> @@ -31,6 +31,7 @@
>>>>>  #include <linux/rbtree.h>
>>>>>  #include <linux/sched/signal.h>
>>>>>  #include <linux/sched/mm.h>
>>>>> +#include <linux/kthread.h>
>>>>>  #include <linux/slab.h>
>>>>>  #include <linux/uaccess.h>
>>>>>  #include <linux/vfio.h>
>>>>> @@ -75,6 +76,7 @@ struct vfio_iommu {
>>>>>  	bool			dirty_page_tracking;
>>>>>  	bool			pinned_page_dirty_scope;
>>>>>  	bool			controlled;
>>>>> +	wait_queue_head_t	vaddr_wait;
>>>>>  };
>>>>>  
>>>>>  struct vfio_domain {
>>>>> @@ -145,6 +147,8 @@ struct vfio_regions {
>>>>>  #define DIRTY_BITMAP_PAGES_MAX	 ((u64)INT_MAX)
>>>>>  #define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
>>>>>  
>>>>> +#define WAITED 1
>>>>> +
>>>>>  static int put_pfn(unsigned long pfn, int prot);
>>>>>  
>>>>>  static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
>>>>> @@ -505,6 +509,52 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
>>>>>  }
>>>>>  
>>>>>  /*
>>>>> + * Wait for vaddr of *dma_p to become valid.  iommu lock is dropped if the task
>>>>> + * waits, but is re-locked on return.  If the task waits, then return an updated
>>>>> + * dma struct in *dma_p.  Return 0 on success with no waiting, 1 on success if    
>>>>
>>>> s/1/WAITED/    
>>>
>>> OK, but the WAITED state will not need to be returned in the new scheme below.
>>>   
>>>>> + * waited, and -errno on error.
>>>>> + */
>>>>> +static int vfio_vaddr_wait(struct vfio_iommu *iommu, struct vfio_dma **dma_p)
>>>>> +{
>>>>> +	struct vfio_dma *dma = *dma_p;
>>>>> +	unsigned long iova = dma->iova;
>>>>> +	size_t size = dma->size;
>>>>> +	int ret = 0;
>>>>> +	DEFINE_WAIT(wait);
>>>>> +
>>>>> +	while (!dma->vaddr_valid) {
>>>>> +		ret = WAITED;
>>>>> +		prepare_to_wait(&iommu->vaddr_wait, &wait, TASK_KILLABLE);
>>>>> +		mutex_unlock(&iommu->lock);
>>>>> +		schedule();
>>>>> +		mutex_lock(&iommu->lock);
>>>>> +		finish_wait(&iommu->vaddr_wait, &wait);
>>>>> +		if (kthread_should_stop() || !iommu->controlled ||
>>>>> +		    fatal_signal_pending(current)) {
>>>>> +			return -EFAULT;
>>>>> +		}
>>>>> +		*dma_p = dma = vfio_find_dma(iommu, iova, size);
>>>>> +		if (!dma)
>>>>> +			return -EINVAL;
>>>>> +	}
>>>>> +	return ret;
>>>>> +}
>>>>> +
>>>>> +/*
>>>>> + * Find dma struct and wait for its vaddr to be valid.  iommu lock is dropped
>>>>> + * if the task waits, but is re-locked on return.  Return result in *dma_p.
>>>>> + * Return 0 on success, 1 on success if waited,  and -errno on error.
>>>>> + */
>>>>> +static int vfio_find_vaddr(struct vfio_iommu *iommu, dma_addr_t start,
>>>>> +			   size_t size, struct vfio_dma **dma_p)    
>>>>
>>>> more of a vfio_find_dma_valid()    
>>>
>>> I will slightly modify and rename this with the new scheme I describe below.
>>>   
>>>>> +{
>>>>> +	*dma_p = vfio_find_dma(iommu, start, size);
>>>>> +	if (!*dma_p)
>>>>> +		return -EINVAL;
>>>>> +	return vfio_vaddr_wait(iommu, dma_p);
>>>>> +}
>>>>> +
>>>>> +/*
>>>>>   * Attempt to pin pages.  We really don't want to track all the pfns and
>>>>>   * the iommu can only map chunks of consecutive pfns anyway, so get the
>>>>>   * first page and all consecutive pages with the same locking.
>>>>> @@ -693,11 +743,11 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>>>>>  		struct vfio_pfn *vpfn;
>>>>>  
>>>>>  		iova = user_pfn[i] << PAGE_SHIFT;
>>>>> -		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
>>>>> -		if (!dma) {
>>>>> -			ret = -EINVAL;
>>>>> +		ret = vfio_find_vaddr(iommu, iova, PAGE_SIZE, &dma);
>>>>> +		if (ret < 0)
>>>>>  			goto pin_unwind;
>>>>> -		}
>>>>> +		else if (ret == WAITED)
>>>>> +			do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);    
>>>>
>>>> We're iterating through an array of pfns to provide translations, once
>>>> we've released the lock it's not just the current one that could be
>>>> invalid.  I'm afraid we need to unwind and try again, but I think it's
>>>> actually worse than that because if we've marked pages within a
>>>> vfio_dma's pfn_list as pinned, then during the locking gap it gets
>>>> unmapped, the unmap path will call the unmap notifier chain to release
>>>> the page that the vendor driver doesn't have yet.  Yikes!    
>>>
>>> Yikes indeed.  The fix is easy, though.  I will maintain a count in vfio_iommu of 
>>> vfio_dma objects with invalid vaddr's, modified and tested while holding the lock, 
>>> provide a function to wait for the count to become 0, and call that function at the 
>>> start of vfio_iommu_type1_pin_pages and vfio_iommu_replay.  I will use iommu->vaddr_wait 
>>> for wait and wake.  
>>
>> I prefer the overhead of this, but the resulting behavior seems pretty
>> non-intuitive.  Any invalidated vaddr blocks all vaddr use cases, which
>> almost suggests the unmap _VADDR flag should only be allowed with the
>> _ALL flag, but then the map _VADDR flag can only be per mapping, which
>> would make accounting and recovering from _VADDR + _ALL pretty
>> complicated.  Thanks,
> 
> I wonder if there's a hybrid approach, a counter on the vfio_iommu
> which when non-zero causes pin pages to pre-test vaddr on all required
> vfio_dma objects, waiting and being woken on counter decrement to check
> again.  Thanks,

Sounds good, thanks.

- Steve

>>>>>  		if ((dma->prot & prot) != prot) {
>>>>>  			ret = -EPERM;
>>>>> @@ -1496,6 +1546,22 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
>>>>>  	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>>>>>  	int ret;
>>>>>  
>>>>> +	/*
>>>>> +	 * Wait for all vaddr to be valid so they can be used in the main loop.
>>>>> +	 * If we do wait, the lock was dropped and re-taken, so start over to
>>>>> +	 * ensure the dma list is consistent.
>>>>> +	 */
>>>>> +again:
>>>>> +	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
>>>>> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
>>>>> +
>>>>> +		ret = vfio_vaddr_wait(iommu, &dma);
>>>>> +		if (ret < 0)
>>>>> +			return ret;
>>>>> +		else if (ret == WAITED)
>>>>> +			goto again;
>>>>> +	}    
>>>>
>>>> This "do nothing until all the vaddrs are valid" approach could work
>>>> above too, but gosh is that a lot of cache unfriendly work for a rare
>>>> invalidation.  Thanks,    
>>>
>>> The new wait function described above is fast in the common case, just a check that
>>> the invalid vaddr count is 0.
>>>
>>> - Steve
>>>   
>>>>> +
>>>>>  	/* Arbitrarily pick the first domain in the list for lookups */
>>>>>  	if (!list_empty(&iommu->domain_list))
>>>>>  		d = list_first_entry(&iommu->domain_list,
>>>>> @@ -2522,6 +2588,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)
>>>>>  	iommu->controlled = true;
>>>>>  	mutex_init(&iommu->lock);
>>>>>  	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
>>>>> +	init_waitqueue_head(&iommu->vaddr_wait);
>>>>>  
>>>>>  	return iommu;
>>>>>  }
>>>>> @@ -2972,12 +3039,13 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
>>>>>  	struct vfio_dma *dma;
>>>>>  	bool kthread = current->mm == NULL;
>>>>>  	size_t offset;
>>>>> +	int ret;
>>>>>  
>>>>>  	*copied = 0;
>>>>>  
>>>>> -	dma = vfio_find_dma(iommu, user_iova, 1);
>>>>> -	if (!dma)
>>>>> -		return -EINVAL;
>>>>> +	ret = vfio_find_vaddr(iommu, user_iova, 1, &dma);
>>>>> +	if (ret < 0)
>>>>> +		return ret;
>>>>>  
>>>>>  	if ((write && !(dma->prot & IOMMU_WRITE)) ||
>>>>>  			!(dma->prot & IOMMU_READ))
>>>>> @@ -3055,6 +3123,7 @@ static void vfio_iommu_type1_notify(void *iommu_data, unsigned int event,
>>>>>  	mutex_lock(&iommu->lock);
>>>>>  	iommu->controlled = false;
>>>>>  	mutex_unlock(&iommu->lock);
>>>>> +	wake_up_all(&iommu->vaddr_wait);
>>>>>  }
>>>>>  
>>>>>  static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {    
>>>>     
>>>   
>>
>
diff mbox series

Patch

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 0167996..c97573a 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -31,6 +31,7 @@ 
 #include <linux/rbtree.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
+#include <linux/kthread.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 #include <linux/vfio.h>
@@ -75,6 +76,7 @@  struct vfio_iommu {
 	bool			dirty_page_tracking;
 	bool			pinned_page_dirty_scope;
 	bool			controlled;
+	wait_queue_head_t	vaddr_wait;
 };
 
 struct vfio_domain {
@@ -145,6 +147,8 @@  struct vfio_regions {
 #define DIRTY_BITMAP_PAGES_MAX	 ((u64)INT_MAX)
 #define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
 
+#define WAITED 1
+
 static int put_pfn(unsigned long pfn, int prot);
 
 static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
@@ -505,6 +509,52 @@  static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
 }
 
 /*
+ * Wait for vaddr of *dma_p to become valid.  iommu lock is dropped if the task
+ * waits, but is re-locked on return.  If the task waits, then return an updated
+ * dma struct in *dma_p.  Return 0 on success with no waiting, 1 on success if
+ * waited, and -errno on error.
+ */
+static int vfio_vaddr_wait(struct vfio_iommu *iommu, struct vfio_dma **dma_p)
+{
+	struct vfio_dma *dma = *dma_p;
+	unsigned long iova = dma->iova;
+	size_t size = dma->size;
+	int ret = 0;
+	DEFINE_WAIT(wait);
+
+	while (!dma->vaddr_valid) {
+		ret = WAITED;
+		prepare_to_wait(&iommu->vaddr_wait, &wait, TASK_KILLABLE);
+		mutex_unlock(&iommu->lock);
+		schedule();
+		mutex_lock(&iommu->lock);
+		finish_wait(&iommu->vaddr_wait, &wait);
+		if (kthread_should_stop() || !iommu->controlled ||
+		    fatal_signal_pending(current)) {
+			return -EFAULT;
+		}
+		*dma_p = dma = vfio_find_dma(iommu, iova, size);
+		if (!dma)
+			return -EINVAL;
+	}
+	return ret;
+}
+
+/*
+ * Find dma struct and wait for its vaddr to be valid.  iommu lock is dropped
+ * if the task waits, but is re-locked on return.  Return result in *dma_p.
+ * Return 0 on success, 1 on success if waited,  and -errno on error.
+ */
+static int vfio_find_vaddr(struct vfio_iommu *iommu, dma_addr_t start,
+			   size_t size, struct vfio_dma **dma_p)
+{
+	*dma_p = vfio_find_dma(iommu, start, size);
+	if (!*dma_p)
+		return -EINVAL;
+	return vfio_vaddr_wait(iommu, dma_p);
+}
+
+/*
  * Attempt to pin pages.  We really don't want to track all the pfns and
  * the iommu can only map chunks of consecutive pfns anyway, so get the
  * first page and all consecutive pages with the same locking.
@@ -693,11 +743,11 @@  static int vfio_iommu_type1_pin_pages(void *iommu_data,
 		struct vfio_pfn *vpfn;
 
 		iova = user_pfn[i] << PAGE_SHIFT;
-		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
-		if (!dma) {
-			ret = -EINVAL;
+		ret = vfio_find_vaddr(iommu, iova, PAGE_SIZE, &dma);
+		if (ret < 0)
 			goto pin_unwind;
-		}
+		else if (ret == WAITED)
+			do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
 
 		if ((dma->prot & prot) != prot) {
 			ret = -EPERM;
@@ -1496,6 +1546,22 @@  static int vfio_iommu_replay(struct vfio_iommu *iommu,
 	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 	int ret;
 
+	/*
+	 * Wait for all vaddr to be valid so they can be used in the main loop.
+	 * If we do wait, the lock was dropped and re-taken, so start over to
+	 * ensure the dma list is consistent.
+	 */
+again:
+	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
+		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+
+		ret = vfio_vaddr_wait(iommu, &dma);
+		if (ret < 0)
+			return ret;
+		else if (ret == WAITED)
+			goto again;
+	}
+
 	/* Arbitrarily pick the first domain in the list for lookups */
 	if (!list_empty(&iommu->domain_list))
 		d = list_first_entry(&iommu->domain_list,
@@ -2522,6 +2588,7 @@  static void *vfio_iommu_type1_open(unsigned long arg)
 	iommu->controlled = true;
 	mutex_init(&iommu->lock);
 	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
+	init_waitqueue_head(&iommu->vaddr_wait);
 
 	return iommu;
 }
@@ -2972,12 +3039,13 @@  static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
 	struct vfio_dma *dma;
 	bool kthread = current->mm == NULL;
 	size_t offset;
+	int ret;
 
 	*copied = 0;
 
-	dma = vfio_find_dma(iommu, user_iova, 1);
-	if (!dma)
-		return -EINVAL;
+	ret = vfio_find_vaddr(iommu, user_iova, 1, &dma);
+	if (ret < 0)
+		return ret;
 
 	if ((write && !(dma->prot & IOMMU_WRITE)) ||
 			!(dma->prot & IOMMU_READ))
@@ -3055,6 +3123,7 @@  static void vfio_iommu_type1_notify(void *iommu_data, unsigned int event,
 	mutex_lock(&iommu->lock);
 	iommu->controlled = false;
 	mutex_unlock(&iommu->lock);
+	wake_up_all(&iommu->vaddr_wait);
 }
 
 static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {