diff mbox

[3/3] VFIO Type1 IOMMU: Add support for mediated devices

Message ID 1466440308-4961-4-git-send-email-kwankhede@nvidia.com (mailing list archive)
State New, archived
Headers show

Commit Message

Kirti Wankhede June 20, 2016, 4:31 p.m. UTC
VFIO Type1 IOMMU driver is designed for the devices which are IOMMU
capable. Mediated device only uses IOMMU TYPE1 API, the underlying
hardware can be managed by an IOMMU domain.

This change exports functions to pin and unpin pages for mediated devices.
It maintains data of pinned pages for mediated domain. This data is used to
verify unpinning request and to unpin remaining pages from detach_group()
if there are any.

Aim of this change is:
- To use most of the code of IOMMU driver for mediated devices
- To support direct assigned device and mediated device by single module

Updated the change to keep mediated domain structure out of domain_list.

Tested by assigning below combinations of devices to a single VM:
- GPU pass through only
- vGPU device only
- One GPU pass through and one vGPU device
- two GPU pass through

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Signed-off-by: Neo Jia <cjia@nvidia.com>
Change-Id: I295d6f0f2e0579b8d9882bfd8fd5a4194b97bd9a
---
 drivers/vfio/vfio_iommu_type1.c | 444 +++++++++++++++++++++++++++++++++++++---
 include/linux/vfio.h            |   6 +
 2 files changed, 418 insertions(+), 32 deletions(-)

Comments

Alex Williamson June 22, 2016, 3:46 a.m. UTC | #1
On Mon, 20 Jun 2016 22:01:48 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> VFIO Type1 IOMMU driver is designed for the devices which are IOMMU
> capable. Mediated device only uses IOMMU TYPE1 API, the underlying
> hardware can be managed by an IOMMU domain.
> 
> This change exports functions to pin and unpin pages for mediated devices.
> It maintains data of pinned pages for mediated domain. This data is used to
> verify unpinning request and to unpin remaining pages from detach_group()
> if there are any.
> 
> Aim of this change is:
> - To use most of the code of IOMMU driver for mediated devices
> - To support direct assigned device and mediated device by single module
> 
> Updated the change to keep mediated domain structure out of domain_list.
> 
> Tested by assigning below combinations of devices to a single VM:
> - GPU pass through only
> - vGPU device only
> - One GPU pass through and one vGPU device
> - two GPU pass through
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Signed-off-by: Neo Jia <cjia@nvidia.com>
> Change-Id: I295d6f0f2e0579b8d9882bfd8fd5a4194b97bd9a
> ---
>  drivers/vfio/vfio_iommu_type1.c | 444 +++++++++++++++++++++++++++++++++++++---
>  include/linux/vfio.h            |   6 +
>  2 files changed, 418 insertions(+), 32 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 75b24e93cedb..f17dd104fe27 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -36,6 +36,7 @@
>  #include <linux/uaccess.h>
>  #include <linux/vfio.h>
>  #include <linux/workqueue.h>
> +#include <linux/mdev.h>
>  
>  #define DRIVER_VERSION  "0.2"
>  #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
> @@ -55,6 +56,7 @@ MODULE_PARM_DESC(disable_hugepages,
>  
>  struct vfio_iommu {
>  	struct list_head	domain_list;
> +	struct vfio_domain	*mediated_domain;

I'm not really a fan of how this is so often used to special case the
code...

>  	struct mutex		lock;
>  	struct rb_root		dma_list;
>  	bool			v2;
> @@ -67,6 +69,13 @@ struct vfio_domain {
>  	struct list_head	group_list;
>  	int			prot;		/* IOMMU_CACHE */
>  	bool			fgsp;		/* Fine-grained super pages */
> +
> +	/* Domain for mediated device which is without physical IOMMU */
> +	bool			mediated_device;

But sometimes we use this to special case the code and other times we
use domain_list being empty.  I thought the argument against pulling
code out to a shared file was that this approach could be made
maintainable.

> +
> +	struct mm_struct	*mm;
> +	struct rb_root		pfn_list;	/* pinned Host pfn list */
> +	struct mutex		pfn_list_lock;	/* mutex for pfn_list */

Seems like we could reduce overhead for the existing use cases by just
adding a pointer here and making these last 3 entries part of the
structure that gets pointed to.  Existence of the pointer would replace
@mediated_device.

>  };
>  
>  struct vfio_dma {
> @@ -79,10 +88,26 @@ struct vfio_dma {
>  
>  struct vfio_group {
>  	struct iommu_group	*iommu_group;
> +#if defined(CONFIG_MDEV) || defined(CONFIG_MDEV_MODULE)

Where does CONFIG_MDEV_MODULE come from?

Plus, all the #ifdefs... <cringe>

> +	struct mdev_device	*mdev;

This gets set on attach_group where we use the iommu_group to lookup
the mdev, so why can't we do that on the other paths that make use of
this?  I think this is just holding a reference.

> +#endif
>  	struct list_head	next;
>  };
>  
>  /*
> + * Guest RAM pinning working set or DMA target
> + */
> +struct vfio_pfn {
> +	struct rb_node		node;
> +	unsigned long		vaddr;		/* virtual addr */
> +	dma_addr_t		iova;		/* IOVA */
> +	unsigned long		npage;		/* number of pages */
> +	unsigned long		pfn;		/* Host pfn */
> +	size_t			prot;
> +	atomic_t		ref_count;
> +};
> +
> +/*
>   * This code handles mapping and unmapping of user data buffers
>   * into DMA'ble space using the IOMMU
>   */
> @@ -130,6 +155,64 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
>  	rb_erase(&old->node, &iommu->dma_list);
>  }
>  
> +/*
> + * Helper Functions for host pfn list
> + */
> +
> +static struct vfio_pfn *vfio_find_pfn(struct vfio_domain *domain,
> +				      unsigned long pfn)
> +{
> +	struct rb_node *node;
> +	struct vfio_pfn *vpfn, *ret = NULL;
> +
> +	mutex_lock(&domain->pfn_list_lock);
> +	node = domain->pfn_list.rb_node;
> +
> +	while (node) {
> +		vpfn = rb_entry(node, struct vfio_pfn, node);
> +
> +		if (pfn < vpfn->pfn)
> +			node = node->rb_left;
> +		else if (pfn > vpfn->pfn)
> +			node = node->rb_right;
> +		else {
> +			ret = vpfn;
> +			break;
> +		}
> +	}
> +
> +	mutex_unlock(&domain->pfn_list_lock);
> +	return ret;
> +}
> +
> +static void vfio_link_pfn(struct vfio_domain *domain, struct vfio_pfn *new)
> +{
> +	struct rb_node **link, *parent = NULL;
> +	struct vfio_pfn *vpfn;
> +
> +	mutex_lock(&domain->pfn_list_lock);
> +	link = &domain->pfn_list.rb_node;
> +	while (*link) {
> +		parent = *link;
> +		vpfn = rb_entry(parent, struct vfio_pfn, node);
> +
> +		if (new->pfn < vpfn->pfn)
> +			link = &(*link)->rb_left;
> +		else
> +			link = &(*link)->rb_right;
> +	}
> +
> +	rb_link_node(&new->node, parent, link);
> +	rb_insert_color(&new->node, &domain->pfn_list);
> +	mutex_unlock(&domain->pfn_list_lock);
> +}
> +
> +/* call by holding domain->pfn_list_lock */
> +static void vfio_unlink_pfn(struct vfio_domain *domain, struct vfio_pfn *old)
> +{
> +	rb_erase(&old->node, &domain->pfn_list);
> +}

Hmm, all the other pfn list interfaces lock themselves yet this one
requires a lock.  I think that should be called out by naming it
something like __vfio_unlink_pfn() rather than simply a comment.

> +
>  struct vwork {
>  	struct mm_struct	*mm;
>  	long			npage;
> @@ -228,20 +311,29 @@ static int put_pfn(unsigned long pfn, int prot)
>  	return 0;
>  }
>  
> -static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
> +static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
> +			 int prot, unsigned long *pfn)
>  {
>  	struct page *page[1];
>  	struct vm_area_struct *vma;
> +	struct mm_struct *local_mm = mm;
>  	int ret = -EFAULT;
>  
> -	if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
> +	if (!local_mm && !current->mm)
> +		return -ENODEV;
> +
> +	if (!local_mm)
> +		local_mm = current->mm;

The above would be much more concise if we just initialized local_mm
as: mm ? mm : current->mm

> +
> +	down_read(&local_mm->mmap_sem);
> +	if (get_user_pages_remote(NULL, local_mm, vaddr, 1,
> +				!!(prot & IOMMU_WRITE), 0, page, NULL) == 1) {

Um, the comment for get_user_pages_remote says:

"See also get_user_pages_fast, for performance critical applications."

So what penalty are we imposing on the existing behavior of type1
here?  Previously we only needed to acquire mmap_sem if
get_user_pages_fast() didn't work, so the existing use case seems to be
compromised.

>  		*pfn = page_to_pfn(page[0]);
> -		return 0;
> +		ret = 0;
> +		goto done_pfn;
>  	}
>  
> -	down_read(&current->mm->mmap_sem);
> -
> -	vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
> +	vma = find_vma_intersection(local_mm, vaddr, vaddr + 1);
>  
>  	if (vma && vma->vm_flags & VM_PFNMAP) {
>  		*pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
> @@ -249,7 +341,8 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
>  			ret = 0;
>  	}
>  
> -	up_read(&current->mm->mmap_sem);
> +done_pfn:
> +	up_read(&local_mm->mmap_sem);
>  
>  	return ret;
>  }
> @@ -259,18 +352,19 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
>   * the iommu can only map chunks of consecutive pfns anyway, so get the
>   * first page and all consecutive pages with the same locking.
>   */
> -static long vfio_pin_pages(unsigned long vaddr, long npage,
> -			   int prot, unsigned long *pfn_base)
> +static long vfio_pin_pages_internal(struct vfio_domain *domain,
> +				    unsigned long vaddr, long npage,
> +				    int prot, unsigned long *pfn_base)
>  {
>  	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>  	bool lock_cap = capable(CAP_IPC_LOCK);
>  	long ret, i;
>  	bool rsvd;
>  
> -	if (!current->mm)
> +	if (!domain)
>  		return -ENODEV;

This test doesn't make much sense to me.  The existing use case error
is again being deferred and the callers either seem like domain can't
be NULL or it's an exported function where we should be validating the
parameters before calling this function.

>  
> -	ret = vaddr_get_pfn(vaddr, prot, pfn_base);
> +	ret = vaddr_get_pfn(domain->mm, vaddr, prot, pfn_base);
>  	if (ret)
>  		return ret;
>  
> @@ -293,7 +387,7 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
>  	for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
>  		unsigned long pfn = 0;
>  
> -		ret = vaddr_get_pfn(vaddr, prot, &pfn);
> +		ret = vaddr_get_pfn(domain->mm, vaddr, prot, &pfn);
>  		if (ret)
>  			break;
>  
> @@ -318,20 +412,165 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
>  	return i;
>  }
>  
> -static long vfio_unpin_pages(unsigned long pfn, long npage,
> -			     int prot, bool do_accounting)
> +static long vfio_unpin_pages_internal(struct vfio_domain *domain,
> +				      unsigned long pfn, long npage, int prot,
> +				      bool do_accounting)
>  {
>  	unsigned long unlocked = 0;
>  	long i;
>  
> +	if (!domain)
> +		return -ENODEV;
> +

Again, seems like validation of parameters should happen at the caller
in this case.

>  	for (i = 0; i < npage; i++)
>  		unlocked += put_pfn(pfn++, prot);
>  
>  	if (do_accounting)
>  		vfio_lock_acct(-unlocked);
> +	return unlocked;
> +}
> +
> +/*
> + * Pin a set of guest PFNs and return their associated host PFNs for API
> + * supported domain only.
> + * @vaddr [in]: array of guest PFNs

vfio is a userspace driver, never assume the userspace is a VM.  It's
also not really a vaddr since it's a frame number.  Please work on the
names.

> + * @npage [in]: count of array elements
> + * @prot [in] : protection flags
> + * @pfn_base[out] : array of host PFNs

phys_pfn maybe.

> + */
> +long vfio_pin_pages(void *iommu_data, dma_addr_t *vaddr, long npage,
> +		   int prot, dma_addr_t *pfn_base)
> +{
> +	struct vfio_iommu *iommu = iommu_data;
> +	struct vfio_domain *domain = NULL;

Unnecessary initialization.

> +	int i = 0, ret = 0;

Same for these.

> +	long retpage;
> +	unsigned long remote_vaddr = 0;

And this.

> +	dma_addr_t *pfn = pfn_base;
> +	struct vfio_dma *dma;
> +
> +	if (!iommu || !vaddr || !pfn_base)
> +		return -EINVAL;
> +
> +	mutex_lock(&iommu->lock);
> +
> +	if (!iommu->mediated_domain) {
> +		ret = -EINVAL;
> +		goto pin_done;
> +	}
> +
> +	domain = iommu->mediated_domain;

You're already validating domain here, which makes the test in
vfio_pin_pages_internal() really seem unnecessary.

> +
> +	for (i = 0; i < npage; i++) {
> +		struct vfio_pfn *p, *lpfn;
> +		unsigned long tpfn;
> +		dma_addr_t iova;
> +		long pg_cnt = 1;
> +
> +		iova = vaddr[i] << PAGE_SHIFT;
> +
> +		dma = vfio_find_dma(iommu, iova, 0 /*  size */);
> +		if (!dma) {
> +			ret = -EINVAL;
> +			goto pin_done;

All error paths need to unwind, if we return error there should be no
state change, otherwise we're leaking pages.

> +		}
> +
> +		remote_vaddr = dma->vaddr + iova - dma->iova;
> +
> +		retpage = vfio_pin_pages_internal(domain, remote_vaddr,
> +						  pg_cnt, prot, &tpfn);
> +		if (retpage <= 0) {
> +			WARN_ON(!retpage);
> +			ret = (int)retpage;
> +			goto pin_done;

unwind

> +		}
> +
> +		pfn[i] = tpfn;
> +
> +		/* search if pfn exist */
> +		p = vfio_find_pfn(domain, tpfn);
> +		if (p) {
> +			atomic_inc(&p->ref_count);
> +			continue;
> +		}
> +
> +		/* add to pfn_list */
> +		lpfn = kzalloc(sizeof(*lpfn), GFP_KERNEL);
> +		if (!lpfn) {
> +			ret = -ENOMEM;
> +			goto pin_done;

unwind

> +		}
> +		lpfn->vaddr = remote_vaddr;
> +		lpfn->iova = iova;
> +		lpfn->pfn = pfn[i];
> +		lpfn->npage = 1;

Why do we need this variable if this can only track 1 page?

> +		lpfn->prot = prot;
> +		atomic_inc(&lpfn->ref_count);

atomic_set(), we want to set the ref count to 1, not increment.

> +		vfio_link_pfn(domain, lpfn);
> +	}
> +
> +	ret = i;
> +
> +pin_done:
> +	mutex_unlock(&iommu->lock);
> +	return ret;
> +}
> +EXPORT_SYMBOL(vfio_pin_pages);
> +
> +static int vfio_unpin_pfn(struct vfio_domain *domain,
> +			  struct vfio_pfn *vpfn, bool do_accounting)
> +{
> +	int ret;
> +
> +	ret = vfio_unpin_pages_internal(domain, vpfn->pfn, vpfn->npage,
> +					vpfn->prot, do_accounting);
> +
> +	if (ret > 0 && atomic_dec_and_test(&vpfn->ref_count)) {
> +		vfio_unlink_pfn(domain, vpfn);
> +		kfree(vpfn);
> +	}
> +
> +	return ret;
> +}
> +
> +/*
> + * Unpin set of host PFNs for API supported domain only.
> + * @pfn	[in] : array of host PFNs to be unpinned.
> + * @npage [in] :count of elements in array, that is number of pages.
> + * @prot [in] : protection flags

prot is unused, it's also saved in out pfn list if we did need it.  In
fact, should we compare prot after our vfio_find_pfn above to make sure
our existing pinned page has the right settings?

> + */
> +long vfio_unpin_pages(void *iommu_data, dma_addr_t *pfn, long npage,
> +		     int prot)
> +{
> +	struct vfio_iommu *iommu = iommu_data;
> +	struct vfio_domain *domain = NULL;
> +	long unlocked = 0;
> +	int i;
> +
> +	if (!iommu || !pfn)
> +		return -EINVAL;
> +
> +	if (!iommu->mediated_domain)
> +		return -EINVAL;
> +
> +	domain = iommu->mediated_domain;

Again, domain is already validated here.

> +
> +	for (i = 0; i < npage; i++) {
> +		struct vfio_pfn *p;
> +
> +		/* verify if pfn exist in pfn_list */
> +		p = vfio_find_pfn(domain, *(pfn + i));

Why are we using array indexing above and array math here?  Were these
functions written by different people?

> +		if (!p)
> +			continue;

Hmm, this seems like more of a bad thing than a continue.

> +
> +		mutex_lock(&domain->pfn_list_lock);
> +		unlocked += vfio_unpin_pfn(domain, p, true);
> +		mutex_unlock(&domain->pfn_list_lock);

[huge red flag] the entire vfio_unpin_pfn path is called under
pfn_list_lock, but vfio_pin_pages only uses it sparsely.  Maybe someone
else did write this function.  I'll assume all locking here needs a
revisit.

> +	}
>  
>  	return unlocked;
>  }
> +EXPORT_SYMBOL(vfio_unpin_pages);
>  
>  static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
>  {
> @@ -341,6 +580,10 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
>  
>  	if (!dma->size)
>  		return;
> +
> +	if (list_empty(&iommu->domain_list))
> +		return;

Huh?  This would be a serious consistency error if this happened for
the existing use case.

> +
>  	/*
>  	 * We use the IOMMU to track the physical addresses, otherwise we'd
>  	 * need a much more complicated tracking system.  Unfortunately that
> @@ -382,9 +625,10 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
>  		if (WARN_ON(!unmapped))
>  			break;
>  
> -		unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
> -					     unmapped >> PAGE_SHIFT,
> -					     dma->prot, false);
> +		unlocked += vfio_unpin_pages_internal(domain,
> +						phys >> PAGE_SHIFT,
> +						unmapped >> PAGE_SHIFT,
> +						dma->prot, false);
>  		iova += unmapped;
>  
>  		cond_resched();
> @@ -517,6 +761,9 @@ static int map_try_harder(struct vfio_domain *domain, dma_addr_t iova,
>  	long i;
>  	int ret;
>  
> +	if (domain->mediated_device)
> +		return -EINVAL;


Which is it going to be, mediated_device to flag these special domains
or domain_list empty, let's not use both.

> +
>  	for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
>  		ret = iommu_map(domain->domain, iova,
>  				(phys_addr_t)pfn << PAGE_SHIFT,
> @@ -537,6 +784,9 @@ static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
>  	struct vfio_domain *d;
>  	int ret;
>  
> +	if (list_empty(&iommu->domain_list))
> +		return 0;
> +
>  	list_for_each_entry(d, &iommu->domain_list, next) {
>  		ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
>  				npage << PAGE_SHIFT, prot | d->prot);
> @@ -569,6 +819,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  	uint64_t mask;
>  	struct vfio_dma *dma;
>  	unsigned long pfn;
> +	struct vfio_domain *domain = NULL;
>  
>  	/* Verify that none of our __u64 fields overflow */
>  	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
> @@ -611,10 +862,21 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  	/* Insert zero-sized and grow as we map chunks of it */
>  	vfio_link_dma(iommu, dma);
>  
> +	/*
> +	 * Skip pin and map if and domain list is empty
> +	 */
> +	if (list_empty(&iommu->domain_list)) {
> +		dma->size = size;
> +		goto map_done;
> +	}

Again, this would be a serious consistency error for the existing use
case.  Let's use indicators that are explicit.

> +
> +	domain = list_first_entry(&iommu->domain_list,
> +				  struct vfio_domain, next);
> +
>  	while (size) {
>  		/* Pin a contiguous chunk of memory */
> -		npage = vfio_pin_pages(vaddr + dma->size,
> -				       size >> PAGE_SHIFT, prot, &pfn);
> +		npage = vfio_pin_pages_internal(domain, vaddr + dma->size,
> +						size >> PAGE_SHIFT, prot, &pfn);
>  		if (npage <= 0) {
>  			WARN_ON(!npage);
>  			ret = (int)npage;
> @@ -624,7 +886,8 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  		/* Map it! */
>  		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot);
>  		if (ret) {
> -			vfio_unpin_pages(pfn, npage, prot, true);
> +			vfio_unpin_pages_internal(domain, pfn, npage,
> +						  prot, true);
>  			break;
>  		}
>  
> @@ -635,6 +898,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  	if (ret)
>  		vfio_remove_dma(iommu, dma);
>  
> +map_done:
>  	mutex_unlock(&iommu->lock);
>  	return ret;
>  }
> @@ -658,6 +922,9 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
>  	struct rb_node *n;
>  	int ret;
>  
> +	if (domain->mediated_device)
> +		return 0;

Though "mediated_device" is the user, not really the property of the
domain we're trying to support, which is more like track_and_pin_only.

> +
>  	/* Arbitrarily pick the first domain in the list for lookups */
>  	d = list_first_entry(&iommu->domain_list, struct vfio_domain, next);
>  	n = rb_first(&iommu->dma_list);
> @@ -716,6 +983,9 @@ static void vfio_test_domain_fgsp(struct vfio_domain *domain)
>  	struct page *pages;
>  	int ret, order = get_order(PAGE_SIZE * 2);
>  
> +	if (domain->mediated_device)
> +		return;
> +
>  	pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
>  	if (!pages)
>  		return;
> @@ -734,11 +1004,25 @@ static void vfio_test_domain_fgsp(struct vfio_domain *domain)
>  	__free_pages(pages, order);
>  }
>  
> +static struct vfio_group *is_iommu_group_present(struct vfio_domain *domain,
> +				   struct iommu_group *iommu_group)

is_foo is a yes/no answer, the return should be bool.  This is more of
a find_foo_from_bar

> +{
> +	struct vfio_group *g;
> +
> +	list_for_each_entry(g, &domain->group_list, next) {
> +		if (g->iommu_group != iommu_group)
> +			continue;
> +		return g;

Hmmm

if (g->iommu_group == iommu_group)
	return g;

> +	}
> +
> +	return NULL;
> +}
> +
>  static int vfio_iommu_type1_attach_group(void *iommu_data,
>  					 struct iommu_group *iommu_group)
>  {
>  	struct vfio_iommu *iommu = iommu_data;
> -	struct vfio_group *group, *g;
> +	struct vfio_group *group;
>  	struct vfio_domain *domain, *d;
>  	struct bus_type *bus = NULL;
>  	int ret;
> @@ -746,14 +1030,21 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	mutex_lock(&iommu->lock);
>  
>  	list_for_each_entry(d, &iommu->domain_list, next) {
> -		list_for_each_entry(g, &d->group_list, next) {
> -			if (g->iommu_group != iommu_group)
> -				continue;
> +		if (is_iommu_group_present(d, iommu_group)) {
> +			mutex_unlock(&iommu->lock);
> +			return -EINVAL;
> +		}
> +	}
>  
> +#if defined(CONFIG_MDEV) || defined(CONFIG_MDEV_MODULE)
> +	if (iommu->mediated_domain) {
> +		if (is_iommu_group_present(iommu->mediated_domain,
> +					   iommu_group)) {
>  			mutex_unlock(&iommu->lock);
>  			return -EINVAL;
>  		}
>  	}
> +#endif
>  
>  	group = kzalloc(sizeof(*group), GFP_KERNEL);
>  	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
> @@ -769,6 +1060,36 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	if (ret)
>  		goto out_free;
>  
> +#if defined(CONFIG_MDEV) || defined(CONFIG_MDEV_MODULE)
> +	if (!iommu_present(bus) && (bus == &mdev_bus_type)) {
> +		struct mdev_device *mdev = NULL;

Unnecessary initialization.

> +
> +		mdev = mdev_get_device_by_group(iommu_group);
> +		if (!mdev)
> +			goto out_free;
> +
> +		mdev->iommu_data = iommu;

This looks rather sketchy to me, we don't have a mediated driver in
this series, but presumably the driver blindly calls vfio_pin_pages
passing mdev->iommu_data and hoping that it's either NULL to generate
an error or relevant to this iommu backend.  How would we add a second
mediated driver iommu backend?  We're currently assuming the user
configured this backend.  Should vfio_pin_pages instead have a struct
device* parameter from which we would lookup the iommu_group and get to
the vfio_domain?  That's a bit heavy weight, but we need something
along those lines.


> +		group->mdev = mdev;
> +
> +		if (iommu->mediated_domain) {
> +			list_add(&group->next,
> +				 &iommu->mediated_domain->group_list);
> +			kfree(domain);
> +			mutex_unlock(&iommu->lock);
> +			return 0;
> +		}
> +		domain->mediated_device = true;
> +		domain->mm = current->mm;
> +		INIT_LIST_HEAD(&domain->group_list);
> +		list_add(&group->next, &domain->group_list);
> +		domain->pfn_list = RB_ROOT;
> +		mutex_init(&domain->pfn_list_lock);
> +		iommu->mediated_domain = domain;
> +		mutex_unlock(&iommu->lock);
> +		return 0;
> +	}
> +#endif
> +
>  	domain->domain = iommu_domain_alloc(bus);
>  	if (!domain->domain) {
>  		ret = -EIO;
> @@ -859,6 +1180,20 @@ static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
>  		vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
>  }
>  
> +#if defined(CONFIG_MDEV) || defined(CONFIG_MDEV_MODULE)
> +static void vfio_iommu_unpin_api_domain(struct vfio_domain *domain)
> +{
> +	struct rb_node *node;
> +
> +	mutex_lock(&domain->pfn_list_lock);
> +	while ((node = rb_first(&domain->pfn_list))) {
> +		vfio_unpin_pfn(domain,
> +				rb_entry(node, struct vfio_pfn, node), false);
> +	}
> +	mutex_unlock(&domain->pfn_list_lock);
> +}
> +#endif
> +
>  static void vfio_iommu_type1_detach_group(void *iommu_data,
>  					  struct iommu_group *iommu_group)
>  {
> @@ -868,31 +1203,55 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
>  
>  	mutex_lock(&iommu->lock);
>  
> -	list_for_each_entry(domain, &iommu->domain_list, next) {
> -		list_for_each_entry(group, &domain->group_list, next) {
> -			if (group->iommu_group != iommu_group)
> -				continue;
> +#if defined(CONFIG_MDEV) || defined(CONFIG_MDEV_MODULE)
> +	if (iommu->mediated_domain) {
> +		domain = iommu->mediated_domain;
> +		group = is_iommu_group_present(domain, iommu_group);
> +		if (group) {
> +			if (group->mdev) {
> +				group->mdev->iommu_data = NULL;
> +				mdev_put_device(group->mdev);
> +			}
> +			list_del(&group->next);
> +			kfree(group);
> +
> +			if (list_empty(&domain->group_list)) {
> +				vfio_iommu_unpin_api_domain(domain);
> +
> +				if (list_empty(&iommu->domain_list))
> +					vfio_iommu_unmap_unpin_all(iommu);
> +
> +				kfree(domain);
> +				iommu->mediated_domain = NULL;
> +			}
> +		}
> +	}
> +#endif
>  
> +	list_for_each_entry(domain, &iommu->domain_list, next) {
> +		group = is_iommu_group_present(domain, iommu_group);
> +		if (group) {
>  			iommu_detach_group(domain->domain, iommu_group);
>  			list_del(&group->next);
>  			kfree(group);
>  			/*
>  			 * Group ownership provides privilege, if the group
>  			 * list is empty, the domain goes away.  If it's the
> -			 * last domain, then all the mappings go away too.
> +			 * last domain with iommu and API-only domain doesn't
> +			 * exist, the all the mappings go away too.
>  			 */
>  			if (list_empty(&domain->group_list)) {
> -				if (list_is_singular(&iommu->domain_list))
> +				if (list_is_singular(&iommu->domain_list) &&
> +				    !iommu->mediated_domain)
>  					vfio_iommu_unmap_unpin_all(iommu);
>  				iommu_domain_free(domain->domain);
>  				list_del(&domain->next);
>  				kfree(domain);
>  			}
> -			goto done;
> +			break;
>  		}
>  	}
>  
> -done:
>  	mutex_unlock(&iommu->lock);
>  }
>  
> @@ -930,8 +1289,28 @@ static void vfio_iommu_type1_release(void *iommu_data)
>  	struct vfio_domain *domain, *domain_tmp;
>  	struct vfio_group *group, *group_tmp;
>  
> +#if defined(CONFIG_MDEV) || defined(CONFIG_MDEV_MODULE)
> +	if (iommu->mediated_domain) {
> +		domain = iommu->mediated_domain;
> +		list_for_each_entry_safe(group, group_tmp,
> +					 &domain->group_list, next) {
> +			if (group->mdev) {
> +				group->mdev->iommu_data = NULL;
> +				mdev_put_device(group->mdev);
> +			}
> +			list_del(&group->next);
> +			kfree(group);
> +		}
> +		vfio_iommu_unpin_api_domain(domain);
> +		kfree(domain);
> +		iommu->mediated_domain = NULL;
> +	}
> +#endif

I'm not really seeing how this is all that much more maintainable than
what was proposed previously, has this aspect been worked on since last
I reviewed this patch?

>  	vfio_iommu_unmap_unpin_all(iommu);
>  
> +	if (list_empty(&iommu->domain_list))
> +		goto release_exit;
> +
>  	list_for_each_entry_safe(domain, domain_tmp,
>  				 &iommu->domain_list, next) {
>  		list_for_each_entry_safe(group, group_tmp,
> @@ -945,6 +1324,7 @@ static void vfio_iommu_type1_release(void *iommu_data)
>  		kfree(domain);
>  	}
>  
> +release_exit:
>  	kfree(iommu);
>  }
>  
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 431b824b0d3e..0a907bb33426 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -134,6 +134,12 @@ static inline long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
>  }
>  #endif /* CONFIG_EEH */
>  
> +extern long vfio_pin_pages(void *iommu_data, dma_addr_t *vaddr, long npage,
> +			   int prot, dma_addr_t *pfn_base);
> +
> +extern long vfio_unpin_pages(void *iommu_data, dma_addr_t *pfn, long npage,
> +			     int prot);
> +
>  /*
>   * IRQfd - generic
>   */

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kirti Wankhede June 28, 2016, 1:02 p.m. UTC | #2
On 6/22/2016 9:16 AM, Alex Williamson wrote:
> On Mon, 20 Jun 2016 22:01:48 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>>  
>>  struct vfio_iommu {
>>  	struct list_head	domain_list;
>> +	struct vfio_domain	*mediated_domain;
> 
> I'm not really a fan of how this is so often used to special case the
> code...
> 
>>  	struct mutex		lock;
>>  	struct rb_root		dma_list;
>>  	bool			v2;
>> @@ -67,6 +69,13 @@ struct vfio_domain {
>>  	struct list_head	group_list;
>>  	int			prot;		/* IOMMU_CACHE */
>>  	bool			fgsp;		/* Fine-grained super pages */
>> +
>> +	/* Domain for mediated device which is without physical IOMMU */
>> +	bool			mediated_device;
> 
> But sometimes we use this to special case the code and other times we
> use domain_list being empty.  I thought the argument against pulling
> code out to a shared file was that this approach could be made
> maintainable.
> 

Functions where struct vfio_domain *domain is argument which are
intended to perform for that domain only, checked if
(domain->mediated_device), like map_try_harder(), vfio_iommu_replay(),
vfio_test_domain_fgsp(). Checks in these functions can be removed but
then it would be callers responsibility to make sure that they don't
call these functions for mediated_domain.
Whereas functions where struct vfio_iommu *iommu is argument and
domain_list is traversed to find domain or perform for each domain in
domain_list, checked if (list_empty(&iommu->domain_list)), like
vfio_unmap_unpin(), vfio_iommu_map(), vfio_dma_do_map().


>> +
>> +	struct mm_struct	*mm;
>> +	struct rb_root		pfn_list;	/* pinned Host pfn list */
>> +	struct mutex		pfn_list_lock;	/* mutex for pfn_list */
> 
> Seems like we could reduce overhead for the existing use cases by just
> adding a pointer here and making these last 3 entries part of the
> structure that gets pointed to.  Existence of the pointer would replace
> @mediated_device.
>

Ok.

>>  };
>>  
>>  struct vfio_dma {
>> @@ -79,10 +88,26 @@ struct vfio_dma {
>>  
>>  struct vfio_group {
>>  	struct iommu_group	*iommu_group;
>> +#if defined(CONFIG_MDEV) || defined(CONFIG_MDEV_MODULE)
> 
> Where does CONFIG_MDEV_MODULE come from?
> 
> Plus, all the #ifdefs... <cringe>
> 

Config option MDEV is tristate and when selected as module
CONFIG_MDEV_MODULE is set in include/generated/autoconf.h.
Symbols mdev_bus_type, mdev_get_device_by_group() and mdev_put_device()
are only available when MDEV option is selected as built-in or modular.
If MDEV option is not selected, vfio_iommu_type1 modules should still
work for device direct assignment. If these #ifdefs are not there
vfio_iommu_type1 module fails to load with undefined symbols when MDEV
is not selected.

>> +	struct mdev_device	*mdev;
> 
> This gets set on attach_group where we use the iommu_group to lookup
> the mdev, so why can't we do that on the other paths that make use of
> this?  I think this is just holding a reference.
> 

mdev is retrieved from attach_group for 2 reasons:
1. to increase the ref count of mdev, mdev_get_device_by_group(), when
its iommu_group is attached. That should be decremented, by
mdev_put_device(), from detach while detaching its iommu_group. This is
make sure that mdev is not freed until it's iommu_group is detached from
the container.

2. save reference to iommu_data so that vendor driver would use to call
vfio_pin_pages() and vfio_unpin_pages(). More details below.



>> -static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
>> +static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
>> +			 int prot, unsigned long *pfn)
>>  {
>>  	struct page *page[1];
>>  	struct vm_area_struct *vma;
>> +	struct mm_struct *local_mm = mm;
>>  	int ret = -EFAULT;
>>  
>> -	if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
>> +	if (!local_mm && !current->mm)
>> +		return -ENODEV;
>> +
>> +	if (!local_mm)
>> +		local_mm = current->mm;
> 
> The above would be much more concise if we just initialized local_mm
> as: mm ? mm : current->mm
> 
>> +
>> +	down_read(&local_mm->mmap_sem);
>> +	if (get_user_pages_remote(NULL, local_mm, vaddr, 1,
>> +				!!(prot & IOMMU_WRITE), 0, page, NULL) == 1) {
> 
> Um, the comment for get_user_pages_remote says:
> 
> "See also get_user_pages_fast, for performance critical applications."
> 
> So what penalty are we imposing on the existing behavior of type1
> here?  Previously we only needed to acquire mmap_sem if
> get_user_pages_fast() didn't work, so the existing use case seems to be
> compromised.
>

Yes.
get_user_pages_fast() pins pages from current->mm, but for mediated
device mm could be different than current->mm.

This penalty for existing behavior could be avoided by:
if (!mm && current->mm)
    get_user_pages_fast(); //take fast path
else
    get_user_pages_remote(); // take slow path


>> +long vfio_unpin_pages(void *iommu_data, dma_addr_t *pfn, long npage,
>> +		     int prot)
>> +{
>> +	struct vfio_iommu *iommu = iommu_data;
>> +	struct vfio_domain *domain = NULL;
>> +	long unlocked = 0;
>> +	int i;
>> +
>> +	if (!iommu || !pfn)
>> +		return -EINVAL;
>> +
>> +	if (!iommu->mediated_domain)
>> +		return -EINVAL;
>> +
>> +	domain = iommu->mediated_domain;
> 
> Again, domain is already validated here.
> 
>> +
>> +	for (i = 0; i < npage; i++) {
>> +		struct vfio_pfn *p;
>> +
>> +		/* verify if pfn exist in pfn_list */
>> +		p = vfio_find_pfn(domain, *(pfn + i));
> 
> Why are we using array indexing above and array math here?  Were these
> functions written by different people?
> 

No, input argument to vfio_unpin_pages() was always array of pfns to be
unpinned.

>> +		if (!p)
>> +			continue;
> 
> Hmm, this seems like more of a bad thing than a continue.
> 

Caller of vfio_unpin_pages() are other modules. I feel its better to do
sanity check than crash.


>>  
>>  static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
>>  {
>> @@ -341,6 +580,10 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
>>  
>>  	if (!dma->size)
>>  		return;
>> +
>> +	if (list_empty(&iommu->domain_list))
>> +		return;
> 
> Huh?  This would be a serious consistency error if this happened for
> the existing use case.
>

This will not happen for existing use case, i.e. device direct
assignment. This case is true when there is only mediated device
assigned and there are no direct assigned devices.


>> @@ -569,6 +819,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>>  	uint64_t mask;
>>  	struct vfio_dma *dma;
>>  	unsigned long pfn;
>> +	struct vfio_domain *domain = NULL;
>>  
>>  	/* Verify that none of our __u64 fields overflow */
>>  	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
>> @@ -611,10 +862,21 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>>  	/* Insert zero-sized and grow as we map chunks of it */
>>  	vfio_link_dma(iommu, dma);
>>  
>> +	/*
>> +	 * Skip pin and map if and domain list is empty
>> +	 */
>> +	if (list_empty(&iommu->domain_list)) {
>> +		dma->size = size;
>> +		goto map_done;
>> +	}
> 
> Again, this would be a serious consistency error for the existing use
> case.  Let's use indicators that are explicit.
>

Why? for existing use case (i.e. direct device assignment) domain_list
will not be empty, domain_list will only be empty when there is mediated
device assigned and no direct device assigned.

>>  static int vfio_iommu_type1_attach_group(void *iommu_data,
>>  					 struct iommu_group *iommu_group)
>>  {
>>  	struct vfio_iommu *iommu = iommu_data;
>> -	struct vfio_group *group, *g;
>> +	struct vfio_group *group;
>>  	struct vfio_domain *domain, *d;
>>  	struct bus_type *bus = NULL;
>>  	int ret;
>> @@ -746,14 +1030,21 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>>  	mutex_lock(&iommu->lock);
>>  
>>  	list_for_each_entry(d, &iommu->domain_list, next) {
>> -		list_for_each_entry(g, &d->group_list, next) {
>> -			if (g->iommu_group != iommu_group)
>> -				continue;
>> +		if (is_iommu_group_present(d, iommu_group)) {
>> +			mutex_unlock(&iommu->lock);
>> +			return -EINVAL;
>> +		}
>> +	}
>>  
>> +#if defined(CONFIG_MDEV) || defined(CONFIG_MDEV_MODULE)
>> +	if (iommu->mediated_domain) {
>> +		if (is_iommu_group_present(iommu->mediated_domain,
>> +					   iommu_group)) {
>>  			mutex_unlock(&iommu->lock);
>>  			return -EINVAL;
>>  		}
>>  	}
>> +#endif
>>  
>>  	group = kzalloc(sizeof(*group), GFP_KERNEL);
>>  	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
>> @@ -769,6 +1060,36 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>>  	if (ret)
>>  		goto out_free;
>>  
>> +#if defined(CONFIG_MDEV) || defined(CONFIG_MDEV_MODULE)
>> +	if (!iommu_present(bus) && (bus == &mdev_bus_type)) {
>> +		struct mdev_device *mdev = NULL;
> 
> Unnecessary initialization.
> 
>> +
>> +		mdev = mdev_get_device_by_group(iommu_group);
>> +		if (!mdev)
>> +			goto out_free;
>> +
>> +		mdev->iommu_data = iommu;
> 
> This looks rather sketchy to me, we don't have a mediated driver in
> this series, but presumably the driver blindly calls vfio_pin_pages
> passing mdev->iommu_data and hoping that it's either NULL to generate
> an error or relevant to this iommu backend.  How would we add a second
> mediated driver iommu backend?  We're currently assuming the user
> configured this backend.  

If I understand correctly, your question is if two different mediated
devices are assigned to same container. In such case, the two mediated
devices will have different iommu_groups and will be added to
mediated_domain's group_list (iommu->mediated_domain->group_list).

> Should vfio_pin_pages instead have a struct
> device* parameter from which we would lookup the iommu_group and get to
> the vfio_domain?  That's a bit heavy weight, but we need something
> along those lines.
> 

There could be multiple mdev devices from same mediated vendor driver in
one container. In that case, that vendor driver need reference of
container or container->iommu_data to pin and unpin pages.
Similarly, there could be mutiple mdev devices from different mediated
vendor driver in one container, in that case both vendor driver need
reference to container or container->iommu_data to pin and unpin pages
in their driver.

>> +		mdev->iommu_data = iommu;
With the above line, a reference to container->iommu_data is kept in
mdev structure when the iommu_group is attached to a container so that
vendor drivers can find reference to pin and unpin pages.

If struct device* is passed as an argument to vfio_pin_pages, to find
reference to container of struct device *dev, have to find
vfio_device/vfio_group from dev that means traverse vfio.group_list for
each pin and unpin call. This list would be long when there are many
mdev devices in the system.

Is there any better way to find reference to container from struct device*?


>>  
>> @@ -930,8 +1289,28 @@ static void vfio_iommu_type1_release(void *iommu_data)
>>  	struct vfio_domain *domain, *domain_tmp;
>>  	struct vfio_group *group, *group_tmp;
>>  
>> +#if defined(CONFIG_MDEV) || defined(CONFIG_MDEV_MODULE)
>> +	if (iommu->mediated_domain) {
>> +		domain = iommu->mediated_domain;
>> +		list_for_each_entry_safe(group, group_tmp,
>> +					 &domain->group_list, next) {
>> +			if (group->mdev) {
>> +				group->mdev->iommu_data = NULL;
>> +				mdev_put_device(group->mdev);
>> +			}
>> +			list_del(&group->next);
>> +			kfree(group);
>> +		}
>> +		vfio_iommu_unpin_api_domain(domain);
>> +		kfree(domain);
>> +		iommu->mediated_domain = NULL;
>> +	}
>> +#endif
> 
> I'm not really seeing how this is all that much more maintainable than
> what was proposed previously, has this aspect been worked on since last
> I reviewed this patch?
> 

There aren't many changes from v4 to v5 version of this patch.
Can you more specific on you concerns about maintainability? I'll
definitely address your concerns.

Thanks,
Kirti
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson June 29, 2016, 2:46 a.m. UTC | #3
On Tue, 28 Jun 2016 18:32:44 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 6/22/2016 9:16 AM, Alex Williamson wrote:
> > On Mon, 20 Jun 2016 22:01:48 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> >>  
> >>  struct vfio_iommu {
> >>  	struct list_head	domain_list;
> >> +	struct vfio_domain	*mediated_domain;  
> > 
> > I'm not really a fan of how this is so often used to special case the
> > code...
> >   
> >>  	struct mutex		lock;
> >>  	struct rb_root		dma_list;
> >>  	bool			v2;
> >> @@ -67,6 +69,13 @@ struct vfio_domain {
> >>  	struct list_head	group_list;
> >>  	int			prot;		/* IOMMU_CACHE */
> >>  	bool			fgsp;		/* Fine-grained super pages */
> >> +
> >> +	/* Domain for mediated device which is without physical IOMMU */
> >> +	bool			mediated_device;  
> > 
> > But sometimes we use this to special case the code and other times we
> > use domain_list being empty.  I thought the argument against pulling
> > code out to a shared file was that this approach could be made
> > maintainable.
> >   
> 
> Functions where struct vfio_domain *domain is argument which are
> intended to perform for that domain only, checked if
> (domain->mediated_device), like map_try_harder(), vfio_iommu_replay(),
> vfio_test_domain_fgsp(). Checks in these functions can be removed but
> then it would be callers responsibility to make sure that they don't
> call these functions for mediated_domain.
> Whereas functions where struct vfio_iommu *iommu is argument and
> domain_list is traversed to find domain or perform for each domain in
> domain_list, checked if (list_empty(&iommu->domain_list)), like
> vfio_unmap_unpin(), vfio_iommu_map(), vfio_dma_do_map().

My point is that we have different test elements at different points in
the data structures and they all need to be kept in sync and the right
one used at the right place, which makes the code all that much more
complex versus the alternative approach of finding commonality,
extracting it into a shared file, and creating a mediated version of
the type1 iommu that doesn't try to overload dual functionality into a
single code block. 

> >> +
> >> +	struct mm_struct	*mm;
> >> +	struct rb_root		pfn_list;	/* pinned Host pfn list */
> >> +	struct mutex		pfn_list_lock;	/* mutex for pfn_list */  
> > 
> > Seems like we could reduce overhead for the existing use cases by just
> > adding a pointer here and making these last 3 entries part of the
> > structure that gets pointed to.  Existence of the pointer would replace
> > @mediated_device.
> >  
> 
> Ok.
> 
> >>  };
> >>  
> >>  struct vfio_dma {
> >> @@ -79,10 +88,26 @@ struct vfio_dma {
> >>  
> >>  struct vfio_group {
> >>  	struct iommu_group	*iommu_group;
> >> +#if defined(CONFIG_MDEV) || defined(CONFIG_MDEV_MODULE)  
> > 
> > Where does CONFIG_MDEV_MODULE come from?
> > 
> > Plus, all the #ifdefs... <cringe>
> >   
> 
> Config option MDEV is tristate and when selected as module
> CONFIG_MDEV_MODULE is set in include/generated/autoconf.h.
> Symbols mdev_bus_type, mdev_get_device_by_group() and mdev_put_device()
> are only available when MDEV option is selected as built-in or modular.
> If MDEV option is not selected, vfio_iommu_type1 modules should still
> work for device direct assignment. If these #ifdefs are not there
> vfio_iommu_type1 module fails to load with undefined symbols when MDEV
> is not selected.

I guess I just hadn't seen the _MODULE define used before, but it does
appear to be fairly common.  Another option might be to provide stubs
or static inline abstractions in a header file so the #ifdefs can be
isolated.  It also seems like this is going to mean that type1 now
depends on and will autoload the mdev module even for physical
assignment.  That's not terribly desirable.

> >> +	struct mdev_device	*mdev;  
> > 
> > This gets set on attach_group where we use the iommu_group to lookup
> > the mdev, so why can't we do that on the other paths that make use of
> > this?  I think this is just holding a reference.
> >   
> 
> mdev is retrieved from attach_group for 2 reasons:
> 1. to increase the ref count of mdev, mdev_get_device_by_group(), when
> its iommu_group is attached. That should be decremented, by
> mdev_put_device(), from detach while detaching its iommu_group. This is
> make sure that mdev is not freed until it's iommu_group is detached from
> the container.
> 
> 2. save reference to iommu_data so that vendor driver would use to call
> vfio_pin_pages() and vfio_unpin_pages(). More details below.
> 
> 
> 
> >> -static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
> >> +static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
> >> +			 int prot, unsigned long *pfn)
> >>  {
> >>  	struct page *page[1];
> >>  	struct vm_area_struct *vma;
> >> +	struct mm_struct *local_mm = mm;
> >>  	int ret = -EFAULT;
> >>  
> >> -	if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
> >> +	if (!local_mm && !current->mm)
> >> +		return -ENODEV;
> >> +
> >> +	if (!local_mm)
> >> +		local_mm = current->mm;  
> > 
> > The above would be much more concise if we just initialized local_mm
> > as: mm ? mm : current->mm
> >   
> >> +
> >> +	down_read(&local_mm->mmap_sem);
> >> +	if (get_user_pages_remote(NULL, local_mm, vaddr, 1,
> >> +				!!(prot & IOMMU_WRITE), 0, page, NULL) == 1) {  
> > 
> > Um, the comment for get_user_pages_remote says:
> > 
> > "See also get_user_pages_fast, for performance critical applications."
> > 
> > So what penalty are we imposing on the existing behavior of type1
> > here?  Previously we only needed to acquire mmap_sem if
> > get_user_pages_fast() didn't work, so the existing use case seems to be
> > compromised.
> >  
> 
> Yes.
> get_user_pages_fast() pins pages from current->mm, but for mediated
> device mm could be different than current->mm.
> 
> This penalty for existing behavior could be avoided by:
> if (!mm && current->mm)
>     get_user_pages_fast(); //take fast path
> else
>     get_user_pages_remote(); // take slow path


How to avoid it is pretty obvious, the concern is that overhead of the
existing use case isn't being prioritized.

> >> +long vfio_unpin_pages(void *iommu_data, dma_addr_t *pfn, long npage,
> >> +		     int prot)
> >> +{
> >> +	struct vfio_iommu *iommu = iommu_data;
> >> +	struct vfio_domain *domain = NULL;
> >> +	long unlocked = 0;
> >> +	int i;
> >> +
> >> +	if (!iommu || !pfn)
> >> +		return -EINVAL;
> >> +
> >> +	if (!iommu->mediated_domain)
> >> +		return -EINVAL;
> >> +
> >> +	domain = iommu->mediated_domain;  
> > 
> > Again, domain is already validated here.
> >   
> >> +
> >> +	for (i = 0; i < npage; i++) {
> >> +		struct vfio_pfn *p;
> >> +
> >> +		/* verify if pfn exist in pfn_list */
> >> +		p = vfio_find_pfn(domain, *(pfn + i));  
> > 
> > Why are we using array indexing above and array math here?  Were these
> > functions written by different people?
> >   
> 
> No, input argument to vfio_unpin_pages() was always array of pfns to be
> unpinned.

My comment is in regard to how the code added for vfio_pin_pages() uses
array indexing (ie, pfn[i]) while here we use array math (ie, *(pfn +
i)).  Don't arbitrarily mix them, be consistent.

> >> +		if (!p)
> >> +			continue;  
> > 
> > Hmm, this seems like more of a bad thing than a continue.
> >   
> 
> Caller of vfio_unpin_pages() are other modules. I feel its better to do
> sanity check than crash.

Who said anything about crashing?  We have a return value for a reason,
right?

> >>  
> >>  static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
> >>  {
> >> @@ -341,6 +580,10 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
> >>  
> >>  	if (!dma->size)
> >>  		return;
> >> +
> >> +	if (list_empty(&iommu->domain_list))
> >> +		return;  
> > 
> > Huh?  This would be a serious consistency error if this happened for
> > the existing use case.
> >  
> 
> This will not happen for existing use case, i.e. device direct
> assignment. This case is true when there is only mediated device
> assigned and there are no direct assigned devices.

Which is sort of my point, you're using an arbitrary property to
identify a mediated device vfio_iommu vs a directed assigned one.  This
fits in with my complaint of how many different special cases are being
thrown into the code which increases the overall code complexity.

> >> @@ -569,6 +819,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
> >>  	uint64_t mask;
> >>  	struct vfio_dma *dma;
> >>  	unsigned long pfn;
> >> +	struct vfio_domain *domain = NULL;
> >>  
> >>  	/* Verify that none of our __u64 fields overflow */
> >>  	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
> >> @@ -611,10 +862,21 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
> >>  	/* Insert zero-sized and grow as we map chunks of it */
> >>  	vfio_link_dma(iommu, dma);
> >>  
> >> +	/*
> >> +	 * Skip pin and map if and domain list is empty
> >> +	 */
> >> +	if (list_empty(&iommu->domain_list)) {
> >> +		dma->size = size;
> >> +		goto map_done;
> >> +	}  
> > 
> > Again, this would be a serious consistency error for the existing use
> > case.  Let's use indicators that are explicit.
> >  
> 
> Why? for existing use case (i.e. direct device assignment) domain_list
> will not be empty, domain_list will only be empty when there is mediated
> device assigned and no direct device assigned.

I'm trying to be cautious whether it actually makes sense to
dual-purpose the existing type1 iommu backend.  What's the benefit of
putting an exit path in the middle of a function versus splitting it in
two separate functions with two separate callers, one of which only
calls the first function.  What's the benefit of a struct vfio_iommu
that hosts both mediated and directly assigned devices?  Is the
benefit to the functionality or to the code base?  Should the fact that
they use the same iommu API dictate that they're also managed in the
same data structures?  When we have too many special case exits or
branches, the code complexity increases, bugs are harder to flush out,
and possible exploits are more likely.  Convince me that this is the
right approach.
 
> >>  static int vfio_iommu_type1_attach_group(void *iommu_data,
> >>  					 struct iommu_group *iommu_group)
> >>  {
> >>  	struct vfio_iommu *iommu = iommu_data;
> >> -	struct vfio_group *group, *g;
> >> +	struct vfio_group *group;
> >>  	struct vfio_domain *domain, *d;
> >>  	struct bus_type *bus = NULL;
> >>  	int ret;
> >> @@ -746,14 +1030,21 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
> >>  	mutex_lock(&iommu->lock);
> >>  
> >>  	list_for_each_entry(d, &iommu->domain_list, next) {
> >> -		list_for_each_entry(g, &d->group_list, next) {
> >> -			if (g->iommu_group != iommu_group)
> >> -				continue;
> >> +		if (is_iommu_group_present(d, iommu_group)) {
> >> +			mutex_unlock(&iommu->lock);
> >> +			return -EINVAL;
> >> +		}
> >> +	}
> >>  
> >> +#if defined(CONFIG_MDEV) || defined(CONFIG_MDEV_MODULE)
> >> +	if (iommu->mediated_domain) {
> >> +		if (is_iommu_group_present(iommu->mediated_domain,
> >> +					   iommu_group)) {
> >>  			mutex_unlock(&iommu->lock);
> >>  			return -EINVAL;
> >>  		}
> >>  	}
> >> +#endif
> >>  
> >>  	group = kzalloc(sizeof(*group), GFP_KERNEL);
> >>  	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
> >> @@ -769,6 +1060,36 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
> >>  	if (ret)
> >>  		goto out_free;
> >>  
> >> +#if defined(CONFIG_MDEV) || defined(CONFIG_MDEV_MODULE)
> >> +	if (!iommu_present(bus) && (bus == &mdev_bus_type)) {
> >> +		struct mdev_device *mdev = NULL;  
> > 
> > Unnecessary initialization.
> >   
> >> +
> >> +		mdev = mdev_get_device_by_group(iommu_group);
> >> +		if (!mdev)
> >> +			goto out_free;
> >> +
> >> +		mdev->iommu_data = iommu;  
> > 
> > This looks rather sketchy to me, we don't have a mediated driver in
> > this series, but presumably the driver blindly calls vfio_pin_pages
> > passing mdev->iommu_data and hoping that it's either NULL to generate
> > an error or relevant to this iommu backend.  How would we add a second
> > mediated driver iommu backend?  We're currently assuming the user
> > configured this backend.    
> 
> If I understand correctly, your question is if two different mediated
> devices are assigned to same container. In such case, the two mediated
> devices will have different iommu_groups and will be added to
> mediated_domain's group_list (iommu->mediated_domain->group_list).

No, my concern is that mdev->iommu_data is opaque data specific to the
type1 extensions here, but vfio_pin_pages() is effectively a completely
separate API.  Mediated devices end up with sort of a side channel
into this one iommu, which breaks the modularity of vfio iommus.  So
let's say we create a type2 interface that also supports mediated
devices, do the mediated drivers still call vfio_pin_pages()?  Do we
need to export new interfaces for every iommu backend to support this?
The interface should probably be through the vfio container, IOW
extensions to the vfio_iommu_driver_ops or maybe direct use of the
ioctl callback within that interface such that the pinning is actually
paired with the correct driver and extensible.
 
> > Should vfio_pin_pages instead have a struct
> > device* parameter from which we would lookup the iommu_group and get to
> > the vfio_domain?  That's a bit heavy weight, but we need something
> > along those lines.
> >   
> 
> There could be multiple mdev devices from same mediated vendor driver in
> one container. In that case, that vendor driver need reference of
> container or container->iommu_data to pin and unpin pages.
> Similarly, there could be mutiple mdev devices from different mediated
> vendor driver in one container, in that case both vendor driver need
> reference to container or container->iommu_data to pin and unpin pages
> in their driver.

As above, I think something like this is necessary, the proposed
interface here is a bit of a side-channel hack.

> >> +		mdev->iommu_data = iommu;  
> With the above line, a reference to container->iommu_data is kept in
> mdev structure when the iommu_group is attached to a container so that
> vendor drivers can find reference to pin and unpin pages.
> 
> If struct device* is passed as an argument to vfio_pin_pages, to find
> reference to container of struct device *dev, have to find
> vfio_device/vfio_group from dev that means traverse vfio.group_list for
> each pin and unpin call. This list would be long when there are many
> mdev devices in the system.
> 
> Is there any better way to find reference to container from struct device*?

struct vfio_device *device = dev_get_drvdata(mdev->dev);

device->group->container

Note of course that vfio_device and vfio_group are private to vfio.c.
We do already have a vfio_device_get_from_dev() function though, so the
vendor could call into a vfio.c function with a reference to the
vfio_device.

> >>  
> >> @@ -930,8 +1289,28 @@ static void vfio_iommu_type1_release(void *iommu_data)
> >>  	struct vfio_domain *domain, *domain_tmp;
> >>  	struct vfio_group *group, *group_tmp;
> >>  
> >> +#if defined(CONFIG_MDEV) || defined(CONFIG_MDEV_MODULE)
> >> +	if (iommu->mediated_domain) {
> >> +		domain = iommu->mediated_domain;
> >> +		list_for_each_entry_safe(group, group_tmp,
> >> +					 &domain->group_list, next) {
> >> +			if (group->mdev) {
> >> +				group->mdev->iommu_data = NULL;
> >> +				mdev_put_device(group->mdev);
> >> +			}
> >> +			list_del(&group->next);
> >> +			kfree(group);
> >> +		}
> >> +		vfio_iommu_unpin_api_domain(domain);
> >> +		kfree(domain);
> >> +		iommu->mediated_domain = NULL;
> >> +	}
> >> +#endif  
> > 
> > I'm not really seeing how this is all that much more maintainable than
> > what was proposed previously, has this aspect been worked on since last
> > I reviewed this patch?
> >   
> 
> There aren't many changes from v4 to v5 version of this patch.
> Can you more specific on you concerns about maintainability? I'll
> definitely address your concerns.

In reply to comments on v3 of the series you said you'd prefer to work
on making the bimodal approach more maintainable rather than splitting
out common code and creating a separate module for type1-direct vs
type1-mediated.  To make the code more maintainable, I think we need to
see fewer special cases, clean data paths for each time with some
attention paid to how we detect one type vs another.  The killer
feature that would that would really justify the complexity of this
approach would be if locked page accounting avoid duplicate counts
between interfaces.  As it is, vfio_pin_pages() calls
vfio_pin_pages_internal() which adds to the user's locked_vm regardless
of whether those pages are already locked by a direct assigned device,
or even a previous call to vfio_pin_pages() for the same range, perhaps
by a different mediated device.  So we end up with userspace needing to
set the locked memory limit once for any number of direct assigned
devices, but then bump it up for each mediated device, by a value which
may depend on the type of mediated device.  That's clearly a rat's nest
for userspace to guess what it needs to do and pretty strongly
justifies such tight integration.  Thanks,

Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tian, Kevin June 30, 2016, 8:28 a.m. UTC | #4
> From: Alex Williamson [mailto:alex.williamson@redhat.com]
> Sent: Wednesday, June 29, 2016 10:46 AM
> 
> On Tue, 28 Jun 2016 18:32:44 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
> > On 6/22/2016 9:16 AM, Alex Williamson wrote:
> > > On Mon, 20 Jun 2016 22:01:48 +0530
> > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > >
> > >>
> > >>  struct vfio_iommu {
> > >>  	struct list_head	domain_list;
> > >> +	struct vfio_domain	*mediated_domain;
> > >
> > > I'm not really a fan of how this is so often used to special case the
> > > code...

Better remove this field and treat mediated_domain same as other domains
(within vfio_domain you already have additional fields to mark mediated
fact)

> 
> 
> > >> +	struct mdev_device	*mdev;
> > >
> > > This gets set on attach_group where we use the iommu_group to lookup
> > > the mdev, so why can't we do that on the other paths that make use of
> > > this?  I think this is just holding a reference.
> > >
> >
> > mdev is retrieved from attach_group for 2 reasons:
> > 1. to increase the ref count of mdev, mdev_get_device_by_group(), when
> > its iommu_group is attached. That should be decremented, by
> > mdev_put_device(), from detach while detaching its iommu_group. This is
> > make sure that mdev is not freed until it's iommu_group is detached from
> > the container.
> >
> > 2. save reference to iommu_data so that vendor driver would use to call
> > vfio_pin_pages() and vfio_unpin_pages(). More details below.

Can't we retrieve mdev_device from iommu_group once the mdev is attached
to a iommu_group?

> > >> @@ -569,6 +819,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
> > >>  	uint64_t mask;
> > >>  	struct vfio_dma *dma;
> > >>  	unsigned long pfn;
> > >> +	struct vfio_domain *domain = NULL;
> > >>
> > >>  	/* Verify that none of our __u64 fields overflow */
> > >>  	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
> > >> @@ -611,10 +862,21 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
> > >>  	/* Insert zero-sized and grow as we map chunks of it */
> > >>  	vfio_link_dma(iommu, dma);
> > >>
> > >> +	/*
> > >> +	 * Skip pin and map if and domain list is empty
> > >> +	 */
> > >> +	if (list_empty(&iommu->domain_list)) {
> > >> +		dma->size = size;
> > >> +		goto map_done;
> > >> +	}
> > >
> > > Again, this would be a serious consistency error for the existing use
> > > case.  Let's use indicators that are explicit.
> > >
> >
> > Why? for existing use case (i.e. direct device assignment) domain_list
> > will not be empty, domain_list will only be empty when there is mediated
> > device assigned and no direct device assigned.
> 
> I'm trying to be cautious whether it actually makes sense to
> dual-purpose the existing type1 iommu backend.  What's the benefit of
> putting an exit path in the middle of a function versus splitting it in
> two separate functions with two separate callers, one of which only
> calls the first function.  What's the benefit of a struct vfio_iommu
> that hosts both mediated and directly assigned devices?  Is the
> benefit to the functionality or to the code base?  Should the fact that
> they use the same iommu API dictate that they're also managed in the
> same data structures?  When we have too many special case exits or
> branches, the code complexity increases, bugs are harder to flush out,
> and possible exploits are more likely.  Convince me that this is the
> right approach.

If we have mediated_domain as a normal vfio_domain added to domain_list
of vfio_iommu, no such special case would be there then.

> 
> > > Should vfio_pin_pages instead have a struct
> > > device* parameter from which we would lookup the iommu_group and get to
> > > the vfio_domain?  That's a bit heavy weight, but we need something
> > > along those lines.
> > >
> >
> > There could be multiple mdev devices from same mediated vendor driver in
> > one container. In that case, that vendor driver need reference of
> > container or container->iommu_data to pin and unpin pages.
> > Similarly, there could be mutiple mdev devices from different mediated
> > vendor driver in one container, in that case both vendor driver need
> > reference to container or container->iommu_data to pin and unpin pages
> > in their driver.
> 
> As above, I think something like this is necessary, the proposed
> interface here is a bit of a side-channel hack.

Page-pinning should be counted per container, regardless of whether the
pinning request is for assigned devices or mediated devices. Then double
counting should be avoided as Alex pointed out.

Thanks
Kevin
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 75b24e93cedb..f17dd104fe27 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -36,6 +36,7 @@ 
 #include <linux/uaccess.h>
 #include <linux/vfio.h>
 #include <linux/workqueue.h>
+#include <linux/mdev.h>
 
 #define DRIVER_VERSION  "0.2"
 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
@@ -55,6 +56,7 @@  MODULE_PARM_DESC(disable_hugepages,
 
 struct vfio_iommu {
 	struct list_head	domain_list;
+	struct vfio_domain	*mediated_domain;
 	struct mutex		lock;
 	struct rb_root		dma_list;
 	bool			v2;
@@ -67,6 +69,13 @@  struct vfio_domain {
 	struct list_head	group_list;
 	int			prot;		/* IOMMU_CACHE */
 	bool			fgsp;		/* Fine-grained super pages */
+
+	/* Domain for mediated device which is without physical IOMMU */
+	bool			mediated_device;
+
+	struct mm_struct	*mm;
+	struct rb_root		pfn_list;	/* pinned Host pfn list */
+	struct mutex		pfn_list_lock;	/* mutex for pfn_list */
 };
 
 struct vfio_dma {
@@ -79,10 +88,26 @@  struct vfio_dma {
 
 struct vfio_group {
 	struct iommu_group	*iommu_group;
+#if defined(CONFIG_MDEV) || defined(CONFIG_MDEV_MODULE)
+	struct mdev_device	*mdev;
+#endif
 	struct list_head	next;
 };
 
 /*
+ * Guest RAM pinning working set or DMA target
+ */
+struct vfio_pfn {
+	struct rb_node		node;
+	unsigned long		vaddr;		/* virtual addr */
+	dma_addr_t		iova;		/* IOVA */
+	unsigned long		npage;		/* number of pages */
+	unsigned long		pfn;		/* Host pfn */
+	size_t			prot;
+	atomic_t		ref_count;
+};
+
+/*
  * This code handles mapping and unmapping of user data buffers
  * into DMA'ble space using the IOMMU
  */
@@ -130,6 +155,64 @@  static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
 	rb_erase(&old->node, &iommu->dma_list);
 }
 
+/*
+ * Helper Functions for host pfn list
+ */
+
+static struct vfio_pfn *vfio_find_pfn(struct vfio_domain *domain,
+				      unsigned long pfn)
+{
+	struct rb_node *node;
+	struct vfio_pfn *vpfn, *ret = NULL;
+
+	mutex_lock(&domain->pfn_list_lock);
+	node = domain->pfn_list.rb_node;
+
+	while (node) {
+		vpfn = rb_entry(node, struct vfio_pfn, node);
+
+		if (pfn < vpfn->pfn)
+			node = node->rb_left;
+		else if (pfn > vpfn->pfn)
+			node = node->rb_right;
+		else {
+			ret = vpfn;
+			break;
+		}
+	}
+
+	mutex_unlock(&domain->pfn_list_lock);
+	return ret;
+}
+
+static void vfio_link_pfn(struct vfio_domain *domain, struct vfio_pfn *new)
+{
+	struct rb_node **link, *parent = NULL;
+	struct vfio_pfn *vpfn;
+
+	mutex_lock(&domain->pfn_list_lock);
+	link = &domain->pfn_list.rb_node;
+	while (*link) {
+		parent = *link;
+		vpfn = rb_entry(parent, struct vfio_pfn, node);
+
+		if (new->pfn < vpfn->pfn)
+			link = &(*link)->rb_left;
+		else
+			link = &(*link)->rb_right;
+	}
+
+	rb_link_node(&new->node, parent, link);
+	rb_insert_color(&new->node, &domain->pfn_list);
+	mutex_unlock(&domain->pfn_list_lock);
+}
+
+/* call by holding domain->pfn_list_lock */
+static void vfio_unlink_pfn(struct vfio_domain *domain, struct vfio_pfn *old)
+{
+	rb_erase(&old->node, &domain->pfn_list);
+}
+
 struct vwork {
 	struct mm_struct	*mm;
 	long			npage;
@@ -228,20 +311,29 @@  static int put_pfn(unsigned long pfn, int prot)
 	return 0;
 }
 
-static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
+static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
+			 int prot, unsigned long *pfn)
 {
 	struct page *page[1];
 	struct vm_area_struct *vma;
+	struct mm_struct *local_mm = mm;
 	int ret = -EFAULT;
 
-	if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
+	if (!local_mm && !current->mm)
+		return -ENODEV;
+
+	if (!local_mm)
+		local_mm = current->mm;
+
+	down_read(&local_mm->mmap_sem);
+	if (get_user_pages_remote(NULL, local_mm, vaddr, 1,
+				!!(prot & IOMMU_WRITE), 0, page, NULL) == 1) {
 		*pfn = page_to_pfn(page[0]);
-		return 0;
+		ret = 0;
+		goto done_pfn;
 	}
 
-	down_read(&current->mm->mmap_sem);
-
-	vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
+	vma = find_vma_intersection(local_mm, vaddr, vaddr + 1);
 
 	if (vma && vma->vm_flags & VM_PFNMAP) {
 		*pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
@@ -249,7 +341,8 @@  static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
 			ret = 0;
 	}
 
-	up_read(&current->mm->mmap_sem);
+done_pfn:
+	up_read(&local_mm->mmap_sem);
 
 	return ret;
 }
@@ -259,18 +352,19 @@  static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
  * the iommu can only map chunks of consecutive pfns anyway, so get the
  * first page and all consecutive pages with the same locking.
  */
-static long vfio_pin_pages(unsigned long vaddr, long npage,
-			   int prot, unsigned long *pfn_base)
+static long vfio_pin_pages_internal(struct vfio_domain *domain,
+				    unsigned long vaddr, long npage,
+				    int prot, unsigned long *pfn_base)
 {
 	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 	bool lock_cap = capable(CAP_IPC_LOCK);
 	long ret, i;
 	bool rsvd;
 
-	if (!current->mm)
+	if (!domain)
 		return -ENODEV;
 
-	ret = vaddr_get_pfn(vaddr, prot, pfn_base);
+	ret = vaddr_get_pfn(domain->mm, vaddr, prot, pfn_base);
 	if (ret)
 		return ret;
 
@@ -293,7 +387,7 @@  static long vfio_pin_pages(unsigned long vaddr, long npage,
 	for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
 		unsigned long pfn = 0;
 
-		ret = vaddr_get_pfn(vaddr, prot, &pfn);
+		ret = vaddr_get_pfn(domain->mm, vaddr, prot, &pfn);
 		if (ret)
 			break;
 
@@ -318,20 +412,165 @@  static long vfio_pin_pages(unsigned long vaddr, long npage,
 	return i;
 }
 
-static long vfio_unpin_pages(unsigned long pfn, long npage,
-			     int prot, bool do_accounting)
+static long vfio_unpin_pages_internal(struct vfio_domain *domain,
+				      unsigned long pfn, long npage, int prot,
+				      bool do_accounting)
 {
 	unsigned long unlocked = 0;
 	long i;
 
+	if (!domain)
+		return -ENODEV;
+
 	for (i = 0; i < npage; i++)
 		unlocked += put_pfn(pfn++, prot);
 
 	if (do_accounting)
 		vfio_lock_acct(-unlocked);
+	return unlocked;
+}
+
+/*
+ * Pin a set of guest PFNs and return their associated host PFNs for API
+ * supported domain only.
+ * @vaddr [in]: array of guest PFNs
+ * @npage [in]: count of array elements
+ * @prot [in] : protection flags
+ * @pfn_base[out] : array of host PFNs
+ */
+long vfio_pin_pages(void *iommu_data, dma_addr_t *vaddr, long npage,
+		   int prot, dma_addr_t *pfn_base)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	struct vfio_domain *domain = NULL;
+	int i = 0, ret = 0;
+	long retpage;
+	unsigned long remote_vaddr = 0;
+	dma_addr_t *pfn = pfn_base;
+	struct vfio_dma *dma;
+
+	if (!iommu || !vaddr || !pfn_base)
+		return -EINVAL;
+
+	mutex_lock(&iommu->lock);
+
+	if (!iommu->mediated_domain) {
+		ret = -EINVAL;
+		goto pin_done;
+	}
+
+	domain = iommu->mediated_domain;
+
+	for (i = 0; i < npage; i++) {
+		struct vfio_pfn *p, *lpfn;
+		unsigned long tpfn;
+		dma_addr_t iova;
+		long pg_cnt = 1;
+
+		iova = vaddr[i] << PAGE_SHIFT;
+
+		dma = vfio_find_dma(iommu, iova, 0 /*  size */);
+		if (!dma) {
+			ret = -EINVAL;
+			goto pin_done;
+		}
+
+		remote_vaddr = dma->vaddr + iova - dma->iova;
+
+		retpage = vfio_pin_pages_internal(domain, remote_vaddr,
+						  pg_cnt, prot, &tpfn);
+		if (retpage <= 0) {
+			WARN_ON(!retpage);
+			ret = (int)retpage;
+			goto pin_done;
+		}
+
+		pfn[i] = tpfn;
+
+		/* search if pfn exist */
+		p = vfio_find_pfn(domain, tpfn);
+		if (p) {
+			atomic_inc(&p->ref_count);
+			continue;
+		}
+
+		/* add to pfn_list */
+		lpfn = kzalloc(sizeof(*lpfn), GFP_KERNEL);
+		if (!lpfn) {
+			ret = -ENOMEM;
+			goto pin_done;
+		}
+		lpfn->vaddr = remote_vaddr;
+		lpfn->iova = iova;
+		lpfn->pfn = pfn[i];
+		lpfn->npage = 1;
+		lpfn->prot = prot;
+		atomic_inc(&lpfn->ref_count);
+		vfio_link_pfn(domain, lpfn);
+	}
+
+	ret = i;
+
+pin_done:
+	mutex_unlock(&iommu->lock);
+	return ret;
+}
+EXPORT_SYMBOL(vfio_pin_pages);
+
+static int vfio_unpin_pfn(struct vfio_domain *domain,
+			  struct vfio_pfn *vpfn, bool do_accounting)
+{
+	int ret;
+
+	ret = vfio_unpin_pages_internal(domain, vpfn->pfn, vpfn->npage,
+					vpfn->prot, do_accounting);
+
+	if (ret > 0 && atomic_dec_and_test(&vpfn->ref_count)) {
+		vfio_unlink_pfn(domain, vpfn);
+		kfree(vpfn);
+	}
+
+	return ret;
+}
+
+/*
+ * Unpin set of host PFNs for API supported domain only.
+ * @pfn	[in] : array of host PFNs to be unpinned.
+ * @npage [in] :count of elements in array, that is number of pages.
+ * @prot [in] : protection flags
+ */
+long vfio_unpin_pages(void *iommu_data, dma_addr_t *pfn, long npage,
+		     int prot)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	struct vfio_domain *domain = NULL;
+	long unlocked = 0;
+	int i;
+
+	if (!iommu || !pfn)
+		return -EINVAL;
+
+	if (!iommu->mediated_domain)
+		return -EINVAL;
+
+	domain = iommu->mediated_domain;
+
+	for (i = 0; i < npage; i++) {
+		struct vfio_pfn *p;
+
+		/* verify if pfn exist in pfn_list */
+		p = vfio_find_pfn(domain, *(pfn + i));
+		if (!p)
+			continue;
+
+		mutex_lock(&domain->pfn_list_lock);
+		unlocked += vfio_unpin_pfn(domain, p, true);
+		mutex_unlock(&domain->pfn_list_lock);
+	}
 
 	return unlocked;
 }
+EXPORT_SYMBOL(vfio_unpin_pages);
 
 static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
 {
@@ -341,6 +580,10 @@  static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
 
 	if (!dma->size)
 		return;
+
+	if (list_empty(&iommu->domain_list))
+		return;
+
 	/*
 	 * We use the IOMMU to track the physical addresses, otherwise we'd
 	 * need a much more complicated tracking system.  Unfortunately that
@@ -382,9 +625,10 @@  static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
 		if (WARN_ON(!unmapped))
 			break;
 
-		unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
-					     unmapped >> PAGE_SHIFT,
-					     dma->prot, false);
+		unlocked += vfio_unpin_pages_internal(domain,
+						phys >> PAGE_SHIFT,
+						unmapped >> PAGE_SHIFT,
+						dma->prot, false);
 		iova += unmapped;
 
 		cond_resched();
@@ -517,6 +761,9 @@  static int map_try_harder(struct vfio_domain *domain, dma_addr_t iova,
 	long i;
 	int ret;
 
+	if (domain->mediated_device)
+		return -EINVAL;
+
 	for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
 		ret = iommu_map(domain->domain, iova,
 				(phys_addr_t)pfn << PAGE_SHIFT,
@@ -537,6 +784,9 @@  static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
 	struct vfio_domain *d;
 	int ret;
 
+	if (list_empty(&iommu->domain_list))
+		return 0;
+
 	list_for_each_entry(d, &iommu->domain_list, next) {
 		ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
 				npage << PAGE_SHIFT, prot | d->prot);
@@ -569,6 +819,7 @@  static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	uint64_t mask;
 	struct vfio_dma *dma;
 	unsigned long pfn;
+	struct vfio_domain *domain = NULL;
 
 	/* Verify that none of our __u64 fields overflow */
 	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
@@ -611,10 +862,21 @@  static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	/* Insert zero-sized and grow as we map chunks of it */
 	vfio_link_dma(iommu, dma);
 
+	/*
+	 * Skip pin and map if and domain list is empty
+	 */
+	if (list_empty(&iommu->domain_list)) {
+		dma->size = size;
+		goto map_done;
+	}
+
+	domain = list_first_entry(&iommu->domain_list,
+				  struct vfio_domain, next);
+
 	while (size) {
 		/* Pin a contiguous chunk of memory */
-		npage = vfio_pin_pages(vaddr + dma->size,
-				       size >> PAGE_SHIFT, prot, &pfn);
+		npage = vfio_pin_pages_internal(domain, vaddr + dma->size,
+						size >> PAGE_SHIFT, prot, &pfn);
 		if (npage <= 0) {
 			WARN_ON(!npage);
 			ret = (int)npage;
@@ -624,7 +886,8 @@  static int vfio_dma_do_map(struct vfio_iommu *iommu,
 		/* Map it! */
 		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot);
 		if (ret) {
-			vfio_unpin_pages(pfn, npage, prot, true);
+			vfio_unpin_pages_internal(domain, pfn, npage,
+						  prot, true);
 			break;
 		}
 
@@ -635,6 +898,7 @@  static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	if (ret)
 		vfio_remove_dma(iommu, dma);
 
+map_done:
 	mutex_unlock(&iommu->lock);
 	return ret;
 }
@@ -658,6 +922,9 @@  static int vfio_iommu_replay(struct vfio_iommu *iommu,
 	struct rb_node *n;
 	int ret;
 
+	if (domain->mediated_device)
+		return 0;
+
 	/* Arbitrarily pick the first domain in the list for lookups */
 	d = list_first_entry(&iommu->domain_list, struct vfio_domain, next);
 	n = rb_first(&iommu->dma_list);
@@ -716,6 +983,9 @@  static void vfio_test_domain_fgsp(struct vfio_domain *domain)
 	struct page *pages;
 	int ret, order = get_order(PAGE_SIZE * 2);
 
+	if (domain->mediated_device)
+		return;
+
 	pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
 	if (!pages)
 		return;
@@ -734,11 +1004,25 @@  static void vfio_test_domain_fgsp(struct vfio_domain *domain)
 	__free_pages(pages, order);
 }
 
+static struct vfio_group *is_iommu_group_present(struct vfio_domain *domain,
+				   struct iommu_group *iommu_group)
+{
+	struct vfio_group *g;
+
+	list_for_each_entry(g, &domain->group_list, next) {
+		if (g->iommu_group != iommu_group)
+			continue;
+		return g;
+	}
+
+	return NULL;
+}
+
 static int vfio_iommu_type1_attach_group(void *iommu_data,
 					 struct iommu_group *iommu_group)
 {
 	struct vfio_iommu *iommu = iommu_data;
-	struct vfio_group *group, *g;
+	struct vfio_group *group;
 	struct vfio_domain *domain, *d;
 	struct bus_type *bus = NULL;
 	int ret;
@@ -746,14 +1030,21 @@  static int vfio_iommu_type1_attach_group(void *iommu_data,
 	mutex_lock(&iommu->lock);
 
 	list_for_each_entry(d, &iommu->domain_list, next) {
-		list_for_each_entry(g, &d->group_list, next) {
-			if (g->iommu_group != iommu_group)
-				continue;
+		if (is_iommu_group_present(d, iommu_group)) {
+			mutex_unlock(&iommu->lock);
+			return -EINVAL;
+		}
+	}
 
+#if defined(CONFIG_MDEV) || defined(CONFIG_MDEV_MODULE)
+	if (iommu->mediated_domain) {
+		if (is_iommu_group_present(iommu->mediated_domain,
+					   iommu_group)) {
 			mutex_unlock(&iommu->lock);
 			return -EINVAL;
 		}
 	}
+#endif
 
 	group = kzalloc(sizeof(*group), GFP_KERNEL);
 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
@@ -769,6 +1060,36 @@  static int vfio_iommu_type1_attach_group(void *iommu_data,
 	if (ret)
 		goto out_free;
 
+#if defined(CONFIG_MDEV) || defined(CONFIG_MDEV_MODULE)
+	if (!iommu_present(bus) && (bus == &mdev_bus_type)) {
+		struct mdev_device *mdev = NULL;
+
+		mdev = mdev_get_device_by_group(iommu_group);
+		if (!mdev)
+			goto out_free;
+
+		mdev->iommu_data = iommu;
+		group->mdev = mdev;
+
+		if (iommu->mediated_domain) {
+			list_add(&group->next,
+				 &iommu->mediated_domain->group_list);
+			kfree(domain);
+			mutex_unlock(&iommu->lock);
+			return 0;
+		}
+		domain->mediated_device = true;
+		domain->mm = current->mm;
+		INIT_LIST_HEAD(&domain->group_list);
+		list_add(&group->next, &domain->group_list);
+		domain->pfn_list = RB_ROOT;
+		mutex_init(&domain->pfn_list_lock);
+		iommu->mediated_domain = domain;
+		mutex_unlock(&iommu->lock);
+		return 0;
+	}
+#endif
+
 	domain->domain = iommu_domain_alloc(bus);
 	if (!domain->domain) {
 		ret = -EIO;
@@ -859,6 +1180,20 @@  static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
 		vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
 }
 
+#if defined(CONFIG_MDEV) || defined(CONFIG_MDEV_MODULE)
+static void vfio_iommu_unpin_api_domain(struct vfio_domain *domain)
+{
+	struct rb_node *node;
+
+	mutex_lock(&domain->pfn_list_lock);
+	while ((node = rb_first(&domain->pfn_list))) {
+		vfio_unpin_pfn(domain,
+				rb_entry(node, struct vfio_pfn, node), false);
+	}
+	mutex_unlock(&domain->pfn_list_lock);
+}
+#endif
+
 static void vfio_iommu_type1_detach_group(void *iommu_data,
 					  struct iommu_group *iommu_group)
 {
@@ -868,31 +1203,55 @@  static void vfio_iommu_type1_detach_group(void *iommu_data,
 
 	mutex_lock(&iommu->lock);
 
-	list_for_each_entry(domain, &iommu->domain_list, next) {
-		list_for_each_entry(group, &domain->group_list, next) {
-			if (group->iommu_group != iommu_group)
-				continue;
+#if defined(CONFIG_MDEV) || defined(CONFIG_MDEV_MODULE)
+	if (iommu->mediated_domain) {
+		domain = iommu->mediated_domain;
+		group = is_iommu_group_present(domain, iommu_group);
+		if (group) {
+			if (group->mdev) {
+				group->mdev->iommu_data = NULL;
+				mdev_put_device(group->mdev);
+			}
+			list_del(&group->next);
+			kfree(group);
+
+			if (list_empty(&domain->group_list)) {
+				vfio_iommu_unpin_api_domain(domain);
+
+				if (list_empty(&iommu->domain_list))
+					vfio_iommu_unmap_unpin_all(iommu);
+
+				kfree(domain);
+				iommu->mediated_domain = NULL;
+			}
+		}
+	}
+#endif
 
+	list_for_each_entry(domain, &iommu->domain_list, next) {
+		group = is_iommu_group_present(domain, iommu_group);
+		if (group) {
 			iommu_detach_group(domain->domain, iommu_group);
 			list_del(&group->next);
 			kfree(group);
 			/*
 			 * Group ownership provides privilege, if the group
 			 * list is empty, the domain goes away.  If it's the
-			 * last domain, then all the mappings go away too.
+			 * last domain with iommu and API-only domain doesn't
+			 * exist, the all the mappings go away too.
 			 */
 			if (list_empty(&domain->group_list)) {
-				if (list_is_singular(&iommu->domain_list))
+				if (list_is_singular(&iommu->domain_list) &&
+				    !iommu->mediated_domain)
 					vfio_iommu_unmap_unpin_all(iommu);
 				iommu_domain_free(domain->domain);
 				list_del(&domain->next);
 				kfree(domain);
 			}
-			goto done;
+			break;
 		}
 	}
 
-done:
 	mutex_unlock(&iommu->lock);
 }
 
@@ -930,8 +1289,28 @@  static void vfio_iommu_type1_release(void *iommu_data)
 	struct vfio_domain *domain, *domain_tmp;
 	struct vfio_group *group, *group_tmp;
 
+#if defined(CONFIG_MDEV) || defined(CONFIG_MDEV_MODULE)
+	if (iommu->mediated_domain) {
+		domain = iommu->mediated_domain;
+		list_for_each_entry_safe(group, group_tmp,
+					 &domain->group_list, next) {
+			if (group->mdev) {
+				group->mdev->iommu_data = NULL;
+				mdev_put_device(group->mdev);
+			}
+			list_del(&group->next);
+			kfree(group);
+		}
+		vfio_iommu_unpin_api_domain(domain);
+		kfree(domain);
+		iommu->mediated_domain = NULL;
+	}
+#endif
 	vfio_iommu_unmap_unpin_all(iommu);
 
+	if (list_empty(&iommu->domain_list))
+		goto release_exit;
+
 	list_for_each_entry_safe(domain, domain_tmp,
 				 &iommu->domain_list, next) {
 		list_for_each_entry_safe(group, group_tmp,
@@ -945,6 +1324,7 @@  static void vfio_iommu_type1_release(void *iommu_data)
 		kfree(domain);
 	}
 
+release_exit:
 	kfree(iommu);
 }
 
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 431b824b0d3e..0a907bb33426 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -134,6 +134,12 @@  static inline long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
 }
 #endif /* CONFIG_EEH */
 
+extern long vfio_pin_pages(void *iommu_data, dma_addr_t *vaddr, long npage,
+			   int prot, dma_addr_t *pfn_base);
+
+extern long vfio_unpin_pages(void *iommu_data, dma_addr_t *pfn, long npage,
+			     int prot);
+
 /*
  * IRQfd - generic
  */