diff mbox

[RFC,v3,3/3] VFIO Type1 IOMMU change: to support with iommu and without iommu

Message ID 1462214441-3732-4-git-send-email-kwankhede@nvidia.com (mailing list archive)
State New, archived
Headers show

Commit Message

Kirti Wankhede May 2, 2016, 6:40 p.m. UTC
VFIO Type1 IOMMU driver is designed for the devices which are IOMMU capable.
vGPU device are only using IOMMU TYPE1 API, the underlying hardware can be
managed by an IOMMU domain. To use most of the code of IOMMU driver for vGPU
devices, type1 IOMMU driver is modified to support vGPU devices. This change
exports functions to pin and unpin pages for vGPU devices.
It maintains data of pinned pages for vGPU domain. This data is used to verify
unpinning request and also used to unpin pages from detach_group().

Tested by assigning below combinations of devices to a single VM:
- GPU pass through only
- vGPU device only
- One GPU pass through and one vGPU device
- two GPU pass through

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Signed-off-by: Neo Jia <cjia@nvidia.com>
Change-Id: I6e35e9fc7f14049226365e9ecef3814dc4ca1738
---
 drivers/vfio/vfio_iommu_type1.c |  427 ++++++++++++++++++++++++++++++++++++---
 include/linux/vfio.h            |    6 +
 include/linux/vgpu.h            |    4 +-
 3 files changed, 403 insertions(+), 34 deletions(-)

Comments

Jike Song May 3, 2016, 10:40 a.m. UTC | #1
On 05/03/2016 02:40 AM, Kirti Wankhede wrote:
> VFIO Type1 IOMMU driver is designed for the devices which are IOMMU capable.
> vGPU device are only using IOMMU TYPE1 API, the underlying hardware can be
> managed by an IOMMU domain. To use most of the code of IOMMU driver for vGPU
> devices, type1 IOMMU driver is modified to support vGPU devices. This change
> exports functions to pin and unpin pages for vGPU devices.
> It maintains data of pinned pages for vGPU domain. This data is used to verify
> unpinning request and also used to unpin pages from detach_group().
> 
> Tested by assigning below combinations of devices to a single VM:
> - GPU pass through only
> - vGPU device only
> - One GPU pass through and one vGPU device
> - two GPU pass through

{the patch trimmed}

Hi Kirti,

 I have a question: in the scenario above, how many PCI BDFs do your vGPUs consume?

 Per my understanding, you take the GPA of a KVM guest as the IOVA of IOMMU domain,
and if there are multiple guests with vGPU assigned, the vGPUs must belong to
different vGPUs (thereby having different BDFs).

 Do I miss anything?


--
Thanks,
Jike
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson May 3, 2016, 10:43 p.m. UTC | #2
On Tue, 3 May 2016 00:10:41 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> VFIO Type1 IOMMU driver is designed for the devices which are IOMMU capable.
> vGPU device are only using IOMMU TYPE1 API, the underlying hardware can be
> managed by an IOMMU domain. To use most of the code of IOMMU driver for vGPU
> devices, type1 IOMMU driver is modified to support vGPU devices. This change
> exports functions to pin and unpin pages for vGPU devices.
> It maintains data of pinned pages for vGPU domain. This data is used to verify
> unpinning request and also used to unpin pages from detach_group().
> 
> Tested by assigning below combinations of devices to a single VM:
> - GPU pass through only
> - vGPU device only
> - One GPU pass through and one vGPU device
> - two GPU pass through
>
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Signed-off-by: Neo Jia <cjia@nvidia.com>
> Change-Id: I6e35e9fc7f14049226365e9ecef3814dc4ca1738
> ---
>  drivers/vfio/vfio_iommu_type1.c |  427 ++++++++++++++++++++++++++++++++++++---
>  include/linux/vfio.h            |    6 +
>  include/linux/vgpu.h            |    4 +-
>  3 files changed, 403 insertions(+), 34 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 75b24e9..a970854 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -36,6 +36,7 @@
>  #include <linux/uaccess.h>
>  #include <linux/vfio.h>
>  #include <linux/workqueue.h>
> +#include <linux/vgpu.h>
>  
>  #define DRIVER_VERSION  "0.2"
>  #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
> @@ -67,6 +68,11 @@ struct vfio_domain {
>  	struct list_head	group_list;
>  	int			prot;		/* IOMMU_CACHE */
>  	bool			fgsp;		/* Fine-grained super pages */
> +	bool			vfio_iommu_api_only;	/* Domain for device which
> +							   is without physical IOMMU */
> +	struct mm_struct	*vmm_mm;	/* VMM's mm */

I really don't like assuming a VMM, vfio is a userspace driver
interface, this is just an mm associated with this set of mappings.

> +	struct rb_root		pfn_list;	/* Host pfn list for requested gfns */

This is just an iova to pfn list, whether it's a guest running on a
VMM, we don't care.

> +	struct mutex		lock;		/* mutex for pfn_list */

So pfn_list_lock might be a better name for it.

>  };
>  
>  struct vfio_dma {
> @@ -83,6 +89,19 @@ struct vfio_group {
>  };
>  
>  /*
> + * Guest RAM pinning working set or DMA target
> + */
> +struct vfio_vgpu_pfn {
> +	struct rb_node		node;
> +	unsigned long		vmm_va;		/* VMM virtual addr */

vaddr

> +	dma_addr_t		iova;		/* IOVA */
> +	unsigned long		npage;		/* number of pages */
> +	unsigned long		pfn;		/* Host pfn */
> +	int			prot;
> +	atomic_t		ref_count;
> +	struct list_head	next;
> +};

Why is any of this vgpu specific?  It's just a data structure for
tracking iova to vaddr pins.

> +/*
>   * This code handles mapping and unmapping of user data buffers
>   * into DMA'ble space using the IOMMU
>   */
> @@ -130,6 +149,53 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
>  	rb_erase(&old->node, &iommu->dma_list);
>  }
>  
> +/*
> + * Helper Functions for host pfn list
> + */
> +
> +static struct vfio_vgpu_pfn *vfio_find_vgpu_pfn(struct vfio_domain *domain,
> +						unsigned long pfn)
> +{
> +	struct rb_node *node = domain->pfn_list.rb_node;
> +
> +	while (node) {
> +		struct vfio_vgpu_pfn *vgpu_pfn = rb_entry(node, struct vfio_vgpu_pfn, node);
> +
> +		if (pfn <= vgpu_pfn->pfn)
> +			node = node->rb_left;
> +		else if (pfn >= vgpu_pfn->pfn)
> +			node = node->rb_right;
> +		else
> +			return vgpu_pfn;
> +	}
> +
> +	return NULL;
> +}
> +
> +static void vfio_link_vgpu_pfn(struct vfio_domain *domain, struct vfio_vgpu_pfn *new)
> +{
> +	struct rb_node **link = &domain->pfn_list.rb_node, *parent = NULL;
> +	struct vfio_vgpu_pfn *vgpu_pfn;
> +
> +	while (*link) {
> +		parent = *link;
> +		vgpu_pfn = rb_entry(parent, struct vfio_vgpu_pfn, node);
> +
> +		if (new->pfn <= vgpu_pfn->pfn)
> +			link = &(*link)->rb_left;
> +		else
> +			link = &(*link)->rb_right;
> +	}
> +
> +	rb_link_node(&new->node, parent, link);
> +	rb_insert_color(&new->node, &domain->pfn_list);
> +}
> +
> +static void vfio_unlink_vgpu_pfn(struct vfio_domain *domain, struct vfio_vgpu_pfn *old)
> +{
> +	rb_erase(&old->node, &domain->pfn_list);
> +}
> +

None of the above is really vgpu specific either, just managing a tree
of mappings.  Name things based on what they do, not who they're for.

>  struct vwork {
>  	struct mm_struct	*mm;
>  	long			npage;
> @@ -228,20 +294,22 @@ static int put_pfn(unsigned long pfn, int prot)
>  	return 0;
>  }
>  
> -static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
> +static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
> +			 int prot, unsigned long *pfn)
>  {
>  	struct page *page[1];
>  	struct vm_area_struct *vma;
>  	int ret = -EFAULT;
>  
> -	if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
> +	if (get_user_pages_remote(NULL, mm, vaddr, 1, !!(prot & IOMMU_WRITE),
> +				    0, page, NULL) == 1) {

AIUI, _remote requires the mmap_sem to be held, _fast does not.  I don't
see that being accounted for anywhere.

>  		*pfn = page_to_pfn(page[0]);
>  		return 0;
>  	}
>  
> -	down_read(&current->mm->mmap_sem);
> +	down_read(&mm->mmap_sem);
>  
> -	vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
> +	vma = find_vma_intersection(mm, vaddr, vaddr + 1);
>  
>  	if (vma && vma->vm_flags & VM_PFNMAP) {
>  		*pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
> @@ -249,28 +317,63 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
>  			ret = 0;
>  	}
>  
> -	up_read(&current->mm->mmap_sem);
> +	up_read(&mm->mmap_sem);
>  
>  	return ret;
>  }
>  
>  /*
> + * Get first domain with iommu and without iommu from iommu's domain_list for
> + * lookups
> + * @iommu [in]: iommu structure
> + * @domain [out]: domain with iommu
> + * @domain_vgpu [out] : domain without iommu for vGPU
> + */
> +static void get_first_domains(struct vfio_iommu *iommu, struct vfio_domain **domain,
> +			      struct vfio_domain **domain_vgpu)
> +{
> +	struct vfio_domain *d;
> +
> +	if (!domain || !domain_vgpu)
> +		return;
> +
> +	list_for_each_entry(d, &iommu->domain_list, next) {
> +		if (d->vfio_iommu_api_only && !*domain_vgpu)
> +			*domain_vgpu = d;
> +		else if (!*domain)
> +			*domain = d;
> +		if (*domain_vgpu && *domain)
> +			break;
> +	}
> +}

This looks like pure overhead for existing code.  Also no need to
introduce the concept of vgpu here.  

> +
> +/*
>   * Attempt to pin pages.  We really don't want to track all the pfns and
>   * the iommu can only map chunks of consecutive pfns anyway, so get the
>   * first page and all consecutive pages with the same locking.
>   */
> -static long vfio_pin_pages(unsigned long vaddr, long npage,
> -			   int prot, unsigned long *pfn_base)
> +static long vfio_pin_pages_internal(void *domain_data, unsigned long vaddr, long npage,

I appears we know this as a struct vfio_domain in all callers, why is
this using void*?.

> +		             int prot, unsigned long *pfn_base)
>  {
> +	struct vfio_domain *domain = domain_data;
>  	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>  	bool lock_cap = capable(CAP_IPC_LOCK);
>  	long ret, i;
>  	bool rsvd;
> +	struct mm_struct *mm;
>  
> -	if (!current->mm)
> +	if (!domain)
>  		return -ENODEV;
>  
> -	ret = vaddr_get_pfn(vaddr, prot, pfn_base);
> +	if (domain->vfio_iommu_api_only)
> +		mm = domain->vmm_mm;
> +	else
> +		mm = current->mm;
> +
> +	if (!mm)
> +		return -ENODEV;
> +
> +	ret = vaddr_get_pfn(mm, vaddr, prot, pfn_base);

We could pass domain->mm unconditionally to vaddr_get_pfn(), let it be
NULL in the !api_only case and use it as a cue to vaddr_get_pfn() which
gup variant to use.  Of course we need to deal with mmap_sem somewhere
too without turning the code into swiss cheese.

Correct me if I'm wrong, but I assume the main benefit of interweaving
this into type1 vs pulling out common code and making a new vfio iommu
backend is the page accounting, ie. not over accounting locked pages.
TBH, I don't know if it's worth it.  Any idea what the high water mark
of pinned pages for a vgpu might be?

>  	if (ret)
>  		return ret;
>  
> @@ -293,7 +396,7 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
>  	for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
>  		unsigned long pfn = 0;
>  
> -		ret = vaddr_get_pfn(vaddr, prot, &pfn);
> +		ret = vaddr_get_pfn(mm, vaddr, prot, &pfn);
>  		if (ret)
>  			break;
>  
> @@ -318,25 +421,183 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
>  	return i;
>  }
>  
> -static long vfio_unpin_pages(unsigned long pfn, long npage,
> -			     int prot, bool do_accounting)
> +static long vfio_unpin_pages_internal(void *domain_data, unsigned long pfn, long npage,

Again, all the callers seem to know they have a struct vfio_domain*, I
don't see any justification for using a void* here.

It seems less disruptive to leave these function names alone and make
the new functions _external.

> +				      int prot, bool do_accounting)
>  {
> +	struct vfio_domain *domain = domain_data;
>  	unsigned long unlocked = 0;
>  	long i;
>  
> +	if (!domain)
> +		return -ENODEV;
> +

How is this possible?  Callers of this function need to be updated for
a possible negative return or accounting gets really broken.

>  	for (i = 0; i < npage; i++)
>  		unlocked += put_pfn(pfn++, prot);
>  
>  	if (do_accounting)
>  		vfio_lock_acct(-unlocked);
> +	return unlocked;
> +}
> +
> +/*
> + * Pin a set of guest PFNs and return their associated host PFNs for vGPU.
> + * @vaddr [in]: array of guest PFNs
> + * @npage [in]: count of array elements
> + * @prot [in] : protection flags
> + * @pfn_base[out] : array of host PFNs
> + */
> +int vfio_pin_pages(void *iommu_data, dma_addr_t *vaddr, long npage,
> +		   int prot, dma_addr_t *pfn_base)
> +{
> +	struct vfio_iommu *iommu = iommu_data;
> +	struct vfio_domain *domain = NULL, *domain_vgpu = NULL;
> +	int i = 0, ret = 0;
> +	long retpage;
> +	dma_addr_t remote_vaddr = 0;
> +	dma_addr_t *pfn = pfn_base;
> +	struct vfio_dma *dma;
> +
> +	if (!iommu || !pfn_base)
> +		return -EINVAL;
> +
> +	if (list_empty(&iommu->domain_list)) {
> +		ret = -EINVAL;
> +		goto pin_done;
> +	}
> +
> +	get_first_domains(iommu, &domain, &domain_vgpu);
> +
> +	// Return error if vGPU domain doesn't exist

No c++ style comments please.

> +	if (!domain_vgpu) {
> +		ret = -EINVAL;
> +		goto pin_done;
> +	}
> +
> +	for (i = 0; i < npage; i++) {
> +		struct vfio_vgpu_pfn *p;
> +		struct vfio_vgpu_pfn *lpfn;
> +		unsigned long tpfn;
> +		dma_addr_t iova;
> +
> +		mutex_lock(&iommu->lock);
> +
> +		iova = vaddr[i] << PAGE_SHIFT;
> +
> +		dma = vfio_find_dma(iommu, iova, 0 /*  size */);
> +		if (!dma) {
> +			mutex_unlock(&iommu->lock);
> +			ret = -EINVAL;
> +			goto pin_done;
> +		}
> +
> +		remote_vaddr = dma->vaddr + iova - dma->iova;
> +
> +		retpage = vfio_pin_pages_internal(domain_vgpu, remote_vaddr,
> +						  (long)1, prot, &tpfn);
> +		mutex_unlock(&iommu->lock);
> +		if (retpage <= 0) {
> +			WARN_ON(!retpage);
> +			ret = (int)retpage;
> +			goto pin_done;
> +		}
> +
> +		pfn[i] = tpfn;
> +
> +		mutex_lock(&domain_vgpu->lock);
> +
> +		// search if pfn exist
> +		if ((p = vfio_find_vgpu_pfn(domain_vgpu, tpfn))) {
> +			atomic_inc(&p->ref_count);
> +			mutex_unlock(&domain_vgpu->lock);
> +			continue;
> +		}

The only reason I can come up with for why we'd want to integrate an
api-only domain into the existing type1 code would be to avoid page
accounting issues where we count locked pages once for a normal
assigned device and again for a vgpu, but that's not what we're doing
here.  We're not only locking the pages again regardless of them
already being locked, we're counting every time we lock them through
this new interface.  So there's really no point at all to making type1
become this unsupportable.  In that case we should be pulling out the
common code that we want to share from type1 and making a new type1
compatible vfio iommu backend rather than conditionalizing everything
here.

> +
> +		// add to pfn_list
> +		lpfn = kzalloc(sizeof(*lpfn), GFP_KERNEL);
> +		if (!lpfn) {
> +			ret = -ENOMEM;
> +			mutex_unlock(&domain_vgpu->lock);
> +			goto pin_done;
> +		}
> +		lpfn->vmm_va = remote_vaddr;
> +		lpfn->iova = iova;
> +		lpfn->pfn = pfn[i];
> +		lpfn->npage = 1;
> +		lpfn->prot = prot;
> +		atomic_inc(&lpfn->ref_count);
> +		vfio_link_vgpu_pfn(domain_vgpu, lpfn);
> +		mutex_unlock(&domain_vgpu->lock);
> +	}
> +
> +	ret = i;
> +
> +pin_done:
> +	return ret;
> +}
> +EXPORT_SYMBOL(vfio_pin_pages);
> +
> +static int vfio_vgpu_unpin_pfn(struct vfio_domain *domain, struct vfio_vgpu_pfn *vpfn)
> +{
> +	int ret;
> +
> +	ret = vfio_unpin_pages_internal(domain, vpfn->pfn, vpfn->npage, vpfn->prot, true);
> +
> +	if (atomic_dec_and_test(&vpfn->ref_count)) {
> +		// remove from pfn_list
> +		vfio_unlink_vgpu_pfn(domain, vpfn);
> +		kfree(vpfn);
> +	}
> +
> +	return ret;
> +}
> +
> +/*
> + * Unpin set of host PFNs for vGPU.
> + * @pfn	[in] : array of host PFNs to be unpinned.
> + * @npage [in] :count of elements in array, that is number of pages.
> + * @prot [in] : protection flags
> + */
> +int vfio_unpin_pages(void *iommu_data, dma_addr_t *pfn, long npage,
> +		     int prot)
> +{
> +	struct vfio_iommu *iommu = iommu_data;
> +	struct vfio_domain *domain = NULL, *domain_vgpu = NULL;
> +	long unlocked = 0;
> +	int i;
> +	if (!iommu || !pfn)
> +		return -EINVAL;
> +
> +	if (list_empty(&iommu->domain_list))
> +		return -EINVAL;
> +
> +	get_first_domains(iommu, &domain, &domain_vgpu);
> +
> +	// Return error if vGPU domain doesn't exist
> +	if (!domain_vgpu)
> +		return -EINVAL;
> +
> +	mutex_lock(&domain_vgpu->lock);
> +
> +	for (i = 0; i < npage; i++) {
> +		struct vfio_vgpu_pfn *p;
> +
> +		// verify if pfn exist in pfn_list
> +		if (!(p = vfio_find_vgpu_pfn(domain_vgpu, *(pfn + i)))) {
> +			continue;

How does the caller deal with this, the function returns number of
pages unpinned which will not match the requested number of pages to
unpin if there are any missing.  Also, no setting variables within a
test when easily avoidable please, separate to a set then test.

> +		}
> +
> +		unlocked += vfio_vgpu_unpin_pfn(domain_vgpu, p);
> +	}
> +	mutex_unlock(&domain_vgpu->lock);
>  
>  	return unlocked;
>  }
> +EXPORT_SYMBOL(vfio_unpin_pages);
>  
>  static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
>  {
>  	dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
> -	struct vfio_domain *domain, *d;
> +	struct vfio_domain *domain = NULL, *d, *domain_vgpu = NULL;
>  	long unlocked = 0;
>  
>  	if (!dma->size)
> @@ -348,12 +609,18 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
>  	 * pfns to unpin.  The rest need to be unmapped in advance so we have
>  	 * no iommu translations remaining when the pages are unpinned.
>  	 */
> -	domain = d = list_first_entry(&iommu->domain_list,
> -				      struct vfio_domain, next);
>  
> +	get_first_domains(iommu, &domain, &domain_vgpu);
> +
> +	if (!domain)
> +		return;
> +
> +	d = domain;
>  	list_for_each_entry_continue(d, &iommu->domain_list, next) {
> -		iommu_unmap(d->domain, dma->iova, dma->size);
> -		cond_resched();
> +		if (!d->vfio_iommu_api_only) {
> +			iommu_unmap(d->domain, dma->iova, dma->size);
> +			cond_resched();
> +		}
>  	}
>  
>  	while (iova < end) {

How do api-only domain not blowup on the iommu API code in this next
code block?  Are you just getting lucky that the api-only domain is
first in the list and the real domain is last?

> @@ -382,7 +649,7 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
>  		if (WARN_ON(!unmapped))
>  			break;
>  
> -		unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
> +		unlocked += vfio_unpin_pages_internal(domain, phys >> PAGE_SHIFT,
>  					     unmapped >> PAGE_SHIFT,
>  					     dma->prot, false);
>  		iova += unmapped;
> @@ -406,8 +673,10 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
>  	unsigned long bitmap = ULONG_MAX;
>  
>  	mutex_lock(&iommu->lock);
> -	list_for_each_entry(domain, &iommu->domain_list, next)
> -		bitmap &= domain->domain->ops->pgsize_bitmap;
> +	list_for_each_entry(domain, &iommu->domain_list, next) {
> +		if (!domain->vfio_iommu_api_only)
> +			bitmap &= domain->domain->ops->pgsize_bitmap;
> +	}
>  	mutex_unlock(&iommu->lock);
>  
>  	/*
> @@ -517,6 +786,9 @@ static int map_try_harder(struct vfio_domain *domain, dma_addr_t iova,
>  	long i;
>  	int ret;
>  
> +	if (domain->vfio_iommu_api_only)
> +		return -EINVAL;
> +
>  	for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
>  		ret = iommu_map(domain->domain, iova,
>  				(phys_addr_t)pfn << PAGE_SHIFT,
> @@ -538,6 +810,9 @@ static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
>  	int ret;
>  
>  	list_for_each_entry(d, &iommu->domain_list, next) {
> +		if (d->vfio_iommu_api_only)
> +			continue;
> +

Really disliking all these switches everywhere, too many different code
paths.

>  		ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
>  				npage << PAGE_SHIFT, prot | d->prot);
>  		if (ret) {
> @@ -552,8 +827,11 @@ static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
>  	return 0;
>  
>  unwind:
> -	list_for_each_entry_continue_reverse(d, &iommu->domain_list, next)
> +	list_for_each_entry_continue_reverse(d, &iommu->domain_list, next) {
> +		if (d->vfio_iommu_api_only)
> +			continue;
>  		iommu_unmap(d->domain, iova, npage << PAGE_SHIFT);
> +	}
>  
>  	return ret;
>  }
> @@ -569,6 +847,8 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  	uint64_t mask;
>  	struct vfio_dma *dma;
>  	unsigned long pfn;
> +	struct vfio_domain *domain = NULL;
> +	int domain_with_iommu_present = 0;
>  
>  	/* Verify that none of our __u64 fields overflow */
>  	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
> @@ -611,9 +891,22 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  	/* Insert zero-sized and grow as we map chunks of it */
>  	vfio_link_dma(iommu, dma);
>  
> +	list_for_each_entry(domain, &iommu->domain_list, next) {
> +		if (!domain->vfio_iommu_api_only) {
> +			domain_with_iommu_present = 1;
> +			break;
> +		}
> +	}
> +
> +	// Skip pin and map only if domain without IOMMU is present
> +	if (!domain_with_iommu_present) {
> +		dma->size = size;
> +		goto map_done;
> +	}
> +

Yet more special cases, the code is getting unsupportable.

>  	while (size) {
>  		/* Pin a contiguous chunk of memory */
> -		npage = vfio_pin_pages(vaddr + dma->size,
> +		npage = vfio_pin_pages_internal(domain, vaddr + dma->size,
>  				       size >> PAGE_SHIFT, prot, &pfn);
>  		if (npage <= 0) {
>  			WARN_ON(!npage);
> @@ -623,8 +916,8 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  
>  		/* Map it! */
>  		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot);
> -		if (ret) {
> -			vfio_unpin_pages(pfn, npage, prot, true);
> +		if (ret){
> +			vfio_unpin_pages_internal(domain, pfn, npage, prot, true);
>  			break;
>  		}
>  
> @@ -635,6 +928,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  	if (ret)
>  		vfio_remove_dma(iommu, dma);
>  
> +map_done:
>  	mutex_unlock(&iommu->lock);
>  	return ret;
>  }
> @@ -654,12 +948,15 @@ static int vfio_bus_type(struct device *dev, void *data)
>  static int vfio_iommu_replay(struct vfio_iommu *iommu,
>  			     struct vfio_domain *domain)
>  {
> -	struct vfio_domain *d;
> +	struct vfio_domain *d = NULL, *d_vgpu = NULL;
>  	struct rb_node *n;
>  	int ret;
>  
> +	if (domain->vfio_iommu_api_only)
> +		return -EINVAL;

Huh?  This only does iommu API stuff, shouldn't we skip it w/o error
for api-only?

> +
>  	/* Arbitrarily pick the first domain in the list for lookups */
> -	d = list_first_entry(&iommu->domain_list, struct vfio_domain, next);
> +	get_first_domains(iommu, &d, &d_vgpu);

Gag, probably should have used a separate list.

>  	n = rb_first(&iommu->dma_list);
>  
>  	/* If there's not a domain, there better not be any mappings */
> @@ -716,6 +1013,9 @@ static void vfio_test_domain_fgsp(struct vfio_domain *domain)
>  	struct page *pages;
>  	int ret, order = get_order(PAGE_SIZE * 2);
>  
> +	if (domain->vfio_iommu_api_only)
> +		return;
> +
>  	pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
>  	if (!pages)
>  		return;
> @@ -769,6 +1069,23 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	if (ret)
>  		goto out_free;
>  
> +	if (!iommu_present(bus) && (bus == &vgpu_bus_type)) {
> +		struct vgpu_device *vgpu_dev = NULL;
> +
> +		vgpu_dev = get_vgpu_device_from_group(iommu_group);
> +		if (!vgpu_dev)
> +			goto out_free;
> +
> +		vgpu_dev->iommu_data = iommu;

Probably better to have a vgpu_set_iommu_data() function rather than
manipulate it ourselves.  We also have no guarantees about races since
vgpus have no reference counting.

> +		domain->vfio_iommu_api_only = true;
> +		domain->vmm_mm = current->mm;
> +		INIT_LIST_HEAD(&domain->group_list);
> +		list_add(&group->next, &domain->group_list);
> +		domain->pfn_list = RB_ROOT;
> +		mutex_init(&domain->lock);
> +		goto out_success;
> +	}

Very little sharing going on here.

> +
>  	domain->domain = iommu_domain_alloc(bus);
>  	if (!domain->domain) {
>  		ret = -EIO;
> @@ -834,6 +1151,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	if (ret)
>  		goto out_detach;
>  
> +out_success:
>  	list_add(&domain->next, &iommu->domain_list);
>  
>  	mutex_unlock(&iommu->lock);
> @@ -854,11 +1172,36 @@ out_free:
>  static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
>  {
>  	struct rb_node *node;
> +	struct vfio_domain *domain = NULL, *domain_vgpu = NULL;
> +
> +	get_first_domains(iommu, &domain, &domain_vgpu);
> +
> +	if (domain_vgpu) {
> +		int unlocked;
> +		mutex_lock(&domain_vgpu->lock);
> +		while ((node = rb_first(&domain_vgpu->pfn_list))) {
> +			unlocked = vfio_vgpu_unpin_pfn(domain_vgpu,
> +					rb_entry(node, struct vfio_vgpu_pfn, node));

Why bother to store the return?

> +		}
> +		mutex_unlock(&domain_vgpu->lock);
> +	}
>  
>  	while ((node = rb_first(&iommu->dma_list)))
>  		vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
>  }
>  
> +static bool list_is_singular_iommu_domain(struct vfio_iommu *iommu)
> +{
> +	struct vfio_domain *domain;
> +	int domain_iommu = 0;
> +
> +	list_for_each_entry(domain, &iommu->domain_list, next) {
> +		if (!domain->vfio_iommu_api_only)
> +			domain_iommu++;
> +	}
> +	return (domain_iommu == 1);
> +}
> +
>  static void vfio_iommu_type1_detach_group(void *iommu_data,
>  					  struct iommu_group *iommu_group)
>  {
> @@ -872,19 +1215,28 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
>  		list_for_each_entry(group, &domain->group_list, next) {
>  			if (group->iommu_group != iommu_group)
>  				continue;
> +			if (!domain->vfio_iommu_api_only)
> +				iommu_detach_group(domain->domain, iommu_group);
> +			else {
> +				struct vgpu_device *vgpu_dev = NULL;
>  
> -			iommu_detach_group(domain->domain, iommu_group);
> +				vgpu_dev = get_vgpu_device_from_group(iommu_group);
> +				if (vgpu_dev)
> +					vgpu_dev->iommu_data = NULL;
> +
> +			}
>  			list_del(&group->next);
>  			kfree(group);
>  			/*
>  			 * Group ownership provides privilege, if the group
>  			 * list is empty, the domain goes away.  If it's the
> -			 * last domain, then all the mappings go away too.
> +			 * last domain with iommu, then all the mappings go away too.
>  			 */
>  			if (list_empty(&domain->group_list)) {
> -				if (list_is_singular(&iommu->domain_list))
> +				if (list_is_singular_iommu_domain(iommu))
>  					vfio_iommu_unmap_unpin_all(iommu);
> -				iommu_domain_free(domain->domain);
> +				if (!domain->vfio_iommu_api_only)
> +					iommu_domain_free(domain->domain);
>  				list_del(&domain->next);
>  				kfree(domain);
>  			}
> @@ -936,11 +1288,22 @@ static void vfio_iommu_type1_release(void *iommu_data)
>  				 &iommu->domain_list, next) {
>  		list_for_each_entry_safe(group, group_tmp,
>  					 &domain->group_list, next) {
> -			iommu_detach_group(domain->domain, group->iommu_group);
> +			if (!domain->vfio_iommu_api_only)
> +				iommu_detach_group(domain->domain, group->iommu_group);
> +			else {
> +				struct vgpu_device *vgpu_dev = NULL;
> +
> +				vgpu_dev = get_vgpu_device_from_group(group->iommu_group);
> +				if (vgpu_dev)
> +					vgpu_dev->iommu_data = NULL;
> +
> +			}
> +
>  			list_del(&group->next);
>  			kfree(group);
>  		}
> -		iommu_domain_free(domain->domain);
> +		if (!domain->vfio_iommu_api_only)
> +			iommu_domain_free(domain->domain);
>  		list_del(&domain->next);
>  		kfree(domain);
>  	}

I'm really not convinced that pushing this into the type1 code is the
right approach vs pulling out shareable code chunks where it makes
sense and creating a separate iommu backend.  We're not getting
anything but code complexity out of this approach it seems.  Thanks,

Alex

> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 0ecae0b..d280868 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -127,6 +127,12 @@ static inline long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
>  }
>  #endif /* CONFIG_EEH */
>  
> +extern long vfio_pin_pages(void *iommu_data, dma_addr_t *vaddr, long npage,
> +		           int prot, dma_addr_t *pfn_base);
> +
> +extern long vfio_unpin_pages(void *iommu_data, dma_addr_t *pfn, long npage,
> +			     int prot);
> +
>  /*
>   * IRQfd - generic
>   */
> diff --git a/include/linux/vgpu.h b/include/linux/vgpu.h
> index 03a77cf..cc18353 100644
> --- a/include/linux/vgpu.h
> +++ b/include/linux/vgpu.h
> @@ -36,6 +36,7 @@ struct vgpu_device {
>  	struct device		dev;
>  	struct gpu_device	*gpu_dev;
>  	struct iommu_group	*group;
> +	void			*iommu_data;
>  #define DEVICE_NAME_LEN		(64)
>  	char			dev_name[DEVICE_NAME_LEN];
>  	uuid_le			uuid;
> @@ -209,8 +210,7 @@ extern void vgpu_unregister_driver(struct vgpu_driver *drv);
>  
>  extern int vgpu_map_virtual_bar(uint64_t virt_bar_addr, uint64_t phys_bar_addr,
>  				uint32_t len, uint32_t flags);
> -extern int vgpu_dma_do_translate(dma_addr_t * gfn_buffer, uint32_t count);
>  
> -struct vgpu_device *get_vgpu_device_from_group(struct iommu_group *group);
> +extern struct vgpu_device *get_vgpu_device_from_group(struct iommu_group *group);
>  
>  #endif /* VGPU_H */

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tian, Kevin May 4, 2016, 3:39 a.m. UTC | #3
> From: Alex Williamson [mailto:alex.williamson@redhat.com]
> Sent: Wednesday, May 04, 2016 6:43 AM
> > +		             int prot, unsigned long *pfn_base)
> >  {
> > +	struct vfio_domain *domain = domain_data;
> >  	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> >  	bool lock_cap = capable(CAP_IPC_LOCK);
> >  	long ret, i;
> >  	bool rsvd;
> > +	struct mm_struct *mm;
> >
> > -	if (!current->mm)
> > +	if (!domain)
> >  		return -ENODEV;
> >
> > -	ret = vaddr_get_pfn(vaddr, prot, pfn_base);
> > +	if (domain->vfio_iommu_api_only)
> > +		mm = domain->vmm_mm;
> > +	else
> > +		mm = current->mm;
> > +
> > +	if (!mm)
> > +		return -ENODEV;
> > +
> > +	ret = vaddr_get_pfn(mm, vaddr, prot, pfn_base);
> 
> We could pass domain->mm unconditionally to vaddr_get_pfn(), let it be
> NULL in the !api_only case and use it as a cue to vaddr_get_pfn() which
> gup variant to use.  Of course we need to deal with mmap_sem somewhere
> too without turning the code into swiss cheese.
> 
> Correct me if I'm wrong, but I assume the main benefit of interweaving
> this into type1 vs pulling out common code and making a new vfio iommu
> backend is the page accounting, ie. not over accounting locked pages.
> TBH, I don't know if it's worth it.  Any idea what the high water mark
> of pinned pages for a vgpu might be?

The baseline is same as today's PCI device passthrough, i.e. we need to
pin all memory pages allocated to the said VM at least for current KVMGT. 
Ideally we may reduce the pinned set based on fined-grained resource 
tracking within vGPU device model (then it might be in 100MBs based on 
active graphics memory working set). 

Thanks
Kevin
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jike Song May 5, 2016, 6:55 a.m. UTC | #4
On 05/04/2016 06:43 AM, Alex Williamson wrote:
> On Tue, 3 May 2016 00:10:41 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>> +
>> +/*
>> + * Pin a set of guest PFNs and return their associated host PFNs for vGPU.
>> + * @vaddr [in]: array of guest PFNs
>> + * @npage [in]: count of array elements
>> + * @prot [in] : protection flags
>> + * @pfn_base[out] : array of host PFNs
>> + */
>> +int vfio_pin_pages(void *iommu_data, dma_addr_t *vaddr, long npage,
>> +		   int prot, dma_addr_t *pfn_base)
>> +{
>> +	struct vfio_iommu *iommu = iommu_data;
>> +	struct vfio_domain *domain = NULL, *domain_vgpu = NULL;
>> +	int i = 0, ret = 0;
>> +	long retpage;
>> +	dma_addr_t remote_vaddr = 0;
>> +	dma_addr_t *pfn = pfn_base;
>> +	struct vfio_dma *dma;
>> +
>> +	if (!iommu || !pfn_base)
>> +		return -EINVAL;
>> +
>> +	if (list_empty(&iommu->domain_list)) {
>> +		ret = -EINVAL;
>> +		goto pin_done;
>> +	}
>> +
>> +	get_first_domains(iommu, &domain, &domain_vgpu);
>> +
>> +	// Return error if vGPU domain doesn't exist
> 
> No c++ style comments please.
> 
>> +	if (!domain_vgpu) {
>> +		ret = -EINVAL;
>> +		goto pin_done;
>> +	}
>> +
>> +	for (i = 0; i < npage; i++) {
>> +		struct vfio_vgpu_pfn *p;
>> +		struct vfio_vgpu_pfn *lpfn;
>> +		unsigned long tpfn;
>> +		dma_addr_t iova;
>> +
>> +		mutex_lock(&iommu->lock);
>> +
>> +		iova = vaddr[i] << PAGE_SHIFT;
>> +
>> +		dma = vfio_find_dma(iommu, iova, 0 /*  size */);
>> +		if (!dma) {
>> +			mutex_unlock(&iommu->lock);
>> +			ret = -EINVAL;
>> +			goto pin_done;
>> +		}
>> +
>> +		remote_vaddr = dma->vaddr + iova - dma->iova;
>> +
>> +		retpage = vfio_pin_pages_internal(domain_vgpu, remote_vaddr,
>> +						  (long)1, prot, &tpfn);
>> +		mutex_unlock(&iommu->lock);
>> +		if (retpage <= 0) {
>> +			WARN_ON(!retpage);
>> +			ret = (int)retpage;
>> +			goto pin_done;
>> +		}
>> +
>> +		pfn[i] = tpfn;
>> +
>> +		mutex_lock(&domain_vgpu->lock);
>> +
>> +		// search if pfn exist
>> +		if ((p = vfio_find_vgpu_pfn(domain_vgpu, tpfn))) {
>> +			atomic_inc(&p->ref_count);
>> +			mutex_unlock(&domain_vgpu->lock);
>> +			continue;
>> +		}
> 
> The only reason I can come up with for why we'd want to integrate an
> api-only domain into the existing type1 code would be to avoid page
> accounting issues where we count locked pages once for a normal
> assigned device and again for a vgpu, but that's not what we're doing
> here.  We're not only locking the pages again regardless of them
> already being locked, we're counting every time we lock them through
> this new interface.  So there's really no point at all to making type1
> become this unsupportable.  In that case we should be pulling out the
> common code that we want to share from type1 and making a new type1
> compatible vfio iommu backend rather than conditionalizing everything
> here.
> 
>> +
>> +		// add to pfn_list
>> +		lpfn = kzalloc(sizeof(*lpfn), GFP_KERNEL);
>> +		if (!lpfn) {
>> +			ret = -ENOMEM;
>> +			mutex_unlock(&domain_vgpu->lock);
>> +			goto pin_done;
>> +		}
>> +		lpfn->vmm_va = remote_vaddr;
>> +		lpfn->iova = iova;
>> +		lpfn->pfn = pfn[i];
>> +		lpfn->npage = 1;
>> +		lpfn->prot = prot;
>> +		atomic_inc(&lpfn->ref_count);
>> +		vfio_link_vgpu_pfn(domain_vgpu, lpfn);
>> +		mutex_unlock(&domain_vgpu->lock);
>> +	}
>> +
>> +	ret = i;
>> +
>> +pin_done:
>> +	return ret;
>> +}
>> +EXPORT_SYMBOL(vfio_pin_pages);
>> +

IIUC, an api-only domain is a VFIO domain *without* underlying IOMMU
hardware. It just, as you said in another mail, "rather than
programming them into an IOMMU for a device, it simply stores the
translations for use by later requests".

That imposes a constraint on gfx driver: hardware IOMMU must be disabled.
Otherwise, if IOMMU is present, the gfx driver eventually programs
the hardware IOMMU with IOVA returned by pci_map_page or dma_map_page;
Meanwhile, the IOMMU backend for vgpu only maintains GPA <-> HPA
translations without any knowledge about hardware IOMMU, how is the
device model supposed to do to get an IOVA for a given GPA (thereby HPA
by the IOMMU backend here)?

If things go as guessed above, as vfio_pin_pages() indicates, it
pin & translate vaddr to PFN, then it will be very difficult for the
device model to figure out:

	1, for a given GPA, how to avoid calling dma_map_page multiple times?
	2, for which page to call dma_unmap_page?

--
Thanks,
Jike

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kirti Wankhede May 5, 2016, 7:51 a.m. UTC | #5
On 5/4/2016 4:13 AM, Alex Williamson wrote:
 > On Tue, 3 May 2016 00:10:41 +0530
 > Kirti Wankhede <kwankhede@nvidia.com> wrote:
 >
[..]


 >> +	if (domain->vfio_iommu_api_only)
 >> +		mm = domain->vmm_mm;
 >> +	else
 >> +		mm = current->mm;
 >> +
 >> +	if (!mm)
 >> +		return -ENODEV;
 >> +
 >> +	ret = vaddr_get_pfn(mm, vaddr, prot, pfn_base);
 >
 > We could pass domain->mm unconditionally to vaddr_get_pfn(), let it be
 > NULL in the !api_only case and use it as a cue to vaddr_get_pfn() which
 > gup variant to use.  Of course we need to deal with mmap_sem somewhere
 > too without turning the code into swiss cheese.
 >

Yes, I missed that. Thanks for pointing out. I'll fix it.

 > Correct me if I'm wrong, but I assume the main benefit of interweaving
 > this into type1 vs pulling out common code and making a new vfio iommu
 > backend is the page accounting, ie. not over accounting locked pages.
 > TBH, I don't know if it's worth it.  Any idea what the high water mark
 > of pinned pages for a vgpu might be?
 >

It depends in which VM (Linux/Windows) is running and what workload is 
running. On Windows DMA pages are managed by WDDM model. On Linux each 
user space application can DMA to pages and there are not restrictions.


 > The only reason I can come up with for why we'd want to integrate an
 > api-only domain into the existing type1 code would be to avoid page
 > accounting issues where we count locked pages once for a normal
 > assigned device and again for a vgpu, but that's not what we're doing
 > here.  We're not only locking the pages again regardless of them
 > already being locked, we're counting every time we lock them through
 > this new interface.  So there's really no point at all to making type1
 > become this unsupportable.  In that case we should be pulling out the
 > common code that we want to share from type1 and making a new type1
 > compatible vfio iommu backend rather than conditionalizing everything
 > here.
 >

I tried to add pfn tracking logic and use already locked pages, but that 
didn't worked somehow, I'll revisit it again.
With this there will be additional pfn tracking logic for the case where 
device is directly assigned and vGPU device is not present.



 >> +		// verify if pfn exist in pfn_list
 >> +		if (!(p = vfio_find_vgpu_pfn(domain_vgpu, *(pfn + i)))) {
 >> +			continue;
 >
 > How does the caller deal with this, the function returns number of
 > pages unpinned which will not match the requested number of pages to
 > unpin if there are any missing.  Also, no setting variables within a
 > test when easily avoidable please, separate to a set then test.
 >

Here we are following the current code logic. Do you have any suggestion 
how to deal with that?


 >> +	(iommu, &domain, &domain_vgpu);
 >> +
 >> +	if (!domain)
 >> +		return;
 >> +
 >> +	d = domain;
 >>  	list_for_each_entry_continue(d, &iommu->domain_list, next) {
 >> -		iommu_unmap(d->domain, dma->iova, dma->size);
 >> -		cond_resched();
 >> +		if (!d->vfio_iommu_api_only) {
 >> +			iommu_unmap(d->domain, dma->iova, dma->size);
 >> +			cond_resched();
 >> +		}get_first_domains
 >>  	}
 >>
 >>  	while (iova < end) {
 >
 > How do api-only domain not blowup on the iommu API code in this next
 > code block?  Are you just getting lucky that the api-only domain is
 > first in the list and the real domain is last?
 >

Control will not come here if there is no domain with IOMMU due to below 
change:

 >> +	if (!domain)
 >> +		return;

get_first_domains() returns the first domain with IOMMU and first domain 
with api_only.


 >> +		if (d->vfio_iommu_api_only)
 >> +			continue;
 >> +
 >
 > Really disliking all these switches everywhere, too many different code
 > paths.
 >

I'll move such APIs to inline functions such that this check would be 
within inline functions and code would look much cleaner.


 >> +	// Skip pin and map only if domain without IOMMU is present
 >> +	if (!domain_with_iommu_present) {
 >> +		dma->size = size;
 >> +		goto map_done;
 >> +	}
 >> +
 >
 > Yet more special cases, the code is getting unsupportable.

 From vfio_dma_do_map(), if there is no devices pass-throughed then we 
don't want to pin all pages upfront. and that is the reason of this check.

 >
 > I'm really not convinced that pushing this into the type1 code is the
 > right approach vs pulling out shareable code chunks where it makes
 > sense and creating a separate iommu backend.  We're not getting
 > anything but code complexity out of this approach it seems.

I find pulling out shared code is also not simple. I would like to 
revisit this again and sort out the concerns you raised rather than 
making separate module.

Thanks,
Kirti.


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tian, Kevin May 5, 2016, 9:27 a.m. UTC | #6
> From: Song, Jike

> Sent: Thursday, May 05, 2016 2:56 PM

> >

> > The only reason I can come up with for why we'd want to integrate an

> > api-only domain into the existing type1 code would be to avoid page

> > accounting issues where we count locked pages once for a normal

> > assigned device and again for a vgpu, but that's not what we're doing

> > here.  We're not only locking the pages again regardless of them

> > already being locked, we're counting every time we lock them through

> > this new interface.  So there's really no point at all to making type1

> > become this unsupportable.  In that case we should be pulling out the

> > common code that we want to share from type1 and making a new type1

> > compatible vfio iommu backend rather than conditionalizing everything

> > here.

> >

> >> +

> >> +		// add to pfn_list

> >> +		lpfn = kzalloc(sizeof(*lpfn), GFP_KERNEL);

> >> +		if (!lpfn) {

> >> +			ret = -ENOMEM;

> >> +			mutex_unlock(&domain_vgpu->lock);

> >> +			goto pin_done;

> >> +		}

> >> +		lpfn->vmm_va = remote_vaddr;

> >> +		lpfn->iova = iova;

> >> +		lpfn->pfn = pfn[i];

> >> +		lpfn->npage = 1;

> >> +		lpfn->prot = prot;

> >> +		atomic_inc(&lpfn->ref_count);

> >> +		vfio_link_vgpu_pfn(domain_vgpu, lpfn);

> >> +		mutex_unlock(&domain_vgpu->lock);

> >> +	}

> >> +

> >> +	ret = i;

> >> +

> >> +pin_done:

> >> +	return ret;

> >> +}

> >> +EXPORT_SYMBOL(vfio_pin_pages);

> >> +

> 

> IIUC, an api-only domain is a VFIO domain *without* underlying IOMMU

> hardware. It just, as you said in another mail, "rather than

> programming them into an IOMMU for a device, it simply stores the

> translations for use by later requests".

> 

> That imposes a constraint on gfx driver: hardware IOMMU must be disabled.

> Otherwise, if IOMMU is present, the gfx driver eventually programs

> the hardware IOMMU with IOVA returned by pci_map_page or dma_map_page;

> Meanwhile, the IOMMU backend for vgpu only maintains GPA <-> HPA

> translations without any knowledge about hardware IOMMU, how is the

> device model supposed to do to get an IOVA for a given GPA (thereby HPA

> by the IOMMU backend here)?

> 

> If things go as guessed above, as vfio_pin_pages() indicates, it

> pin & translate vaddr to PFN, then it will be very difficult for the

> device model to figure out:

> 

> 	1, for a given GPA, how to avoid calling dma_map_page multiple times?

> 	2, for which page to call dma_unmap_page?

> 

> --


We have to support both w/ iommu and w/o iommu case, since
that fact is out of GPU driver control. A simple way is to use
dma_map_page which internally will cope with w/ and w/o iommu
case gracefully, i.e. return HPA w/o iommu and IOVA w/ iommu.
Then in this file we only need to cache GPA to whatever dmadr_t
returned by dma_map_page.

Thanks
Kevin
Jike Song May 10, 2016, 7:52 a.m. UTC | #7
On 05/05/2016 05:27 PM, Tian, Kevin wrote:
>> From: Song, Jike
>>
>> IIUC, an api-only domain is a VFIO domain *without* underlying IOMMU
>> hardware. It just, as you said in another mail, "rather than
>> programming them into an IOMMU for a device, it simply stores the
>> translations for use by later requests".
>>
>> That imposes a constraint on gfx driver: hardware IOMMU must be disabled.
>> Otherwise, if IOMMU is present, the gfx driver eventually programs
>> the hardware IOMMU with IOVA returned by pci_map_page or dma_map_page;
>> Meanwhile, the IOMMU backend for vgpu only maintains GPA <-> HPA
>> translations without any knowledge about hardware IOMMU, how is the
>> device model supposed to do to get an IOVA for a given GPA (thereby HPA
>> by the IOMMU backend here)?
>>
>> If things go as guessed above, as vfio_pin_pages() indicates, it
>> pin & translate vaddr to PFN, then it will be very difficult for the
>> device model to figure out:
>>
>> 	1, for a given GPA, how to avoid calling dma_map_page multiple times?
>> 	2, for which page to call dma_unmap_page?
>>
>> --
> 
> We have to support both w/ iommu and w/o iommu case, since
> that fact is out of GPU driver control. A simple way is to use
> dma_map_page which internally will cope with w/ and w/o iommu
> case gracefully, i.e. return HPA w/o iommu and IOVA w/ iommu.
> Then in this file we only need to cache GPA to whatever dmadr_t
> returned by dma_map_page.
> 

Hi Alex, Kirti and Neo, any thought on the IOMMU compatibility here?

--
Thanks,
Jike

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neo Jia May 10, 2016, 4:02 p.m. UTC | #8
On Tue, May 10, 2016 at 03:52:27PM +0800, Jike Song wrote:
> On 05/05/2016 05:27 PM, Tian, Kevin wrote:
> >> From: Song, Jike
> >>
> >> IIUC, an api-only domain is a VFIO domain *without* underlying IOMMU
> >> hardware. It just, as you said in another mail, "rather than
> >> programming them into an IOMMU for a device, it simply stores the
> >> translations for use by later requests".
> >>
> >> That imposes a constraint on gfx driver: hardware IOMMU must be disabled.
> >> Otherwise, if IOMMU is present, the gfx driver eventually programs
> >> the hardware IOMMU with IOVA returned by pci_map_page or dma_map_page;
> >> Meanwhile, the IOMMU backend for vgpu only maintains GPA <-> HPA
> >> translations without any knowledge about hardware IOMMU, how is the
> >> device model supposed to do to get an IOVA for a given GPA (thereby HPA
> >> by the IOMMU backend here)?
> >>
> >> If things go as guessed above, as vfio_pin_pages() indicates, it
> >> pin & translate vaddr to PFN, then it will be very difficult for the
> >> device model to figure out:
> >>
> >> 	1, for a given GPA, how to avoid calling dma_map_page multiple times?
> >> 	2, for which page to call dma_unmap_page?
> >>
> >> --
> > 
> > We have to support both w/ iommu and w/o iommu case, since
> > that fact is out of GPU driver control. A simple way is to use
> > dma_map_page which internally will cope with w/ and w/o iommu
> > case gracefully, i.e. return HPA w/o iommu and IOVA w/ iommu.
> > Then in this file we only need to cache GPA to whatever dmadr_t
> > returned by dma_map_page.
> > 
> 
> Hi Alex, Kirti and Neo, any thought on the IOMMU compatibility here?

Hi Jike,

With mediated passthru, you still can use hardware iommu, but more important
that part is actually orthogonal to what we are discussing here as we will only
cache the mapping between <gfn (iova if guest has iommu), (qemu) va>, once we 
have pinned pages later with the help of above info, you can map it into the
proper iommu domain if the system has configured so.

Thanks,
Neo

> 
> --
> Thanks,
> Jike
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jike Song May 11, 2016, 9:15 a.m. UTC | #9
On 05/11/2016 12:02 AM, Neo Jia wrote:
> On Tue, May 10, 2016 at 03:52:27PM +0800, Jike Song wrote:
>> On 05/05/2016 05:27 PM, Tian, Kevin wrote:
>>>> From: Song, Jike
>>>>
>>>> IIUC, an api-only domain is a VFIO domain *without* underlying IOMMU
>>>> hardware. It just, as you said in another mail, "rather than
>>>> programming them into an IOMMU for a device, it simply stores the
>>>> translations for use by later requests".
>>>>
>>>> That imposes a constraint on gfx driver: hardware IOMMU must be disabled.
>>>> Otherwise, if IOMMU is present, the gfx driver eventually programs
>>>> the hardware IOMMU with IOVA returned by pci_map_page or dma_map_page;
>>>> Meanwhile, the IOMMU backend for vgpu only maintains GPA <-> HPA
>>>> translations without any knowledge about hardware IOMMU, how is the
>>>> device model supposed to do to get an IOVA for a given GPA (thereby HPA
>>>> by the IOMMU backend here)?
>>>>
>>>> If things go as guessed above, as vfio_pin_pages() indicates, it
>>>> pin & translate vaddr to PFN, then it will be very difficult for the
>>>> device model to figure out:
>>>>
>>>> 	1, for a given GPA, how to avoid calling dma_map_page multiple times?
>>>> 	2, for which page to call dma_unmap_page?
>>>>
>>>> --
>>>
>>> We have to support both w/ iommu and w/o iommu case, since
>>> that fact is out of GPU driver control. A simple way is to use
>>> dma_map_page which internally will cope with w/ and w/o iommu
>>> case gracefully, i.e. return HPA w/o iommu and IOVA w/ iommu.
>>> Then in this file we only need to cache GPA to whatever dmadr_t
>>> returned by dma_map_page.
>>>
>>
>> Hi Alex, Kirti and Neo, any thought on the IOMMU compatibility here?
> 
> Hi Jike,
> 
> With mediated passthru, you still can use hardware iommu, but more important
> that part is actually orthogonal to what we are discussing here as we will only
> cache the mapping between <gfn (iova if guest has iommu), (qemu) va>, once we 
> have pinned pages later with the help of above info, you can map it into the
> proper iommu domain if the system has configured so.
>

Hi Neo,

Technically yes you can map a pfn into the proper IOMMU domain elsewhere,
but to find out whether a pfn was previously mapped or not, you have to
track it with another rbtree-alike data structure (the IOMMU driver simply
doesn't bother with tracking), that seems somehow duplicate with the vGPU
IOMMU backend we are discussing here.

And it is also semantically correct for an IOMMU backend to handle both w/
and w/o an IOMMU hardware? :)


--
Thanks,
Jike

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson May 11, 2016, 10:06 p.m. UTC | #10
On Wed, 11 May 2016 17:15:15 +0800
Jike Song <jike.song@intel.com> wrote:

> On 05/11/2016 12:02 AM, Neo Jia wrote:
> > On Tue, May 10, 2016 at 03:52:27PM +0800, Jike Song wrote:  
> >> On 05/05/2016 05:27 PM, Tian, Kevin wrote:  
> >>>> From: Song, Jike
> >>>>
> >>>> IIUC, an api-only domain is a VFIO domain *without* underlying IOMMU
> >>>> hardware. It just, as you said in another mail, "rather than
> >>>> programming them into an IOMMU for a device, it simply stores the
> >>>> translations for use by later requests".
> >>>>
> >>>> That imposes a constraint on gfx driver: hardware IOMMU must be disabled.
> >>>> Otherwise, if IOMMU is present, the gfx driver eventually programs
> >>>> the hardware IOMMU with IOVA returned by pci_map_page or dma_map_page;
> >>>> Meanwhile, the IOMMU backend for vgpu only maintains GPA <-> HPA
> >>>> translations without any knowledge about hardware IOMMU, how is the
> >>>> device model supposed to do to get an IOVA for a given GPA (thereby HPA
> >>>> by the IOMMU backend here)?
> >>>>
> >>>> If things go as guessed above, as vfio_pin_pages() indicates, it
> >>>> pin & translate vaddr to PFN, then it will be very difficult for the
> >>>> device model to figure out:
> >>>>
> >>>> 	1, for a given GPA, how to avoid calling dma_map_page multiple times?
> >>>> 	2, for which page to call dma_unmap_page?
> >>>>
> >>>> --  
> >>>
> >>> We have to support both w/ iommu and w/o iommu case, since
> >>> that fact is out of GPU driver control. A simple way is to use
> >>> dma_map_page which internally will cope with w/ and w/o iommu
> >>> case gracefully, i.e. return HPA w/o iommu and IOVA w/ iommu.
> >>> Then in this file we only need to cache GPA to whatever dmadr_t
> >>> returned by dma_map_page.
> >>>  
> >>
> >> Hi Alex, Kirti and Neo, any thought on the IOMMU compatibility here?  
> > 
> > Hi Jike,
> > 
> > With mediated passthru, you still can use hardware iommu, but more important
> > that part is actually orthogonal to what we are discussing here as we will only
> > cache the mapping between <gfn (iova if guest has iommu), (qemu) va>, once we 
> > have pinned pages later with the help of above info, you can map it into the
> > proper iommu domain if the system has configured so.
> >  
> 
> Hi Neo,
> 
> Technically yes you can map a pfn into the proper IOMMU domain elsewhere,
> but to find out whether a pfn was previously mapped or not, you have to
> track it with another rbtree-alike data structure (the IOMMU driver simply
> doesn't bother with tracking), that seems somehow duplicate with the vGPU
> IOMMU backend we are discussing here.
> 
> And it is also semantically correct for an IOMMU backend to handle both w/
> and w/o an IOMMU hardware? :)

A problem with the iommu doing the dma_map_page() though is for what
device does it do this?  In the mediated case the vfio infrastructure
is dealing with a software representation of a device.  For all we
know that software model could transparently migrate from one physical
GPU to another.  There may not even be a physical device backing
the mediated device.  Those are details left to the vgpu driver itself.

Perhaps one possibility would be to allow the vgpu driver to register
map and unmap callbacks.  The unmap callback might provide the
invalidation interface that we're so far missing.  The combination of
map and unmap callbacks might simplify the Intel approach of pinning the
entire VM memory space, ie. for each map callback do a translation
(pin) and dma_map_page, for each unmap do a dma_unmap_page and release
the translation.  There's still the problem of where that dma_addr_t
from the dma_map_page is stored though.  Someone would need to keep
track of iova to dma_addr_t.  The vfio iommu might be a place to do
that since we're already tracking information based on iova, possibly
in an opaque data element provided by the vgpu driver.  However, we're
going to need to take a serious look at whether an rb-tree is the right
data structure for the job.  It works well for the current type1
functionality where we typically have tens of entries.  I think the
NVIDIA model of sparse pinning the VM is pushing that up to tens of
thousands.  If Intel intends to pin the entire guest, that's
potentially tens of millions of tracked entries and I don't know that
an rb-tree is the right tool for that job.  Thanks,

Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jike Song May 12, 2016, 4:11 a.m. UTC | #11
On Thu, May 12, 2016 at 6:06 AM, Alex Williamson
<alex.williamson@redhat.com> wrote:
> On Wed, 11 May 2016 17:15:15 +0800
> Jike Song <jike.song@intel.com> wrote:
>
>> On 05/11/2016 12:02 AM, Neo Jia wrote:
>> > On Tue, May 10, 2016 at 03:52:27PM +0800, Jike Song wrote:
>> >> On 05/05/2016 05:27 PM, Tian, Kevin wrote:
>> >>>> From: Song, Jike
>> >>>>
>> >>>> IIUC, an api-only domain is a VFIO domain *without* underlying IOMMU
>> >>>> hardware. It just, as you said in another mail, "rather than
>> >>>> programming them into an IOMMU for a device, it simply stores the
>> >>>> translations for use by later requests".
>> >>>>
>> >>>> That imposes a constraint on gfx driver: hardware IOMMU must be disabled.
>> >>>> Otherwise, if IOMMU is present, the gfx driver eventually programs
>> >>>> the hardware IOMMU with IOVA returned by pci_map_page or dma_map_page;
>> >>>> Meanwhile, the IOMMU backend for vgpu only maintains GPA <-> HPA
>> >>>> translations without any knowledge about hardware IOMMU, how is the
>> >>>> device model supposed to do to get an IOVA for a given GPA (thereby HPA
>> >>>> by the IOMMU backend here)?
>> >>>>
>> >>>> If things go as guessed above, as vfio_pin_pages() indicates, it
>> >>>> pin & translate vaddr to PFN, then it will be very difficult for the
>> >>>> device model to figure out:
>> >>>>
>> >>>>  1, for a given GPA, how to avoid calling dma_map_page multiple times?
>> >>>>  2, for which page to call dma_unmap_page?
>> >>>>
>> >>>> --
>> >>>
>> >>> We have to support both w/ iommu and w/o iommu case, since
>> >>> that fact is out of GPU driver control. A simple way is to use
>> >>> dma_map_page which internally will cope with w/ and w/o iommu
>> >>> case gracefully, i.e. return HPA w/o iommu and IOVA w/ iommu.
>> >>> Then in this file we only need to cache GPA to whatever dmadr_t
>> >>> returned by dma_map_page.
>> >>>
>> >>
>> >> Hi Alex, Kirti and Neo, any thought on the IOMMU compatibility here?
>> >
>> > Hi Jike,
>> >
>> > With mediated passthru, you still can use hardware iommu, but more important
>> > that part is actually orthogonal to what we are discussing here as we will only
>> > cache the mapping between <gfn (iova if guest has iommu), (qemu) va>, once we
>> > have pinned pages later with the help of above info, you can map it into the
>> > proper iommu domain if the system has configured so.
>> >
>>
>> Hi Neo,
>>
>> Technically yes you can map a pfn into the proper IOMMU domain elsewhere,
>> but to find out whether a pfn was previously mapped or not, you have to
>> track it with another rbtree-alike data structure (the IOMMU driver simply
>> doesn't bother with tracking), that seems somehow duplicate with the vGPU
>> IOMMU backend we are discussing here.
>>
>> And it is also semantically correct for an IOMMU backend to handle both w/
>> and w/o an IOMMU hardware? :)
>
> A problem with the iommu doing the dma_map_page() though is for what
> device does it do this?  In the mediated case the vfio infrastructure
> is dealing with a software representation of a device.  For all we
> know that software model could transparently migrate from one physical
> GPU to another.  There may not even be a physical device backing
> the mediated device.  Those are details left to the vgpu driver itself.
>

Great point :) Yes, I agree it's a bit intrusive to do the mapping for
a particular
pdev in an vGPU IOMMU BE.

> Perhaps one possibility would be to allow the vgpu driver to register
> map and unmap callbacks.  The unmap callback might provide the
> invalidation interface that we're so far missing.  The combination of
> map and unmap callbacks might simplify the Intel approach of pinning the
> entire VM memory space, ie. for each map callback do a translation
> (pin) and dma_map_page, for each unmap do a dma_unmap_page and release
> the translation.

Yes adding map/unmap ops in pGPU drvier (I assume you are refering to
gpu_device_ops as
implemented in Kirti's patch) sounds a good idea, satisfying both: 1)
keeping vGPU purely
virtual; 2) dealing with the Linux DMA API to achive hardware IOMMU
compatibility.

PS, this has very little to do with pinning wholly or partially. Intel KVMGT has
once been had the whole guest memory pinned, only because we used a spinlock,
which can't sleep at runtime.  We have removed that spinlock in our another
upstreaming effort, not here but for i915 driver, so probably no biggie.


> There's still the problem of where that dma_addr_t
> from the dma_map_page is stored though.  Someone would need to keep
> track of iova to dma_addr_t.  The vfio iommu might be a place to do
> that since we're already tracking information based on iova, possibly
> in an opaque data element provided by the vgpu driver.

Any reason to keep it opaque? Given that vfio iommu is already tracking
PFN for iova (vaddr as vGPU is), seems adding dma_addr_t as another field is
simple. But I don't have a strong opinion here, opaque definitely
works for me :)

> However, we're
> going to need to take a serious look at whether an rb-tree is the right
> data structure for the job.  It works well for the current type1
> functionality where we typically have tens of entries.  I think the
> NVIDIA model of sparse pinning the VM is pushing that up to tens of
> thousands.  If Intel intends to pin the entire guest, that's
> potentially tens of millions of tracked entries and I don't know that
> an rb-tree is the right tool for that job.  Thanks,
>

Having the rbtree efficiency considered there is yet another reason for us
to pin partially. Assuming that partially pinning guaranteed, do you
think rbtree
is good enough?

--
Thanks,
Jike
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tian, Kevin May 12, 2016, 8 a.m. UTC | #12
> From: Alex Williamson [mailto:alex.williamson@redhat.com]
> Sent: Thursday, May 12, 2016 6:06 AM
> 
> On Wed, 11 May 2016 17:15:15 +0800
> Jike Song <jike.song@intel.com> wrote:
> 
> > On 05/11/2016 12:02 AM, Neo Jia wrote:
> > > On Tue, May 10, 2016 at 03:52:27PM +0800, Jike Song wrote:
> > >> On 05/05/2016 05:27 PM, Tian, Kevin wrote:
> > >>>> From: Song, Jike
> > >>>>
> > >>>> IIUC, an api-only domain is a VFIO domain *without* underlying IOMMU
> > >>>> hardware. It just, as you said in another mail, "rather than
> > >>>> programming them into an IOMMU for a device, it simply stores the
> > >>>> translations for use by later requests".
> > >>>>
> > >>>> That imposes a constraint on gfx driver: hardware IOMMU must be disabled.
> > >>>> Otherwise, if IOMMU is present, the gfx driver eventually programs
> > >>>> the hardware IOMMU with IOVA returned by pci_map_page or dma_map_page;
> > >>>> Meanwhile, the IOMMU backend for vgpu only maintains GPA <-> HPA
> > >>>> translations without any knowledge about hardware IOMMU, how is the
> > >>>> device model supposed to do to get an IOVA for a given GPA (thereby HPA
> > >>>> by the IOMMU backend here)?
> > >>>>
> > >>>> If things go as guessed above, as vfio_pin_pages() indicates, it
> > >>>> pin & translate vaddr to PFN, then it will be very difficult for the
> > >>>> device model to figure out:
> > >>>>
> > >>>> 	1, for a given GPA, how to avoid calling dma_map_page multiple times?
> > >>>> 	2, for which page to call dma_unmap_page?
> > >>>>
> > >>>> --
> > >>>
> > >>> We have to support both w/ iommu and w/o iommu case, since
> > >>> that fact is out of GPU driver control. A simple way is to use
> > >>> dma_map_page which internally will cope with w/ and w/o iommu
> > >>> case gracefully, i.e. return HPA w/o iommu and IOVA w/ iommu.
> > >>> Then in this file we only need to cache GPA to whatever dmadr_t
> > >>> returned by dma_map_page.
> > >>>
> > >>
> > >> Hi Alex, Kirti and Neo, any thought on the IOMMU compatibility here?
> > >
> > > Hi Jike,
> > >
> > > With mediated passthru, you still can use hardware iommu, but more important
> > > that part is actually orthogonal to what we are discussing here as we will only
> > > cache the mapping between <gfn (iova if guest has iommu), (qemu) va>, once we
> > > have pinned pages later with the help of above info, you can map it into the
> > > proper iommu domain if the system has configured so.
> > >
> >
> > Hi Neo,
> >
> > Technically yes you can map a pfn into the proper IOMMU domain elsewhere,
> > but to find out whether a pfn was previously mapped or not, you have to
> > track it with another rbtree-alike data structure (the IOMMU driver simply
> > doesn't bother with tracking), that seems somehow duplicate with the vGPU
> > IOMMU backend we are discussing here.
> >
> > And it is also semantically correct for an IOMMU backend to handle both w/
> > and w/o an IOMMU hardware? :)
> 
> A problem with the iommu doing the dma_map_page() though is for what
> device does it do this?  In the mediated case the vfio infrastructure
> is dealing with a software representation of a device.  For all we
> know that software model could transparently migrate from one physical
> GPU to another.  There may not even be a physical device backing
> the mediated device.  Those are details left to the vgpu driver itself.

This is a fair argument. VFIO iommu driver simply serves user space
requests, where only vaddr<->iova (essentially gpa in kvm case) is
mattered. How iova is mapped into real IOMMU is not VFIO's interest.

> 
> Perhaps one possibility would be to allow the vgpu driver to register
> map and unmap callbacks.  The unmap callback might provide the
> invalidation interface that we're so far missing.  The combination of
> map and unmap callbacks might simplify the Intel approach of pinning the
> entire VM memory space, ie. for each map callback do a translation
> (pin) and dma_map_page, for each unmap do a dma_unmap_page and release
> the translation.  There's still the problem of where that dma_addr_t
> from the dma_map_page is stored though.  Someone would need to keep
> track of iova to dma_addr_t.  The vfio iommu might be a place to do
> that since we're already tracking information based on iova, possibly
> in an opaque data element provided by the vgpu driver.  However, we're
> going to need to take a serious look at whether an rb-tree is the right
> data structure for the job.  It works well for the current type1
> functionality where we typically have tens of entries.  I think the
> NVIDIA model of sparse pinning the VM is pushing that up to tens of
> thousands.  If Intel intends to pin the entire guest, that's
> potentially tens of millions of tracked entries and I don't know that
> an rb-tree is the right tool for that job.  Thanks,
> 

Based on above thought I'm thinking whether below would work:
(let's use gpa to replace existing iova in type1 driver, while using iova
for the one actually used in vGPU driver. Assume 'pin-all' scenario first
which matches existing vfio logic)

- No change to existing vfio_dma structure. VFIO still maintains gpa<->vaddr
mapping, in coarse-grained regions;

- Leverage same page accounting/pinning logic in type1 driver, which 
should be enough for 'pin-all' usage;

- Then main divergence point for vGPU would be in vfio_unmap_unpin
and vfio_iommu_map. I'm not sure whether it's easy to fake an 
iommu_domain for vGPU so same iommu_map/unmap can be reused.
If not, we may introduce two new map/unmap callbacks provided
specifically by vGPU core driver, as you suggested:

	* vGPU core driver uses dma_map_page to map specified pfns:

		o When IOMMU is enabled, we'll get an iova returned different
from pfn;
		o When IOMMU is disabled, returned iova is same as pfn;

	* Then vGPU core driver just maintains its own gpa<->iova lookup
table (e.g. called vgpu_dma)

	* Because each vfio_iommu_map invocation is about a contiguous 
region, we can expect same number of vgpu_dma entries as maintained 
for vfio_dma list;

Then it's vGPU core driver's responsibility to provide gpa<->iova
lookup for vendor specific GPU driver. And we don't need worry about
tens of thousands of entries. Once we get this simple 'pin-all' model
ready, then it can be further extended to support 'pin-sparse'
scenario. We still maintain a top-level vgpu_dma list with each entry to
further link its own sparse mapping structure. In reality I don't expect
we really need to maintain per-page translation even with sparse pinning.

Thanks
Kevin
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson May 12, 2016, 7:05 p.m. UTC | #13
On Thu, 12 May 2016 08:00:36 +0000
"Tian, Kevin" <kevin.tian@intel.com> wrote:

> > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > Sent: Thursday, May 12, 2016 6:06 AM
> > 
> > On Wed, 11 May 2016 17:15:15 +0800
> > Jike Song <jike.song@intel.com> wrote:
> >   
> > > On 05/11/2016 12:02 AM, Neo Jia wrote:  
> > > > On Tue, May 10, 2016 at 03:52:27PM +0800, Jike Song wrote:  
> > > >> On 05/05/2016 05:27 PM, Tian, Kevin wrote:  
> > > >>>> From: Song, Jike
> > > >>>>
> > > >>>> IIUC, an api-only domain is a VFIO domain *without* underlying IOMMU
> > > >>>> hardware. It just, as you said in another mail, "rather than
> > > >>>> programming them into an IOMMU for a device, it simply stores the
> > > >>>> translations for use by later requests".
> > > >>>>
> > > >>>> That imposes a constraint on gfx driver: hardware IOMMU must be disabled.
> > > >>>> Otherwise, if IOMMU is present, the gfx driver eventually programs
> > > >>>> the hardware IOMMU with IOVA returned by pci_map_page or dma_map_page;
> > > >>>> Meanwhile, the IOMMU backend for vgpu only maintains GPA <-> HPA
> > > >>>> translations without any knowledge about hardware IOMMU, how is the
> > > >>>> device model supposed to do to get an IOVA for a given GPA (thereby HPA
> > > >>>> by the IOMMU backend here)?
> > > >>>>
> > > >>>> If things go as guessed above, as vfio_pin_pages() indicates, it
> > > >>>> pin & translate vaddr to PFN, then it will be very difficult for the
> > > >>>> device model to figure out:
> > > >>>>
> > > >>>> 	1, for a given GPA, how to avoid calling dma_map_page multiple times?
> > > >>>> 	2, for which page to call dma_unmap_page?
> > > >>>>
> > > >>>> --  
> > > >>>
> > > >>> We have to support both w/ iommu and w/o iommu case, since
> > > >>> that fact is out of GPU driver control. A simple way is to use
> > > >>> dma_map_page which internally will cope with w/ and w/o iommu
> > > >>> case gracefully, i.e. return HPA w/o iommu and IOVA w/ iommu.
> > > >>> Then in this file we only need to cache GPA to whatever dmadr_t
> > > >>> returned by dma_map_page.
> > > >>>  
> > > >>
> > > >> Hi Alex, Kirti and Neo, any thought on the IOMMU compatibility here?  
> > > >
> > > > Hi Jike,
> > > >
> > > > With mediated passthru, you still can use hardware iommu, but more important
> > > > that part is actually orthogonal to what we are discussing here as we will only
> > > > cache the mapping between <gfn (iova if guest has iommu), (qemu) va>, once we
> > > > have pinned pages later with the help of above info, you can map it into the
> > > > proper iommu domain if the system has configured so.
> > > >  
> > >
> > > Hi Neo,
> > >
> > > Technically yes you can map a pfn into the proper IOMMU domain elsewhere,
> > > but to find out whether a pfn was previously mapped or not, you have to
> > > track it with another rbtree-alike data structure (the IOMMU driver simply
> > > doesn't bother with tracking), that seems somehow duplicate with the vGPU
> > > IOMMU backend we are discussing here.
> > >
> > > And it is also semantically correct for an IOMMU backend to handle both w/
> > > and w/o an IOMMU hardware? :)  
> > 
> > A problem with the iommu doing the dma_map_page() though is for what
> > device does it do this?  In the mediated case the vfio infrastructure
> > is dealing with a software representation of a device.  For all we
> > know that software model could transparently migrate from one physical
> > GPU to another.  There may not even be a physical device backing
> > the mediated device.  Those are details left to the vgpu driver itself.  
> 
> This is a fair argument. VFIO iommu driver simply serves user space
> requests, where only vaddr<->iova (essentially gpa in kvm case) is
> mattered. How iova is mapped into real IOMMU is not VFIO's interest.
> 
> > 
> > Perhaps one possibility would be to allow the vgpu driver to register
> > map and unmap callbacks.  The unmap callback might provide the
> > invalidation interface that we're so far missing.  The combination of
> > map and unmap callbacks might simplify the Intel approach of pinning the
> > entire VM memory space, ie. for each map callback do a translation
> > (pin) and dma_map_page, for each unmap do a dma_unmap_page and release
> > the translation.  There's still the problem of where that dma_addr_t
> > from the dma_map_page is stored though.  Someone would need to keep
> > track of iova to dma_addr_t.  The vfio iommu might be a place to do
> > that since we're already tracking information based on iova, possibly
> > in an opaque data element provided by the vgpu driver.  However, we're
> > going to need to take a serious look at whether an rb-tree is the right
> > data structure for the job.  It works well for the current type1
> > functionality where we typically have tens of entries.  I think the
> > NVIDIA model of sparse pinning the VM is pushing that up to tens of
> > thousands.  If Intel intends to pin the entire guest, that's
> > potentially tens of millions of tracked entries and I don't know that
> > an rb-tree is the right tool for that job.  Thanks,
> >   
> 
> Based on above thought I'm thinking whether below would work:
> (let's use gpa to replace existing iova in type1 driver, while using iova
> for the one actually used in vGPU driver. Assume 'pin-all' scenario first
> which matches existing vfio logic)
> 
> - No change to existing vfio_dma structure. VFIO still maintains gpa<->vaddr
> mapping, in coarse-grained regions;
> 
> - Leverage same page accounting/pinning logic in type1 driver, which 
> should be enough for 'pin-all' usage;
> 
> - Then main divergence point for vGPU would be in vfio_unmap_unpin
> and vfio_iommu_map. I'm not sure whether it's easy to fake an 
> iommu_domain for vGPU so same iommu_map/unmap can be reused.

This seems troublesome.  Kirti's version used numerous api-only tests
to avoid these which made the code difficult to trace.  Clearly one
option is to split out the common code so that a new mediated-type1
backend skips this, but they thought they could clean it up without
this, so we'll see what happens in the next version.

> If not, we may introduce two new map/unmap callbacks provided
> specifically by vGPU core driver, as you suggested:
> 
> 	* vGPU core driver uses dma_map_page to map specified pfns:
> 
> 		o When IOMMU is enabled, we'll get an iova returned different
> from pfn;
> 		o When IOMMU is disabled, returned iova is same as pfn;

Either way each iova needs to be stored and we have a worst case of one
iova per page of guest memory.
 
> 	* Then vGPU core driver just maintains its own gpa<->iova lookup
> table (e.g. called vgpu_dma)
> 
> 	* Because each vfio_iommu_map invocation is about a contiguous 
> region, we can expect same number of vgpu_dma entries as maintained 
> for vfio_dma list;
>
> Then it's vGPU core driver's responsibility to provide gpa<->iova
> lookup for vendor specific GPU driver. And we don't need worry about
> tens of thousands of entries. Once we get this simple 'pin-all' model
> ready, then it can be further extended to support 'pin-sparse'
> scenario. We still maintain a top-level vgpu_dma list with each entry to
> further link its own sparse mapping structure. In reality I don't expect
> we really need to maintain per-page translation even with sparse pinning.

If you're trying to equate the scale of what we need to track vs what
type1 currently tracks, they're significantly different.  Possible
things we need to track include the pfn, the iova, and possibly a
reference count or some sort of pinned page map.  In the pin-all model
we can assume that every page is pinned on map and unpinned on unmap,
so a reference count or map is unnecessary.  We can also assume that we
can always regenerate the pfn with get_user_pages() from the vaddr, so
we don't need to track that.  I don't see any way around tracking the
iova.  The iommu can't tell us this like it can with the normal type1
model because the pfn is the result of the translation, not the key for
the translation. So we're always going to have between 1 and
(size/PAGE_SIZE) iova entries per vgpu_dma entry.  You might be able to
manage the vgpu_dma with an rb-tree, but each vgpu_dma entry needs some
data structure tracking every iova.

Sparse mapping has the same issue but of course the tree of iovas is
potentially incomplete and we need a way to determine where it's
incomplete.  A page table rooted in the vgpu_dma and indexed by the
offset from the start vaddr seems like the way to go here.  It's also
possible that some mediated device models might store the iova in the
command sent to the device and therefore be able to parse those entries
back out to unmap them without storing them separately.  This might be
how the s390 channel-io model would prefer to work.  That seems like
further validation that such tracking is going to be dependent on the
mediated driver itself and probably not something to centralize in a
mediated iommu driver.  Thanks,

Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neo Jia May 12, 2016, 7:49 p.m. UTC | #14
On Thu, May 12, 2016 at 12:11:00PM +0800, Jike Song wrote:
> On Thu, May 12, 2016 at 6:06 AM, Alex Williamson
> <alex.williamson@redhat.com> wrote:
> > On Wed, 11 May 2016 17:15:15 +0800
> > Jike Song <jike.song@intel.com> wrote:
> >
> >> On 05/11/2016 12:02 AM, Neo Jia wrote:
> >> > On Tue, May 10, 2016 at 03:52:27PM +0800, Jike Song wrote:
> >> >> On 05/05/2016 05:27 PM, Tian, Kevin wrote:
> >> >>>> From: Song, Jike
> >> >>>>
> >> >>>> IIUC, an api-only domain is a VFIO domain *without* underlying IOMMU
> >> >>>> hardware. It just, as you said in another mail, "rather than
> >> >>>> programming them into an IOMMU for a device, it simply stores the
> >> >>>> translations for use by later requests".
> >> >>>>
> >> >>>> That imposes a constraint on gfx driver: hardware IOMMU must be disabled.
> >> >>>> Otherwise, if IOMMU is present, the gfx driver eventually programs
> >> >>>> the hardware IOMMU with IOVA returned by pci_map_page or dma_map_page;
> >> >>>> Meanwhile, the IOMMU backend for vgpu only maintains GPA <-> HPA
> >> >>>> translations without any knowledge about hardware IOMMU, how is the
> >> >>>> device model supposed to do to get an IOVA for a given GPA (thereby HPA
> >> >>>> by the IOMMU backend here)?
> >> >>>>
> >> >>>> If things go as guessed above, as vfio_pin_pages() indicates, it
> >> >>>> pin & translate vaddr to PFN, then it will be very difficult for the
> >> >>>> device model to figure out:
> >> >>>>
> >> >>>>  1, for a given GPA, how to avoid calling dma_map_page multiple times?
> >> >>>>  2, for which page to call dma_unmap_page?
> >> >>>>
> >> >>>> --
> >> >>>
> >> >>> We have to support both w/ iommu and w/o iommu case, since
> >> >>> that fact is out of GPU driver control. A simple way is to use
> >> >>> dma_map_page which internally will cope with w/ and w/o iommu
> >> >>> case gracefully, i.e. return HPA w/o iommu and IOVA w/ iommu.
> >> >>> Then in this file we only need to cache GPA to whatever dmadr_t
> >> >>> returned by dma_map_page.
> >> >>>
> >> >>
> >> >> Hi Alex, Kirti and Neo, any thought on the IOMMU compatibility here?
> >> >
> >> > Hi Jike,
> >> >
> >> > With mediated passthru, you still can use hardware iommu, but more important
> >> > that part is actually orthogonal to what we are discussing here as we will only
> >> > cache the mapping between <gfn (iova if guest has iommu), (qemu) va>, once we
> >> > have pinned pages later with the help of above info, you can map it into the
> >> > proper iommu domain if the system has configured so.
> >> >
> >>
> >> Hi Neo,
> >>
> >> Technically yes you can map a pfn into the proper IOMMU domain elsewhere,
> >> but to find out whether a pfn was previously mapped or not, you have to
> >> track it with another rbtree-alike data structure (the IOMMU driver simply
> >> doesn't bother with tracking), that seems somehow duplicate with the vGPU
> >> IOMMU backend we are discussing here.
> >>
> >> And it is also semantically correct for an IOMMU backend to handle both w/
> >> and w/o an IOMMU hardware? :)
> >
> > A problem with the iommu doing the dma_map_page() though is for what
> > device does it do this?  In the mediated case the vfio infrastructure
> > is dealing with a software representation of a device.  For all we
> > know that software model could transparently migrate from one physical
> > GPU to another.  There may not even be a physical device backing
> > the mediated device.  Those are details left to the vgpu driver itself.
> >
> 
> Great point :) Yes, I agree it's a bit intrusive to do the mapping for
> a particular
> pdev in an vGPU IOMMU BE.
> 
> > Perhaps one possibility would be to allow the vgpu driver to register
> > map and unmap callbacks.  The unmap callback might provide the
> > invalidation interface that we're so far missing.  The combination of
> > map and unmap callbacks might simplify the Intel approach of pinning the
> > entire VM memory space, ie. for each map callback do a translation
> > (pin) and dma_map_page, for each unmap do a dma_unmap_page and release
> > the translation.
> 
> Yes adding map/unmap ops in pGPU drvier (I assume you are refering to
> gpu_device_ops as
> implemented in Kirti's patch) sounds a good idea, satisfying both: 1)
> keeping vGPU purely
> virtual; 2) dealing with the Linux DMA API to achive hardware IOMMU
> compatibility.
> 
> PS, this has very little to do with pinning wholly or partially. Intel KVMGT has
> once been had the whole guest memory pinned, only because we used a spinlock,
> which can't sleep at runtime.  We have removed that spinlock in our another
> upstreaming effort, not here but for i915 driver, so probably no biggie.
> 

OK, then you guys don't need to pin everything. The next question will be if you
can send the pinning request from your mediated driver backend to request memory
pinning like we have demonstrated in the v3 patch, function vfio_pin_pages and
vfio_unpin_pages?

Thanks,
Neo

> 
> > There's still the problem of where that dma_addr_t
> > from the dma_map_page is stored though.  Someone would need to keep
> > track of iova to dma_addr_t.  The vfio iommu might be a place to do
> > that since we're already tracking information based on iova, possibly
> > in an opaque data element provided by the vgpu driver.
> 
> Any reason to keep it opaque? Given that vfio iommu is already tracking
> PFN for iova (vaddr as vGPU is), seems adding dma_addr_t as another field is
> simple. But I don't have a strong opinion here, opaque definitely
> works for me :)
> 
> > However, we're
> > going to need to take a serious look at whether an rb-tree is the right
> > data structure for the job.  It works well for the current type1
> > functionality where we typically have tens of entries.  I think the
> > NVIDIA model of sparse pinning the VM is pushing that up to tens of
> > thousands.  If Intel intends to pin the entire guest, that's
> > potentially tens of millions of tracked entries and I don't know that
> > an rb-tree is the right tool for that job.  Thanks,
> >
> 
> Having the rbtree efficiency considered there is yet another reason for us
> to pin partially. Assuming that partially pinning guaranteed, do you
> think rbtree
> is good enough?
> 
> --
> Thanks,
> Jike
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neo Jia May 12, 2016, 8:12 p.m. UTC | #15
On Thu, May 12, 2016 at 01:05:52PM -0600, Alex Williamson wrote:
> On Thu, 12 May 2016 08:00:36 +0000
> "Tian, Kevin" <kevin.tian@intel.com> wrote:
> 
> > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > Sent: Thursday, May 12, 2016 6:06 AM
> > > 
> > > On Wed, 11 May 2016 17:15:15 +0800
> > > Jike Song <jike.song@intel.com> wrote:
> > >   
> > > > On 05/11/2016 12:02 AM, Neo Jia wrote:  
> > > > > On Tue, May 10, 2016 at 03:52:27PM +0800, Jike Song wrote:  
> > > > >> On 05/05/2016 05:27 PM, Tian, Kevin wrote:  
> > > > >>>> From: Song, Jike
> > > > >>>>
> > > > >>>> IIUC, an api-only domain is a VFIO domain *without* underlying IOMMU
> > > > >>>> hardware. It just, as you said in another mail, "rather than
> > > > >>>> programming them into an IOMMU for a device, it simply stores the
> > > > >>>> translations for use by later requests".
> > > > >>>>
> > > > >>>> That imposes a constraint on gfx driver: hardware IOMMU must be disabled.
> > > > >>>> Otherwise, if IOMMU is present, the gfx driver eventually programs
> > > > >>>> the hardware IOMMU with IOVA returned by pci_map_page or dma_map_page;
> > > > >>>> Meanwhile, the IOMMU backend for vgpu only maintains GPA <-> HPA
> > > > >>>> translations without any knowledge about hardware IOMMU, how is the
> > > > >>>> device model supposed to do to get an IOVA for a given GPA (thereby HPA
> > > > >>>> by the IOMMU backend here)?
> > > > >>>>
> > > > >>>> If things go as guessed above, as vfio_pin_pages() indicates, it
> > > > >>>> pin & translate vaddr to PFN, then it will be very difficult for the
> > > > >>>> device model to figure out:
> > > > >>>>
> > > > >>>> 	1, for a given GPA, how to avoid calling dma_map_page multiple times?
> > > > >>>> 	2, for which page to call dma_unmap_page?
> > > > >>>>
> > > > >>>> --  
> > > > >>>
> > > > >>> We have to support both w/ iommu and w/o iommu case, since
> > > > >>> that fact is out of GPU driver control. A simple way is to use
> > > > >>> dma_map_page which internally will cope with w/ and w/o iommu
> > > > >>> case gracefully, i.e. return HPA w/o iommu and IOVA w/ iommu.
> > > > >>> Then in this file we only need to cache GPA to whatever dmadr_t
> > > > >>> returned by dma_map_page.
> > > > >>>  
> > > > >>
> > > > >> Hi Alex, Kirti and Neo, any thought on the IOMMU compatibility here?  
> > > > >
> > > > > Hi Jike,
> > > > >
> > > > > With mediated passthru, you still can use hardware iommu, but more important
> > > > > that part is actually orthogonal to what we are discussing here as we will only
> > > > > cache the mapping between <gfn (iova if guest has iommu), (qemu) va>, once we
> > > > > have pinned pages later with the help of above info, you can map it into the
> > > > > proper iommu domain if the system has configured so.
> > > > >  
> > > >
> > > > Hi Neo,
> > > >
> > > > Technically yes you can map a pfn into the proper IOMMU domain elsewhere,
> > > > but to find out whether a pfn was previously mapped or not, you have to
> > > > track it with another rbtree-alike data structure (the IOMMU driver simply
> > > > doesn't bother with tracking), that seems somehow duplicate with the vGPU
> > > > IOMMU backend we are discussing here.
> > > >
> > > > And it is also semantically correct for an IOMMU backend to handle both w/
> > > > and w/o an IOMMU hardware? :)  
> > > 
> > > A problem with the iommu doing the dma_map_page() though is for what
> > > device does it do this?  In the mediated case the vfio infrastructure
> > > is dealing with a software representation of a device.  For all we
> > > know that software model could transparently migrate from one physical
> > > GPU to another.  There may not even be a physical device backing
> > > the mediated device.  Those are details left to the vgpu driver itself.  
> > 
> > This is a fair argument. VFIO iommu driver simply serves user space
> > requests, where only vaddr<->iova (essentially gpa in kvm case) is
> > mattered. How iova is mapped into real IOMMU is not VFIO's interest.
> > 
> > > 
> > > Perhaps one possibility would be to allow the vgpu driver to register
> > > map and unmap callbacks.  The unmap callback might provide the
> > > invalidation interface that we're so far missing.  The combination of
> > > map and unmap callbacks might simplify the Intel approach of pinning the
> > > entire VM memory space, ie. for each map callback do a translation
> > > (pin) and dma_map_page, for each unmap do a dma_unmap_page and release
> > > the translation.  There's still the problem of where that dma_addr_t
> > > from the dma_map_page is stored though.  Someone would need to keep
> > > track of iova to dma_addr_t.  The vfio iommu might be a place to do
> > > that since we're already tracking information based on iova, possibly
> > > in an opaque data element provided by the vgpu driver.  However, we're
> > > going to need to take a serious look at whether an rb-tree is the right
> > > data structure for the job.  It works well for the current type1
> > > functionality where we typically have tens of entries.  I think the
> > > NVIDIA model of sparse pinning the VM is pushing that up to tens of
> > > thousands.  If Intel intends to pin the entire guest, that's
> > > potentially tens of millions of tracked entries and I don't know that
> > > an rb-tree is the right tool for that job.  Thanks,
> > >   
> > 
> > Based on above thought I'm thinking whether below would work:
> > (let's use gpa to replace existing iova in type1 driver, while using iova
> > for the one actually used in vGPU driver. Assume 'pin-all' scenario first
> > which matches existing vfio logic)
> > 
> > - No change to existing vfio_dma structure. VFIO still maintains gpa<->vaddr
> > mapping, in coarse-grained regions;
> > 
> > - Leverage same page accounting/pinning logic in type1 driver, which 
> > should be enough for 'pin-all' usage;
> > 
> > - Then main divergence point for vGPU would be in vfio_unmap_unpin
> > and vfio_iommu_map. I'm not sure whether it's easy to fake an 
> > iommu_domain for vGPU so same iommu_map/unmap can be reused.
> 
> This seems troublesome.  Kirti's version used numerous api-only tests
> to avoid these which made the code difficult to trace.  Clearly one
> option is to split out the common code so that a new mediated-type1
> backend skips this, but they thought they could clean it up without
> this, so we'll see what happens in the next version.
> 
> > If not, we may introduce two new map/unmap callbacks provided
> > specifically by vGPU core driver, as you suggested:
> > 
> > 	* vGPU core driver uses dma_map_page to map specified pfns:
> > 
> > 		o When IOMMU is enabled, we'll get an iova returned different
> > from pfn;
> > 		o When IOMMU is disabled, returned iova is same as pfn;
> 
> Either way each iova needs to be stored and we have a worst case of one
> iova per page of guest memory.
>  
> > 	* Then vGPU core driver just maintains its own gpa<->iova lookup
> > table (e.g. called vgpu_dma)
> > 
> > 	* Because each vfio_iommu_map invocation is about a contiguous 
> > region, we can expect same number of vgpu_dma entries as maintained 
> > for vfio_dma list;
> >
> > Then it's vGPU core driver's responsibility to provide gpa<->iova
> > lookup for vendor specific GPU driver. And we don't need worry about
> > tens of thousands of entries. Once we get this simple 'pin-all' model
> > ready, then it can be further extended to support 'pin-sparse'
> > scenario. We still maintain a top-level vgpu_dma list with each entry to
> > further link its own sparse mapping structure. In reality I don't expect
> > we really need to maintain per-page translation even with sparse pinning.
> 
> If you're trying to equate the scale of what we need to track vs what
> type1 currently tracks, they're significantly different.  Possible
> things we need to track include the pfn, the iova, and possibly a
> reference count or some sort of pinned page map.  In the pin-all model
> we can assume that every page is pinned on map and unpinned on unmap,
> so a reference count or map is unnecessary.  We can also assume that we
> can always regenerate the pfn with get_user_pages() from the vaddr, so
> we don't need to track that.  

Hi Alex,

Thanks for pointing this out, we will not track those in our next rev and
get_user_pages will be used from the vaddr as you suggested to handle the
single VM with both passthru + mediated device case.

Thanks,
Neo

> I don't see any way around tracking the
> iova.  The iommu can't tell us this like it can with the normal type1
> model because the pfn is the result of the translation, not the key for
> the translation. So we're always going to have between 1 and
> (size/PAGE_SIZE) iova entries per vgpu_dma entry.  You might be able to
> manage the vgpu_dma with an rb-tree, but each vgpu_dma entry needs some
> data structure tracking every iova.
> 
> Sparse mapping has the same issue but of course the tree of iovas is
> potentially incomplete and we need a way to determine where it's
> incomplete.  A page table rooted in the vgpu_dma and indexed by the
> offset from the start vaddr seems like the way to go here.  It's also
> possible that some mediated device models might store the iova in the
> command sent to the device and therefore be able to parse those entries
> back out to unmap them without storing them separately.  This might be
> how the s390 channel-io model would prefer to work.  That seems like
> further validation that such tracking is going to be dependent on the
> mediated driver itself and probably not something to centralize in a
> mediated iommu driver.  Thanks,

> 
> Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tian, Kevin May 13, 2016, 2:41 a.m. UTC | #16
> From: Neo Jia [mailto:cjia@nvidia.com]
> Sent: Friday, May 13, 2016 3:49 AM
> 
> >
> > > Perhaps one possibility would be to allow the vgpu driver to register
> > > map and unmap callbacks.  The unmap callback might provide the
> > > invalidation interface that we're so far missing.  The combination of
> > > map and unmap callbacks might simplify the Intel approach of pinning the
> > > entire VM memory space, ie. for each map callback do a translation
> > > (pin) and dma_map_page, for each unmap do a dma_unmap_page and release
> > > the translation.
> >
> > Yes adding map/unmap ops in pGPU drvier (I assume you are refering to
> > gpu_device_ops as
> > implemented in Kirti's patch) sounds a good idea, satisfying both: 1)
> > keeping vGPU purely
> > virtual; 2) dealing with the Linux DMA API to achive hardware IOMMU
> > compatibility.
> >
> > PS, this has very little to do with pinning wholly or partially. Intel KVMGT has
> > once been had the whole guest memory pinned, only because we used a spinlock,
> > which can't sleep at runtime.  We have removed that spinlock in our another
> > upstreaming effort, not here but for i915 driver, so probably no biggie.
> >
> 
> OK, then you guys don't need to pin everything. The next question will be if you
> can send the pinning request from your mediated driver backend to request memory
> pinning like we have demonstrated in the v3 patch, function vfio_pin_pages and
> vfio_unpin_pages?
> 

Jike can you confirm this statement? My feeling is that we don't have such logic
in our device model to figure out which pages need to be pinned on demand. So
currently pin-everything is same requirement in both KVM and Xen side...

Thanks
Kevin
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tian, Kevin May 13, 2016, 3:55 a.m. UTC | #17
> From: Alex Williamson [mailto:alex.williamson@redhat.com]
> Sent: Friday, May 13, 2016 3:06 AM
> 
> > >
> >
> > Based on above thought I'm thinking whether below would work:
> > (let's use gpa to replace existing iova in type1 driver, while using iova
> > for the one actually used in vGPU driver. Assume 'pin-all' scenario first
> > which matches existing vfio logic)
> >
> > - No change to existing vfio_dma structure. VFIO still maintains gpa<->vaddr
> > mapping, in coarse-grained regions;
> >
> > - Leverage same page accounting/pinning logic in type1 driver, which
> > should be enough for 'pin-all' usage;
> >
> > - Then main divergence point for vGPU would be in vfio_unmap_unpin
> > and vfio_iommu_map. I'm not sure whether it's easy to fake an
> > iommu_domain for vGPU so same iommu_map/unmap can be reused.
> 
> This seems troublesome.  Kirti's version used numerous api-only tests
> to avoid these which made the code difficult to trace.  Clearly one
> option is to split out the common code so that a new mediated-type1
> backend skips this, but they thought they could clean it up without
> this, so we'll see what happens in the next version.
> 
> > If not, we may introduce two new map/unmap callbacks provided
> > specifically by vGPU core driver, as you suggested:
> >
> > 	* vGPU core driver uses dma_map_page to map specified pfns:
> >
> > 		o When IOMMU is enabled, we'll get an iova returned different
> > from pfn;
> > 		o When IOMMU is disabled, returned iova is same as pfn;
> 
> Either way each iova needs to be stored and we have a worst case of one
> iova per page of guest memory.
> 
> > 	* Then vGPU core driver just maintains its own gpa<->iova lookup
> > table (e.g. called vgpu_dma)
> >
> > 	* Because each vfio_iommu_map invocation is about a contiguous
> > region, we can expect same number of vgpu_dma entries as maintained
> > for vfio_dma list;
> >
> > Then it's vGPU core driver's responsibility to provide gpa<->iova
> > lookup for vendor specific GPU driver. And we don't need worry about
> > tens of thousands of entries. Once we get this simple 'pin-all' model
> > ready, then it can be further extended to support 'pin-sparse'
> > scenario. We still maintain a top-level vgpu_dma list with each entry to
> > further link its own sparse mapping structure. In reality I don't expect
> > we really need to maintain per-page translation even with sparse pinning.
> 
> If you're trying to equate the scale of what we need to track vs what
> type1 currently tracks, they're significantly different.  Possible
> things we need to track include the pfn, the iova, and possibly a
> reference count or some sort of pinned page map.  In the pin-all model
> we can assume that every page is pinned on map and unpinned on unmap,
> so a reference count or map is unnecessary.  We can also assume that we
> can always regenerate the pfn with get_user_pages() from the vaddr, so
> we don't need to track that.  I don't see any way around tracking the
> iova.  The iommu can't tell us this like it can with the normal type1
> model because the pfn is the result of the translation, not the key for
> the translation. So we're always going to have between 1 and
> (size/PAGE_SIZE) iova entries per vgpu_dma entry.  You might be able to
> manage the vgpu_dma with an rb-tree, but each vgpu_dma entry needs some
> data structure tracking every iova.

There is one option. We may use alloc_iova to reserve continuous iova
range for each vgpu_dma range and then use iommu_map/unmap to
write iommu ptes later upon map request (then could be same #entries
as vfio_dma compared to unbounded entries when using dma_map_page). 
Of course this needs to be done in vGPU core driver, since vfio type1 only 
sees a faked iommu domain.

> 
> Sparse mapping has the same issue but of course the tree of iovas is
> potentially incomplete and we need a way to determine where it's
> incomplete.  A page table rooted in the vgpu_dma and indexed by the
> offset from the start vaddr seems like the way to go here.  It's also
> possible that some mediated device models might store the iova in the
> command sent to the device and therefore be able to parse those entries
> back out to unmap them without storing them separately.  This might be
> how the s390 channel-io model would prefer to work.  That seems like
> further validation that such tracking is going to be dependent on the
> mediated driver itself and probably not something to centralize in a
> mediated iommu driver.  Thanks,
> 

Another simpler way might be allocate an array for each memory
regions registered from user space. For a 512MB region, it means
512K*4=2MB array to track pfn or iova mapping corresponding to
a gfn. It may consume more resource than rb tree when not many
pages need to be pinned, but could be less when rb tree increases
a lot. 

Is such array-based approach considered ugly in kernel? :-)

Thanks
Kevin
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jike Song May 13, 2016, 6:08 a.m. UTC | #18
On 05/13/2016 03:49 AM, Neo Jia wrote:
> On Thu, May 12, 2016 at 12:11:00PM +0800, Jike Song wrote:
>> On Thu, May 12, 2016 at 6:06 AM, Alex Williamson
>> <alex.williamson@redhat.com> wrote:
>>> On Wed, 11 May 2016 17:15:15 +0800
>>> Jike Song <jike.song@intel.com> wrote:
>>>
>>>> On 05/11/2016 12:02 AM, Neo Jia wrote:
>>>>> On Tue, May 10, 2016 at 03:52:27PM +0800, Jike Song wrote:
>>>>>> On 05/05/2016 05:27 PM, Tian, Kevin wrote:
>>>>>>>> From: Song, Jike
>>>>>>>>
>>>>>>>> IIUC, an api-only domain is a VFIO domain *without* underlying IOMMU
>>>>>>>> hardware. It just, as you said in another mail, "rather than
>>>>>>>> programming them into an IOMMU for a device, it simply stores the
>>>>>>>> translations for use by later requests".
>>>>>>>>
>>>>>>>> That imposes a constraint on gfx driver: hardware IOMMU must be disabled.
>>>>>>>> Otherwise, if IOMMU is present, the gfx driver eventually programs
>>>>>>>> the hardware IOMMU with IOVA returned by pci_map_page or dma_map_page;
>>>>>>>> Meanwhile, the IOMMU backend for vgpu only maintains GPA <-> HPA
>>>>>>>> translations without any knowledge about hardware IOMMU, how is the
>>>>>>>> device model supposed to do to get an IOVA for a given GPA (thereby HPA
>>>>>>>> by the IOMMU backend here)?
>>>>>>>>
>>>>>>>> If things go as guessed above, as vfio_pin_pages() indicates, it
>>>>>>>> pin & translate vaddr to PFN, then it will be very difficult for the
>>>>>>>> device model to figure out:
>>>>>>>>
>>>>>>>>  1, for a given GPA, how to avoid calling dma_map_page multiple times?
>>>>>>>>  2, for which page to call dma_unmap_page?
>>>>>>>>
>>>>>>>> --
>>>>>>>
>>>>>>> We have to support both w/ iommu and w/o iommu case, since
>>>>>>> that fact is out of GPU driver control. A simple way is to use
>>>>>>> dma_map_page which internally will cope with w/ and w/o iommu
>>>>>>> case gracefully, i.e. return HPA w/o iommu and IOVA w/ iommu.
>>>>>>> Then in this file we only need to cache GPA to whatever dmadr_t
>>>>>>> returned by dma_map_page.
>>>>>>>
>>>>>>
>>>>>> Hi Alex, Kirti and Neo, any thought on the IOMMU compatibility here?
>>>>>
>>>>> Hi Jike,
>>>>>
>>>>> With mediated passthru, you still can use hardware iommu, but more important
>>>>> that part is actually orthogonal to what we are discussing here as we will only
>>>>> cache the mapping between <gfn (iova if guest has iommu), (qemu) va>, once we
>>>>> have pinned pages later with the help of above info, you can map it into the
>>>>> proper iommu domain if the system has configured so.
>>>>>
>>>>
>>>> Hi Neo,
>>>>
>>>> Technically yes you can map a pfn into the proper IOMMU domain elsewhere,
>>>> but to find out whether a pfn was previously mapped or not, you have to
>>>> track it with another rbtree-alike data structure (the IOMMU driver simply
>>>> doesn't bother with tracking), that seems somehow duplicate with the vGPU
>>>> IOMMU backend we are discussing here.
>>>>
>>>> And it is also semantically correct for an IOMMU backend to handle both w/
>>>> and w/o an IOMMU hardware? :)
>>>
>>> A problem with the iommu doing the dma_map_page() though is for what
>>> device does it do this?  In the mediated case the vfio infrastructure
>>> is dealing with a software representation of a device.  For all we
>>> know that software model could transparently migrate from one physical
>>> GPU to another.  There may not even be a physical device backing
>>> the mediated device.  Those are details left to the vgpu driver itself.
>>>
>>
>> Great point :) Yes, I agree it's a bit intrusive to do the mapping for
>> a particular
>> pdev in an vGPU IOMMU BE.
>>
>>> Perhaps one possibility would be to allow the vgpu driver to register
>>> map and unmap callbacks.  The unmap callback might provide the
>>> invalidation interface that we're so far missing.  The combination of
>>> map and unmap callbacks might simplify the Intel approach of pinning the
>>> entire VM memory space, ie. for each map callback do a translation
>>> (pin) and dma_map_page, for each unmap do a dma_unmap_page and release
>>> the translation.
>>
>> Yes adding map/unmap ops in pGPU drvier (I assume you are refering to
>> gpu_device_ops as
>> implemented in Kirti's patch) sounds a good idea, satisfying both: 1)
>> keeping vGPU purely
>> virtual; 2) dealing with the Linux DMA API to achive hardware IOMMU
>> compatibility.
>>
>> PS, this has very little to do with pinning wholly or partially. Intel KVMGT has
>> once been had the whole guest memory pinned, only because we used a spinlock,
>> which can't sleep at runtime.  We have removed that spinlock in our another
>> upstreaming effort, not here but for i915 driver, so probably no biggie.
>>
> 
> OK, then you guys don't need to pin everything.

Yes :)

> The next question will be if you
> can send the pinning request from your mediated driver backend to request memory
> pinning like we have demonstrated in the v3 patch, function vfio_pin_pages and
> vfio_unpin_pages?

Kind of yes, not exactly.

IMO the mediated driver backend cares not only about pinning, but also the more
important translation. The vfio_pin_pages of v3 patch does the pinning and
translation simultaneously, whereas I do think the API is better named to
'translate' instead of 'pin', like v2 did.

We possibly have the same requirement from the mediate driver backend:

	a) get a GFN, when guest try to tell hardware;
	b) consult the vfio iommu with that GFN[1]: will you find me a proper dma_addr?

The vfio iommu backend search the tracking table with this GFN[1]:

	c) if entry found, return the dma_addr;
	d) if nothing found, call GUP to pin the page, and dma_map_page to get the dma_addr[2], return it;

The dma_addr will be told to real GPU hardware.

I can't simply say a 'Yes' here, since we may consult dma_addr for a GFN
multiple times, but only for the first time we need to pin the page.

IOW, pinning is kind of an internal action in the iommu backend.


//Sorry for the long, maybe boring explanation.. :)


[1] GFN or vaddr, no biggie
[2] As pointed out by Alex, dma_map_page can be called elsewhere like a callback.


--
Thanks,
Jike
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jike Song May 13, 2016, 6:22 a.m. UTC | #19
On 05/13/2016 10:41 AM, Tian, Kevin wrote:
>> From: Neo Jia [mailto:cjia@nvidia.com]
>> Sent: Friday, May 13, 2016 3:49 AM
>>
>>>
>>>> Perhaps one possibility would be to allow the vgpu driver to register
>>>> map and unmap callbacks.  The unmap callback might provide the
>>>> invalidation interface that we're so far missing.  The combination of
>>>> map and unmap callbacks might simplify the Intel approach of pinning the
>>>> entire VM memory space, ie. for each map callback do a translation
>>>> (pin) and dma_map_page, for each unmap do a dma_unmap_page and release
>>>> the translation.
>>>
>>> Yes adding map/unmap ops in pGPU drvier (I assume you are refering to
>>> gpu_device_ops as
>>> implemented in Kirti's patch) sounds a good idea, satisfying both: 1)
>>> keeping vGPU purely
>>> virtual; 2) dealing with the Linux DMA API to achive hardware IOMMU
>>> compatibility.
>>>
>>> PS, this has very little to do with pinning wholly or partially. Intel KVMGT has
>>> once been had the whole guest memory pinned, only because we used a spinlock,
>>> which can't sleep at runtime.  We have removed that spinlock in our another
>>> upstreaming effort, not here but for i915 driver, so probably no biggie.
>>>
>>
>> OK, then you guys don't need to pin everything. The next question will be if you
>> can send the pinning request from your mediated driver backend to request memory
>> pinning like we have demonstrated in the v3 patch, function vfio_pin_pages and
>> vfio_unpin_pages?
>>
> 
> Jike can you confirm this statement? My feeling is that we don't have such logic
> in our device model to figure out which pages need to be pinned on demand. So
> currently pin-everything is same requirement in both KVM and Xen side...

[Correct me in case of any neglect:)]

IMO the ultimate reason to pin a page, is for DMA. Accessing RAM from a GPU is
certainly a DMA operation. The DMA facility of most platforms, IGD and NVIDIA
GPU included, is not capable of faulting-handling-retrying.

As for vGPU solutions like Nvidia and Intel provide, the memory address region
used by Guest for GPU access, whenever Guest sets the mappings, it is
intercepted by Host, so it's safe to only pin the page before it get used by
Guest. This probably doesn't need device model to change :)


--
Thanks,
Jike

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neo Jia May 13, 2016, 6:41 a.m. UTC | #20
On Fri, May 13, 2016 at 02:08:36PM +0800, Jike Song wrote:
> On 05/13/2016 03:49 AM, Neo Jia wrote:
> > On Thu, May 12, 2016 at 12:11:00PM +0800, Jike Song wrote:
> >> On Thu, May 12, 2016 at 6:06 AM, Alex Williamson
> >> <alex.williamson@redhat.com> wrote:
> >>> On Wed, 11 May 2016 17:15:15 +0800
> >>> Jike Song <jike.song@intel.com> wrote:
> >>>
> >>>> On 05/11/2016 12:02 AM, Neo Jia wrote:
> >>>>> On Tue, May 10, 2016 at 03:52:27PM +0800, Jike Song wrote:
> >>>>>> On 05/05/2016 05:27 PM, Tian, Kevin wrote:
> >>>>>>>> From: Song, Jike
> >>>>>>>>
> >>>>>>>> IIUC, an api-only domain is a VFIO domain *without* underlying IOMMU
> >>>>>>>> hardware. It just, as you said in another mail, "rather than
> >>>>>>>> programming them into an IOMMU for a device, it simply stores the
> >>>>>>>> translations for use by later requests".
> >>>>>>>>
> >>>>>>>> That imposes a constraint on gfx driver: hardware IOMMU must be disabled.
> >>>>>>>> Otherwise, if IOMMU is present, the gfx driver eventually programs
> >>>>>>>> the hardware IOMMU with IOVA returned by pci_map_page or dma_map_page;
> >>>>>>>> Meanwhile, the IOMMU backend for vgpu only maintains GPA <-> HPA
> >>>>>>>> translations without any knowledge about hardware IOMMU, how is the
> >>>>>>>> device model supposed to do to get an IOVA for a given GPA (thereby HPA
> >>>>>>>> by the IOMMU backend here)?
> >>>>>>>>
> >>>>>>>> If things go as guessed above, as vfio_pin_pages() indicates, it
> >>>>>>>> pin & translate vaddr to PFN, then it will be very difficult for the
> >>>>>>>> device model to figure out:
> >>>>>>>>
> >>>>>>>>  1, for a given GPA, how to avoid calling dma_map_page multiple times?
> >>>>>>>>  2, for which page to call dma_unmap_page?
> >>>>>>>>
> >>>>>>>> --
> >>>>>>>
> >>>>>>> We have to support both w/ iommu and w/o iommu case, since
> >>>>>>> that fact is out of GPU driver control. A simple way is to use
> >>>>>>> dma_map_page which internally will cope with w/ and w/o iommu
> >>>>>>> case gracefully, i.e. return HPA w/o iommu and IOVA w/ iommu.
> >>>>>>> Then in this file we only need to cache GPA to whatever dmadr_t
> >>>>>>> returned by dma_map_page.
> >>>>>>>
> >>>>>>
> >>>>>> Hi Alex, Kirti and Neo, any thought on the IOMMU compatibility here?
> >>>>>
> >>>>> Hi Jike,
> >>>>>
> >>>>> With mediated passthru, you still can use hardware iommu, but more important
> >>>>> that part is actually orthogonal to what we are discussing here as we will only
> >>>>> cache the mapping between <gfn (iova if guest has iommu), (qemu) va>, once we
> >>>>> have pinned pages later with the help of above info, you can map it into the
> >>>>> proper iommu domain if the system has configured so.
> >>>>>
> >>>>
> >>>> Hi Neo,
> >>>>
> >>>> Technically yes you can map a pfn into the proper IOMMU domain elsewhere,
> >>>> but to find out whether a pfn was previously mapped or not, you have to
> >>>> track it with another rbtree-alike data structure (the IOMMU driver simply
> >>>> doesn't bother with tracking), that seems somehow duplicate with the vGPU
> >>>> IOMMU backend we are discussing here.
> >>>>
> >>>> And it is also semantically correct for an IOMMU backend to handle both w/
> >>>> and w/o an IOMMU hardware? :)
> >>>
> >>> A problem with the iommu doing the dma_map_page() though is for what
> >>> device does it do this?  In the mediated case the vfio infrastructure
> >>> is dealing with a software representation of a device.  For all we
> >>> know that software model could transparently migrate from one physical
> >>> GPU to another.  There may not even be a physical device backing
> >>> the mediated device.  Those are details left to the vgpu driver itself.
> >>>
> >>
> >> Great point :) Yes, I agree it's a bit intrusive to do the mapping for
> >> a particular
> >> pdev in an vGPU IOMMU BE.
> >>
> >>> Perhaps one possibility would be to allow the vgpu driver to register
> >>> map and unmap callbacks.  The unmap callback might provide the
> >>> invalidation interface that we're so far missing.  The combination of
> >>> map and unmap callbacks might simplify the Intel approach of pinning the
> >>> entire VM memory space, ie. for each map callback do a translation
> >>> (pin) and dma_map_page, for each unmap do a dma_unmap_page and release
> >>> the translation.
> >>
> >> Yes adding map/unmap ops in pGPU drvier (I assume you are refering to
> >> gpu_device_ops as
> >> implemented in Kirti's patch) sounds a good idea, satisfying both: 1)
> >> keeping vGPU purely
> >> virtual; 2) dealing with the Linux DMA API to achive hardware IOMMU
> >> compatibility.
> >>
> >> PS, this has very little to do with pinning wholly or partially. Intel KVMGT has
> >> once been had the whole guest memory pinned, only because we used a spinlock,
> >> which can't sleep at runtime.  We have removed that spinlock in our another
> >> upstreaming effort, not here but for i915 driver, so probably no biggie.
> >>
> > 
> > OK, then you guys don't need to pin everything.
> 
> Yes :)
> 
> > The next question will be if you
> > can send the pinning request from your mediated driver backend to request memory
> > pinning like we have demonstrated in the v3 patch, function vfio_pin_pages and
> > vfio_unpin_pages?
> 
> Kind of yes, not exactly.
> 
> IMO the mediated driver backend cares not only about pinning, but also the more
> important translation. The vfio_pin_pages of v3 patch does the pinning and
> translation simultaneously, whereas I do think the API is better named to
> 'translate' instead of 'pin', like v2 did.

Hi Jike,

Let me explain here.

The "pin and translation" has to be done all together and the pinning here
doesn't mean installing anything into the real IOMMU hardware.

The pinning is lock down the underlying pages for a given QEMU VA which will be
the corresponding guest physical address. Why we have to do that? If not, the
underlying physical pages will be moved and the DMA will not work properly, this
is exactly why the default iommu type1 driver use the get_user_pages to *pin*
memory. The translation part is easy to understand I think.

If you want to read more, you can check the latest email from Alex about a
recent regression introduced by THP, where the underlying page has moved by thp
for a qemu va, so dma is broken.

https://lkml.org/lkml/2016/4/28/604

Once you have the pfn, then the vendor driver can decide what do next.

> 
> We possibly have the same requirement from the mediate driver backend:
> 
> 	a) get a GFN, when guest try to tell hardware;
> 	b) consult the vfio iommu with that GFN[1]: will you find me a proper dma_addr?

We will provide you the pfn via vfio_pin_pages, so you can map it for dma
purpose in your i915 driver, which is what we are doing today.

> 
> The vfio iommu backend search the tracking table with this GFN[1]:
> 
> 	c) if entry found, return the dma_addr;

> 	d) if nothing found, call GUP to pin the page, and dma_map_page to get the dma_addr[2], return it;
> 
> The dma_addr will be told to real GPU hardware.
> 
> I can't simply say a 'Yes' here, since we may consult dma_addr for a GFN
> multiple times, but only for the first time we need to pin the page.

It is very important to keep the consistency from kernel point of view and also
not trust device driver, for example it would always good to assume the
device is going to reference a page whenever he is asking for that information,
therefore it is always good to keep the reference counter going if he asks for
it.

So it is the caller's responsibility to know what they are doing when calling
vfio_pin_pages, the same actually applies to get_user_pages.

Thanks,
Neo

> 
> IOW, pinning is kind of an internal action in the iommu backend.
> 
> 
> //Sorry for the long, maybe boring explanation.. :)
> 
> 
> [1] GFN or vaddr, no biggie
> [2] As pointed out by Alex, dma_map_page can be called elsewhere like a callback.
> 
> 
> --
> Thanks,
> Jike
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neo Jia May 13, 2016, 6:43 a.m. UTC | #21
On Fri, May 13, 2016 at 02:22:37PM +0800, Jike Song wrote:
> On 05/13/2016 10:41 AM, Tian, Kevin wrote:
> >> From: Neo Jia [mailto:cjia@nvidia.com]
> >> Sent: Friday, May 13, 2016 3:49 AM
> >>
> >>>
> >>>> Perhaps one possibility would be to allow the vgpu driver to register
> >>>> map and unmap callbacks.  The unmap callback might provide the
> >>>> invalidation interface that we're so far missing.  The combination of
> >>>> map and unmap callbacks might simplify the Intel approach of pinning the
> >>>> entire VM memory space, ie. for each map callback do a translation
> >>>> (pin) and dma_map_page, for each unmap do a dma_unmap_page and release
> >>>> the translation.
> >>>
> >>> Yes adding map/unmap ops in pGPU drvier (I assume you are refering to
> >>> gpu_device_ops as
> >>> implemented in Kirti's patch) sounds a good idea, satisfying both: 1)
> >>> keeping vGPU purely
> >>> virtual; 2) dealing with the Linux DMA API to achive hardware IOMMU
> >>> compatibility.
> >>>
> >>> PS, this has very little to do with pinning wholly or partially. Intel KVMGT has
> >>> once been had the whole guest memory pinned, only because we used a spinlock,
> >>> which can't sleep at runtime.  We have removed that spinlock in our another
> >>> upstreaming effort, not here but for i915 driver, so probably no biggie.
> >>>
> >>
> >> OK, then you guys don't need to pin everything. The next question will be if you
> >> can send the pinning request from your mediated driver backend to request memory
> >> pinning like we have demonstrated in the v3 patch, function vfio_pin_pages and
> >> vfio_unpin_pages?
> >>
> > 
> > Jike can you confirm this statement? My feeling is that we don't have such logic
> > in our device model to figure out which pages need to be pinned on demand. So
> > currently pin-everything is same requirement in both KVM and Xen side...
> 
> [Correct me in case of any neglect:)]
> 
> IMO the ultimate reason to pin a page, is for DMA. Accessing RAM from a GPU is
> certainly a DMA operation. The DMA facility of most platforms, IGD and NVIDIA
> GPU included, is not capable of faulting-handling-retrying.
> 
> As for vGPU solutions like Nvidia and Intel provide, the memory address region
> used by Guest for GPU access, whenever Guest sets the mappings, it is
> intercepted by Host, so it's safe to only pin the page before it get used by
> Guest. This probably doesn't need device model to change :)

Hi Jike

Just out of curiosity, how does the host intercept this before it goes on the
bus?

Thanks,
Neo

> 
> 
> --
> Thanks,
> Jike
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dong Jia Shi May 13, 2016, 7:10 a.m. UTC | #22
On Thu, 12 May 2016 13:05:52 -0600
Alex Williamson <alex.williamson@redhat.com> wrote:

> On Thu, 12 May 2016 08:00:36 +0000
> "Tian, Kevin" <kevin.tian@intel.com> wrote:
> 
> > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > Sent: Thursday, May 12, 2016 6:06 AM
> > > 
> > > On Wed, 11 May 2016 17:15:15 +0800
> > > Jike Song <jike.song@intel.com> wrote:
> > >   
> > > > On 05/11/2016 12:02 AM, Neo Jia wrote:  
> > > > > On Tue, May 10, 2016 at 03:52:27PM +0800, Jike Song wrote:  
> > > > >> On 05/05/2016 05:27 PM, Tian, Kevin wrote:  
> > > > >>>> From: Song, Jike
> > > > >>>>
> > > > >>>> IIUC, an api-only domain is a VFIO domain *without* underlying IOMMU
> > > > >>>> hardware. It just, as you said in another mail, "rather than
> > > > >>>> programming them into an IOMMU for a device, it simply stores the
> > > > >>>> translations for use by later requests".
> > > > >>>>
> > > > >>>> That imposes a constraint on gfx driver: hardware IOMMU must be disabled.
> > > > >>>> Otherwise, if IOMMU is present, the gfx driver eventually programs
> > > > >>>> the hardware IOMMU with IOVA returned by pci_map_page or dma_map_page;
> > > > >>>> Meanwhile, the IOMMU backend for vgpu only maintains GPA <-> HPA
> > > > >>>> translations without any knowledge about hardware IOMMU, how is the
> > > > >>>> device model supposed to do to get an IOVA for a given GPA (thereby HPA
> > > > >>>> by the IOMMU backend here)?
> > > > >>>>
> > > > >>>> If things go as guessed above, as vfio_pin_pages() indicates, it
> > > > >>>> pin & translate vaddr to PFN, then it will be very difficult for the
> > > > >>>> device model to figure out:
> > > > >>>>
> > > > >>>> 	1, for a given GPA, how to avoid calling dma_map_page multiple times?
> > > > >>>> 	2, for which page to call dma_unmap_page?
> > > > >>>>
> > > > >>>> --  
> > > > >>>
> > > > >>> We have to support both w/ iommu and w/o iommu case, since
> > > > >>> that fact is out of GPU driver control. A simple way is to use
> > > > >>> dma_map_page which internally will cope with w/ and w/o iommu
> > > > >>> case gracefully, i.e. return HPA w/o iommu and IOVA w/ iommu.
> > > > >>> Then in this file we only need to cache GPA to whatever dmadr_t
> > > > >>> returned by dma_map_page.
> > > > >>>  
> > > > >>
> > > > >> Hi Alex, Kirti and Neo, any thought on the IOMMU compatibility here?  
> > > > >
> > > > > Hi Jike,
> > > > >
> > > > > With mediated passthru, you still can use hardware iommu, but more important
> > > > > that part is actually orthogonal to what we are discussing here as we will only
> > > > > cache the mapping between <gfn (iova if guest has iommu), (qemu) va>, once we
> > > > > have pinned pages later with the help of above info, you can map it into the
> > > > > proper iommu domain if the system has configured so.
> > > > >  
> > > >
> > > > Hi Neo,
> > > >
> > > > Technically yes you can map a pfn into the proper IOMMU domain elsewhere,
> > > > but to find out whether a pfn was previously mapped or not, you have to
> > > > track it with another rbtree-alike data structure (the IOMMU driver simply
> > > > doesn't bother with tracking), that seems somehow duplicate with the vGPU
> > > > IOMMU backend we are discussing here.
> > > >
> > > > And it is also semantically correct for an IOMMU backend to handle both w/
> > > > and w/o an IOMMU hardware? :)  
> > > 
> > > A problem with the iommu doing the dma_map_page() though is for what
> > > device does it do this?  In the mediated case the vfio infrastructure
> > > is dealing with a software representation of a device.  For all we
> > > know that software model could transparently migrate from one physical
> > > GPU to another.  There may not even be a physical device backing
> > > the mediated device.  Those are details left to the vgpu driver itself.  
> > 
> > This is a fair argument. VFIO iommu driver simply serves user space
> > requests, where only vaddr<->iova (essentially gpa in kvm case) is
> > mattered. How iova is mapped into real IOMMU is not VFIO's interest.
> > 
> > > 
> > > Perhaps one possibility would be to allow the vgpu driver to register
> > > map and unmap callbacks.  The unmap callback might provide the
> > > invalidation interface that we're so far missing.  The combination of
> > > map and unmap callbacks might simplify the Intel approach of pinning the
> > > entire VM memory space, ie. for each map callback do a translation
> > > (pin) and dma_map_page, for each unmap do a dma_unmap_page and release
> > > the translation.  There's still the problem of where that dma_addr_t
> > > from the dma_map_page is stored though.  Someone would need to keep
> > > track of iova to dma_addr_t.  The vfio iommu might be a place to do
> > > that since we're already tracking information based on iova, possibly
> > > in an opaque data element provided by the vgpu driver.  However, we're
> > > going to need to take a serious look at whether an rb-tree is the right
> > > data structure for the job.  It works well for the current type1
> > > functionality where we typically have tens of entries.  I think the
> > > NVIDIA model of sparse pinning the VM is pushing that up to tens of
> > > thousands.  If Intel intends to pin the entire guest, that's
> > > potentially tens of millions of tracked entries and I don't know that
> > > an rb-tree is the right tool for that job.  Thanks,
> > >   
> > 
> > Based on above thought I'm thinking whether below would work:
> > (let's use gpa to replace existing iova in type1 driver, while using iova
> > for the one actually used in vGPU driver. Assume 'pin-all' scenario first
> > which matches existing vfio logic)
> > 
> > - No change to existing vfio_dma structure. VFIO still maintains gpa<->vaddr
> > mapping, in coarse-grained regions;
> > 
> > - Leverage same page accounting/pinning logic in type1 driver, which 
> > should be enough for 'pin-all' usage;
> > 
> > - Then main divergence point for vGPU would be in vfio_unmap_unpin
> > and vfio_iommu_map. I'm not sure whether it's easy to fake an 
> > iommu_domain for vGPU so same iommu_map/unmap can be reused.
> 
> This seems troublesome.  Kirti's version used numerous api-only tests
> to avoid these which made the code difficult to trace.  Clearly one
> option is to split out the common code so that a new mediated-type1
> backend skips this, but they thought they could clean it up without
> this, so we'll see what happens in the next version.
> 
> > If not, we may introduce two new map/unmap callbacks provided
> > specifically by vGPU core driver, as you suggested:
> > 
> > 	* vGPU core driver uses dma_map_page to map specified pfns:
> > 
> > 		o When IOMMU is enabled, we'll get an iova returned different
> > from pfn;
> > 		o When IOMMU is disabled, returned iova is same as pfn;
> 
> Either way each iova needs to be stored and we have a worst case of one
> iova per page of guest memory.
> 
> > 	* Then vGPU core driver just maintains its own gpa<->iova lookup
> > table (e.g. called vgpu_dma)
> > 
> > 	* Because each vfio_iommu_map invocation is about a contiguous 
> > region, we can expect same number of vgpu_dma entries as maintained 
> > for vfio_dma list;
> >
> > Then it's vGPU core driver's responsibility to provide gpa<->iova
> > lookup for vendor specific GPU driver. And we don't need worry about
> > tens of thousands of entries. Once we get this simple 'pin-all' model
> > ready, then it can be further extended to support 'pin-sparse'
> > scenario. We still maintain a top-level vgpu_dma list with each entry to
> > further link its own sparse mapping structure. In reality I don't expect
> > we really need to maintain per-page translation even with sparse pinning.
> 
> If you're trying to equate the scale of what we need to track vs what
> type1 currently tracks, they're significantly different.  Possible
> things we need to track include the pfn, the iova, and possibly a
> reference count or some sort of pinned page map.  In the pin-all model
> we can assume that every page is pinned on map and unpinned on unmap,
> so a reference count or map is unnecessary.  We can also assume that we
> can always regenerate the pfn with get_user_pages() from the vaddr, so
> we don't need to track that.  I don't see any way around tracking the
> iova.  The iommu can't tell us this like it can with the normal type1
> model because the pfn is the result of the translation, not the key for
> the translation. So we're always going to have between 1 and
> (size/PAGE_SIZE) iova entries per vgpu_dma entry.  You might be able to
> manage the vgpu_dma with an rb-tree, but each vgpu_dma entry needs some
> data structure tracking every iova.
> 
> Sparse mapping has the same issue but of course the tree of iovas is
> potentially incomplete and we need a way to determine where it's
> incomplete.  A page table rooted in the vgpu_dma and indexed by the
> offset from the start vaddr seems like the way to go here.  It's also
> possible that some mediated device models might store the iova in the
> command sent to the device and therefore be able to parse those entries
> back out to unmap them without storing them separately.  This might be
> how the s390 channel-io model would prefer to work.
Dear Alex:

For the s390 channel-io model, when an I/O instruction was intercepted
and issued to the device driver for further translation, the operand of
the instruction contents iovas only. Since iova is the key to locate an
entry in the database (r-b tree or whatever), we do can parse the
entries back out one by one when doing the unmap operation.
                 ^^^^^^^^^^

BTW, if the mediated-iommu backend can potentially offer a transaction
level support for the unmap operation, I believe it will benefit the
performance for this case.

e.g.:
handler = vfio_trasaction_begin();
foreach(iova in the command) {
    pfn = vfio_trasaction_map(handler, iova);
    do_io(pfn);
}

/*
 * Expect to unmap all of the pfns mapped in this trasaction with the
 * next statement. The mediated-iommu backend could use handler as the
 * key to track the list of the entries.
 */
vfio_trasaction_unmap(handler);
vfio_trasaction_end(handler);

Not sure if this could benefit the vgpu sparse mapping use case though.

>  That seems like
> further validation that such tracking is going to be dependent on the
> mediated driver itself and probably not something to centralize in a
> mediated iommu driver.  Thanks,
> 
> Alex
> 



--------
Dong Jia

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tian, Kevin May 13, 2016, 7:13 a.m. UTC | #23
> From: Neo Jia [mailto:cjia@nvidia.com]
> Sent: Friday, May 13, 2016 2:42 PM
> 
> 
> >
> > We possibly have the same requirement from the mediate driver backend:
> >
> > 	a) get a GFN, when guest try to tell hardware;
> > 	b) consult the vfio iommu with that GFN[1]: will you find me a proper dma_addr?
> 
> We will provide you the pfn via vfio_pin_pages, so you can map it for dma
> purpose in your i915 driver, which is what we are doing today.
> 

Can such 'map' operation be consolidated in vGPU core driver? I don't think 
Intel vGPU driver has any feature proactively relying on iommu. The reason 
why we keep talking iommu is just because the kernel may enable iommu 
for physical GPU so we need make sure our device model can work in such
configuration. And this requirement should apply to all vendors, not Intel
specific (like you said you are doing it already today).

Thanks
Kevin
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neo Jia May 13, 2016, 7:24 a.m. UTC | #24
On Fri, May 13, 2016 at 03:10:22PM +0800, Dong Jia wrote:
> On Thu, 12 May 2016 13:05:52 -0600
> Alex Williamson <alex.williamson@redhat.com> wrote:
> 
> > On Thu, 12 May 2016 08:00:36 +0000
> > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> > 
> > > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > > Sent: Thursday, May 12, 2016 6:06 AM
> > > > 
> > > > On Wed, 11 May 2016 17:15:15 +0800
> > > > Jike Song <jike.song@intel.com> wrote:
> > > >   
> > > > > On 05/11/2016 12:02 AM, Neo Jia wrote:  
> > > > > > On Tue, May 10, 2016 at 03:52:27PM +0800, Jike Song wrote:  
> > > > > >> On 05/05/2016 05:27 PM, Tian, Kevin wrote:  
> > > > > >>>> From: Song, Jike
> > > > > >>>>
> > > > > >>>> IIUC, an api-only domain is a VFIO domain *without* underlying IOMMU
> > > > > >>>> hardware. It just, as you said in another mail, "rather than
> > > > > >>>> programming them into an IOMMU for a device, it simply stores the
> > > > > >>>> translations for use by later requests".
> > > > > >>>>
> > > > > >>>> That imposes a constraint on gfx driver: hardware IOMMU must be disabled.
> > > > > >>>> Otherwise, if IOMMU is present, the gfx driver eventually programs
> > > > > >>>> the hardware IOMMU with IOVA returned by pci_map_page or dma_map_page;
> > > > > >>>> Meanwhile, the IOMMU backend for vgpu only maintains GPA <-> HPA
> > > > > >>>> translations without any knowledge about hardware IOMMU, how is the
> > > > > >>>> device model supposed to do to get an IOVA for a given GPA (thereby HPA
> > > > > >>>> by the IOMMU backend here)?
> > > > > >>>>
> > > > > >>>> If things go as guessed above, as vfio_pin_pages() indicates, it
> > > > > >>>> pin & translate vaddr to PFN, then it will be very difficult for the
> > > > > >>>> device model to figure out:
> > > > > >>>>
> > > > > >>>> 	1, for a given GPA, how to avoid calling dma_map_page multiple times?
> > > > > >>>> 	2, for which page to call dma_unmap_page?
> > > > > >>>>
> > > > > >>>> --  
> > > > > >>>
> > > > > >>> We have to support both w/ iommu and w/o iommu case, since
> > > > > >>> that fact is out of GPU driver control. A simple way is to use
> > > > > >>> dma_map_page which internally will cope with w/ and w/o iommu
> > > > > >>> case gracefully, i.e. return HPA w/o iommu and IOVA w/ iommu.
> > > > > >>> Then in this file we only need to cache GPA to whatever dmadr_t
> > > > > >>> returned by dma_map_page.
> > > > > >>>  
> > > > > >>
> > > > > >> Hi Alex, Kirti and Neo, any thought on the IOMMU compatibility here?  
> > > > > >
> > > > > > Hi Jike,
> > > > > >
> > > > > > With mediated passthru, you still can use hardware iommu, but more important
> > > > > > that part is actually orthogonal to what we are discussing here as we will only
> > > > > > cache the mapping between <gfn (iova if guest has iommu), (qemu) va>, once we
> > > > > > have pinned pages later with the help of above info, you can map it into the
> > > > > > proper iommu domain if the system has configured so.
> > > > > >  
> > > > >
> > > > > Hi Neo,
> > > > >
> > > > > Technically yes you can map a pfn into the proper IOMMU domain elsewhere,
> > > > > but to find out whether a pfn was previously mapped or not, you have to
> > > > > track it with another rbtree-alike data structure (the IOMMU driver simply
> > > > > doesn't bother with tracking), that seems somehow duplicate with the vGPU
> > > > > IOMMU backend we are discussing here.
> > > > >
> > > > > And it is also semantically correct for an IOMMU backend to handle both w/
> > > > > and w/o an IOMMU hardware? :)  
> > > > 
> > > > A problem with the iommu doing the dma_map_page() though is for what
> > > > device does it do this?  In the mediated case the vfio infrastructure
> > > > is dealing with a software representation of a device.  For all we
> > > > know that software model could transparently migrate from one physical
> > > > GPU to another.  There may not even be a physical device backing
> > > > the mediated device.  Those are details left to the vgpu driver itself.  
> > > 
> > > This is a fair argument. VFIO iommu driver simply serves user space
> > > requests, where only vaddr<->iova (essentially gpa in kvm case) is
> > > mattered. How iova is mapped into real IOMMU is not VFIO's interest.
> > > 
> > > > 
> > > > Perhaps one possibility would be to allow the vgpu driver to register
> > > > map and unmap callbacks.  The unmap callback might provide the
> > > > invalidation interface that we're so far missing.  The combination of
> > > > map and unmap callbacks might simplify the Intel approach of pinning the
> > > > entire VM memory space, ie. for each map callback do a translation
> > > > (pin) and dma_map_page, for each unmap do a dma_unmap_page and release
> > > > the translation.  There's still the problem of where that dma_addr_t
> > > > from the dma_map_page is stored though.  Someone would need to keep
> > > > track of iova to dma_addr_t.  The vfio iommu might be a place to do
> > > > that since we're already tracking information based on iova, possibly
> > > > in an opaque data element provided by the vgpu driver.  However, we're
> > > > going to need to take a serious look at whether an rb-tree is the right
> > > > data structure for the job.  It works well for the current type1
> > > > functionality where we typically have tens of entries.  I think the
> > > > NVIDIA model of sparse pinning the VM is pushing that up to tens of
> > > > thousands.  If Intel intends to pin the entire guest, that's
> > > > potentially tens of millions of tracked entries and I don't know that
> > > > an rb-tree is the right tool for that job.  Thanks,
> > > >   
> > > 
> > > Based on above thought I'm thinking whether below would work:
> > > (let's use gpa to replace existing iova in type1 driver, while using iova
> > > for the one actually used in vGPU driver. Assume 'pin-all' scenario first
> > > which matches existing vfio logic)
> > > 
> > > - No change to existing vfio_dma structure. VFIO still maintains gpa<->vaddr
> > > mapping, in coarse-grained regions;
> > > 
> > > - Leverage same page accounting/pinning logic in type1 driver, which 
> > > should be enough for 'pin-all' usage;
> > > 
> > > - Then main divergence point for vGPU would be in vfio_unmap_unpin
> > > and vfio_iommu_map. I'm not sure whether it's easy to fake an 
> > > iommu_domain for vGPU so same iommu_map/unmap can be reused.
> > 
> > This seems troublesome.  Kirti's version used numerous api-only tests
> > to avoid these which made the code difficult to trace.  Clearly one
> > option is to split out the common code so that a new mediated-type1
> > backend skips this, but they thought they could clean it up without
> > this, so we'll see what happens in the next version.
> > 
> > > If not, we may introduce two new map/unmap callbacks provided
> > > specifically by vGPU core driver, as you suggested:
> > > 
> > > 	* vGPU core driver uses dma_map_page to map specified pfns:
> > > 
> > > 		o When IOMMU is enabled, we'll get an iova returned different
> > > from pfn;
> > > 		o When IOMMU is disabled, returned iova is same as pfn;
> > 
> > Either way each iova needs to be stored and we have a worst case of one
> > iova per page of guest memory.
> > 
> > > 	* Then vGPU core driver just maintains its own gpa<->iova lookup
> > > table (e.g. called vgpu_dma)
> > > 
> > > 	* Because each vfio_iommu_map invocation is about a contiguous 
> > > region, we can expect same number of vgpu_dma entries as maintained 
> > > for vfio_dma list;
> > >
> > > Then it's vGPU core driver's responsibility to provide gpa<->iova
> > > lookup for vendor specific GPU driver. And we don't need worry about
> > > tens of thousands of entries. Once we get this simple 'pin-all' model
> > > ready, then it can be further extended to support 'pin-sparse'
> > > scenario. We still maintain a top-level vgpu_dma list with each entry to
> > > further link its own sparse mapping structure. In reality I don't expect
> > > we really need to maintain per-page translation even with sparse pinning.
> > 
> > If you're trying to equate the scale of what we need to track vs what
> > type1 currently tracks, they're significantly different.  Possible
> > things we need to track include the pfn, the iova, and possibly a
> > reference count or some sort of pinned page map.  In the pin-all model
> > we can assume that every page is pinned on map and unpinned on unmap,
> > so a reference count or map is unnecessary.  We can also assume that we
> > can always regenerate the pfn with get_user_pages() from the vaddr, so
> > we don't need to track that.  I don't see any way around tracking the
> > iova.  The iommu can't tell us this like it can with the normal type1
> > model because the pfn is the result of the translation, not the key for
> > the translation. So we're always going to have between 1 and
> > (size/PAGE_SIZE) iova entries per vgpu_dma entry.  You might be able to
> > manage the vgpu_dma with an rb-tree, but each vgpu_dma entry needs some
> > data structure tracking every iova.
> > 
> > Sparse mapping has the same issue but of course the tree of iovas is
> > potentially incomplete and we need a way to determine where it's
> > incomplete.  A page table rooted in the vgpu_dma and indexed by the
> > offset from the start vaddr seems like the way to go here.  It's also
> > possible that some mediated device models might store the iova in the
> > command sent to the device and therefore be able to parse those entries
> > back out to unmap them without storing them separately.  This might be
> > how the s390 channel-io model would prefer to work.
> Dear Alex:
> 
> For the s390 channel-io model, when an I/O instruction was intercepted
> and issued to the device driver for further translation, the operand of
> the instruction contents iovas only. Since iova is the key to locate an
> entry in the database (r-b tree or whatever), we do can parse the
> entries back out one by one when doing the unmap operation.
>                  ^^^^^^^^^^
> 
> BTW, if the mediated-iommu backend can potentially offer a transaction
> level support for the unmap operation, I believe it will benefit the
> performance for this case.
> 
> e.g.:
> handler = vfio_trasaction_begin();
> foreach(iova in the command) {
>     pfn = vfio_trasaction_map(handler, iova);
>     do_io(pfn);
> }

Hi Dong,

Could you please help me understand the performance benefit here? 

Is the perf argument coming from the searching of rbtree of the tracking data
structure or something else?

For example you can do similar thing by the following sequence from your backend
driver:

    vfio_pin_pages(gfn_list/iova_list /* in */, npages, prot, pfn_bases /* out */)
    foreach (pfn)
        do_io(pfn)
    vfio_unpin_pages(pfn_bases)

Thanks,
Neo

> 
> /*
>  * Expect to unmap all of the pfns mapped in this trasaction with the
>  * next statement. The mediated-iommu backend could use handler as the
>  * key to track the list of the entries.
>  */
> vfio_trasaction_unmap(handler);
> vfio_trasaction_end(handler);
> 
> Not sure if this could benefit the vgpu sparse mapping use case though.





> 
> >  That seems like
> > further validation that such tracking is going to be dependent on the
> > mediated driver itself and probably not something to centralize in a
> > mediated iommu driver.  Thanks,
> > 
> > Alex
> > 
> 
> 
> 
> --------
> Dong Jia
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jike Song May 13, 2016, 7:30 a.m. UTC | #25
On 05/13/2016 02:43 PM, Neo Jia wrote:
> On Fri, May 13, 2016 at 02:22:37PM +0800, Jike Song wrote:
>> On 05/13/2016 10:41 AM, Tian, Kevin wrote:
>>>> From: Neo Jia [mailto:cjia@nvidia.com] Sent: Friday, May 13,
>>>> 2016 3:49 AM
>>>> 
>>>>> 
>>>>>> Perhaps one possibility would be to allow the vgpu driver
>>>>>> to register map and unmap callbacks.  The unmap callback
>>>>>> might provide the invalidation interface that we're so far
>>>>>> missing.  The combination of map and unmap callbacks might
>>>>>> simplify the Intel approach of pinning the entire VM memory
>>>>>> space, ie. for each map callback do a translation (pin) and
>>>>>> dma_map_page, for each unmap do a dma_unmap_page and
>>>>>> release the translation.
>>>>> 
>>>>> Yes adding map/unmap ops in pGPU drvier (I assume you are
>>>>> refering to gpu_device_ops as implemented in Kirti's patch)
>>>>> sounds a good idea, satisfying both: 1) keeping vGPU purely 
>>>>> virtual; 2) dealing with the Linux DMA API to achive hardware
>>>>> IOMMU compatibility.
>>>>> 
>>>>> PS, this has very little to do with pinning wholly or
>>>>> partially. Intel KVMGT has once been had the whole guest
>>>>> memory pinned, only because we used a spinlock, which can't
>>>>> sleep at runtime.  We have removed that spinlock in our
>>>>> another upstreaming effort, not here but for i915 driver, so
>>>>> probably no biggie.
>>>>> 
>>>> 
>>>> OK, then you guys don't need to pin everything. The next
>>>> question will be if you can send the pinning request from your
>>>> mediated driver backend to request memory pinning like we have
>>>> demonstrated in the v3 patch, function vfio_pin_pages and 
>>>> vfio_unpin_pages?
>>>> 
>>> 
>>> Jike can you confirm this statement? My feeling is that we don't
>>> have such logic in our device model to figure out which pages
>>> need to be pinned on demand. So currently pin-everything is same
>>> requirement in both KVM and Xen side...
>> 
>> [Correct me in case of any neglect:)]
>> 
>> IMO the ultimate reason to pin a page, is for DMA. Accessing RAM
>> from a GPU is certainly a DMA operation. The DMA facility of most
>> platforms, IGD and NVIDIA GPU included, is not capable of
>> faulting-handling-retrying.
>> 
>> As for vGPU solutions like Nvidia and Intel provide, the memory
>> address region used by Guest for GPU access, whenever Guest sets
>> the mappings, it is intercepted by Host, so it's safe to only pin
>> the page before it get used by Guest. This probably doesn't need
>> device model to change :)
> 
> Hi Jike
> 
> Just out of curiosity, how does the host intercept this before it
> goes on the bus?
> 

Hi Neo,

[prologize if I mis-expressed myself, bad English ..] 

I was talking about intercepting the setting-up of GPU page tables,
not the DMA itself.  For currently Intel GPU, the page tables are
MMIO registers or simply RAM pages, called GTT (Graphics Translation
Table), the writing event to an GTT entry from Guest, is always
intercepted by Host.

--
Thanks,
Jike

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neo Jia May 13, 2016, 7:38 a.m. UTC | #26
On Fri, May 13, 2016 at 07:13:44AM +0000, Tian, Kevin wrote:
> > From: Neo Jia [mailto:cjia@nvidia.com]
> > Sent: Friday, May 13, 2016 2:42 PM
> > 
> > 
> > >
> > > We possibly have the same requirement from the mediate driver backend:
> > >
> > > 	a) get a GFN, when guest try to tell hardware;
> > > 	b) consult the vfio iommu with that GFN[1]: will you find me a proper dma_addr?
> > 
> > We will provide you the pfn via vfio_pin_pages, so you can map it for dma
> > purpose in your i915 driver, which is what we are doing today.
> > 
> 
> Can such 'map' operation be consolidated in vGPU core driver? I don't think 
> Intel vGPU driver has any feature proactively relying on iommu. The reason 
> why we keep talking iommu is just because the kernel may enable iommu 
> for physical GPU so we need make sure our device model can work in such
> configuration. And this requirement should apply to all vendors, not Intel
> specific (like you said you are doing it already today).

Hi Kevin,

Actually, such requirement is already satisfied today as all vendor drivers
should transparently work with and without system iommu on bare-metal, right?

So I don't see any new requirement here, also such consolidation doesn't help
any but adding complexity to the system as vendor driver will not remove
their own dma_map_xxx functions as they are still required to support
non-mediated cases. 

Thanks,
Neo

> 
> Thanks
> Kevin
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neo Jia May 13, 2016, 7:42 a.m. UTC | #27
On Fri, May 13, 2016 at 03:30:27PM +0800, Jike Song wrote:
> On 05/13/2016 02:43 PM, Neo Jia wrote:
> > On Fri, May 13, 2016 at 02:22:37PM +0800, Jike Song wrote:
> >> On 05/13/2016 10:41 AM, Tian, Kevin wrote:
> >>>> From: Neo Jia [mailto:cjia@nvidia.com] Sent: Friday, May 13,
> >>>> 2016 3:49 AM
> >>>> 
> >>>>> 
> >>>>>> Perhaps one possibility would be to allow the vgpu driver
> >>>>>> to register map and unmap callbacks.  The unmap callback
> >>>>>> might provide the invalidation interface that we're so far
> >>>>>> missing.  The combination of map and unmap callbacks might
> >>>>>> simplify the Intel approach of pinning the entire VM memory
> >>>>>> space, ie. for each map callback do a translation (pin) and
> >>>>>> dma_map_page, for each unmap do a dma_unmap_page and
> >>>>>> release the translation.
> >>>>> 
> >>>>> Yes adding map/unmap ops in pGPU drvier (I assume you are
> >>>>> refering to gpu_device_ops as implemented in Kirti's patch)
> >>>>> sounds a good idea, satisfying both: 1) keeping vGPU purely 
> >>>>> virtual; 2) dealing with the Linux DMA API to achive hardware
> >>>>> IOMMU compatibility.
> >>>>> 
> >>>>> PS, this has very little to do with pinning wholly or
> >>>>> partially. Intel KVMGT has once been had the whole guest
> >>>>> memory pinned, only because we used a spinlock, which can't
> >>>>> sleep at runtime.  We have removed that spinlock in our
> >>>>> another upstreaming effort, not here but for i915 driver, so
> >>>>> probably no biggie.
> >>>>> 
> >>>> 
> >>>> OK, then you guys don't need to pin everything. The next
> >>>> question will be if you can send the pinning request from your
> >>>> mediated driver backend to request memory pinning like we have
> >>>> demonstrated in the v3 patch, function vfio_pin_pages and 
> >>>> vfio_unpin_pages?
> >>>> 
> >>> 
> >>> Jike can you confirm this statement? My feeling is that we don't
> >>> have such logic in our device model to figure out which pages
> >>> need to be pinned on demand. So currently pin-everything is same
> >>> requirement in both KVM and Xen side...
> >> 
> >> [Correct me in case of any neglect:)]
> >> 
> >> IMO the ultimate reason to pin a page, is for DMA. Accessing RAM
> >> from a GPU is certainly a DMA operation. The DMA facility of most
> >> platforms, IGD and NVIDIA GPU included, is not capable of
> >> faulting-handling-retrying.
> >> 
> >> As for vGPU solutions like Nvidia and Intel provide, the memory
> >> address region used by Guest for GPU access, whenever Guest sets
> >> the mappings, it is intercepted by Host, so it's safe to only pin
> >> the page before it get used by Guest. This probably doesn't need
> >> device model to change :)
> > 
> > Hi Jike
> > 
> > Just out of curiosity, how does the host intercept this before it
> > goes on the bus?
> > 
> 
> Hi Neo,
> 
> [prologize if I mis-expressed myself, bad English ..] 
> 
> I was talking about intercepting the setting-up of GPU page tables,
> not the DMA itself.  For currently Intel GPU, the page tables are
> MMIO registers or simply RAM pages, called GTT (Graphics Translation
> Table), the writing event to an GTT entry from Guest, is always
> intercepted by Host.

Hi Jike,

Thanks for the details, one more question if the page tables are guest RAM, how do you
intercept it from host? I can see it get intercepted when it is in MMIO range.

Thanks,
Neo

> 
> --
> Thanks,
> Jike
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tian, Kevin May 13, 2016, 7:45 a.m. UTC | #28
> From: Neo Jia [mailto:cjia@nvidia.com]
> Sent: Friday, May 13, 2016 3:42 PM
> 
> On Fri, May 13, 2016 at 03:30:27PM +0800, Jike Song wrote:
> > On 05/13/2016 02:43 PM, Neo Jia wrote:
> > > On Fri, May 13, 2016 at 02:22:37PM +0800, Jike Song wrote:
> > >> On 05/13/2016 10:41 AM, Tian, Kevin wrote:
> > >>>> From: Neo Jia [mailto:cjia@nvidia.com] Sent: Friday, May 13,
> > >>>> 2016 3:49 AM
> > >>>>
> > >>>>>
> > >>>>>> Perhaps one possibility would be to allow the vgpu driver
> > >>>>>> to register map and unmap callbacks.  The unmap callback
> > >>>>>> might provide the invalidation interface that we're so far
> > >>>>>> missing.  The combination of map and unmap callbacks might
> > >>>>>> simplify the Intel approach of pinning the entire VM memory
> > >>>>>> space, ie. for each map callback do a translation (pin) and
> > >>>>>> dma_map_page, for each unmap do a dma_unmap_page and
> > >>>>>> release the translation.
> > >>>>>
> > >>>>> Yes adding map/unmap ops in pGPU drvier (I assume you are
> > >>>>> refering to gpu_device_ops as implemented in Kirti's patch)
> > >>>>> sounds a good idea, satisfying both: 1) keeping vGPU purely
> > >>>>> virtual; 2) dealing with the Linux DMA API to achive hardware
> > >>>>> IOMMU compatibility.
> > >>>>>
> > >>>>> PS, this has very little to do with pinning wholly or
> > >>>>> partially. Intel KVMGT has once been had the whole guest
> > >>>>> memory pinned, only because we used a spinlock, which can't
> > >>>>> sleep at runtime.  We have removed that spinlock in our
> > >>>>> another upstreaming effort, not here but for i915 driver, so
> > >>>>> probably no biggie.
> > >>>>>
> > >>>>
> > >>>> OK, then you guys don't need to pin everything. The next
> > >>>> question will be if you can send the pinning request from your
> > >>>> mediated driver backend to request memory pinning like we have
> > >>>> demonstrated in the v3 patch, function vfio_pin_pages and
> > >>>> vfio_unpin_pages?
> > >>>>
> > >>>
> > >>> Jike can you confirm this statement? My feeling is that we don't
> > >>> have such logic in our device model to figure out which pages
> > >>> need to be pinned on demand. So currently pin-everything is same
> > >>> requirement in both KVM and Xen side...
> > >>
> > >> [Correct me in case of any neglect:)]
> > >>
> > >> IMO the ultimate reason to pin a page, is for DMA. Accessing RAM
> > >> from a GPU is certainly a DMA operation. The DMA facility of most
> > >> platforms, IGD and NVIDIA GPU included, is not capable of
> > >> faulting-handling-retrying.
> > >>
> > >> As for vGPU solutions like Nvidia and Intel provide, the memory
> > >> address region used by Guest for GPU access, whenever Guest sets
> > >> the mappings, it is intercepted by Host, so it's safe to only pin
> > >> the page before it get used by Guest. This probably doesn't need
> > >> device model to change :)
> > >
> > > Hi Jike
> > >
> > > Just out of curiosity, how does the host intercept this before it
> > > goes on the bus?
> > >
> >
> > Hi Neo,
> >
> > [prologize if I mis-expressed myself, bad English ..]
> >
> > I was talking about intercepting the setting-up of GPU page tables,
> > not the DMA itself.  For currently Intel GPU, the page tables are
> > MMIO registers or simply RAM pages, called GTT (Graphics Translation
> > Table), the writing event to an GTT entry from Guest, is always
> > intercepted by Host.
> 
> Hi Jike,
> 
> Thanks for the details, one more question if the page tables are guest RAM, how do you
> intercept it from host? I can see it get intercepted when it is in MMIO range.
> 

We use page tracking framework, which is newly added to KVM recently,
to mark RAM pages as read-only so write accesses are intercepted to 
device model.

Thanks
Kevin
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tian, Kevin May 13, 2016, 8:02 a.m. UTC | #29
> From: Neo Jia [mailto:cjia@nvidia.com]
> Sent: Friday, May 13, 2016 3:38 PM
> 
> On Fri, May 13, 2016 at 07:13:44AM +0000, Tian, Kevin wrote:
> > > From: Neo Jia [mailto:cjia@nvidia.com]
> > > Sent: Friday, May 13, 2016 2:42 PM
> > >
> > >
> > > >
> > > > We possibly have the same requirement from the mediate driver backend:
> > > >
> > > > 	a) get a GFN, when guest try to tell hardware;
> > > > 	b) consult the vfio iommu with that GFN[1]: will you find me a proper dma_addr?
> > >
> > > We will provide you the pfn via vfio_pin_pages, so you can map it for dma
> > > purpose in your i915 driver, which is what we are doing today.
> > >
> >
> > Can such 'map' operation be consolidated in vGPU core driver? I don't think
> > Intel vGPU driver has any feature proactively relying on iommu. The reason
> > why we keep talking iommu is just because the kernel may enable iommu
> > for physical GPU so we need make sure our device model can work in such
> > configuration. And this requirement should apply to all vendors, not Intel
> > specific (like you said you are doing it already today).
> 
> Hi Kevin,
> 
> Actually, such requirement is already satisfied today as all vendor drivers
> should transparently work with and without system iommu on bare-metal, right?
> 
> So I don't see any new requirement here, also such consolidation doesn't help
> any but adding complexity to the system as vendor driver will not remove
> their own dma_map_xxx functions as they are still required to support
> non-mediated cases.
> 

Thanks for your information, which makes it clearer where the difference is. :-)

Based on your description, looks you treat guest pages same as normal process
pages, which all share the same code path when mapping as DMA target, so it
is pointless to separate guest page map out to vGPU core driver. Is this
understanding correct?

In our side, so far guest pages are treated differently from normal process
pages, which is the main reason why I asked whether we can consolidate that
part. Looks now it's not necessary since it's already not a common requirement.

One additional question though. Jike already mentioned the need to shadow
GPU MMU (called GTT table in Intel side) in our device model. 'shadow' here
basically means we need translate from 'gfn' in guest pte to 'dmadr_t'
as returned by dma_map_xxx. Based on gfn->pfn translation provided by
VFIO (in your v3 driver), gfn->dmadr_t mapping can be constructed accordingly
in the vendor driver. So do you have similar requirement like this? If yes, do
you think any value to unify that translation structure or prefer to maintain
it by vendor driver?

Thanks
Kevin
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neo Jia May 13, 2016, 8:31 a.m. UTC | #30
On Fri, May 13, 2016 at 07:45:14AM +0000, Tian, Kevin wrote:
> > From: Neo Jia [mailto:cjia@nvidia.com]
> > Sent: Friday, May 13, 2016 3:42 PM
> > 
> > On Fri, May 13, 2016 at 03:30:27PM +0800, Jike Song wrote:
> > > On 05/13/2016 02:43 PM, Neo Jia wrote:
> > > > On Fri, May 13, 2016 at 02:22:37PM +0800, Jike Song wrote:
> > > >> On 05/13/2016 10:41 AM, Tian, Kevin wrote:
> > > >>>> From: Neo Jia [mailto:cjia@nvidia.com] Sent: Friday, May 13,
> > > >>>> 2016 3:49 AM
> > > >>>>
> > > >>>>>
> > > >>>>>> Perhaps one possibility would be to allow the vgpu driver
> > > >>>>>> to register map and unmap callbacks.  The unmap callback
> > > >>>>>> might provide the invalidation interface that we're so far
> > > >>>>>> missing.  The combination of map and unmap callbacks might
> > > >>>>>> simplify the Intel approach of pinning the entire VM memory
> > > >>>>>> space, ie. for each map callback do a translation (pin) and
> > > >>>>>> dma_map_page, for each unmap do a dma_unmap_page and
> > > >>>>>> release the translation.
> > > >>>>>
> > > >>>>> Yes adding map/unmap ops in pGPU drvier (I assume you are
> > > >>>>> refering to gpu_device_ops as implemented in Kirti's patch)
> > > >>>>> sounds a good idea, satisfying both: 1) keeping vGPU purely
> > > >>>>> virtual; 2) dealing with the Linux DMA API to achive hardware
> > > >>>>> IOMMU compatibility.
> > > >>>>>
> > > >>>>> PS, this has very little to do with pinning wholly or
> > > >>>>> partially. Intel KVMGT has once been had the whole guest
> > > >>>>> memory pinned, only because we used a spinlock, which can't
> > > >>>>> sleep at runtime.  We have removed that spinlock in our
> > > >>>>> another upstreaming effort, not here but for i915 driver, so
> > > >>>>> probably no biggie.
> > > >>>>>
> > > >>>>
> > > >>>> OK, then you guys don't need to pin everything. The next
> > > >>>> question will be if you can send the pinning request from your
> > > >>>> mediated driver backend to request memory pinning like we have
> > > >>>> demonstrated in the v3 patch, function vfio_pin_pages and
> > > >>>> vfio_unpin_pages?
> > > >>>>
> > > >>>
> > > >>> Jike can you confirm this statement? My feeling is that we don't
> > > >>> have such logic in our device model to figure out which pages
> > > >>> need to be pinned on demand. So currently pin-everything is same
> > > >>> requirement in both KVM and Xen side...
> > > >>
> > > >> [Correct me in case of any neglect:)]
> > > >>
> > > >> IMO the ultimate reason to pin a page, is for DMA. Accessing RAM
> > > >> from a GPU is certainly a DMA operation. The DMA facility of most
> > > >> platforms, IGD and NVIDIA GPU included, is not capable of
> > > >> faulting-handling-retrying.
> > > >>
> > > >> As for vGPU solutions like Nvidia and Intel provide, the memory
> > > >> address region used by Guest for GPU access, whenever Guest sets
> > > >> the mappings, it is intercepted by Host, so it's safe to only pin
> > > >> the page before it get used by Guest. This probably doesn't need
> > > >> device model to change :)
> > > >
> > > > Hi Jike
> > > >
> > > > Just out of curiosity, how does the host intercept this before it
> > > > goes on the bus?
> > > >
> > >
> > > Hi Neo,
> > >
> > > [prologize if I mis-expressed myself, bad English ..]
> > >
> > > I was talking about intercepting the setting-up of GPU page tables,
> > > not the DMA itself.  For currently Intel GPU, the page tables are
> > > MMIO registers or simply RAM pages, called GTT (Graphics Translation
> > > Table), the writing event to an GTT entry from Guest, is always
> > > intercepted by Host.
> > 
> > Hi Jike,
> > 
> > Thanks for the details, one more question if the page tables are guest RAM, how do you
> > intercept it from host? I can see it get intercepted when it is in MMIO range.
> > 
> 
> We use page tracking framework, which is newly added to KVM recently,
> to mark RAM pages as read-only so write accesses are intercepted to 
> device model.

Yes, I am aware of that patchset from Guangrong. So far the interface are all
requiring struct *kvm, copied from https://lkml.org/lkml/2015/11/30/644

- kvm_page_track_add_page(): add the page to the tracking pool after
  that later specified access on that page will be tracked

- kvm_page_track_remove_page(): remove the page from the tracking pool,
  the specified access on the page is not tracked after the last user is
  gone

void kvm_page_track_add_page(struct kvm *kvm, gfn_t gfn,
                enum kvm_page_track_mode mode);
void kvm_page_track_remove_page(struct kvm *kvm, gfn_t gfn,
               enum kvm_page_track_mode mode);

Really curious how you are going to have access to the struct kvm *kvm, or you
are relying on the userfaultfd to track the write faults only as part of the
QEMU userfault thread?

Thanks,
Neo

> 
> Thanks
> Kevin

-----------------------------------------------------------------------------------
This email message is for the sole use of the intended recipient(s) and may contain
confidential information.  Any unauthorized review, use, disclosure or distribution
is prohibited.  If you are not the intended recipient, please contact the sender by
reply email and destroy all copies of the original message.
-----------------------------------------------------------------------------------
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dong Jia Shi May 13, 2016, 8:39 a.m. UTC | #31
On Fri, 13 May 2016 00:24:34 -0700
Neo Jia <cjia@nvidia.com> wrote:

> On Fri, May 13, 2016 at 03:10:22PM +0800, Dong Jia wrote:
> > On Thu, 12 May 2016 13:05:52 -0600
> > Alex Williamson <alex.williamson@redhat.com> wrote:
> > 
> > > On Thu, 12 May 2016 08:00:36 +0000
> > > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> > > 
> > > > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > > > Sent: Thursday, May 12, 2016 6:06 AM
> > > > > 
> > > > > On Wed, 11 May 2016 17:15:15 +0800
> > > > > Jike Song <jike.song@intel.com> wrote:
> > > > >   
> > > > > > On 05/11/2016 12:02 AM, Neo Jia wrote:  
> > > > > > > On Tue, May 10, 2016 at 03:52:27PM +0800, Jike Song wrote:  
> > > > > > >> On 05/05/2016 05:27 PM, Tian, Kevin wrote:  
> > > > > > >>>> From: Song, Jike
> > > > > > >>>>
> > > > > > >>>> IIUC, an api-only domain is a VFIO domain *without* underlying IOMMU
> > > > > > >>>> hardware. It just, as you said in another mail, "rather than
> > > > > > >>>> programming them into an IOMMU for a device, it simply stores the
> > > > > > >>>> translations for use by later requests".
> > > > > > >>>>
> > > > > > >>>> That imposes a constraint on gfx driver: hardware IOMMU must be disabled.
> > > > > > >>>> Otherwise, if IOMMU is present, the gfx driver eventually programs
> > > > > > >>>> the hardware IOMMU with IOVA returned by pci_map_page or dma_map_page;
> > > > > > >>>> Meanwhile, the IOMMU backend for vgpu only maintains GPA <-> HPA
> > > > > > >>>> translations without any knowledge about hardware IOMMU, how is the
> > > > > > >>>> device model supposed to do to get an IOVA for a given GPA (thereby HPA
> > > > > > >>>> by the IOMMU backend here)?
> > > > > > >>>>
> > > > > > >>>> If things go as guessed above, as vfio_pin_pages() indicates, it
> > > > > > >>>> pin & translate vaddr to PFN, then it will be very difficult for the
> > > > > > >>>> device model to figure out:
> > > > > > >>>>
> > > > > > >>>> 	1, for a given GPA, how to avoid calling dma_map_page multiple times?
> > > > > > >>>> 	2, for which page to call dma_unmap_page?
> > > > > > >>>>
> > > > > > >>>> --  
> > > > > > >>>
> > > > > > >>> We have to support both w/ iommu and w/o iommu case, since
> > > > > > >>> that fact is out of GPU driver control. A simple way is to use
> > > > > > >>> dma_map_page which internally will cope with w/ and w/o iommu
> > > > > > >>> case gracefully, i.e. return HPA w/o iommu and IOVA w/ iommu.
> > > > > > >>> Then in this file we only need to cache GPA to whatever dmadr_t
> > > > > > >>> returned by dma_map_page.
> > > > > > >>>  
> > > > > > >>
> > > > > > >> Hi Alex, Kirti and Neo, any thought on the IOMMU compatibility here?  
> > > > > > >
> > > > > > > Hi Jike,
> > > > > > >
> > > > > > > With mediated passthru, you still can use hardware iommu, but more important
> > > > > > > that part is actually orthogonal to what we are discussing here as we will only
> > > > > > > cache the mapping between <gfn (iova if guest has iommu), (qemu) va>, once we
> > > > > > > have pinned pages later with the help of above info, you can map it into the
> > > > > > > proper iommu domain if the system has configured so.
> > > > > > >  
> > > > > >
> > > > > > Hi Neo,
> > > > > >
> > > > > > Technically yes you can map a pfn into the proper IOMMU domain elsewhere,
> > > > > > but to find out whether a pfn was previously mapped or not, you have to
> > > > > > track it with another rbtree-alike data structure (the IOMMU driver simply
> > > > > > doesn't bother with tracking), that seems somehow duplicate with the vGPU
> > > > > > IOMMU backend we are discussing here.
> > > > > >
> > > > > > And it is also semantically correct for an IOMMU backend to handle both w/
> > > > > > and w/o an IOMMU hardware? :)  
> > > > > 
> > > > > A problem with the iommu doing the dma_map_page() though is for what
> > > > > device does it do this?  In the mediated case the vfio infrastructure
> > > > > is dealing with a software representation of a device.  For all we
> > > > > know that software model could transparently migrate from one physical
> > > > > GPU to another.  There may not even be a physical device backing
> > > > > the mediated device.  Those are details left to the vgpu driver itself.  
> > > > 
> > > > This is a fair argument. VFIO iommu driver simply serves user space
> > > > requests, where only vaddr<->iova (essentially gpa in kvm case) is
> > > > mattered. How iova is mapped into real IOMMU is not VFIO's interest.
> > > > 
> > > > > 
> > > > > Perhaps one possibility would be to allow the vgpu driver to register
> > > > > map and unmap callbacks.  The unmap callback might provide the
> > > > > invalidation interface that we're so far missing.  The combination of
> > > > > map and unmap callbacks might simplify the Intel approach of pinning the
> > > > > entire VM memory space, ie. for each map callback do a translation
> > > > > (pin) and dma_map_page, for each unmap do a dma_unmap_page and release
> > > > > the translation.  There's still the problem of where that dma_addr_t
> > > > > from the dma_map_page is stored though.  Someone would need to keep
> > > > > track of iova to dma_addr_t.  The vfio iommu might be a place to do
> > > > > that since we're already tracking information based on iova, possibly
> > > > > in an opaque data element provided by the vgpu driver.  However, we're
> > > > > going to need to take a serious look at whether an rb-tree is the right
> > > > > data structure for the job.  It works well for the current type1
> > > > > functionality where we typically have tens of entries.  I think the
> > > > > NVIDIA model of sparse pinning the VM is pushing that up to tens of
> > > > > thousands.  If Intel intends to pin the entire guest, that's
> > > > > potentially tens of millions of tracked entries and I don't know that
> > > > > an rb-tree is the right tool for that job.  Thanks,
> > > > >   
> > > > 
> > > > Based on above thought I'm thinking whether below would work:
> > > > (let's use gpa to replace existing iova in type1 driver, while using iova
> > > > for the one actually used in vGPU driver. Assume 'pin-all' scenario first
> > > > which matches existing vfio logic)
> > > > 
> > > > - No change to existing vfio_dma structure. VFIO still maintains gpa<->vaddr
> > > > mapping, in coarse-grained regions;
> > > > 
> > > > - Leverage same page accounting/pinning logic in type1 driver, which 
> > > > should be enough for 'pin-all' usage;
> > > > 
> > > > - Then main divergence point for vGPU would be in vfio_unmap_unpin
> > > > and vfio_iommu_map. I'm not sure whether it's easy to fake an 
> > > > iommu_domain for vGPU so same iommu_map/unmap can be reused.
> > > 
> > > This seems troublesome.  Kirti's version used numerous api-only tests
> > > to avoid these which made the code difficult to trace.  Clearly one
> > > option is to split out the common code so that a new mediated-type1
> > > backend skips this, but they thought they could clean it up without
> > > this, so we'll see what happens in the next version.
> > > 
> > > > If not, we may introduce two new map/unmap callbacks provided
> > > > specifically by vGPU core driver, as you suggested:
> > > > 
> > > > 	* vGPU core driver uses dma_map_page to map specified pfns:
> > > > 
> > > > 		o When IOMMU is enabled, we'll get an iova returned different
> > > > from pfn;
> > > > 		o When IOMMU is disabled, returned iova is same as pfn;
> > > 
> > > Either way each iova needs to be stored and we have a worst case of one
> > > iova per page of guest memory.
> > > 
> > > > 	* Then vGPU core driver just maintains its own gpa<->iova lookup
> > > > table (e.g. called vgpu_dma)
> > > > 
> > > > 	* Because each vfio_iommu_map invocation is about a contiguous 
> > > > region, we can expect same number of vgpu_dma entries as maintained 
> > > > for vfio_dma list;
> > > >
> > > > Then it's vGPU core driver's responsibility to provide gpa<->iova
> > > > lookup for vendor specific GPU driver. And we don't need worry about
> > > > tens of thousands of entries. Once we get this simple 'pin-all' model
> > > > ready, then it can be further extended to support 'pin-sparse'
> > > > scenario. We still maintain a top-level vgpu_dma list with each entry to
> > > > further link its own sparse mapping structure. In reality I don't expect
> > > > we really need to maintain per-page translation even with sparse pinning.
> > > 
> > > If you're trying to equate the scale of what we need to track vs what
> > > type1 currently tracks, they're significantly different.  Possible
> > > things we need to track include the pfn, the iova, and possibly a
> > > reference count or some sort of pinned page map.  In the pin-all model
> > > we can assume that every page is pinned on map and unpinned on unmap,
> > > so a reference count or map is unnecessary.  We can also assume that we
> > > can always regenerate the pfn with get_user_pages() from the vaddr, so
> > > we don't need to track that.  I don't see any way around tracking the
> > > iova.  The iommu can't tell us this like it can with the normal type1
> > > model because the pfn is the result of the translation, not the key for
> > > the translation. So we're always going to have between 1 and
> > > (size/PAGE_SIZE) iova entries per vgpu_dma entry.  You might be able to
> > > manage the vgpu_dma with an rb-tree, but each vgpu_dma entry needs some
> > > data structure tracking every iova.
> > > 
> > > Sparse mapping has the same issue but of course the tree of iovas is
> > > potentially incomplete and we need a way to determine where it's
> > > incomplete.  A page table rooted in the vgpu_dma and indexed by the
> > > offset from the start vaddr seems like the way to go here.  It's also
> > > possible that some mediated device models might store the iova in the
> > > command sent to the device and therefore be able to parse those entries
> > > back out to unmap them without storing them separately.  This might be
> > > how the s390 channel-io model would prefer to work.
> > Dear Alex:
> > 
> > For the s390 channel-io model, when an I/O instruction was intercepted
> > and issued to the device driver for further translation, the operand of
> > the instruction contents iovas only. Since iova is the key to locate an
> > entry in the database (r-b tree or whatever), we do can parse the
> > entries back out one by one when doing the unmap operation.
> >                  ^^^^^^^^^^
> > 
> > BTW, if the mediated-iommu backend can potentially offer a transaction
> > level support for the unmap operation, I believe it will benefit the
> > performance for this case.
> > 
> > e.g.:
> > handler = vfio_trasaction_begin();
> > foreach(iova in the command) {
> >     pfn = vfio_trasaction_map(handler, iova);
> >     do_io(pfn);
> > }
> 
> Hi Dong,
> 
> Could you please help me understand the performance benefit here? 
> 
> Is the perf argument coming from the searching of rbtree of the tracking data
> structure or something else?
> 
> For example you can do similar thing by the following sequence from your backend
> driver:
> 
>     vfio_pin_pages(gfn_list/iova_list /* in */, npages, prot, pfn_bases /* out */)
>     foreach (pfn)
>         do_io(pfn)
>     vfio_unpin_pages(pfn_bases)
Dear Neo:

FWIU, the channel-io driver could leverage these interfaces without
obvious feasibility issues. Since the implementation of the current
vfio_unpin_pages iterates @pfn_bases and find the corresponding entry
from the rb tree for each of the pfn_base, I'm wondering if a dedicated
list of the entries for the whole @pfn_bases could offer us some
benefits. I have to say that I know it's too early to consider the perf
, and the current interfaces are fine for the channel-io case.

I'm also wondering if such an idea could contribute a little to your
discussion of the management of the key-value mapping issue. If this is
just a noise (sorry for that :<), please ignore it.

My major intention is to show up, and to elaborate a bit about the
channel-io use case. So you will see that, there is really another user
of the mediate-iommu backend, and as Alex mentioned before, getting rid
of the 'vgpu_dev' and other vgpu specific stuff is indeed necessary. :>

> 
> Thanks,
> Neo
> 
> > 
> > /*
> >  * Expect to unmap all of the pfns mapped in this trasaction with the
> >  * next statement. The mediated-iommu backend could use handler as the
> >  * key to track the list of the entries.
> >  */
> > vfio_trasaction_unmap(handler);
> > vfio_trasaction_end(handler);
> > 
> > Not sure if this could benefit the vgpu sparse mapping use case though.
> 
> 
> 
> 
> 
> > 
> > >  That seems like
> > > further validation that such tracking is going to be dependent on the
> > > mediated driver itself and probably not something to centralize in a
> > > mediated iommu driver.  Thanks,
> > > 
> > > Alex
> > > 
> > 
> > 
> > 
> > --------
> > Dong Jia
> > 
> 



--------
Dong Jia

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neo Jia May 13, 2016, 8:41 a.m. UTC | #32
On Fri, May 13, 2016 at 08:02:41AM +0000, Tian, Kevin wrote:
> > From: Neo Jia [mailto:cjia@nvidia.com]
> > Sent: Friday, May 13, 2016 3:38 PM
> > 
> > On Fri, May 13, 2016 at 07:13:44AM +0000, Tian, Kevin wrote:
> > > > From: Neo Jia [mailto:cjia@nvidia.com]
> > > > Sent: Friday, May 13, 2016 2:42 PM
> > > >
> > > >
> > > > >
> > > > > We possibly have the same requirement from the mediate driver backend:
> > > > >
> > > > > 	a) get a GFN, when guest try to tell hardware;
> > > > > 	b) consult the vfio iommu with that GFN[1]: will you find me a proper dma_addr?
> > > >
> > > > We will provide you the pfn via vfio_pin_pages, so you can map it for dma
> > > > purpose in your i915 driver, which is what we are doing today.
> > > >
> > >
> > > Can such 'map' operation be consolidated in vGPU core driver? I don't think
> > > Intel vGPU driver has any feature proactively relying on iommu. The reason
> > > why we keep talking iommu is just because the kernel may enable iommu
> > > for physical GPU so we need make sure our device model can work in such
> > > configuration. And this requirement should apply to all vendors, not Intel
> > > specific (like you said you are doing it already today).
> > 
> > Hi Kevin,
> > 
> > Actually, such requirement is already satisfied today as all vendor drivers
> > should transparently work with and without system iommu on bare-metal, right?
> > 
> > So I don't see any new requirement here, also such consolidation doesn't help
> > any but adding complexity to the system as vendor driver will not remove
> > their own dma_map_xxx functions as they are still required to support
> > non-mediated cases.
> > 
> 
> Thanks for your information, which makes it clearer where the difference is. :-)
> 
> Based on your description, looks you treat guest pages same as normal process
> pages, which all share the same code path when mapping as DMA target, so it
> is pointless to separate guest page map out to vGPU core driver. Is this
> understanding correct?

Yes.

It is Linux's responsibility to allocate the physical pages for the QEMU process
which will happen to be the guest physical memory that we might use as DMA
target. From the device point of view, it is just some physical location he
needs to hit.

> 
> In our side, so far guest pages are treated differently from normal process
> pages, which is the main reason why I asked whether we can consolidate that
> part. Looks now it's not necessary since it's already not a common requirement.

> 
> One additional question though. Jike already mentioned the need to shadow
> GPU MMU (called GTT table in Intel side) in our device model. 'shadow' here
> basically means we need translate from 'gfn' in guest pte to 'dmadr_t'
> as returned by dma_map_xxx. Based on gfn->pfn translation provided by
> VFIO (in your v3 driver), gfn->dmadr_t mapping can be constructed accordingly
> in the vendor driver. So do you have similar requirement like this? If yes, do
> you think any value to unify that translation structure or prefer to maintain
> it by vendor driver?

Yes, I think it would make sense to do this in the vendor driver as it keeps the
iommu type1 clean - it will only track the gfn to pfn translation/pinning (on
CPU). Then, you can reuse your existing driver code to map the pfn as DMA
target.

Also you can do some kind of optimization such as keeping a small cache within
your device driver, if the gfn is already translated, no need to query again.

Thanks,
Neo

> 
> Thanks
> Kevin
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neo Jia May 13, 2016, 9:05 a.m. UTC | #33
On Fri, May 13, 2016 at 04:39:37PM +0800, Dong Jia wrote:
> On Fri, 13 May 2016 00:24:34 -0700
> Neo Jia <cjia@nvidia.com> wrote:
> 
> > On Fri, May 13, 2016 at 03:10:22PM +0800, Dong Jia wrote:
> > > On Thu, 12 May 2016 13:05:52 -0600
> > > Alex Williamson <alex.williamson@redhat.com> wrote:
> > > 
> > > > On Thu, 12 May 2016 08:00:36 +0000
> > > > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> > > > 
> > > > > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > > > > Sent: Thursday, May 12, 2016 6:06 AM
> > > > > > 
> > > > > > On Wed, 11 May 2016 17:15:15 +0800
> > > > > > Jike Song <jike.song@intel.com> wrote:
> > > > > >   
> > > > > > > On 05/11/2016 12:02 AM, Neo Jia wrote:  
> > > > > > > > On Tue, May 10, 2016 at 03:52:27PM +0800, Jike Song wrote:  
> > > > > > > >> On 05/05/2016 05:27 PM, Tian, Kevin wrote:  
> > > > > > > >>>> From: Song, Jike
> > > > > > > >>>>
> > > > > > > >>>> IIUC, an api-only domain is a VFIO domain *without* underlying IOMMU
> > > > > > > >>>> hardware. It just, as you said in another mail, "rather than
> > > > > > > >>>> programming them into an IOMMU for a device, it simply stores the
> > > > > > > >>>> translations for use by later requests".
> > > > > > > >>>>
> > > > > > > >>>> That imposes a constraint on gfx driver: hardware IOMMU must be disabled.
> > > > > > > >>>> Otherwise, if IOMMU is present, the gfx driver eventually programs
> > > > > > > >>>> the hardware IOMMU with IOVA returned by pci_map_page or dma_map_page;
> > > > > > > >>>> Meanwhile, the IOMMU backend for vgpu only maintains GPA <-> HPA
> > > > > > > >>>> translations without any knowledge about hardware IOMMU, how is the
> > > > > > > >>>> device model supposed to do to get an IOVA for a given GPA (thereby HPA
> > > > > > > >>>> by the IOMMU backend here)?
> > > > > > > >>>>
> > > > > > > >>>> If things go as guessed above, as vfio_pin_pages() indicates, it
> > > > > > > >>>> pin & translate vaddr to PFN, then it will be very difficult for the
> > > > > > > >>>> device model to figure out:
> > > > > > > >>>>
> > > > > > > >>>> 	1, for a given GPA, how to avoid calling dma_map_page multiple times?
> > > > > > > >>>> 	2, for which page to call dma_unmap_page?
> > > > > > > >>>>
> > > > > > > >>>> --  
> > > > > > > >>>
> > > > > > > >>> We have to support both w/ iommu and w/o iommu case, since
> > > > > > > >>> that fact is out of GPU driver control. A simple way is to use
> > > > > > > >>> dma_map_page which internally will cope with w/ and w/o iommu
> > > > > > > >>> case gracefully, i.e. return HPA w/o iommu and IOVA w/ iommu.
> > > > > > > >>> Then in this file we only need to cache GPA to whatever dmadr_t
> > > > > > > >>> returned by dma_map_page.
> > > > > > > >>>  
> > > > > > > >>
> > > > > > > >> Hi Alex, Kirti and Neo, any thought on the IOMMU compatibility here?  
> > > > > > > >
> > > > > > > > Hi Jike,
> > > > > > > >
> > > > > > > > With mediated passthru, you still can use hardware iommu, but more important
> > > > > > > > that part is actually orthogonal to what we are discussing here as we will only
> > > > > > > > cache the mapping between <gfn (iova if guest has iommu), (qemu) va>, once we
> > > > > > > > have pinned pages later with the help of above info, you can map it into the
> > > > > > > > proper iommu domain if the system has configured so.
> > > > > > > >  
> > > > > > >
> > > > > > > Hi Neo,
> > > > > > >
> > > > > > > Technically yes you can map a pfn into the proper IOMMU domain elsewhere,
> > > > > > > but to find out whether a pfn was previously mapped or not, you have to
> > > > > > > track it with another rbtree-alike data structure (the IOMMU driver simply
> > > > > > > doesn't bother with tracking), that seems somehow duplicate with the vGPU
> > > > > > > IOMMU backend we are discussing here.
> > > > > > >
> > > > > > > And it is also semantically correct for an IOMMU backend to handle both w/
> > > > > > > and w/o an IOMMU hardware? :)  
> > > > > > 
> > > > > > A problem with the iommu doing the dma_map_page() though is for what
> > > > > > device does it do this?  In the mediated case the vfio infrastructure
> > > > > > is dealing with a software representation of a device.  For all we
> > > > > > know that software model could transparently migrate from one physical
> > > > > > GPU to another.  There may not even be a physical device backing
> > > > > > the mediated device.  Those are details left to the vgpu driver itself.  
> > > > > 
> > > > > This is a fair argument. VFIO iommu driver simply serves user space
> > > > > requests, where only vaddr<->iova (essentially gpa in kvm case) is
> > > > > mattered. How iova is mapped into real IOMMU is not VFIO's interest.
> > > > > 
> > > > > > 
> > > > > > Perhaps one possibility would be to allow the vgpu driver to register
> > > > > > map and unmap callbacks.  The unmap callback might provide the
> > > > > > invalidation interface that we're so far missing.  The combination of
> > > > > > map and unmap callbacks might simplify the Intel approach of pinning the
> > > > > > entire VM memory space, ie. for each map callback do a translation
> > > > > > (pin) and dma_map_page, for each unmap do a dma_unmap_page and release
> > > > > > the translation.  There's still the problem of where that dma_addr_t
> > > > > > from the dma_map_page is stored though.  Someone would need to keep
> > > > > > track of iova to dma_addr_t.  The vfio iommu might be a place to do
> > > > > > that since we're already tracking information based on iova, possibly
> > > > > > in an opaque data element provided by the vgpu driver.  However, we're
> > > > > > going to need to take a serious look at whether an rb-tree is the right
> > > > > > data structure for the job.  It works well for the current type1
> > > > > > functionality where we typically have tens of entries.  I think the
> > > > > > NVIDIA model of sparse pinning the VM is pushing that up to tens of
> > > > > > thousands.  If Intel intends to pin the entire guest, that's
> > > > > > potentially tens of millions of tracked entries and I don't know that
> > > > > > an rb-tree is the right tool for that job.  Thanks,
> > > > > >   
> > > > > 
> > > > > Based on above thought I'm thinking whether below would work:
> > > > > (let's use gpa to replace existing iova in type1 driver, while using iova
> > > > > for the one actually used in vGPU driver. Assume 'pin-all' scenario first
> > > > > which matches existing vfio logic)
> > > > > 
> > > > > - No change to existing vfio_dma structure. VFIO still maintains gpa<->vaddr
> > > > > mapping, in coarse-grained regions;
> > > > > 
> > > > > - Leverage same page accounting/pinning logic in type1 driver, which 
> > > > > should be enough for 'pin-all' usage;
> > > > > 
> > > > > - Then main divergence point for vGPU would be in vfio_unmap_unpin
> > > > > and vfio_iommu_map. I'm not sure whether it's easy to fake an 
> > > > > iommu_domain for vGPU so same iommu_map/unmap can be reused.
> > > > 
> > > > This seems troublesome.  Kirti's version used numerous api-only tests
> > > > to avoid these which made the code difficult to trace.  Clearly one
> > > > option is to split out the common code so that a new mediated-type1
> > > > backend skips this, but they thought they could clean it up without
> > > > this, so we'll see what happens in the next version.
> > > > 
> > > > > If not, we may introduce two new map/unmap callbacks provided
> > > > > specifically by vGPU core driver, as you suggested:
> > > > > 
> > > > > 	* vGPU core driver uses dma_map_page to map specified pfns:
> > > > > 
> > > > > 		o When IOMMU is enabled, we'll get an iova returned different
> > > > > from pfn;
> > > > > 		o When IOMMU is disabled, returned iova is same as pfn;
> > > > 
> > > > Either way each iova needs to be stored and we have a worst case of one
> > > > iova per page of guest memory.
> > > > 
> > > > > 	* Then vGPU core driver just maintains its own gpa<->iova lookup
> > > > > table (e.g. called vgpu_dma)
> > > > > 
> > > > > 	* Because each vfio_iommu_map invocation is about a contiguous 
> > > > > region, we can expect same number of vgpu_dma entries as maintained 
> > > > > for vfio_dma list;
> > > > >
> > > > > Then it's vGPU core driver's responsibility to provide gpa<->iova
> > > > > lookup for vendor specific GPU driver. And we don't need worry about
> > > > > tens of thousands of entries. Once we get this simple 'pin-all' model
> > > > > ready, then it can be further extended to support 'pin-sparse'
> > > > > scenario. We still maintain a top-level vgpu_dma list with each entry to
> > > > > further link its own sparse mapping structure. In reality I don't expect
> > > > > we really need to maintain per-page translation even with sparse pinning.
> > > > 
> > > > If you're trying to equate the scale of what we need to track vs what
> > > > type1 currently tracks, they're significantly different.  Possible
> > > > things we need to track include the pfn, the iova, and possibly a
> > > > reference count or some sort of pinned page map.  In the pin-all model
> > > > we can assume that every page is pinned on map and unpinned on unmap,
> > > > so a reference count or map is unnecessary.  We can also assume that we
> > > > can always regenerate the pfn with get_user_pages() from the vaddr, so
> > > > we don't need to track that.  I don't see any way around tracking the
> > > > iova.  The iommu can't tell us this like it can with the normal type1
> > > > model because the pfn is the result of the translation, not the key for
> > > > the translation. So we're always going to have between 1 and
> > > > (size/PAGE_SIZE) iova entries per vgpu_dma entry.  You might be able to
> > > > manage the vgpu_dma with an rb-tree, but each vgpu_dma entry needs some
> > > > data structure tracking every iova.
> > > > 
> > > > Sparse mapping has the same issue but of course the tree of iovas is
> > > > potentially incomplete and we need a way to determine where it's
> > > > incomplete.  A page table rooted in the vgpu_dma and indexed by the
> > > > offset from the start vaddr seems like the way to go here.  It's also
> > > > possible that some mediated device models might store the iova in the
> > > > command sent to the device and therefore be able to parse those entries
> > > > back out to unmap them without storing them separately.  This might be
> > > > how the s390 channel-io model would prefer to work.
> > > Dear Alex:
> > > 
> > > For the s390 channel-io model, when an I/O instruction was intercepted
> > > and issued to the device driver for further translation, the operand of
> > > the instruction contents iovas only. Since iova is the key to locate an
> > > entry in the database (r-b tree or whatever), we do can parse the
> > > entries back out one by one when doing the unmap operation.
> > >                  ^^^^^^^^^^
> > > 
> > > BTW, if the mediated-iommu backend can potentially offer a transaction
> > > level support for the unmap operation, I believe it will benefit the
> > > performance for this case.
> > > 
> > > e.g.:
> > > handler = vfio_trasaction_begin();
> > > foreach(iova in the command) {
> > >     pfn = vfio_trasaction_map(handler, iova);
> > >     do_io(pfn);
> > > }
> > 
> > Hi Dong,
> > 
> > Could you please help me understand the performance benefit here? 
> > 
> > Is the perf argument coming from the searching of rbtree of the tracking data
> > structure or something else?
> > 
> > For example you can do similar thing by the following sequence from your backend
> > driver:
> > 
> >     vfio_pin_pages(gfn_list/iova_list /* in */, npages, prot, pfn_bases /* out */)
> >     foreach (pfn)
> >         do_io(pfn)
> >     vfio_unpin_pages(pfn_bases)
> Dear Neo:
> 
> FWIU, the channel-io driver could leverage these interfaces without
> obvious feasibility issues. Since the implementation of the current
> vfio_unpin_pages iterates @pfn_bases and find the corresponding entry
> from the rb tree for each of the pfn_base, I'm wondering if a dedicated
> list of the entries for the whole @pfn_bases could offer us some
> benefits. I have to say that I know it's too early to consider the perf
> , and the current interfaces are fine for the channel-io case.

Hi Dong,

We should definitely be mindful about the data structure performance especially
dealing with kernel. But for now, we haven't done any performance analysis yet
for the current rbtree implementation, later we will definitely run it through
large guest RAM configuration and multiple virtual devices cases, etc. to
collect data.

Regarding your use case, may I ask if there will be concurrent command streams
running for the same VM? If yes, those two transaction requests (if we
implement) will compete not only the rbtree lock but also the GUP locks.

Also, what is the typical guest RAM we are talking about here for your usecase
and any rough estimation of the active working set of those DMA pages? 

> 
> I'm also wondering if such an idea could contribute a little to your
> discussion of the management of the key-value mapping issue. If this is
> just a noise (sorry for that :<), please ignore it.
> 
> My major intention is to show up, and to elaborate a bit about the
> channel-io use case. So you will see that, there is really another user
> of the mediate-iommu backend, and as Alex mentioned before, getting rid
> of the 'vgpu_dev' and other vgpu specific stuff is indeed necessary. :>

Definitely, we are changing the module/variable names to reflect this general
purpose already. 

Thanks,
Neo

> 
> > 
> > Thanks,
> > Neo
> > 
> > > 
> > > /*
> > >  * Expect to unmap all of the pfns mapped in this trasaction with the
> > >  * next statement. The mediated-iommu backend could use handler as the
> > >  * key to track the list of the entries.
> > >  */
> > > vfio_trasaction_unmap(handler);
> > > vfio_trasaction_end(handler);
> > > 
> > > Not sure if this could benefit the vgpu sparse mapping use case though.
> > 
> > 
> > 
> > 
> > 
> > > 
> > > >  That seems like
> > > > further validation that such tracking is going to be dependent on the
> > > > mediated driver itself and probably not something to centralize in a
> > > > mediated iommu driver.  Thanks,
> > > > 
> > > > Alex
> > > > 
> > > 
> > > 
> > > 
> > > --------
> > > Dong Jia
> > > 
> > 
> 
> 
> 
> --------
> Dong Jia
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jike Song May 13, 2016, 9:23 a.m. UTC | #34
On 05/13/2016 04:31 PM, Neo Jia wrote:
> On Fri, May 13, 2016 at 07:45:14AM +0000, Tian, Kevin wrote:
>>
>> We use page tracking framework, which is newly added to KVM recently,
>> to mark RAM pages as read-only so write accesses are intercepted to 
>> device model.
> 
> Yes, I am aware of that patchset from Guangrong. So far the interface are all
> requiring struct *kvm, copied from https://lkml.org/lkml/2015/11/30/644
> 
> - kvm_page_track_add_page(): add the page to the tracking pool after
>   that later specified access on that page will be tracked
> 
> - kvm_page_track_remove_page(): remove the page from the tracking pool,
>   the specified access on the page is not tracked after the last user is
>   gone
> 
> void kvm_page_track_add_page(struct kvm *kvm, gfn_t gfn,
>                 enum kvm_page_track_mode mode);
> void kvm_page_track_remove_page(struct kvm *kvm, gfn_t gfn,
>                enum kvm_page_track_mode mode);
> 
> Really curious how you are going to have access to the struct kvm *kvm, or you
> are relying on the userfaultfd to track the write faults only as part of the
> QEMU userfault thread?
>

Hi Neo,

For the vGPU used as a device for KVM guest, there will be interfaces
wrapped or implemented in KVM layer, as a rival thing diverted from
the interfaces for Xen. That is where the KVM related code supposed to be.

--
Thanks,
Jike
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jike Song May 13, 2016, 9:46 a.m. UTC | #35
On 05/13/2016 04:12 AM, Neo Jia wrote:
> On Thu, May 12, 2016 at 01:05:52PM -0600, Alex Williamson wrote:
>>
>> If you're trying to equate the scale of what we need to track vs what
>> type1 currently tracks, they're significantly different.  Possible
>> things we need to track include the pfn, the iova, and possibly a
>> reference count or some sort of pinned page map.  In the pin-all model
>> we can assume that every page is pinned on map and unpinned on unmap,
>> so a reference count or map is unnecessary.  We can also assume that we
>> can always regenerate the pfn with get_user_pages() from the vaddr, so
>> we don't need to track that.  
> 
> Hi Alex,
> 
> Thanks for pointing this out, we will not track those in our next rev and
> get_user_pages will be used from the vaddr as you suggested to handle the
> single VM with both passthru + mediated device case.
>

Just a gut feeling:

Calling GUP every time for a particular vaddr, means locking mm->mmap_sem
every time for a particular process. If the VM has dozens of VCPU, which
is not rare, the semaphore is likely to be the bottleneck.


--
Thanks,
Jike

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neo Jia May 13, 2016, 3:48 p.m. UTC | #36
On Fri, May 13, 2016 at 05:46:17PM +0800, Jike Song wrote:
> On 05/13/2016 04:12 AM, Neo Jia wrote:
> > On Thu, May 12, 2016 at 01:05:52PM -0600, Alex Williamson wrote:
> >>
> >> If you're trying to equate the scale of what we need to track vs what
> >> type1 currently tracks, they're significantly different.  Possible
> >> things we need to track include the pfn, the iova, and possibly a
> >> reference count or some sort of pinned page map.  In the pin-all model
> >> we can assume that every page is pinned on map and unpinned on unmap,
> >> so a reference count or map is unnecessary.  We can also assume that we
> >> can always regenerate the pfn with get_user_pages() from the vaddr, so
> >> we don't need to track that.  
> > 
> > Hi Alex,
> > 
> > Thanks for pointing this out, we will not track those in our next rev and
> > get_user_pages will be used from the vaddr as you suggested to handle the
> > single VM with both passthru + mediated device case.
> >
> 
> Just a gut feeling:
> 
> Calling GUP every time for a particular vaddr, means locking mm->mmap_sem
> every time for a particular process. If the VM has dozens of VCPU, which
> is not rare, the semaphore is likely to be the bottleneck.

Hi Jike,

We do need to hold the lock of mm->mmap_sem for the VMM/QEMU process, but I
don't quite follow the reasoning with "dozens of vcpus", one situation that I
can think of is that we have other thread competing with the mmap_sem for the
VMM/QEMU process within KVM kernel such as hva_to_pfn, after a quick search it
seems only mostly gets used by iotcl "KVM_ASSIGN_PCI_DEVICE".

We will definitely conduct performance analysis with large configuration on
servers with E5-2697 v4. :-)

Thanks,
Neo

> 
> 
> --
> Thanks,
> Jike
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neo Jia May 13, 2016, 3:50 p.m. UTC | #37
On Fri, May 13, 2016 at 05:23:44PM +0800, Jike Song wrote:
> On 05/13/2016 04:31 PM, Neo Jia wrote:
> > On Fri, May 13, 2016 at 07:45:14AM +0000, Tian, Kevin wrote:
> >>
> >> We use page tracking framework, which is newly added to KVM recently,
> >> to mark RAM pages as read-only so write accesses are intercepted to 
> >> device model.
> > 
> > Yes, I am aware of that patchset from Guangrong. So far the interface are all
> > requiring struct *kvm, copied from https://lkml.org/lkml/2015/11/30/644
> > 
> > - kvm_page_track_add_page(): add the page to the tracking pool after
> >   that later specified access on that page will be tracked
> > 
> > - kvm_page_track_remove_page(): remove the page from the tracking pool,
> >   the specified access on the page is not tracked after the last user is
> >   gone
> > 
> > void kvm_page_track_add_page(struct kvm *kvm, gfn_t gfn,
> >                 enum kvm_page_track_mode mode);
> > void kvm_page_track_remove_page(struct kvm *kvm, gfn_t gfn,
> >                enum kvm_page_track_mode mode);
> > 
> > Really curious how you are going to have access to the struct kvm *kvm, or you
> > are relying on the userfaultfd to track the write faults only as part of the
> > QEMU userfault thread?
> >
> 
> Hi Neo,
> 
> For the vGPU used as a device for KVM guest, there will be interfaces
> wrapped or implemented in KVM layer, as a rival thing diverted from
> the interfaces for Xen. That is where the KVM related code supposed to be.

Hi Jike,

Is this discussed anywhere on the mailing list already? Sorry if I have missed
such conversation.

Thanks,
Neo

> 
> --
> Thanks,
> Jike
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson May 13, 2016, 4:16 p.m. UTC | #38
On Fri, 13 May 2016 03:55:09 +0000
"Tian, Kevin" <kevin.tian@intel.com> wrote:

> > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > Sent: Friday, May 13, 2016 3:06 AM
> >   
> > > >  
> > >
> > > Based on above thought I'm thinking whether below would work:
> > > (let's use gpa to replace existing iova in type1 driver, while using iova
> > > for the one actually used in vGPU driver. Assume 'pin-all' scenario first
> > > which matches existing vfio logic)
> > >
> > > - No change to existing vfio_dma structure. VFIO still maintains gpa<->vaddr
> > > mapping, in coarse-grained regions;
> > >
> > > - Leverage same page accounting/pinning logic in type1 driver, which
> > > should be enough for 'pin-all' usage;
> > >
> > > - Then main divergence point for vGPU would be in vfio_unmap_unpin
> > > and vfio_iommu_map. I'm not sure whether it's easy to fake an
> > > iommu_domain for vGPU so same iommu_map/unmap can be reused.  
> > 
> > This seems troublesome.  Kirti's version used numerous api-only tests
> > to avoid these which made the code difficult to trace.  Clearly one
> > option is to split out the common code so that a new mediated-type1
> > backend skips this, but they thought they could clean it up without
> > this, so we'll see what happens in the next version.
> >   
> > > If not, we may introduce two new map/unmap callbacks provided
> > > specifically by vGPU core driver, as you suggested:
> > >
> > > 	* vGPU core driver uses dma_map_page to map specified pfns:
> > >
> > > 		o When IOMMU is enabled, we'll get an iova returned different
> > > from pfn;
> > > 		o When IOMMU is disabled, returned iova is same as pfn;  
> > 
> > Either way each iova needs to be stored and we have a worst case of one
> > iova per page of guest memory.
> >   
> > > 	* Then vGPU core driver just maintains its own gpa<->iova lookup
> > > table (e.g. called vgpu_dma)
> > >
> > > 	* Because each vfio_iommu_map invocation is about a contiguous
> > > region, we can expect same number of vgpu_dma entries as maintained
> > > for vfio_dma list;
> > >
> > > Then it's vGPU core driver's responsibility to provide gpa<->iova
> > > lookup for vendor specific GPU driver. And we don't need worry about
> > > tens of thousands of entries. Once we get this simple 'pin-all' model
> > > ready, then it can be further extended to support 'pin-sparse'
> > > scenario. We still maintain a top-level vgpu_dma list with each entry to
> > > further link its own sparse mapping structure. In reality I don't expect
> > > we really need to maintain per-page translation even with sparse pinning.  
> > 
> > If you're trying to equate the scale of what we need to track vs what
> > type1 currently tracks, they're significantly different.  Possible
> > things we need to track include the pfn, the iova, and possibly a
> > reference count or some sort of pinned page map.  In the pin-all model
> > we can assume that every page is pinned on map and unpinned on unmap,
> > so a reference count or map is unnecessary.  We can also assume that we
> > can always regenerate the pfn with get_user_pages() from the vaddr, so
> > we don't need to track that.  I don't see any way around tracking the
> > iova.  The iommu can't tell us this like it can with the normal type1
> > model because the pfn is the result of the translation, not the key for
> > the translation. So we're always going to have between 1 and
> > (size/PAGE_SIZE) iova entries per vgpu_dma entry.  You might be able to
> > manage the vgpu_dma with an rb-tree, but each vgpu_dma entry needs some
> > data structure tracking every iova.  
> 
> There is one option. We may use alloc_iova to reserve continuous iova
> range for each vgpu_dma range and then use iommu_map/unmap to
> write iommu ptes later upon map request (then could be same #entries
> as vfio_dma compared to unbounded entries when using dma_map_page). 
> Of course this needs to be done in vGPU core driver, since vfio type1 only 
> sees a faked iommu domain.

I'm not sure this is really how iova domains work.  There's only one
iova domain per iommu domain using the dma-iommu API, and
iommu_map/unmap are part of a different API.  iova domain may be an
interesting solution though.
 
> > Sparse mapping has the same issue but of course the tree of iovas is
> > potentially incomplete and we need a way to determine where it's
> > incomplete.  A page table rooted in the vgpu_dma and indexed by the
> > offset from the start vaddr seems like the way to go here.  It's also
> > possible that some mediated device models might store the iova in the
> > command sent to the device and therefore be able to parse those entries
> > back out to unmap them without storing them separately.  This might be
> > how the s390 channel-io model would prefer to work.  That seems like
> > further validation that such tracking is going to be dependent on the
> > mediated driver itself and probably not something to centralize in a
> > mediated iommu driver.  Thanks,
> >   
> 
> Another simpler way might be allocate an array for each memory
> regions registered from user space. For a 512MB region, it means
> 512K*4=2MB array to track pfn or iova mapping corresponding to
> a gfn. It may consume more resource than rb tree when not many
> pages need to be pinned, but could be less when rb tree increases
> a lot. 

An array is only the most space efficient structure for a fully pinned
area where we have no contiguous iova.  If we're either mapping a
larger hugepage or we have a larger continuous iova space due to
scatter-gather mapping or we're sparsely pinning the region, an array
can waste a lot of space.  A 512MB is also a pretty anemic example, 2MB
is a reasonable over head, but 2MB per 512MB looks pretty bad when we
have a 512GB VM.  Thanks,

Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jike Song May 16, 2016, 2:27 a.m. UTC | #39
On 05/13/2016 11:48 PM, Neo Jia wrote:
> On Fri, May 13, 2016 at 05:46:17PM +0800, Jike Song wrote:
>> On 05/13/2016 04:12 AM, Neo Jia wrote:
>>> On Thu, May 12, 2016 at 01:05:52PM -0600, Alex Williamson wrote:
>>>>
>>>> If you're trying to equate the scale of what we need to track vs what
>>>> type1 currently tracks, they're significantly different.  Possible
>>>> things we need to track include the pfn, the iova, and possibly a
>>>> reference count or some sort of pinned page map.  In the pin-all model
>>>> we can assume that every page is pinned on map and unpinned on unmap,
>>>> so a reference count or map is unnecessary.  We can also assume that we
>>>> can always regenerate the pfn with get_user_pages() from the vaddr, so
>>>> we don't need to track that.  
>>>
>>> Hi Alex,
>>>
>>> Thanks for pointing this out, we will not track those in our next rev and
>>> get_user_pages will be used from the vaddr as you suggested to handle the
>>> single VM with both passthru + mediated device case.
>>>
>>
>> Just a gut feeling:
>>
>> Calling GUP every time for a particular vaddr, means locking mm->mmap_sem
>> every time for a particular process. If the VM has dozens of VCPU, which
>> is not rare, the semaphore is likely to be the bottleneck.
> 
> Hi Jike,
> 
> We do need to hold the lock of mm->mmap_sem for the VMM/QEMU process, but I
> don't quite follow the reasoning with "dozens of vcpus", one situation that I
> can think of is that we have other thread competing with the mmap_sem for the
> VMM/QEMU process within KVM kernel such as hva_to_pfn, after a quick search it
> seems only mostly gets used by iotcl "KVM_ASSIGN_PCI_DEVICE".
>

I meant, on guest's writing a gfn to GPU MMU, which could happen on any vcpu,
so vmexit happens and mmap_sem required.  But I'm now realized that it's
also the situation even we store the pfn in rbtree ..

> We will definitely conduct performance analysis with large configuration on
> servers with E5-2697 v4. :-)

My homage :)

--
Thanks,
Jike

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jike Song May 16, 2016, 6:57 a.m. UTC | #40
On 05/13/2016 11:50 PM, Neo Jia wrote:
> On Fri, May 13, 2016 at 05:23:44PM +0800, Jike Song wrote:
>> On 05/13/2016 04:31 PM, Neo Jia wrote:
>>> On Fri, May 13, 2016 at 07:45:14AM +0000, Tian, Kevin wrote:
>>>>
>>>> We use page tracking framework, which is newly added to KVM recently,
>>>> to mark RAM pages as read-only so write accesses are intercepted to 
>>>> device model.
>>>
>>> Yes, I am aware of that patchset from Guangrong. So far the interface are all
>>> requiring struct *kvm, copied from https://lkml.org/lkml/2015/11/30/644
>>>
>>> - kvm_page_track_add_page(): add the page to the tracking pool after
>>>   that later specified access on that page will be tracked
>>>
>>> - kvm_page_track_remove_page(): remove the page from the tracking pool,
>>>   the specified access on the page is not tracked after the last user is
>>>   gone
>>>
>>> void kvm_page_track_add_page(struct kvm *kvm, gfn_t gfn,
>>>                 enum kvm_page_track_mode mode);
>>> void kvm_page_track_remove_page(struct kvm *kvm, gfn_t gfn,
>>>                enum kvm_page_track_mode mode);
>>>
>>> Really curious how you are going to have access to the struct kvm *kvm, or you
>>> are relying on the userfaultfd to track the write faults only as part of the
>>> QEMU userfault thread?
>>>
>>
>> Hi Neo,
>>
>> For the vGPU used as a device for KVM guest, there will be interfaces
>> wrapped or implemented in KVM layer, as a rival thing diverted from
>> the interfaces for Xen. That is where the KVM related code supposed to be.
> 
> Hi Jike,
> 
> Is this discussed anywhere on the mailing list already? Sorry if I have missed
> such conversation.
>

Hi Neo,

Not exactly, but we can discuss it if necessary :)

Intel vGPU device-model, which is a part of i915 driver, has to be able to
emulate vGPU for *both* XenGT and KVMGT guests. That means there must be
a ridge somewhere, directing to Xen-specific and KVM-specific logic accordingly.


--
Thanks,
Jike
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dong Jia Shi May 19, 2016, 7:28 a.m. UTC | #41
On Fri, 13 May 2016 02:05:01 -0700
Neo Jia <cjia@nvidia.com> wrote:

...snip...

> 
> Hi Dong,
> 
> We should definitely be mindful about the data structure performance especially
> dealing with kernel. But for now, we haven't done any performance analysis yet
> for the current rbtree implementation, later we will definitely run it through
> large guest RAM configuration and multiple virtual devices cases, etc. to
> collect data.
> 
> Regarding your use case, may I ask if there will be concurrent command streams
> running for the same VM?
Hi Neo:

Sorry for the late response. Spent some time to make the confirmation.

For our case, one iommu group will add one (and only one) ccw-device.
For one ccw-device, there will be no concurrent command streams from it.

> If yes, those two transaction requests (if we
> implement) will compete not only the rbtree lock but also the GUP locks.
Since the answer is 'no, I guess we needn't do this. :>

> 
> Also, what is the typical guest RAM we are talking about here for your usecase
> and any rough estimation of the active working set of those DMA pages? 
> 
I'm afraid there is no typical guest RAM for the I/O instructions
issued by the passed-through ccw-device drivers. They can use any
memory chunk allocated by a kmalloc.

The working set depends on how much memory used by the device drivers,
and of course the number of the available memory. Since there is no
restrictions of the memory usage for this case, it varies...

[...]

--------
Dong Jia

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tian, Kevin May 20, 2016, 3:21 a.m. UTC | #42
> From: Dong Jia [mailto:bjsdjshi@linux.vnet.ibm.com]
> Sent: Thursday, May 19, 2016 3:28 PM
> 
> On Fri, 13 May 2016 02:05:01 -0700
> Neo Jia <cjia@nvidia.com> wrote:
> 
> ...snip...
> 
> >
> > Hi Dong,
> >
> > We should definitely be mindful about the data structure performance especially
> > dealing with kernel. But for now, we haven't done any performance analysis yet
> > for the current rbtree implementation, later we will definitely run it through
> > large guest RAM configuration and multiple virtual devices cases, etc. to
> > collect data.
> >
> > Regarding your use case, may I ask if there will be concurrent command streams
> > running for the same VM?
> Hi Neo:
> 
> Sorry for the late response. Spent some time to make the confirmation.
> 
> For our case, one iommu group will add one (and only one) ccw-device.
> For one ccw-device, there will be no concurrent command streams from it.
> 

Hi, Dong,

Looks there can be multiple devices behind one channel, according to:
https://en.wikipedia.org/wiki/Channel_I/O

Do they need to be assigned together as one iommu group? If not, how is
the isolation being done in your implementation? Based on cmd scanning in 
Qemu-side?

Another curious question about channel io itself. I'm unclear whether the 
channel here only fulfills the role of DMA controller (i.e. controlling how
device access memory), or also offloads CPU accesses to the registers
on the ccw-device. Are ccw-device registers directly addressable by CPU
on s390, similar to MMIO concept on x86? If yes, I guess you also need
provide region info in vfio-ccw to control which I/O resource can be accessed
by user space (looks not there in your vfio-ccw patch series). If not, how 
do you control the isolation in that aspect? :-)

Thanks
Kevin
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dong Jia Shi June 6, 2016, 6:59 a.m. UTC | #43
On Fri, 20 May 2016 03:21:31 +0000
"Tian, Kevin" <kevin.tian@intel.com> wrote:

> > From: Dong Jia [mailto:bjsdjshi@linux.vnet.ibm.com]
> > Sent: Thursday, May 19, 2016 3:28 PM
> > 
> > On Fri, 13 May 2016 02:05:01 -0700
> > Neo Jia <cjia@nvidia.com> wrote:
> > 
> > ...snip...
> > 
> > >
> > > Hi Dong,
> > >
> > > We should definitely be mindful about the data structure performance especially
> > > dealing with kernel. But for now, we haven't done any performance analysis yet
> > > for the current rbtree implementation, later we will definitely run it through
> > > large guest RAM configuration and multiple virtual devices cases, etc. to
> > > collect data.
> > >
> > > Regarding your use case, may I ask if there will be concurrent command streams
> > > running for the same VM?
> > Hi Neo:
> > 
> > Sorry for the late response. Spent some time to make the confirmation.
> > 
> > For our case, one iommu group will add one (and only one) ccw-device.
> > For one ccw-device, there will be no concurrent command streams from it.
> > 
> 
> Hi, Dong,
> 
> Looks there can be multiple devices behind one channel, according to:
> https://en.wikipedia.org/wiki/Channel_I/O
Dear Kevin:

One subchannel (the co-processor to offload the I/O operations) could
be assigned to one device at a time. See below.

> 
> Do they need to be assigned together as one iommu group?
So, 'N/A' to this question.

> If not, how is
> the isolation being done in your implementation? Based on cmd scanning in 
> Qemu-side?
It's a 'one device'-'one subchannel'-'one iommu group' relation then.
The isolation looks quite natural.

> 
> Another curious question about channel io itself. I'm unclear whether the 
> channel here only fulfills the role of DMA controller (i.e. controlling how
> device access memory), or also offloads CPU accesses to the registers
> on the ccw-device. Are ccw-device registers directly addressable by CPU
> on s390, similar to MMIO concept on x86? If yes, I guess you also need
> provide region info in vfio-ccw to control which I/O resource can be accessed
> by user space (looks not there in your vfio-ccw patch series). If not, how 
> do you control the isolation in that aspect? :-)
Channel I/O is quite different to PCI, so I copied some more details
here. Hope these could help.

Channel subsystem:
The channel subsystem directs the flow of information between I/O devices
and main storage. It relieves CPUs of the task of communicating directly
with I/O devices and permits data processing to proceed concurrently with
I/O processing.

Channel path:
The channel subsystem uses one or more channel paths as the communication
link in managing the flow of information to or from I/O devices.

Subchannel:
Within the channel subsystem are subchannels. One subchannel of type I/O
is provided for and dedicated to each I/O device accessible to the channel
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
subsystem.

Control unit:
A control unit implements a standardized interface which translates between
the Channel Subsystem and the actual device. It does this by adapting the
characteristics of each device so that it can respond to the standard form
of control provided by the channel subsystem.

Channel Path:
The channel subsystem communicates with the I/O devices by means of
channel paths between the channel subsystem and control units.

+-------------------+
| channel subsystem |
+-------------------+
|                   |
|   +----------+    |              +--------------+    +------------+
|   |subchannel|    | channel path | control unit |    | I/O device |
|   +---------------------------------------------------------------+
|   | subchno  |    |              |              |    |    devno   |
|   +----------+    |              +--------------+    +------------+
|                   |
+-------------------+

There is no concept of ccw-device registers by the subchannel. Control
unit will interact with the device, collect the I/O result, and inform
the result to the subchannel.
So it seems to me, there is no needs to provide region info for
isolation. As mentioned above, the isolation is quite natural.

Please correct me in case I misunderstood some of the concepts in your
questions and gave irrelevant answers. :>

> 
> Thanks
> Kevin
> 



--------
Dong Jia

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tian, Kevin June 7, 2016, 2:47 a.m. UTC | #44
> From: Dong Jia
> Sent: Monday, June 06, 2016 2:59 PM
> 

[...]

> Channel I/O is quite different to PCI, so I copied some more details
> here. Hope these could help.
> 
> Channel subsystem:
> The channel subsystem directs the flow of information between I/O devices
> and main storage. It relieves CPUs of the task of communicating directly
> with I/O devices and permits data processing to proceed concurrently with
> I/O processing.
> 
> Channel path:
> The channel subsystem uses one or more channel paths as the communication
> link in managing the flow of information to or from I/O devices.
> 
> Subchannel:
> Within the channel subsystem are subchannels. One subchannel of type I/O
> is provided for and dedicated to each I/O device accessible to the channel
>                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> subsystem.
> 
> Control unit:
> A control unit implements a standardized interface which translates between
> the Channel Subsystem and the actual device. It does this by adapting the
> characteristics of each device so that it can respond to the standard form
> of control provided by the channel subsystem.
> 
> Channel Path:
> The channel subsystem communicates with the I/O devices by means of
> channel paths between the channel subsystem and control units.
> 
> +-------------------+
> | channel subsystem |
> +-------------------+
> |                   |
> |   +----------+    |              +--------------+    +------------+
> |   |subchannel|    | channel path | control unit |    | I/O device |
> |   +---------------------------------------------------------------+
> |   | subchno  |    |              |              |    |    devno   |
> |   +----------+    |              +--------------+    +------------+
> |                   |
> +-------------------+
> 
> There is no concept of ccw-device registers by the subchannel. Control
> unit will interact with the device, collect the I/O result, and inform
> the result to the subchannel.
> So it seems to me, there is no needs to provide region info for
> isolation. As mentioned above, the isolation is quite natural.
> 
> Please correct me in case I misunderstood some of the concepts in your
> questions and gave irrelevant answers. :>
> 

Thanks for above background which is very useful. Several follow-up Qs:

1) Does it mean that VFIO is managing resource in subchannel level, so Qemu
can only operate subchannels assigned to itself (then emulate as the complete
channel io sub-system to guest)?

2) How are ccw commands associated with a subchannel? Are they submitted
through a dedicated subchannel interface (so VFIO can easily map that interface)
or that subchannel is specified by a special ccw cmd (means VFIO-ccw needs
to scan cmds to avoid malicious attempts on non-assigned subchannels)?

Thanks
Kevin
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dong Jia Shi June 7, 2016, 7:04 a.m. UTC | #45
On Tue, 7 Jun 2016 02:47:10 +0000
"Tian, Kevin" <kevin.tian@intel.com> wrote:

> > From: Dong Jia
> > Sent: Monday, June 06, 2016 2:59 PM
> > 
> 
> [...]
> 
> > Channel I/O is quite different to PCI, so I copied some more details
> > here. Hope these could help.
> > 
> > Channel subsystem:
> > The channel subsystem directs the flow of information between I/O devices
> > and main storage. It relieves CPUs of the task of communicating directly
> > with I/O devices and permits data processing to proceed concurrently with
> > I/O processing.
> > 
> > Channel path:
> > The channel subsystem uses one or more channel paths as the communication
> > link in managing the flow of information to or from I/O devices.
> > 
> > Subchannel:
> > Within the channel subsystem are subchannels. One subchannel of type I/O
> > is provided for and dedicated to each I/O device accessible to the channel
> >                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> > subsystem.
> > 
> > Control unit:
> > A control unit implements a standardized interface which translates between
> > the Channel Subsystem and the actual device. It does this by adapting the
> > characteristics of each device so that it can respond to the standard form
> > of control provided by the channel subsystem.
> > 
> > Channel Path:
> > The channel subsystem communicates with the I/O devices by means of
> > channel paths between the channel subsystem and control units.
> > 
> > +-------------------+
> > | channel subsystem |
> > +-------------------+
> > |                   |
> > |   +----------+    |              +--------------+    +------------+
> > |   |subchannel|    | channel path | control unit |    | I/O device |
> > |   +---------------------------------------------------------------+
> > |   | subchno  |    |              |              |    |    devno   |
> > |   +----------+    |              +--------------+    +------------+
> > |                   |
> > +-------------------+
> > 
> > There is no concept of ccw-device registers by the subchannel. Control
> > unit will interact with the device, collect the I/O result, and inform
> > the result to the subchannel.
> > So it seems to me, there is no needs to provide region info for
> > isolation. As mentioned above, the isolation is quite natural.
> > 
> > Please correct me in case I misunderstood some of the concepts in your
> > questions and gave irrelevant answers. :>
> > 
> 
> Thanks for above background which is very useful. Several follow-up Qs:
> 
> 1) Does it mean that VFIO is managing resource in subchannel level, so Qemu
> can only operate subchannels assigned to itself
Dear Kevin,

This understanding is basically right, but not that exactly.

Linux creates a 'struct ccw_device' for each device it has detected, and
a 'struct subchannel' for the corresponding subchannel. When we issue
a command to a device instance, the device can find the subchannel and
pass the command to it.

The current vfio-ccw implementation targets a device passthru. So I'd say
that VFIO is managing resource in both the device and the
subchannel level.

However, there is a discussion inside our team that tries to figure out
if a subchannel passthru would be better. So, in the future, it's
possible to do the management in a subchannel level only.

> (then emulate as the complete channel io sub-system to guest)?
This is right.

> 
> 2) How are ccw commands associated with a subchannel?
The I/O instruction requires a subchannel id and an ORB
(Operation-Request Block, which contains the execution parameters,
including the address of the ccws). So when an I/O instruction was
intercepted, we know which is the target subchannel. And using this
target information, we can find the real physical device and the
subchannel to perform the instruction.

> Are they submitted
> through a dedicated subchannel interface (so VFIO can easily map that interface)
We can understand it this way.

> or that subchannel is specified by a special ccw cmd (means VFIO-ccw needs
> to scan cmds to avoid malicious attempts on non-assigned subchannels)?
No. CCWs themselves don't contain subchannel information.

> 
> Thanks
> Kevin
> 

--------
Dong Jia

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 75b24e9..a970854 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -36,6 +36,7 @@ 
 #include <linux/uaccess.h>
 #include <linux/vfio.h>
 #include <linux/workqueue.h>
+#include <linux/vgpu.h>
 
 #define DRIVER_VERSION  "0.2"
 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
@@ -67,6 +68,11 @@  struct vfio_domain {
 	struct list_head	group_list;
 	int			prot;		/* IOMMU_CACHE */
 	bool			fgsp;		/* Fine-grained super pages */
+	bool			vfio_iommu_api_only;	/* Domain for device which
+							   is without physical IOMMU */
+	struct mm_struct	*vmm_mm;	/* VMM's mm */
+	struct rb_root		pfn_list;	/* Host pfn list for requested gfns */
+	struct mutex		lock;		/* mutex for pfn_list */
 };
 
 struct vfio_dma {
@@ -83,6 +89,19 @@  struct vfio_group {
 };
 
 /*
+ * Guest RAM pinning working set or DMA target
+ */
+struct vfio_vgpu_pfn {
+	struct rb_node		node;
+	unsigned long		vmm_va;		/* VMM virtual addr */
+	dma_addr_t		iova;		/* IOVA */
+	unsigned long		npage;		/* number of pages */
+	unsigned long		pfn;		/* Host pfn */
+	int			prot;
+	atomic_t		ref_count;
+	struct list_head	next;
+};
+/*
  * This code handles mapping and unmapping of user data buffers
  * into DMA'ble space using the IOMMU
  */
@@ -130,6 +149,53 @@  static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
 	rb_erase(&old->node, &iommu->dma_list);
 }
 
+/*
+ * Helper Functions for host pfn list
+ */
+
+static struct vfio_vgpu_pfn *vfio_find_vgpu_pfn(struct vfio_domain *domain,
+						unsigned long pfn)
+{
+	struct rb_node *node = domain->pfn_list.rb_node;
+
+	while (node) {
+		struct vfio_vgpu_pfn *vgpu_pfn = rb_entry(node, struct vfio_vgpu_pfn, node);
+
+		if (pfn <= vgpu_pfn->pfn)
+			node = node->rb_left;
+		else if (pfn >= vgpu_pfn->pfn)
+			node = node->rb_right;
+		else
+			return vgpu_pfn;
+	}
+
+	return NULL;
+}
+
+static void vfio_link_vgpu_pfn(struct vfio_domain *domain, struct vfio_vgpu_pfn *new)
+{
+	struct rb_node **link = &domain->pfn_list.rb_node, *parent = NULL;
+	struct vfio_vgpu_pfn *vgpu_pfn;
+
+	while (*link) {
+		parent = *link;
+		vgpu_pfn = rb_entry(parent, struct vfio_vgpu_pfn, node);
+
+		if (new->pfn <= vgpu_pfn->pfn)
+			link = &(*link)->rb_left;
+		else
+			link = &(*link)->rb_right;
+	}
+
+	rb_link_node(&new->node, parent, link);
+	rb_insert_color(&new->node, &domain->pfn_list);
+}
+
+static void vfio_unlink_vgpu_pfn(struct vfio_domain *domain, struct vfio_vgpu_pfn *old)
+{
+	rb_erase(&old->node, &domain->pfn_list);
+}
+
 struct vwork {
 	struct mm_struct	*mm;
 	long			npage;
@@ -228,20 +294,22 @@  static int put_pfn(unsigned long pfn, int prot)
 	return 0;
 }
 
-static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
+static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
+			 int prot, unsigned long *pfn)
 {
 	struct page *page[1];
 	struct vm_area_struct *vma;
 	int ret = -EFAULT;
 
-	if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
+	if (get_user_pages_remote(NULL, mm, vaddr, 1, !!(prot & IOMMU_WRITE),
+				    0, page, NULL) == 1) {
 		*pfn = page_to_pfn(page[0]);
 		return 0;
 	}
 
-	down_read(&current->mm->mmap_sem);
+	down_read(&mm->mmap_sem);
 
-	vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
+	vma = find_vma_intersection(mm, vaddr, vaddr + 1);
 
 	if (vma && vma->vm_flags & VM_PFNMAP) {
 		*pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
@@ -249,28 +317,63 @@  static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
 			ret = 0;
 	}
 
-	up_read(&current->mm->mmap_sem);
+	up_read(&mm->mmap_sem);
 
 	return ret;
 }
 
 /*
+ * Get first domain with iommu and without iommu from iommu's domain_list for
+ * lookups
+ * @iommu [in]: iommu structure
+ * @domain [out]: domain with iommu
+ * @domain_vgpu [out] : domain without iommu for vGPU
+ */
+static void get_first_domains(struct vfio_iommu *iommu, struct vfio_domain **domain,
+			      struct vfio_domain **domain_vgpu)
+{
+	struct vfio_domain *d;
+
+	if (!domain || !domain_vgpu)
+		return;
+
+	list_for_each_entry(d, &iommu->domain_list, next) {
+		if (d->vfio_iommu_api_only && !*domain_vgpu)
+			*domain_vgpu = d;
+		else if (!*domain)
+			*domain = d;
+		if (*domain_vgpu && *domain)
+			break;
+	}
+}
+
+/*
  * Attempt to pin pages.  We really don't want to track all the pfns and
  * the iommu can only map chunks of consecutive pfns anyway, so get the
  * first page and all consecutive pages with the same locking.
  */
-static long vfio_pin_pages(unsigned long vaddr, long npage,
-			   int prot, unsigned long *pfn_base)
+static long vfio_pin_pages_internal(void *domain_data, unsigned long vaddr, long npage,
+		             int prot, unsigned long *pfn_base)
 {
+	struct vfio_domain *domain = domain_data;
 	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 	bool lock_cap = capable(CAP_IPC_LOCK);
 	long ret, i;
 	bool rsvd;
+	struct mm_struct *mm;
 
-	if (!current->mm)
+	if (!domain)
 		return -ENODEV;
 
-	ret = vaddr_get_pfn(vaddr, prot, pfn_base);
+	if (domain->vfio_iommu_api_only)
+		mm = domain->vmm_mm;
+	else
+		mm = current->mm;
+
+	if (!mm)
+		return -ENODEV;
+
+	ret = vaddr_get_pfn(mm, vaddr, prot, pfn_base);
 	if (ret)
 		return ret;
 
@@ -293,7 +396,7 @@  static long vfio_pin_pages(unsigned long vaddr, long npage,
 	for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
 		unsigned long pfn = 0;
 
-		ret = vaddr_get_pfn(vaddr, prot, &pfn);
+		ret = vaddr_get_pfn(mm, vaddr, prot, &pfn);
 		if (ret)
 			break;
 
@@ -318,25 +421,183 @@  static long vfio_pin_pages(unsigned long vaddr, long npage,
 	return i;
 }
 
-static long vfio_unpin_pages(unsigned long pfn, long npage,
-			     int prot, bool do_accounting)
+static long vfio_unpin_pages_internal(void *domain_data, unsigned long pfn, long npage,
+				      int prot, bool do_accounting)
 {
+	struct vfio_domain *domain = domain_data;
 	unsigned long unlocked = 0;
 	long i;
 
+	if (!domain)
+		return -ENODEV;
+
 	for (i = 0; i < npage; i++)
 		unlocked += put_pfn(pfn++, prot);
 
 	if (do_accounting)
 		vfio_lock_acct(-unlocked);
+	return unlocked;
+}
+
+/*
+ * Pin a set of guest PFNs and return their associated host PFNs for vGPU.
+ * @vaddr [in]: array of guest PFNs
+ * @npage [in]: count of array elements
+ * @prot [in] : protection flags
+ * @pfn_base[out] : array of host PFNs
+ */
+int vfio_pin_pages(void *iommu_data, dma_addr_t *vaddr, long npage,
+		   int prot, dma_addr_t *pfn_base)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	struct vfio_domain *domain = NULL, *domain_vgpu = NULL;
+	int i = 0, ret = 0;
+	long retpage;
+	dma_addr_t remote_vaddr = 0;
+	dma_addr_t *pfn = pfn_base;
+	struct vfio_dma *dma;
+
+	if (!iommu || !pfn_base)
+		return -EINVAL;
+
+	if (list_empty(&iommu->domain_list)) {
+		ret = -EINVAL;
+		goto pin_done;
+	}
+
+	get_first_domains(iommu, &domain, &domain_vgpu);
+
+	// Return error if vGPU domain doesn't exist
+	if (!domain_vgpu) {
+		ret = -EINVAL;
+		goto pin_done;
+	}
+
+	for (i = 0; i < npage; i++) {
+		struct vfio_vgpu_pfn *p;
+		struct vfio_vgpu_pfn *lpfn;
+		unsigned long tpfn;
+		dma_addr_t iova;
+
+		mutex_lock(&iommu->lock);
+
+		iova = vaddr[i] << PAGE_SHIFT;
+
+		dma = vfio_find_dma(iommu, iova, 0 /*  size */);
+		if (!dma) {
+			mutex_unlock(&iommu->lock);
+			ret = -EINVAL;
+			goto pin_done;
+		}
+
+		remote_vaddr = dma->vaddr + iova - dma->iova;
+
+		retpage = vfio_pin_pages_internal(domain_vgpu, remote_vaddr,
+						  (long)1, prot, &tpfn);
+		mutex_unlock(&iommu->lock);
+		if (retpage <= 0) {
+			WARN_ON(!retpage);
+			ret = (int)retpage;
+			goto pin_done;
+		}
+
+		pfn[i] = tpfn;
+
+		mutex_lock(&domain_vgpu->lock);
+
+		// search if pfn exist
+		if ((p = vfio_find_vgpu_pfn(domain_vgpu, tpfn))) {
+			atomic_inc(&p->ref_count);
+			mutex_unlock(&domain_vgpu->lock);
+			continue;
+		}
+
+		// add to pfn_list
+		lpfn = kzalloc(sizeof(*lpfn), GFP_KERNEL);
+		if (!lpfn) {
+			ret = -ENOMEM;
+			mutex_unlock(&domain_vgpu->lock);
+			goto pin_done;
+		}
+		lpfn->vmm_va = remote_vaddr;
+		lpfn->iova = iova;
+		lpfn->pfn = pfn[i];
+		lpfn->npage = 1;
+		lpfn->prot = prot;
+		atomic_inc(&lpfn->ref_count);
+		vfio_link_vgpu_pfn(domain_vgpu, lpfn);
+		mutex_unlock(&domain_vgpu->lock);
+	}
+
+	ret = i;
+
+pin_done:
+	return ret;
+}
+EXPORT_SYMBOL(vfio_pin_pages);
+
+static int vfio_vgpu_unpin_pfn(struct vfio_domain *domain, struct vfio_vgpu_pfn *vpfn)
+{
+	int ret;
+
+	ret = vfio_unpin_pages_internal(domain, vpfn->pfn, vpfn->npage, vpfn->prot, true);
+
+	if (atomic_dec_and_test(&vpfn->ref_count)) {
+		// remove from pfn_list
+		vfio_unlink_vgpu_pfn(domain, vpfn);
+		kfree(vpfn);
+	}
+
+	return ret;
+}
+
+/*
+ * Unpin set of host PFNs for vGPU.
+ * @pfn	[in] : array of host PFNs to be unpinned.
+ * @npage [in] :count of elements in array, that is number of pages.
+ * @prot [in] : protection flags
+ */
+int vfio_unpin_pages(void *iommu_data, dma_addr_t *pfn, long npage,
+		     int prot)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	struct vfio_domain *domain = NULL, *domain_vgpu = NULL;
+	long unlocked = 0;
+	int i;
+	if (!iommu || !pfn)
+		return -EINVAL;
+
+	if (list_empty(&iommu->domain_list))
+		return -EINVAL;
+
+	get_first_domains(iommu, &domain, &domain_vgpu);
+
+	// Return error if vGPU domain doesn't exist
+	if (!domain_vgpu)
+		return -EINVAL;
+
+	mutex_lock(&domain_vgpu->lock);
+
+	for (i = 0; i < npage; i++) {
+		struct vfio_vgpu_pfn *p;
+
+		// verify if pfn exist in pfn_list
+		if (!(p = vfio_find_vgpu_pfn(domain_vgpu, *(pfn + i)))) {
+			continue;
+		}
+
+		unlocked += vfio_vgpu_unpin_pfn(domain_vgpu, p);
+	}
+	mutex_unlock(&domain_vgpu->lock);
 
 	return unlocked;
 }
+EXPORT_SYMBOL(vfio_unpin_pages);
 
 static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
 {
 	dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
-	struct vfio_domain *domain, *d;
+	struct vfio_domain *domain = NULL, *d, *domain_vgpu = NULL;
 	long unlocked = 0;
 
 	if (!dma->size)
@@ -348,12 +609,18 @@  static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
 	 * pfns to unpin.  The rest need to be unmapped in advance so we have
 	 * no iommu translations remaining when the pages are unpinned.
 	 */
-	domain = d = list_first_entry(&iommu->domain_list,
-				      struct vfio_domain, next);
 
+	get_first_domains(iommu, &domain, &domain_vgpu);
+
+	if (!domain)
+		return;
+
+	d = domain;
 	list_for_each_entry_continue(d, &iommu->domain_list, next) {
-		iommu_unmap(d->domain, dma->iova, dma->size);
-		cond_resched();
+		if (!d->vfio_iommu_api_only) {
+			iommu_unmap(d->domain, dma->iova, dma->size);
+			cond_resched();
+		}
 	}
 
 	while (iova < end) {
@@ -382,7 +649,7 @@  static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
 		if (WARN_ON(!unmapped))
 			break;
 
-		unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
+		unlocked += vfio_unpin_pages_internal(domain, phys >> PAGE_SHIFT,
 					     unmapped >> PAGE_SHIFT,
 					     dma->prot, false);
 		iova += unmapped;
@@ -406,8 +673,10 @@  static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
 	unsigned long bitmap = ULONG_MAX;
 
 	mutex_lock(&iommu->lock);
-	list_for_each_entry(domain, &iommu->domain_list, next)
-		bitmap &= domain->domain->ops->pgsize_bitmap;
+	list_for_each_entry(domain, &iommu->domain_list, next) {
+		if (!domain->vfio_iommu_api_only)
+			bitmap &= domain->domain->ops->pgsize_bitmap;
+	}
 	mutex_unlock(&iommu->lock);
 
 	/*
@@ -517,6 +786,9 @@  static int map_try_harder(struct vfio_domain *domain, dma_addr_t iova,
 	long i;
 	int ret;
 
+	if (domain->vfio_iommu_api_only)
+		return -EINVAL;
+
 	for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
 		ret = iommu_map(domain->domain, iova,
 				(phys_addr_t)pfn << PAGE_SHIFT,
@@ -538,6 +810,9 @@  static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
 	int ret;
 
 	list_for_each_entry(d, &iommu->domain_list, next) {
+		if (d->vfio_iommu_api_only)
+			continue;
+
 		ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
 				npage << PAGE_SHIFT, prot | d->prot);
 		if (ret) {
@@ -552,8 +827,11 @@  static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
 	return 0;
 
 unwind:
-	list_for_each_entry_continue_reverse(d, &iommu->domain_list, next)
+	list_for_each_entry_continue_reverse(d, &iommu->domain_list, next) {
+		if (d->vfio_iommu_api_only)
+			continue;
 		iommu_unmap(d->domain, iova, npage << PAGE_SHIFT);
+	}
 
 	return ret;
 }
@@ -569,6 +847,8 @@  static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	uint64_t mask;
 	struct vfio_dma *dma;
 	unsigned long pfn;
+	struct vfio_domain *domain = NULL;
+	int domain_with_iommu_present = 0;
 
 	/* Verify that none of our __u64 fields overflow */
 	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
@@ -611,9 +891,22 @@  static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	/* Insert zero-sized and grow as we map chunks of it */
 	vfio_link_dma(iommu, dma);
 
+	list_for_each_entry(domain, &iommu->domain_list, next) {
+		if (!domain->vfio_iommu_api_only) {
+			domain_with_iommu_present = 1;
+			break;
+		}
+	}
+
+	// Skip pin and map only if domain without IOMMU is present
+	if (!domain_with_iommu_present) {
+		dma->size = size;
+		goto map_done;
+	}
+
 	while (size) {
 		/* Pin a contiguous chunk of memory */
-		npage = vfio_pin_pages(vaddr + dma->size,
+		npage = vfio_pin_pages_internal(domain, vaddr + dma->size,
 				       size >> PAGE_SHIFT, prot, &pfn);
 		if (npage <= 0) {
 			WARN_ON(!npage);
@@ -623,8 +916,8 @@  static int vfio_dma_do_map(struct vfio_iommu *iommu,
 
 		/* Map it! */
 		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot);
-		if (ret) {
-			vfio_unpin_pages(pfn, npage, prot, true);
+		if (ret){
+			vfio_unpin_pages_internal(domain, pfn, npage, prot, true);
 			break;
 		}
 
@@ -635,6 +928,7 @@  static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	if (ret)
 		vfio_remove_dma(iommu, dma);
 
+map_done:
 	mutex_unlock(&iommu->lock);
 	return ret;
 }
@@ -654,12 +948,15 @@  static int vfio_bus_type(struct device *dev, void *data)
 static int vfio_iommu_replay(struct vfio_iommu *iommu,
 			     struct vfio_domain *domain)
 {
-	struct vfio_domain *d;
+	struct vfio_domain *d = NULL, *d_vgpu = NULL;
 	struct rb_node *n;
 	int ret;
 
+	if (domain->vfio_iommu_api_only)
+		return -EINVAL;
+
 	/* Arbitrarily pick the first domain in the list for lookups */
-	d = list_first_entry(&iommu->domain_list, struct vfio_domain, next);
+	get_first_domains(iommu, &d, &d_vgpu);
 	n = rb_first(&iommu->dma_list);
 
 	/* If there's not a domain, there better not be any mappings */
@@ -716,6 +1013,9 @@  static void vfio_test_domain_fgsp(struct vfio_domain *domain)
 	struct page *pages;
 	int ret, order = get_order(PAGE_SIZE * 2);
 
+	if (domain->vfio_iommu_api_only)
+		return;
+
 	pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
 	if (!pages)
 		return;
@@ -769,6 +1069,23 @@  static int vfio_iommu_type1_attach_group(void *iommu_data,
 	if (ret)
 		goto out_free;
 
+	if (!iommu_present(bus) && (bus == &vgpu_bus_type)) {
+		struct vgpu_device *vgpu_dev = NULL;
+
+		vgpu_dev = get_vgpu_device_from_group(iommu_group);
+		if (!vgpu_dev)
+			goto out_free;
+
+		vgpu_dev->iommu_data = iommu;
+		domain->vfio_iommu_api_only = true;
+		domain->vmm_mm = current->mm;
+		INIT_LIST_HEAD(&domain->group_list);
+		list_add(&group->next, &domain->group_list);
+		domain->pfn_list = RB_ROOT;
+		mutex_init(&domain->lock);
+		goto out_success;
+	}
+
 	domain->domain = iommu_domain_alloc(bus);
 	if (!domain->domain) {
 		ret = -EIO;
@@ -834,6 +1151,7 @@  static int vfio_iommu_type1_attach_group(void *iommu_data,
 	if (ret)
 		goto out_detach;
 
+out_success:
 	list_add(&domain->next, &iommu->domain_list);
 
 	mutex_unlock(&iommu->lock);
@@ -854,11 +1172,36 @@  out_free:
 static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
 {
 	struct rb_node *node;
+	struct vfio_domain *domain = NULL, *domain_vgpu = NULL;
+
+	get_first_domains(iommu, &domain, &domain_vgpu);
+
+	if (domain_vgpu) {
+		int unlocked;
+		mutex_lock(&domain_vgpu->lock);
+		while ((node = rb_first(&domain_vgpu->pfn_list))) {
+			unlocked = vfio_vgpu_unpin_pfn(domain_vgpu,
+					rb_entry(node, struct vfio_vgpu_pfn, node));
+		}
+		mutex_unlock(&domain_vgpu->lock);
+	}
 
 	while ((node = rb_first(&iommu->dma_list)))
 		vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
 }
 
+static bool list_is_singular_iommu_domain(struct vfio_iommu *iommu)
+{
+	struct vfio_domain *domain;
+	int domain_iommu = 0;
+
+	list_for_each_entry(domain, &iommu->domain_list, next) {
+		if (!domain->vfio_iommu_api_only)
+			domain_iommu++;
+	}
+	return (domain_iommu == 1);
+}
+
 static void vfio_iommu_type1_detach_group(void *iommu_data,
 					  struct iommu_group *iommu_group)
 {
@@ -872,19 +1215,28 @@  static void vfio_iommu_type1_detach_group(void *iommu_data,
 		list_for_each_entry(group, &domain->group_list, next) {
 			if (group->iommu_group != iommu_group)
 				continue;
+			if (!domain->vfio_iommu_api_only)
+				iommu_detach_group(domain->domain, iommu_group);
+			else {
+				struct vgpu_device *vgpu_dev = NULL;
 
-			iommu_detach_group(domain->domain, iommu_group);
+				vgpu_dev = get_vgpu_device_from_group(iommu_group);
+				if (vgpu_dev)
+					vgpu_dev->iommu_data = NULL;
+
+			}
 			list_del(&group->next);
 			kfree(group);
 			/*
 			 * Group ownership provides privilege, if the group
 			 * list is empty, the domain goes away.  If it's the
-			 * last domain, then all the mappings go away too.
+			 * last domain with iommu, then all the mappings go away too.
 			 */
 			if (list_empty(&domain->group_list)) {
-				if (list_is_singular(&iommu->domain_list))
+				if (list_is_singular_iommu_domain(iommu))
 					vfio_iommu_unmap_unpin_all(iommu);
-				iommu_domain_free(domain->domain);
+				if (!domain->vfio_iommu_api_only)
+					iommu_domain_free(domain->domain);
 				list_del(&domain->next);
 				kfree(domain);
 			}
@@ -936,11 +1288,22 @@  static void vfio_iommu_type1_release(void *iommu_data)
 				 &iommu->domain_list, next) {
 		list_for_each_entry_safe(group, group_tmp,
 					 &domain->group_list, next) {
-			iommu_detach_group(domain->domain, group->iommu_group);
+			if (!domain->vfio_iommu_api_only)
+				iommu_detach_group(domain->domain, group->iommu_group);
+			else {
+				struct vgpu_device *vgpu_dev = NULL;
+
+				vgpu_dev = get_vgpu_device_from_group(group->iommu_group);
+				if (vgpu_dev)
+					vgpu_dev->iommu_data = NULL;
+
+			}
+
 			list_del(&group->next);
 			kfree(group);
 		}
-		iommu_domain_free(domain->domain);
+		if (!domain->vfio_iommu_api_only)
+			iommu_domain_free(domain->domain);
 		list_del(&domain->next);
 		kfree(domain);
 	}
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 0ecae0b..d280868 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -127,6 +127,12 @@  static inline long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
 }
 #endif /* CONFIG_EEH */
 
+extern long vfio_pin_pages(void *iommu_data, dma_addr_t *vaddr, long npage,
+		           int prot, dma_addr_t *pfn_base);
+
+extern long vfio_unpin_pages(void *iommu_data, dma_addr_t *pfn, long npage,
+			     int prot);
+
 /*
  * IRQfd - generic
  */
diff --git a/include/linux/vgpu.h b/include/linux/vgpu.h
index 03a77cf..cc18353 100644
--- a/include/linux/vgpu.h
+++ b/include/linux/vgpu.h
@@ -36,6 +36,7 @@  struct vgpu_device {
 	struct device		dev;
 	struct gpu_device	*gpu_dev;
 	struct iommu_group	*group;
+	void			*iommu_data;
 #define DEVICE_NAME_LEN		(64)
 	char			dev_name[DEVICE_NAME_LEN];
 	uuid_le			uuid;
@@ -209,8 +210,7 @@  extern void vgpu_unregister_driver(struct vgpu_driver *drv);
 
 extern int vgpu_map_virtual_bar(uint64_t virt_bar_addr, uint64_t phys_bar_addr,
 				uint32_t len, uint32_t flags);
-extern int vgpu_dma_do_translate(dma_addr_t * gfn_buffer, uint32_t count);
 
-struct vgpu_device *get_vgpu_device_from_group(struct iommu_group *group);
+extern struct vgpu_device *get_vgpu_device_from_group(struct iommu_group *group);
 
 #endif /* VGPU_H */