diff mbox

[v8,3/6] vfio iommu: Add support for mediated devices

Message ID 1476131317-358-4-git-send-email-kwankhede@nvidia.com (mailing list archive)
State New, archived
Headers show

Commit Message

Kirti Wankhede Oct. 10, 2016, 8:28 p.m. UTC
VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
Mediated device only uses IOMMU APIs, the underlying hardware can be
managed by an IOMMU domain.

Aim of this change is:
- To use most of the code of TYPE1 IOMMU driver for mediated devices
- To support direct assigned device and mediated device in single module

Added two new callback functions to struct vfio_iommu_driver_ops. Backend
IOMMU module that supports pining and unpinning pages for mdev devices
should provide these functions.
Added APIs for pining and unpining pages to VFIO module. These calls back
into backend iommu module to actually pin and unpin pages.

This change adds pin and unpin support for mediated device to TYPE1 IOMMU
backend module. More details:
- When iommu_group of mediated devices is attached, task structure is
  cached which is used later to pin pages and page accounting.
- It keeps track of pinned pages for mediated domain. This data is used to
  verify unpinning request and to unpin remaining pages while detaching, if
  there are any.
- Used existing mechanism for page accounting. If iommu capable domain
  exist in the container then all pages are already pinned and accounted.
  Accouting for mdev device is only done if there is no iommu capable
  domain in the container.
- Page accouting is updated on hot plug and unplug mdev device and pass
  through device.

Tested by assigning below combinations of devices to a single VM:
- GPU pass through only
- vGPU device only
- One GPU pass through and one vGPU device
- Linux VM hot plug and unplug vGPU device while GPU pass through device
  exist
- Linux VM hot plug and unplug GPU pass through device while vGPU device
  exist

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Signed-off-by: Neo Jia <cjia@nvidia.com>
Change-Id: I295d6f0f2e0579b8d9882bfd8fd5a4194b97bd9a
---
 drivers/vfio/vfio.c             | 117 +++++++
 drivers/vfio/vfio_iommu_type1.c | 685 ++++++++++++++++++++++++++++++++++------
 include/linux/vfio.h            |  13 +-
 3 files changed, 724 insertions(+), 91 deletions(-)

Comments

Alex Williamson Oct. 11, 2016, 10:06 p.m. UTC | #1
On Tue, 11 Oct 2016 01:58:34 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
> Mediated device only uses IOMMU APIs, the underlying hardware can be
> managed by an IOMMU domain.
> 
> Aim of this change is:
> - To use most of the code of TYPE1 IOMMU driver for mediated devices
> - To support direct assigned device and mediated device in single module
> 
> Added two new callback functions to struct vfio_iommu_driver_ops. Backend
> IOMMU module that supports pining and unpinning pages for mdev devices
> should provide these functions.
> Added APIs for pining and unpining pages to VFIO module. These calls back
> into backend iommu module to actually pin and unpin pages.
> 
> This change adds pin and unpin support for mediated device to TYPE1 IOMMU
> backend module. More details:
> - When iommu_group of mediated devices is attached, task structure is
>   cached which is used later to pin pages and page accounting.
> - It keeps track of pinned pages for mediated domain. This data is used to
>   verify unpinning request and to unpin remaining pages while detaching, if
>   there are any.
> - Used existing mechanism for page accounting. If iommu capable domain
>   exist in the container then all pages are already pinned and accounted.
>   Accouting for mdev device is only done if there is no iommu capable
>   domain in the container.
> - Page accouting is updated on hot plug and unplug mdev device and pass
>   through device.
> 
> Tested by assigning below combinations of devices to a single VM:
> - GPU pass through only
> - vGPU device only
> - One GPU pass through and one vGPU device
> - Linux VM hot plug and unplug vGPU device while GPU pass through device
>   exist
> - Linux VM hot plug and unplug GPU pass through device while vGPU device
>   exist
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Signed-off-by: Neo Jia <cjia@nvidia.com>
> Change-Id: I295d6f0f2e0579b8d9882bfd8fd5a4194b97bd9a
> ---
>  drivers/vfio/vfio.c             | 117 +++++++
>  drivers/vfio/vfio_iommu_type1.c | 685 ++++++++++++++++++++++++++++++++++------
>  include/linux/vfio.h            |  13 +-
>  3 files changed, 724 insertions(+), 91 deletions(-)
> 
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index 6fd6fa5469de..e3e342861e04 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
> @@ -1782,6 +1782,123 @@ void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
>  }
>  EXPORT_SYMBOL_GPL(vfio_info_cap_shift);
>  
> +static struct vfio_group *vfio_group_from_dev(struct device *dev)
> +{
> +	struct vfio_device *device;
> +	struct vfio_group *group;
> +	int ret;
> +
> +	device = vfio_device_get_from_dev(dev);

Note how this does dev->iommu_group->vfio_group->vfio_device and then
we back out one level to get the vfio_group, it's not a terribly
lightweight path.  Perhaps we should have:

struct vfio_device *vfio_group_get_from_dev(struct device *dev)
{
        struct iommu_group *iommu_group;
        struct vfio_group *group;

        iommu_group = iommu_group_get(dev);
        if (!iommu_group)
                return NULL;

        group = vfio_group_get_from_iommu(iommu_group);
	iommu_group_put(iommu_group);

	return group;
}

vfio_device_get_from_dev() would make use of this.

Then create a separate:

static int vfio_group_add_container_user(struct vfio_group *group)
{

> +	if (!atomic_inc_not_zero(&group->container_users)) {
		return -EINVAL;
> +	}
> +
> +	if (group->noiommu) {
> +		atomic_dec(&group->container_users);
		return -EPERM;
> +	}
> +
> +	if (!group->container->iommu_driver ||
> +	    !vfio_group_viable(group)) {
> +		atomic_dec(&group->container_users);
		return -EINVAL;
> +	}
> +
	return 0;
}

vfio_group_get_external_user() would be updated to use this.  In fact,
creating these two functions and updating the existing code to use
these should be a separate patch.

Note that your version did not hold a group reference while doing the
pin/unpin operations below, which seems like a bug.

> +
> +err_ret:
> +	vfio_device_put(device);
> +	return ERR_PTR(ret);
> +}
> +
> +/*
> + * Pin a set of guest PFNs and return their associated host PFNs for local
> + * domain only.
> + * @dev [in] : device
> + * @user_pfn [in]: array of user/guest PFNs
> + * @npage [in]: count of array elements
> + * @prot [in] : protection flags
> + * @phys_pfn[out] : array of host PFNs
> + */
> +long vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
> +		    long npage, int prot, unsigned long *phys_pfn)
> +{
> +	struct vfio_container *container;
> +	struct vfio_group *group;
> +	struct vfio_iommu_driver *driver;
> +	ssize_t ret = -EINVAL;
> +
> +	if (!dev || !user_pfn || !phys_pfn)
> +		return -EINVAL;
> +
> +	group = vfio_group_from_dev(dev);
> +	if (IS_ERR(group))
> +		return PTR_ERR(group);

As suggested above:

	group = vfio_group_get_from_dev(dev);
	if (!group)
		return -ENODEV;

	ret = vfio_group_add_container_user(group)
	if (ret)
		vfio_group_put(group);
		return ret;
	}

> +
> +	container = group->container;
> +	if (IS_ERR(container))
> +		return PTR_ERR(container);
> +
> +	down_read(&container->group_lock);
> +
> +	driver = container->iommu_driver;
> +	if (likely(driver && driver->ops->pin_pages))
> +		ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
> +					     npage, prot, phys_pfn);
> +
> +	up_read(&container->group_lock);
> +	vfio_group_try_dissolve_container(group);

Even if you're considering that the container_user reference holds the
driver, I think we need a group reference throughout this and this
should end with a vfio_group_put(group);

> +
> +	return ret;
> +
> +}
> +EXPORT_SYMBOL(vfio_pin_pages);
> +
> +/*
> + * Unpin set of host PFNs for local domain only.
> + * @dev [in] : device
> + * @pfn [in] : array of host PFNs to be unpinned.
> + * @npage [in] :count of elements in array, that is number of pages.
> + */
> +long vfio_unpin_pages(struct device *dev, unsigned long *pfn, long npage)
> +{
> +	struct vfio_container *container;
> +	struct vfio_group *group;
> +	struct vfio_iommu_driver *driver;
> +	ssize_t ret = -EINVAL;
> +
> +	if (!dev || !pfn)
> +		return -EINVAL;
> +
> +	group = vfio_group_from_dev(dev);
> +	if (IS_ERR(group))
> +		return PTR_ERR(group);
> +
> +	container = group->container;
> +	if (IS_ERR(container))
> +		return PTR_ERR(container);
> +
> +	down_read(&container->group_lock);
> +
> +	driver = container->iommu_driver;
> +	if (likely(driver && driver->ops->unpin_pages))
> +		ret = driver->ops->unpin_pages(container->iommu_data, pfn,
> +					       npage);
> +
> +	up_read(&container->group_lock);
> +	vfio_group_try_dissolve_container(group);


Same as above on all counts.

> +	return ret;
> +}
> +EXPORT_SYMBOL(vfio_unpin_pages);
> +
>  /**
>   * Module/class support
>   */
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 2ba19424e4a1..ce6d6dcbd9a8 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -55,18 +55,26 @@ MODULE_PARM_DESC(disable_hugepages,
>  
>  struct vfio_iommu {
>  	struct list_head	domain_list;
> +	struct vfio_domain	*local_domain;
>  	struct mutex		lock;
>  	struct rb_root		dma_list;
>  	bool			v2;
>  	bool			nesting;
>  };
>  
> +struct local_addr_space {
> +	struct task_struct	*task;
> +	struct rb_root		pfn_list;	/* pinned Host pfn list */
> +	struct mutex		pfn_list_lock;	/* mutex for pfn_list */
> +};
> +
>  struct vfio_domain {
>  	struct iommu_domain	*domain;
>  	struct list_head	next;
>  	struct list_head	group_list;
>  	int			prot;		/* IOMMU_CACHE */
>  	bool			fgsp;		/* Fine-grained super pages */
> +	struct local_addr_space	*local_addr_space;
>  };

Consider structure internal alignment, this should be placed below
group_list.

>  
>  struct vfio_dma {
> @@ -75,6 +83,7 @@ struct vfio_dma {
>  	unsigned long		vaddr;		/* Process virtual addr */
>  	size_t			size;		/* Map size (bytes) */
>  	int			prot;		/* IOMMU_READ/WRITE */
> +	bool			iommu_mapped;
>  };
>  
>  struct vfio_group {
> @@ -83,6 +92,22 @@ struct vfio_group {
>  };
>  
>  /*
> + * Guest RAM pinning working set or DMA target
> + */
> +struct vfio_pfn {
> +	struct rb_node		node;
> +	unsigned long		vaddr;		/* virtual addr */
> +	dma_addr_t		iova;		/* IOVA */
> +	unsigned long		pfn;		/* Host pfn */
> +	size_t			prot;

size_t?  Shouldn't this be an int?

> +	atomic_t		ref_count;
> +};
> +
> +
> +#define IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu)	\
> +			 (list_empty(&iommu->domain_list) ? false : true)

(!list_empty(...))

> +
> +/*
>   * This code handles mapping and unmapping of user data buffers
>   * into DMA'ble space using the IOMMU
>   */
> @@ -130,6 +155,84 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
>  	rb_erase(&old->node, &iommu->dma_list);
>  }
>  
> +/*
> + * Helper Functions for host pfn list
> + */
> +
> +static struct vfio_pfn *vfio_find_pfn(struct vfio_domain *domain,
> +				      unsigned long pfn)
> +{
> +	struct rb_node *node;
> +	struct vfio_pfn *vpfn, *ret = NULL;
> +
> +	node = domain->local_addr_space->pfn_list.rb_node;
> +
> +	while (node) {
> +		vpfn = rb_entry(node, struct vfio_pfn, node);
> +
> +		if (pfn < vpfn->pfn)
> +			node = node->rb_left;
> +		else if (pfn > vpfn->pfn)
> +			node = node->rb_right;
> +		else {
> +			ret = vpfn;
> +			break;
> +		}
> +	}
> +
> +	return ret;
> +}

Some unnecessary style differences from vfio_find_dma()

> +
> +static void vfio_link_pfn(struct vfio_domain *domain, struct vfio_pfn *new)
> +{
> +	struct rb_node **link, *parent = NULL;
> +	struct vfio_pfn *vpfn;
> +
> +	link = &domain->local_addr_space->pfn_list.rb_node;
> +	while (*link) {
> +		parent = *link;
> +		vpfn = rb_entry(parent, struct vfio_pfn, node);
> +
> +		if (new->pfn < vpfn->pfn)
> +			link = &(*link)->rb_left;
> +		else
> +			link = &(*link)->rb_right;
> +	}
> +
> +	rb_link_node(&new->node, parent, link);
> +	rb_insert_color(&new->node, &domain->local_addr_space->pfn_list);
> +}
> +
> +static void vfio_unlink_pfn(struct vfio_domain *domain, struct vfio_pfn *old)
> +{
> +	rb_erase(&old->node, &domain->local_addr_space->pfn_list);
> +}
> +
> +static int vfio_add_to_pfn_list(struct vfio_domain *domain, unsigned long vaddr,
> +				dma_addr_t iova, unsigned long pfn, size_t prot)

size_t?

> +{
> +	struct vfio_pfn *vpfn;
> +
> +	vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
> +	if (!vpfn)
> +		return -ENOMEM;
> +
> +	vpfn->vaddr = vaddr;
> +	vpfn->iova = iova;
> +	vpfn->pfn = pfn;
> +	vpfn->prot = prot;
> +	atomic_set(&vpfn->ref_count, 1);
> +	vfio_link_pfn(domain, vpfn);
> +	return 0;
> +}
> +
> +static void vfio_remove_from_pfn_list(struct vfio_domain *domain,
> +				      struct vfio_pfn *vpfn)
> +{
> +	vfio_unlink_pfn(domain, vpfn);
> +	kfree(vpfn);
> +}
> +
>  struct vwork {
>  	struct mm_struct	*mm;
>  	long			npage;
> @@ -150,17 +253,17 @@ static void vfio_lock_acct_bg(struct work_struct *work)
>  	kfree(vwork);
>  }
>  
> -static void vfio_lock_acct(long npage)
> +static void vfio_lock_acct(struct task_struct *task, long npage)
>  {
>  	struct vwork *vwork;
>  	struct mm_struct *mm;
>  
> -	if (!current->mm || !npage)
> +	if (!task->mm || !npage)
>  		return; /* process exited or nothing to do */
>  
> -	if (down_write_trylock(&current->mm->mmap_sem)) {
> -		current->mm->locked_vm += npage;
> -		up_write(&current->mm->mmap_sem);
> +	if (down_write_trylock(&task->mm->mmap_sem)) {
> +		task->mm->locked_vm += npage;
> +		up_write(&task->mm->mmap_sem);
>  		return;
>  	}
>  
> @@ -172,7 +275,7 @@ static void vfio_lock_acct(long npage)
>  	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
>  	if (!vwork)
>  		return;
> -	mm = get_task_mm(current);
> +	mm = get_task_mm(task);
>  	if (!mm) {
>  		kfree(vwork);
>  		return;
> @@ -228,20 +331,31 @@ static int put_pfn(unsigned long pfn, int prot)
>  	return 0;
>  }
>  
> -static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
> +static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
> +			 int prot, unsigned long *pfn)
>  {
>  	struct page *page[1];
>  	struct vm_area_struct *vma;
> +	struct mm_struct *local_mm = (mm ? mm : current->mm);
>  	int ret = -EFAULT;
>  
> -	if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
> +	if (mm) {
> +		down_read(&local_mm->mmap_sem);
> +		ret = get_user_pages_remote(NULL, local_mm, vaddr, 1,
> +					!!(prot & IOMMU_WRITE), 0, page, NULL);
> +		up_read(&local_mm->mmap_sem);
> +	} else
> +		ret = get_user_pages_fast(vaddr, 1,
> +					  !!(prot & IOMMU_WRITE), page);
> +
> +	if (ret == 1) {
>  		*pfn = page_to_pfn(page[0]);
>  		return 0;
>  	}
>  
> -	down_read(&current->mm->mmap_sem);
> +	down_read(&local_mm->mmap_sem);
>  
> -	vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
> +	vma = find_vma_intersection(local_mm, vaddr, vaddr + 1);
>  
>  	if (vma && vma->vm_flags & VM_PFNMAP) {
>  		*pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
> @@ -249,7 +363,7 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
>  			ret = 0;
>  	}
>  
> -	up_read(&current->mm->mmap_sem);
> +	up_read(&local_mm->mmap_sem);
>  
>  	return ret;
>  }
> @@ -259,8 +373,8 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
>   * the iommu can only map chunks of consecutive pfns anyway, so get the
>   * first page and all consecutive pages with the same locking.
>   */
> -static long vfio_pin_pages(unsigned long vaddr, long npage,
> -			   int prot, unsigned long *pfn_base)
> +static long __vfio_pin_pages_remote(unsigned long vaddr, long npage,
> +				    int prot, unsigned long *pfn_base)
>  {
>  	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>  	bool lock_cap = capable(CAP_IPC_LOCK);
> @@ -270,7 +384,7 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
>  	if (!current->mm)
>  		return -ENODEV;
>  
> -	ret = vaddr_get_pfn(vaddr, prot, pfn_base);
> +	ret = vaddr_get_pfn(NULL, vaddr, prot, pfn_base);
>  	if (ret)
>  		return ret;
>  
> @@ -285,7 +399,7 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
>  
>  	if (unlikely(disable_hugepages)) {
>  		if (!rsvd)
> -			vfio_lock_acct(1);
> +			vfio_lock_acct(current, 1);
>  		return 1;
>  	}
>  
> @@ -293,7 +407,7 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
>  	for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
>  		unsigned long pfn = 0;
>  
> -		ret = vaddr_get_pfn(vaddr, prot, &pfn);
> +		ret = vaddr_get_pfn(NULL, vaddr, prot, &pfn);
>  		if (ret)
>  			break;
>  
> @@ -313,13 +427,13 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
>  	}
>  
>  	if (!rsvd)
> -		vfio_lock_acct(i);
> +		vfio_lock_acct(current, i);
>  
>  	return i;
>  }
>  
> -static long vfio_unpin_pages(unsigned long pfn, long npage,
> -			     int prot, bool do_accounting)
> +static long __vfio_unpin_pages_remote(unsigned long pfn, long npage, int prot,
> +				      bool do_accounting)
>  {
>  	unsigned long unlocked = 0;
>  	long i;
> @@ -328,7 +442,188 @@ static long vfio_unpin_pages(unsigned long pfn, long npage,
>  		unlocked += put_pfn(pfn++, prot);
>  
>  	if (do_accounting)
> -		vfio_lock_acct(-unlocked);
> +		vfio_lock_acct(current, -unlocked);
> +	return unlocked;
> +}
> +
> +static long __vfio_pin_pages_local(struct vfio_domain *domain,
> +				   unsigned long vaddr, int prot,
> +				   unsigned long *pfn_base,
> +				   bool do_accounting)
> +{
> +	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +	bool lock_cap = capable(CAP_IPC_LOCK);
> +	long ret;
> +	bool rsvd;
> +	struct task_struct *task = domain->local_addr_space->task;
> +
> +	if (!task->mm)
> +		return -ENODEV;
> +
> +	ret = vaddr_get_pfn(task->mm, vaddr, prot, pfn_base);
> +	if (ret)
> +		return ret;
> +
> +	rsvd = is_invalid_reserved_pfn(*pfn_base);
> +
> +	if (!rsvd && !lock_cap && task->mm->locked_vm + 1 > limit) {
> +		put_pfn(*pfn_base, prot);
> +		pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
> +			limit << PAGE_SHIFT);
> +		return -ENOMEM;
> +	}
> +
> +	if (!rsvd && do_accounting)
> +		vfio_lock_acct(task, 1);
> +
> +	return 1;
> +}
> +
> +static void __vfio_unpin_pages_local(struct vfio_domain *domain,
> +				     unsigned long pfn, int prot,
> +				     bool do_accounting)
> +{
> +	put_pfn(pfn, prot);
> +
> +	if (do_accounting)
> +		vfio_lock_acct(domain->local_addr_space->task, -1);
> +}
> +
> +static int vfio_unpin_pfn(struct vfio_domain *domain,
> +			  struct vfio_pfn *vpfn, bool do_accounting)
> +{
> +	__vfio_unpin_pages_local(domain, vpfn->pfn, vpfn->prot,
> +				 do_accounting);
> +
> +	if (atomic_dec_and_test(&vpfn->ref_count))
> +		vfio_remove_from_pfn_list(domain, vpfn);
> +
> +	return 1;
> +}
> +
> +static long vfio_iommu_type1_pin_pages(void *iommu_data,
> +				       unsigned long *user_pfn,
> +				       long npage, int prot,
> +				       unsigned long *phys_pfn)
> +{
> +	struct vfio_iommu *iommu = iommu_data;
> +	struct vfio_domain *domain;
> +	int i, j, ret;
> +	long retpage;
> +	unsigned long remote_vaddr;
> +	unsigned long *pfn = phys_pfn;
> +	struct vfio_dma *dma;
> +	bool do_accounting = false;
> +
> +	if (!iommu || !user_pfn || !phys_pfn)
> +		return -EINVAL;
> +
> +	mutex_lock(&iommu->lock);
> +
> +	if (!iommu->local_domain) {
> +		ret = -EINVAL;
> +		goto pin_done;
> +	}
> +
> +	domain = iommu->local_domain;
> +
> +	/*
> +	 * If iommu capable domain exist in the container then all pages are
> +	 * already pinned and accounted. Accouting should be done if there is no
> +	 * iommu capable domain in the container.
> +	 */
> +	do_accounting = !IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu);
> +
> +	for (i = 0; i < npage; i++) {
> +		struct vfio_pfn *p;
> +		dma_addr_t iova;
> +
> +		iova = user_pfn[i] << PAGE_SHIFT;
> +
> +		dma = vfio_find_dma(iommu, iova, 0);
> +		if (!dma) {
> +			ret = -EINVAL;
> +			goto pin_unwind;
> +		}
> +
> +		remote_vaddr = dma->vaddr + iova - dma->iova;
> +
> +		retpage = __vfio_pin_pages_local(domain, remote_vaddr, prot,
> +						 &pfn[i], do_accounting);
> +		if (retpage <= 0) {
> +			WARN_ON(!retpage);
> +			ret = (int)retpage;
> +			goto pin_unwind;
> +		}
> +
> +		mutex_lock(&domain->local_addr_space->pfn_list_lock);
> +
> +		/* search if pfn exist */
> +		p = vfio_find_pfn(domain, pfn[i]);
> +		if (p) {
> +			atomic_inc(&p->ref_count);
> +			mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +			continue;
> +		}
> +
> +		ret = vfio_add_to_pfn_list(domain, remote_vaddr, iova,
> +					   pfn[i], prot);
> +		mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +
> +		if (ret) {
> +			__vfio_unpin_pages_local(domain, pfn[i], prot,
> +						 do_accounting);
> +			goto pin_unwind;
> +		}
> +	}
> +
> +	ret = i;
> +	goto pin_done;
> +
> +pin_unwind:
> +	pfn[i] = 0;
> +	mutex_lock(&domain->local_addr_space->pfn_list_lock);
> +	for (j = 0; j < i; j++) {
> +		struct vfio_pfn *p;
> +
> +		p = vfio_find_pfn(domain, pfn[j]);
> +		if (p)
> +			vfio_unpin_pfn(domain, p, do_accounting);
> +
> +		pfn[j] = 0;
> +	}
> +	mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +
> +pin_done:
> +	mutex_unlock(&iommu->lock);
> +	return ret;
> +}
> +
> +static long vfio_iommu_type1_unpin_pages(void *iommu_data, unsigned long *pfn,
> +					 long npage)
> +{
> +	struct vfio_iommu *iommu = iommu_data;
> +	struct vfio_domain *domain = NULL;
> +	long unlocked = 0;
> +	int i;
> +
> +	if (!iommu || !pfn)
> +		return -EINVAL;
> +

We need iommu->lock here, right?

> +	domain = iommu->local_domain;
> +
> +	for (i = 0; i < npage; i++) {
> +		struct vfio_pfn *p;
> +
> +		mutex_lock(&domain->local_addr_space->pfn_list_lock);
> +
> +		/* verify if pfn exist in pfn_list */
> +		p = vfio_find_pfn(domain, pfn[i]);
> +		if (p)
> +			unlocked += vfio_unpin_pfn(domain, p, true);
> +
> +		mutex_unlock(&domain->local_addr_space->pfn_list_lock);

We hold this mutex outside the loop in the pin unwind case, why is it
different here?

> +	}
>  
>  	return unlocked;
>  }
> @@ -341,6 +636,12 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
>  
>  	if (!dma->size)
>  		return;
> +
> +	if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
> +		return;
> +
> +	if (!dma->iommu_mapped)
> +		return;
>  	/*
>  	 * We use the IOMMU to track the physical addresses, otherwise we'd
>  	 * need a much more complicated tracking system.  Unfortunately that
> @@ -382,15 +683,16 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
>  		if (WARN_ON(!unmapped))
>  			break;
>  
> -		unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
> -					     unmapped >> PAGE_SHIFT,
> -					     dma->prot, false);
> +		unlocked += __vfio_unpin_pages_remote(phys >> PAGE_SHIFT,
> +						      unmapped >> PAGE_SHIFT,
> +						      dma->prot, false);
>  		iova += unmapped;
>  
>  		cond_resched();
>  	}
>  
> -	vfio_lock_acct(-unlocked);
> +	dma->iommu_mapped = false;
> +	vfio_lock_acct(current, -unlocked);
>  }
>  
>  static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
> @@ -558,17 +860,85 @@ unwind:
>  	return ret;
>  }
>  
> +void vfio_update_accounting(struct vfio_iommu *iommu, struct vfio_dma *dma)
> +{
> +	struct vfio_domain *domain = iommu->local_domain;
> +	struct rb_node *n;
> +	long locked = 0;
> +
> +	if (!iommu->local_domain)
> +		return;
> +
> +	mutex_lock(&domain->local_addr_space->pfn_list_lock);
> +
> +	n = rb_first(&domain->local_addr_space->pfn_list);
> +
> +	for (; n; n = rb_next(n)) {
> +		struct vfio_pfn *vpfn;
> +
> +		vpfn = rb_entry(n, struct vfio_pfn, node);
> +
> +		if ((vpfn->iova >= dma->iova) &&
> +		    (vpfn->iova < dma->iova + dma->size))
> +			locked++;
> +	}
> +	vfio_lock_acct(current, -locked);
> +	mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +}
> +
> +static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
> +			    size_t map_size)
> +{
> +	dma_addr_t iova = dma->iova;
> +	unsigned long vaddr = dma->vaddr;
> +	size_t size = map_size, dma_size = 0;
> +	long npage;
> +	unsigned long pfn;
> +	int ret = 0;
> +
> +	while (size) {
> +		/* Pin a contiguous chunk of memory */
> +		npage = __vfio_pin_pages_remote(vaddr + dma_size,
> +						size >> PAGE_SHIFT, dma->prot,
> +						&pfn);
> +		if (npage <= 0) {
> +			WARN_ON(!npage);
> +			ret = (int)npage;
> +			break;
> +		}
> +
> +		/* Map it! */
> +		ret = vfio_iommu_map(iommu, iova + dma_size, pfn, npage,
> +				     dma->prot);
> +		if (ret) {
> +			__vfio_unpin_pages_remote(pfn, npage, dma->prot, true);
> +			break;
> +		}
> +
> +		size -= npage << PAGE_SHIFT;
> +		dma_size += npage << PAGE_SHIFT;
> +	}
> +
> +	if (ret)
> +		vfio_remove_dma(iommu, dma);


There's a bug introduced here, vfio_remove_dma() needs dma->size to be
accurate to the point of failure, it's not updated until the success
branch below, so it's never going to unmap/unpin anything.

> +	else {
> +		dma->size = dma_size;
> +		dma->iommu_mapped = true;
> +		vfio_update_accounting(iommu, dma);

I'm confused how this works, when called from vfio_dma_do_map() we're
populating a vfio_dma, that is we're populating part of the iova space
of the device.  How could we have pinned pfns in the local address
space that overlap that?  It would be invalid to have such pinned pfns
since that part of the iova space was not previously mapped.

Another issue is that if there were existing overlaps, userspace would
need to have locked memory limits sufficient for this temporary double
accounting.  I'm not sure how they'd come up with heuristics to handle
that since we're potentially looking at the bulk of VM memory in a
single vfio_dma entry.

> +	}
> +
> +	return ret;
> +}
> +
>  static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  			   struct vfio_iommu_type1_dma_map *map)
>  {
>  	dma_addr_t iova = map->iova;
>  	unsigned long vaddr = map->vaddr;
>  	size_t size = map->size;
> -	long npage;
>  	int ret = 0, prot = 0;
>  	uint64_t mask;
>  	struct vfio_dma *dma;
> -	unsigned long pfn;
>  
>  	/* Verify that none of our __u64 fields overflow */
>  	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
> @@ -611,29 +981,11 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  	/* Insert zero-sized and grow as we map chunks of it */
>  	vfio_link_dma(iommu, dma);
>  
> -	while (size) {
> -		/* Pin a contiguous chunk of memory */
> -		npage = vfio_pin_pages(vaddr + dma->size,
> -				       size >> PAGE_SHIFT, prot, &pfn);
> -		if (npage <= 0) {
> -			WARN_ON(!npage);
> -			ret = (int)npage;
> -			break;
> -		}
> -
> -		/* Map it! */
> -		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot);
> -		if (ret) {
> -			vfio_unpin_pages(pfn, npage, prot, true);
> -			break;
> -		}
> -
> -		size -= npage << PAGE_SHIFT;
> -		dma->size += npage << PAGE_SHIFT;
> -	}
> -
> -	if (ret)
> -		vfio_remove_dma(iommu, dma);
> +	/* Don't pin and map if container doesn't contain IOMMU capable domain*/
> +	if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
> +		dma->size = size;
> +	else
> +		ret = vfio_pin_map_dma(iommu, dma, size);
>  
>  	mutex_unlock(&iommu->lock);
>  	return ret;
> @@ -662,10 +1014,6 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
>  	d = list_first_entry(&iommu->domain_list, struct vfio_domain, next);
>  	n = rb_first(&iommu->dma_list);
>  
> -	/* If there's not a domain, there better not be any mappings */
> -	if (WARN_ON(n && !d))
> -		return -EINVAL;
> -
>  	for (; n; n = rb_next(n)) {
>  		struct vfio_dma *dma;
>  		dma_addr_t iova;
> @@ -674,20 +1022,43 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
>  		iova = dma->iova;
>  
>  		while (iova < dma->iova + dma->size) {
> -			phys_addr_t phys = iommu_iova_to_phys(d->domain, iova);
> +			phys_addr_t phys;
>  			size_t size;
>  
> -			if (WARN_ON(!phys)) {
> -				iova += PAGE_SIZE;
> -				continue;
> -			}
> +			if (dma->iommu_mapped) {
> +				phys = iommu_iova_to_phys(d->domain, iova);
> +
> +				if (WARN_ON(!phys)) {
> +					iova += PAGE_SIZE;
> +					continue;
> +				}
>  
> -			size = PAGE_SIZE;
> +				size = PAGE_SIZE;
>  
> -			while (iova + size < dma->iova + dma->size &&
> -			       phys + size == iommu_iova_to_phys(d->domain,
> +				while (iova + size < dma->iova + dma->size &&
> +				    phys + size == iommu_iova_to_phys(d->domain,
>  								 iova + size))
> -				size += PAGE_SIZE;
> +					size += PAGE_SIZE;
> +			} else {
> +				unsigned long pfn;
> +				unsigned long vaddr = dma->vaddr +
> +						     (iova - dma->iova);
> +				size_t n = dma->iova + dma->size - iova;
> +				long npage;
> +
> +				npage = __vfio_pin_pages_remote(vaddr,
> +								n >> PAGE_SHIFT,
> +								dma->prot,
> +								&pfn);
> +				if (npage <= 0) {
> +					WARN_ON(!npage);
> +					ret = (int)npage;
> +					return ret;
> +				}
> +
> +				phys = pfn << PAGE_SHIFT;
> +				size = npage << PAGE_SHIFT;
> +			}
>  
>  			ret = iommu_map(domain->domain, iova, phys,
>  					size, dma->prot | domain->prot);
> @@ -696,6 +1067,11 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
>  
>  			iova += size;
>  		}
> +
> +		if (!dma->iommu_mapped) {
> +			dma->iommu_mapped = true;
> +			vfio_update_accounting(iommu, dma);
> +		}

This is the case where we potentially have pinned pfns and we've added
an iommu mapped device and need to adjust accounting.  But we've fully
pinned and accounted the entire iommu mapped space while still holding
the accounting for any pfn mapped space.  So for a time, assuming some
pfn pinned pages, we have duplicate accounting.  How does userspace
deal with that?  For instance, if I'm using an mdev device where the
vendor driver has pinned 512MB of guest memory, then I hot-add an
assigned NIC and the entire VM address space gets pinned, that pinning
will fail unless my locked memory limits are at least 512MB in excess
of my VM size.  Additionally, the user doesn't know how much memory the
vendor driver is going to pin, it might be the whole VM address space,
so the user would need 2x the locked memory limits.

>  	}
>  
>  	return 0;
> @@ -734,11 +1110,24 @@ static void vfio_test_domain_fgsp(struct vfio_domain *domain)
>  	__free_pages(pages, order);
>  }
>  
> +static struct vfio_group *find_iommu_group(struct vfio_domain *domain,
> +				   struct iommu_group *iommu_group)
> +{
> +	struct vfio_group *g;
> +
> +	list_for_each_entry(g, &domain->group_list, next) {
> +		if (g->iommu_group == iommu_group)
> +			return g;
> +	}
> +
> +	return NULL;
> +}

It would make review easier if changes like splitting this into a
separate function with no functional change on the calling path could
be a separate patch.

> +
>  static int vfio_iommu_type1_attach_group(void *iommu_data,
>  					 struct iommu_group *iommu_group)
>  {
>  	struct vfio_iommu *iommu = iommu_data;
> -	struct vfio_group *group, *g;
> +	struct vfio_group *group;
>  	struct vfio_domain *domain, *d;
>  	struct bus_type *bus = NULL;
>  	int ret;
> @@ -746,10 +1135,14 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	mutex_lock(&iommu->lock);
>  
>  	list_for_each_entry(d, &iommu->domain_list, next) {
> -		list_for_each_entry(g, &d->group_list, next) {
> -			if (g->iommu_group != iommu_group)
> -				continue;
> +		if (find_iommu_group(d, iommu_group)) {
> +			mutex_unlock(&iommu->lock);
> +			return -EINVAL;
> +		}
> +	}
>  
> +	if (iommu->local_domain) {
> +		if (find_iommu_group(iommu->local_domain, iommu_group)) {
>  			mutex_unlock(&iommu->lock);
>  			return -EINVAL;
>  		}
> @@ -769,6 +1162,34 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	if (ret)
>  		goto out_free;
>  
> +	if (IS_ENABLED(CONFIG_VFIO_MDEV) && !iommu_present(bus) &&
> +	    (bus == &mdev_bus_type)) {
> +		if (iommu->local_domain) {
> +			list_add(&group->next,
> +				 &iommu->local_domain->group_list);
> +			kfree(domain);
> +			mutex_unlock(&iommu->lock);
> +			return 0;
> +		}
> +
> +		domain->local_addr_space =
> +				      kzalloc(sizeof(*domain->local_addr_space),
> +					      GFP_KERNEL);
> +		if (!domain->local_addr_space) {
> +			ret = -ENOMEM;
> +			goto out_free;
> +		}
> +
> +		domain->local_addr_space->task = current;
> +		INIT_LIST_HEAD(&domain->group_list);
> +		list_add(&group->next, &domain->group_list);
> +		domain->local_addr_space->pfn_list = RB_ROOT;
> +		mutex_init(&domain->local_addr_space->pfn_list_lock);
> +		iommu->local_domain = domain;
> +		mutex_unlock(&iommu->lock);
> +		return 0;


This could have been

		if (!iommu->local_domain) {
			domain->local_addr_space =
				kzalloc(sizeof(*domain->local_addr_space),
					GFP_KERNEL);
			if (!domain->local_addr_space) {
				ret = -ENOMEM;
				goto out_free;
			}

			domain->local_addr_space->task = current;
			domain->local_addr_space->pfn_list = RB_ROOT;
			mutex_init(&domain->local_addr_space->pfn_list_lock);
			INIT_LIST_HEAD(&domain->group_list);
			iommu->local_domain = domain;
		} else {
			kfree(domain);
		}

		list_add(&group->next, &iommu->local_domain->group_list);

		mutex_unlock(&iommu->lock);
		return 0;

> +	}
> +
>  	domain->domain = iommu_domain_alloc(bus);
>  	if (!domain->domain) {
>  		ret = -EIO;
> @@ -859,6 +1280,41 @@ static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
>  		vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
>  }
>  
> +static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
> +{
> +	struct vfio_domain *domain = iommu->local_domain;
> +	struct vfio_dma *dma, *tdma;
> +	struct rb_node *n;
> +	long locked = 0;
> +
> +	rbtree_postorder_for_each_entry_safe(dma, tdma, &iommu->dma_list,
> +					     node) {
> +		vfio_unmap_unpin(iommu, dma);
> +	}
> +
> +	mutex_lock(&domain->local_addr_space->pfn_list_lock);
> +
> +	n = rb_first(&domain->local_addr_space->pfn_list);
> +
> +	for (; n; n = rb_next(n))
> +		locked++;
> +
> +	vfio_lock_acct(domain->local_addr_space->task, locked);
> +	mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +}
> +
> +static void vfio_local_unpin_all(struct vfio_domain *domain)
> +{
> +	struct rb_node *node;
> +
> +	mutex_lock(&domain->local_addr_space->pfn_list_lock);
> +	while ((node = rb_first(&domain->local_addr_space->pfn_list)))
> +		vfio_unpin_pfn(domain,
> +				rb_entry(node, struct vfio_pfn, node), false);
> +
> +	mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +}
> +
>  static void vfio_iommu_type1_detach_group(void *iommu_data,
>  					  struct iommu_group *iommu_group)
>  {
> @@ -868,31 +1324,57 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
>  
>  	mutex_lock(&iommu->lock);
>  
> -	list_for_each_entry(domain, &iommu->domain_list, next) {
> -		list_for_each_entry(group, &domain->group_list, next) {
> -			if (group->iommu_group != iommu_group)
> -				continue;
> -
> -			iommu_detach_group(domain->domain, iommu_group);
> +	if (iommu->local_domain) {
> +		domain = iommu->local_domain;
> +		group = find_iommu_group(domain, iommu_group);
> +		if (group) {
>  			list_del(&group->next);
>  			kfree(group);
> -			/*
> -			 * Group ownership provides privilege, if the group
> -			 * list is empty, the domain goes away.  If it's the
> -			 * last domain, then all the mappings go away too.
> -			 */
> +
>  			if (list_empty(&domain->group_list)) {
> -				if (list_is_singular(&iommu->domain_list))
> +				vfio_local_unpin_all(domain);
> +				if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
>  					vfio_iommu_unmap_unpin_all(iommu);
> -				iommu_domain_free(domain->domain);
> -				list_del(&domain->next);
>  				kfree(domain);
> +				iommu->local_domain = NULL;
> +			}
> +			goto detach_group_done;
> +		}
> +	}
> +
> +	if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
> +		goto detach_group_done;
> +
> +	list_for_each_entry(domain, &iommu->domain_list, next) {
> +		group = find_iommu_group(domain, iommu_group);
> +		if (!group)
> +			continue;
> +
> +		iommu_detach_group(domain->domain, iommu_group);
> +		list_del(&group->next);
> +		kfree(group);
> +		/*
> +		 * Group ownership provides privilege, if the group list is
> +		 * empty, the domain goes away. If it's the last domain with
> +		 * iommu and local domain doesn't exist, then all the mappings
> +		 * go away too. If it's the last domain with iommu and local
> +		 * domain exist, update accounting
> +		 */
> +		if (list_empty(&domain->group_list)) {
> +			if (list_is_singular(&iommu->domain_list)) {
> +				if (!iommu->local_domain)
> +					vfio_iommu_unmap_unpin_all(iommu);
> +				else
> +					vfio_iommu_unmap_unpin_reaccount(iommu);
>  			}
> -			goto done;
> +			iommu_domain_free(domain->domain);
> +			list_del(&domain->next);
> +			kfree(domain);
>  		}
> +		break;
>  	}
>  
> -done:
> +detach_group_done:
>  	mutex_unlock(&iommu->lock);
>  }
>  
> @@ -924,27 +1406,48 @@ static void *vfio_iommu_type1_open(unsigned long arg)
>  	return iommu;
>  }
>  
> +static void vfio_release_domain(struct vfio_domain *domain)
> +{
> +	struct vfio_group *group, *group_tmp;
> +
> +	list_for_each_entry_safe(group, group_tmp,
> +				 &domain->group_list, next) {
> +		if (!domain->local_addr_space)
> +			iommu_detach_group(domain->domain, group->iommu_group);
> +		list_del(&group->next);
> +		kfree(group);
> +	}
> +
> +	if (domain->local_addr_space)
> +		vfio_local_unpin_all(domain);
> +	else
> +		iommu_domain_free(domain->domain);
> +}
> +
>  static void vfio_iommu_type1_release(void *iommu_data)
>  {
>  	struct vfio_iommu *iommu = iommu_data;
>  	struct vfio_domain *domain, *domain_tmp;
> -	struct vfio_group *group, *group_tmp;
> +
> +	if (iommu->local_domain) {
> +		vfio_release_domain(iommu->local_domain);
> +		kfree(iommu->local_domain);
> +		iommu->local_domain = NULL;
> +	}
>  
>  	vfio_iommu_unmap_unpin_all(iommu);
>  
> +	if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
> +		goto release_exit;
> +
>  	list_for_each_entry_safe(domain, domain_tmp,
>  				 &iommu->domain_list, next) {
> -		list_for_each_entry_safe(group, group_tmp,
> -					 &domain->group_list, next) {
> -			iommu_detach_group(domain->domain, group->iommu_group);
> -			list_del(&group->next);
> -			kfree(group);
> -		}
> -		iommu_domain_free(domain->domain);
> +		vfio_release_domain(domain);
>  		list_del(&domain->next);
>  		kfree(domain);
>  	}
>  
> +release_exit:
>  	kfree(iommu);
>  }
>  
> @@ -1048,6 +1551,8 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
>  	.ioctl		= vfio_iommu_type1_ioctl,
>  	.attach_group	= vfio_iommu_type1_attach_group,
>  	.detach_group	= vfio_iommu_type1_detach_group,
> +	.pin_pages	= vfio_iommu_type1_pin_pages,
> +	.unpin_pages	= vfio_iommu_type1_unpin_pages,
>  };
>  
>  static int __init vfio_iommu_type1_init(void)
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 0ecae0b1cd34..0bd25ba6223d 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -17,6 +17,7 @@
>  #include <linux/workqueue.h>
>  #include <linux/poll.h>
>  #include <uapi/linux/vfio.h>
> +#include <linux/mdev.h>
>  
>  /**
>   * struct vfio_device_ops - VFIO bus driver device callbacks
> @@ -75,7 +76,11 @@ struct vfio_iommu_driver_ops {
>  					struct iommu_group *group);
>  	void		(*detach_group)(void *iommu_data,
>  					struct iommu_group *group);
> -
> +	long		(*pin_pages)(void *iommu_data, unsigned long *user_pfn,
> +				     long npage, int prot,
> +				     unsigned long *phys_pfn);
> +	long		(*unpin_pages)(void *iommu_data, unsigned long *pfn,
> +				       long npage);
>  };
>  
>  extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
> @@ -127,6 +132,12 @@ static inline long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
>  }
>  #endif /* CONFIG_EEH */
>  
> +extern long vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
> +			   long npage, int prot, unsigned long *phys_pfn);
> +
> +extern long vfio_unpin_pages(struct device *dev, unsigned long *pfn,
> +			     long npage);
> +
>  /*
>   * IRQfd - generic
>   */

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tian, Kevin Oct. 12, 2016, 10:31 a.m. UTC | #2
> From: Kirti Wankhede [mailto:kwankhede@nvidia.com]
> Sent: Tuesday, October 11, 2016 4:29 AM
> 
[...]
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 2ba19424e4a1..ce6d6dcbd9a8 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -55,18 +55,26 @@ MODULE_PARM_DESC(disable_hugepages,
> 
>  struct vfio_iommu {
>  	struct list_head	domain_list;
> +	struct vfio_domain	*local_domain;

Hi, Kirti, can you help explain the meaning of 'local" here? I have a hard time 
to understand its intention... In your later change of vaddr_get_pfn, it's
even more confusing where get_user_pages_remote is used on a 'local_mm':

+	if (mm) {
+		down_read(&local_mm->mmap_sem);
+		ret = get_user_pages_remote(NULL, local_mm, vaddr, 1,
+					!!(prot & IOMMU_WRITE), 0, page, NULL);
+		up_read(&local_mm->mmap_sem);
+	} else
+		ret = get_user_pages_fast(vaddr, 1,
+					  !!(prot & IOMMU_WRITE), page);


[...]
> -static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
> +static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
> +			 int prot, unsigned long *pfn)
>  {
>  	struct page *page[1];
>  	struct vm_area_struct *vma;
> +	struct mm_struct *local_mm = (mm ? mm : current->mm);

it'd be clearer if you call this variable as 'mm' while the earlier input parameter
as 'local_mm'.

>  	int ret = -EFAULT;
> 
> -	if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
> +	if (mm) {
> +		down_read(&local_mm->mmap_sem);
> +		ret = get_user_pages_remote(NULL, local_mm, vaddr, 1,
> +					!!(prot & IOMMU_WRITE), 0, page, NULL);
> +		up_read(&local_mm->mmap_sem);
> +	} else
> +		ret = get_user_pages_fast(vaddr, 1,
> +					  !!(prot & IOMMU_WRITE), page);
> +
> +	if (ret == 1) {
>  		*pfn = page_to_pfn(page[0]);
>  		return 0;
>  	}
> 
> -	down_read(&current->mm->mmap_sem);
> +	down_read(&local_mm->mmap_sem);
> 
> -	vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
> +	vma = find_vma_intersection(local_mm, vaddr, vaddr + 1);
> 
>  	if (vma && vma->vm_flags & VM_PFNMAP) {
>  		*pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;

[...]
> +static long __vfio_pin_pages_local(struct vfio_domain *domain,
> +				   unsigned long vaddr, int prot,
> +				   unsigned long *pfn_base,
> +				   bool do_accounting)

'pages' -> 'page' since only one page is handled here.

[...]
> +
> +static void __vfio_unpin_pages_local(struct vfio_domain *domain,
> +				     unsigned long pfn, int prot,
> +				     bool do_accounting)

ditto

> +{
> +	put_pfn(pfn, prot);
> +
> +	if (do_accounting)
> +		vfio_lock_acct(domain->local_addr_space->task, -1);
> +}
> +
> +static int vfio_unpin_pfn(struct vfio_domain *domain,
> +			  struct vfio_pfn *vpfn, bool do_accounting)
> +{
> +	__vfio_unpin_pages_local(domain, vpfn->pfn, vpfn->prot,
> +				 do_accounting);
> +
> +	if (atomic_dec_and_test(&vpfn->ref_count))
> +		vfio_remove_from_pfn_list(domain, vpfn);
> +
> +	return 1;
> +}
> +
> +static long vfio_iommu_type1_pin_pages(void *iommu_data,
> +				       unsigned long *user_pfn,
> +				       long npage, int prot,
> +				       unsigned long *phys_pfn)
> +{
> +	struct vfio_iommu *iommu = iommu_data;
> +	struct vfio_domain *domain;
> +	int i, j, ret;
> +	long retpage;
> +	unsigned long remote_vaddr;
> +	unsigned long *pfn = phys_pfn;
> +	struct vfio_dma *dma;
> +	bool do_accounting = false;
> +
> +	if (!iommu || !user_pfn || !phys_pfn)
> +		return -EINVAL;
> +
> +	mutex_lock(&iommu->lock);
> +
> +	if (!iommu->local_domain) {
> +		ret = -EINVAL;
> +		goto pin_done;
> +	}
> +
> +	domain = iommu->local_domain;
> +
> +	/*
> +	 * If iommu capable domain exist in the container then all pages are
> +	 * already pinned and accounted. Accouting should be done if there is no
> +	 * iommu capable domain in the container.
> +	 */
> +	do_accounting = !IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu);
> +
> +	for (i = 0; i < npage; i++) {
> +		struct vfio_pfn *p;
> +		dma_addr_t iova;
> +
> +		iova = user_pfn[i] << PAGE_SHIFT;
> +
> +		dma = vfio_find_dma(iommu, iova, 0);
> +		if (!dma) {
> +			ret = -EINVAL;
> +			goto pin_unwind;
> +		}
> +
> +		remote_vaddr = dma->vaddr + iova - dma->iova;

again, why "remote"_vaddr on a 'local' function?

> +
> +		retpage = __vfio_pin_pages_local(domain, remote_vaddr, prot,
> +						 &pfn[i], do_accounting);
> +		if (retpage <= 0) {
> +			WARN_ON(!retpage);
> +			ret = (int)retpage;
> +			goto pin_unwind;
> +		}
> +
> +		mutex_lock(&domain->local_addr_space->pfn_list_lock);
> +
> +		/* search if pfn exist */
> +		p = vfio_find_pfn(domain, pfn[i]);
> +		if (p) {
> +			atomic_inc(&p->ref_count);
> +			mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +			continue;
> +		}
> +
> +		ret = vfio_add_to_pfn_list(domain, remote_vaddr, iova,
> +					   pfn[i], prot);
> +		mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +
> +		if (ret) {
> +			__vfio_unpin_pages_local(domain, pfn[i], prot,
> +						 do_accounting);
> +			goto pin_unwind;
> +		}
> +	}
> +
> +	ret = i;
> +	goto pin_done;
> +
> +pin_unwind:
> +	pfn[i] = 0;
> +	mutex_lock(&domain->local_addr_space->pfn_list_lock);
> +	for (j = 0; j < i; j++) {
> +		struct vfio_pfn *p;
> +
> +		p = vfio_find_pfn(domain, pfn[j]);
> +		if (p)
> +			vfio_unpin_pfn(domain, p, do_accounting);
> +
> +		pfn[j] = 0;
> +	}
> +	mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +
> +pin_done:
> +	mutex_unlock(&iommu->lock);
> +	return ret;
> +}
> +
> +static long vfio_iommu_type1_unpin_pages(void *iommu_data, unsigned long *pfn,
> +					 long npage)
> +{
> +	struct vfio_iommu *iommu = iommu_data;
> +	struct vfio_domain *domain = NULL;
> +	long unlocked = 0;
> +	int i;
> +
> +	if (!iommu || !pfn)
> +		return -EINVAL;
> +

acquire iommu lock...

> +	domain = iommu->local_domain;
> +
> +	for (i = 0; i < npage; i++) {
> +		struct vfio_pfn *p;
> +
> +		mutex_lock(&domain->local_addr_space->pfn_list_lock);
> +
> +		/* verify if pfn exist in pfn_list */
> +		p = vfio_find_pfn(domain, pfn[i]);
> +		if (p)
> +			unlocked += vfio_unpin_pfn(domain, p, true);

Should we force update accounting here even when there is iommu capable
domain? It's not consistent to earlier pin_pages.

> +
> +		mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +	}
> 
>  	return unlocked;
>  }
> @@ -341,6 +636,12 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct
> vfio_dma *dma)
> 
>  	if (!dma->size)
>  		return;
> +
> +	if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
> +		return;

Is above check redundant to following dma->iommu_mapped?

> +
> +	if (!dma->iommu_mapped)
> +		return;
>  	/*
>  	 * We use the IOMMU to track the physical addresses, otherwise we'd
>  	 * need a much more complicated tracking system.  Unfortunately that

Thanks
Kevin
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tian, Kevin Oct. 12, 2016, 10:38 a.m. UTC | #3
> From: Alex Williamson [mailto:alex.williamson@redhat.com]
> Sent: Wednesday, October 12, 2016 6:07 AM
> > @@ -696,6 +1067,11 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
> >
> >  			iova += size;
> >  		}
> > +
> > +		if (!dma->iommu_mapped) {
> > +			dma->iommu_mapped = true;
> > +			vfio_update_accounting(iommu, dma);
> > +		}
> 
> This is the case where we potentially have pinned pfns and we've added
> an iommu mapped device and need to adjust accounting.  But we've fully
> pinned and accounted the entire iommu mapped space while still holding
> the accounting for any pfn mapped space.  So for a time, assuming some
> pfn pinned pages, we have duplicate accounting.  How does userspace
> deal with that?  For instance, if I'm using an mdev device where the
> vendor driver has pinned 512MB of guest memory, then I hot-add an
> assigned NIC and the entire VM address space gets pinned, that pinning
> will fail unless my locked memory limits are at least 512MB in excess
> of my VM size.  Additionally, the user doesn't know how much memory the
> vendor driver is going to pin, it might be the whole VM address space,
> so the user would need 2x the locked memory limits.
> 

Looks we have inconsistent policies in local/remote pining:

- for local pinning, it increases accounting only when the region hasn't
been pinned in remote path

- however in remote pinning, it always increases accounting and then
adjust back if the region has been pinned in local path earlier. This leaves
a window as you said where double accounting may occur on some pages.

What about adding similar check in remote pining, i.e. increasing account
only when the region hasn't been pinned in local path? That way the 
accounting could be always accurate...

Thanks
Kevin
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kirti Wankhede Oct. 13, 2016, 2:34 p.m. UTC | #4
On 10/12/2016 3:36 AM, Alex Williamson wrote:
> On Tue, 11 Oct 2016 01:58:34 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>
...


>> +static struct vfio_group *vfio_group_from_dev(struct device *dev)
>> +{
>> +	struct vfio_device *device;
>> +	struct vfio_group *group;
>> +	int ret;
>> +
>> +	device = vfio_device_get_from_dev(dev);
>
> Note how this does dev->iommu_group->vfio_group->vfio_device and then
> we back out one level to get the vfio_group, it's not a terribly
> lightweight path.  Perhaps we should have:
>
> struct vfio_device *vfio_group_get_from_dev(struct device *dev)
> {
>         struct iommu_group *iommu_group;
>         struct vfio_group *group;
>
>         iommu_group = iommu_group_get(dev);
>         if (!iommu_group)
>                 return NULL;
>
>         group = vfio_group_get_from_iommu(iommu_group);
> 	iommu_group_put(iommu_group);
>
> 	return group;
> }
>
> vfio_device_get_from_dev() would make use of this.
>
> Then create a separate:
>
> static int vfio_group_add_container_user(struct vfio_group *group)
> {
>
>> +	if (!atomic_inc_not_zero(&group->container_users)) {
> 		return -EINVAL;
>> +	}
>> +
>> +	if (group->noiommu) {
>> +		atomic_dec(&group->container_users);
> 		return -EPERM;
>> +	}
>> +
>> +	if (!group->container->iommu_driver ||
>> +	    !vfio_group_viable(group)) {
>> +		atomic_dec(&group->container_users);
> 		return -EINVAL;
>> +	}
>> +
> 	return 0;
> }
>
> vfio_group_get_external_user() would be updated to use this.  In fact,
> creating these two functions and updating the existing code to use
> these should be a separate patch.
>

Ok. I'll update.


> Note that your version did not hold a group reference while doing the
> pin/unpin operations below, which seems like a bug.
>

container->group_lock is held for pin/unpin. I think then we don't have
to hold the reference to group, because groups are attached and detached
holding this lock, right?


>> +
>> +err_ret:
>> +	vfio_device_put(device);
>> +	return ERR_PTR(ret);
>> +}
>> +
>> +/*
>> + * Pin a set of guest PFNs and return their associated host PFNs for
local
>> + * domain only.
>> + * @dev [in] : device
>> + * @user_pfn [in]: array of user/guest PFNs
>> + * @npage [in]: count of array elements
>> + * @prot [in] : protection flags
>> + * @phys_pfn[out] : array of host PFNs
>> + */
>> +long vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
>> +		    long npage, int prot, unsigned long *phys_pfn)
>> +{
>> +	struct vfio_container *container;
>> +	struct vfio_group *group;
>> +	struct vfio_iommu_driver *driver;
>> +	ssize_t ret = -EINVAL;
>> +
>> +	if (!dev || !user_pfn || !phys_pfn)
>> +		return -EINVAL;
>> +
>> +	group = vfio_group_from_dev(dev);
>> +	if (IS_ERR(group))
>> +		return PTR_ERR(group);
>
> As suggested above:
>
> 	group = vfio_group_get_from_dev(dev);
> 	if (!group)
> 		return -ENODEV;
>
> 	ret = vfio_group_add_container_user(group)
> 	if (ret)
> 		vfio_group_put(group);
> 		return ret;
> 	}
>

Ok.


>> +
>> +	container = group->container;
>> +	if (IS_ERR(container))
>> +		return PTR_ERR(container);
>> +
>> +	down_read(&container->group_lock);
>> +
>> +	driver = container->iommu_driver;
>> +	if (likely(driver && driver->ops->pin_pages))
>> +		ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
>> +					     npage, prot, phys_pfn);
>> +
>> +	up_read(&container->group_lock);
>> +	vfio_group_try_dissolve_container(group);
>
> Even if you're considering that the container_user reference holds the
> driver, I think we need a group reference throughout this and this
> should end with a vfio_group_put(group);
>

Same as I mentioned above, container->group_lock is held here.

...

>> +
>> +static long vfio_iommu_type1_unpin_pages(void *iommu_data, unsigned
long *pfn,
>> +					 long npage)
>> +{
>> +	struct vfio_iommu *iommu = iommu_data;
>> +	struct vfio_domain *domain = NULL;
>> +	long unlocked = 0;
>> +	int i;
>> +
>> +	if (!iommu || !pfn)
>> +		return -EINVAL;
>> +
>
> We need iommu->lock here, right?
>

Oh, yes.

>> +	domain = iommu->local_domain;
>> +
>> +	for (i = 0; i < npage; i++) {
>> +		struct vfio_pfn *p;
>> +
>> +		mutex_lock(&domain->local_addr_space->pfn_list_lock);
>> +
>> +		/* verify if pfn exist in pfn_list */
>> +		p = vfio_find_pfn(domain, pfn[i]);
>> +		if (p)
>> +			unlocked += vfio_unpin_pfn(domain, p, true);
>> +
>> +		mutex_unlock(&domain->local_addr_space->pfn_list_lock);
>
> We hold this mutex outside the loop in the pin unwind case, why is it
> different here?
>

pin_unwind is error condition, so should be done in one go.
Here this is not error case. Holding lock for long could block other
threads if there are multiple threads.



>> +static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct
vfio_dma *dma,
>> +			    size_t map_size)
>> +{
>> +	dma_addr_t iova = dma->iova;
>> +	unsigned long vaddr = dma->vaddr;
>> +	size_t size = map_size, dma_size = 0;
>> +	long npage;
>> +	unsigned long pfn;
>> +	int ret = 0;
>> +
>> +	while (size) {
>> +		/* Pin a contiguous chunk of memory */
>> +		npage = __vfio_pin_pages_remote(vaddr + dma_size,
>> +						size >> PAGE_SHIFT, dma->prot,
>> +						&pfn);
>> +		if (npage <= 0) {
>> +			WARN_ON(!npage);
>> +			ret = (int)npage;
>> +			break;
>> +		}
>> +
>> +		/* Map it! */
>> +		ret = vfio_iommu_map(iommu, iova + dma_size, pfn, npage,
>> +				     dma->prot);
>> +		if (ret) {
>> +			__vfio_unpin_pages_remote(pfn, npage, dma->prot, true);
>> +			break;
>> +		}
>> +
>> +		size -= npage << PAGE_SHIFT;
>> +		dma_size += npage << PAGE_SHIFT;
>> +	}
>> +
>> +	if (ret)
>> +		vfio_remove_dma(iommu, dma);
>
>
> There's a bug introduced here, vfio_remove_dma() needs dma->size to be
> accurate to the point of failure, it's not updated until the success
> branch below, so it's never going to unmap/unpin anything.
>

Ops, yes. I'll fix this.

>> +	else {
>> +		dma->size = dma_size;
>> +		dma->iommu_mapped = true;
>> +		vfio_update_accounting(iommu, dma);
>
> I'm confused how this works, when called from vfio_dma_do_map() we're
> populating a vfio_dma, that is we're populating part of the iova space
> of the device.  How could we have pinned pfns in the local address
> space that overlap that?  It would be invalid to have such pinned pfns
> since that part of the iova space was not previously mapped.
>
> Another issue is that if there were existing overlaps, userspace would
> need to have locked memory limits sufficient for this temporary double
> accounting.  I'm not sure how they'd come up with heuristics to handle
> that since we're potentially looking at the bulk of VM memory in a
> single vfio_dma entry.
>

I see that when QEMU boots a VM, in the case when first vGPU device is
attached and then pass through device is attached, then on first call to
vfio_dma_do_map(), pin and iommu_mmap is skipped. Then when a pass
through device is attached, all mappings are unmapped and then again
vfio_dma_do_map() is called. At this moment IOMMU capable domain is
present and so pin and iommu_mmap() on all sys mem is done. Now in
between these two device attach, if any pages are pinned by vendor
driver, then accounting should be updated.


>> +	}
>> +
>> +	return ret;
>> +}
>> +
>>  static int vfio_dma_do_map(struct vfio_iommu *iommu,
>>  			   struct vfio_iommu_type1_dma_map *map)
>>  {
>>  	dma_addr_t iova = map->iova;
>>  	unsigned long vaddr = map->vaddr;
>>  	size_t size = map->size;
>> -	long npage;
>>  	int ret = 0, prot = 0;
>>  	uint64_t mask;
>>  	struct vfio_dma *dma;
>> -	unsigned long pfn;
>>
>>  	/* Verify that none of our __u64 fields overflow */
>>  	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
>> @@ -611,29 +981,11 @@ static int vfio_dma_do_map(struct vfio_iommu
*iommu,
>>  	/* Insert zero-sized and grow as we map chunks of it */
>>  	vfio_link_dma(iommu, dma);
>>
>> -	while (size) {
>> -		/* Pin a contiguous chunk of memory */
>> -		npage = vfio_pin_pages(vaddr + dma->size,
>> -				       size >> PAGE_SHIFT, prot, &pfn);
>> -		if (npage <= 0) {
>> -			WARN_ON(!npage);
>> -			ret = (int)npage;
>> -			break;
>> -		}
>> -
>> -		/* Map it! */
>> -		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot);
>> -		if (ret) {
>> -			vfio_unpin_pages(pfn, npage, prot, true);
>> -			break;
>> -		}
>> -
>> -		size -= npage << PAGE_SHIFT;
>> -		dma->size += npage << PAGE_SHIFT;
>> -	}
>> -
>> -	if (ret)
>> -		vfio_remove_dma(iommu, dma);
>> +	/* Don't pin and map if container doesn't contain IOMMU capable
domain*/
>> +	if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
>> +		dma->size = size;
>> +	else
>> +		ret = vfio_pin_map_dma(iommu, dma, size);
>>
>>  	mutex_unlock(&iommu->lock);
>>  	return ret;
>> @@ -662,10 +1014,6 @@ static int vfio_iommu_replay(struct vfio_iommu
*iommu,
>>  	d = list_first_entry(&iommu->domain_list, struct vfio_domain, next);
>>  	n = rb_first(&iommu->dma_list);
>>
>> -	/* If there's not a domain, there better not be any mappings */
>> -	if (WARN_ON(n && !d))
>> -		return -EINVAL;
>> -
>>  	for (; n; n = rb_next(n)) {
>>  		struct vfio_dma *dma;
>>  		dma_addr_t iova;
>> @@ -674,20 +1022,43 @@ static int vfio_iommu_replay(struct vfio_iommu
*iommu,
>>  		iova = dma->iova;
>>
>>  		while (iova < dma->iova + dma->size) {
>> -			phys_addr_t phys = iommu_iova_to_phys(d->domain, iova);
>> +			phys_addr_t phys;
>>  			size_t size;
>>
>> -			if (WARN_ON(!phys)) {
>> -				iova += PAGE_SIZE;
>> -				continue;
>> -			}
>> +			if (dma->iommu_mapped) {
>> +				phys = iommu_iova_to_phys(d->domain, iova);
>> +
>> +				if (WARN_ON(!phys)) {
>> +					iova += PAGE_SIZE;
>> +					continue;
>> +				}
>>
>> -			size = PAGE_SIZE;
>> +				size = PAGE_SIZE;
>>
>> -			while (iova + size < dma->iova + dma->size &&
>> -			       phys + size == iommu_iova_to_phys(d->domain,
>> +				while (iova + size < dma->iova + dma->size &&
>> +				    phys + size == iommu_iova_to_phys(d->domain,
>>  								 iova + size))
>> -				size += PAGE_SIZE;
>> +					size += PAGE_SIZE;
>> +			} else {
>> +				unsigned long pfn;
>> +				unsigned long vaddr = dma->vaddr +
>> +						     (iova - dma->iova);
>> +				size_t n = dma->iova + dma->size - iova;
>> +				long npage;
>> +
>> +				npage = __vfio_pin_pages_remote(vaddr,
>> +								n >> PAGE_SHIFT,
>> +								dma->prot,
>> +								&pfn);
>> +				if (npage <= 0) {
>> +					WARN_ON(!npage);
>> +					ret = (int)npage;
>> +					return ret;
>> +				}
>> +
>> +				phys = pfn << PAGE_SHIFT;
>> +				size = npage << PAGE_SHIFT;
>> +			}
>>
>>  			ret = iommu_map(domain->domain, iova, phys,
>>  					size, dma->prot | domain->prot);
>> @@ -696,6 +1067,11 @@ static int vfio_iommu_replay(struct vfio_iommu
*iommu,
>>
>>  			iova += size;
>>  		}
>> +
>> +		if (!dma->iommu_mapped) {
>> +			dma->iommu_mapped = true;
>> +			vfio_update_accounting(iommu, dma);
>> +		}
>
> This is the case where we potentially have pinned pfns and we've added
> an iommu mapped device and need to adjust accounting.  But we've fully
> pinned and accounted the entire iommu mapped space while still holding
> the accounting for any pfn mapped space.  So for a time, assuming some
> pfn pinned pages, we have duplicate accounting.  How does userspace
> deal with that?  For instance, if I'm using an mdev device where the
> vendor driver has pinned 512MB of guest memory, then I hot-add an
> assigned NIC and the entire VM address space gets pinned, that pinning
> will fail unless my locked memory limits are at least 512MB in excess
> of my VM size.  Additionally, the user doesn't know how much memory the
> vendor driver is going to pin, it might be the whole VM address space,
> so the user would need 2x the locked memory limits.
>

Is the RLIMIT_MEMLOCK set so low? I got your point. I'll update
__vfio_pin_pages_remote() to check if page which is pinned is already
accounted in __vfio_pin_pages_remote() itself.


>>  	}
>>
>>  	return 0;
>> @@ -734,11 +1110,24 @@ static void vfio_test_domain_fgsp(struct
vfio_domain *domain)
>>  	__free_pages(pages, order);
>>  }
>>
>> +static struct vfio_group *find_iommu_group(struct vfio_domain *domain,
>> +				   struct iommu_group *iommu_group)
>> +{
>> +	struct vfio_group *g;
>> +
>> +	list_for_each_entry(g, &domain->group_list, next) {
>> +		if (g->iommu_group == iommu_group)
>> +			return g;
>> +	}
>> +
>> +	return NULL;
>> +}
>
> It would make review easier if changes like splitting this into a
> separate function with no functional change on the calling path could
> be a separate patch.
>

OK.

Thanks,
Kirti

-----------------------------------------------------------------------------------
This email message is for the sole use of the intended recipient(s) and may contain
confidential information.  Any unauthorized review, use, disclosure or distribution
is prohibited.  If you are not the intended recipient, please contact the sender by
reply email and destroy all copies of the original message.
-----------------------------------------------------------------------------------
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson Oct. 13, 2016, 5:12 p.m. UTC | #5
On Thu, 13 Oct 2016 20:04:43 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 10/12/2016 3:36 AM, Alex Williamson wrote:
> > On Tue, 11 Oct 2016 01:58:34 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >  
> ...
> 
> 
> >> +static struct vfio_group *vfio_group_from_dev(struct device *dev)
> >> +{
> >> +	struct vfio_device *device;
> >> +	struct vfio_group *group;
> >> +	int ret;
> >> +
> >> +	device = vfio_device_get_from_dev(dev);  
> >
> > Note how this does dev->iommu_group->vfio_group->vfio_device and then
> > we back out one level to get the vfio_group, it's not a terribly
> > lightweight path.  Perhaps we should have:
> >
> > struct vfio_device *vfio_group_get_from_dev(struct device *dev)
> > {
> >         struct iommu_group *iommu_group;
> >         struct vfio_group *group;
> >
> >         iommu_group = iommu_group_get(dev);
> >         if (!iommu_group)
> >                 return NULL;
> >
> >         group = vfio_group_get_from_iommu(iommu_group);
> > 	iommu_group_put(iommu_group);
> >
> > 	return group;
> > }
> >
> > vfio_device_get_from_dev() would make use of this.
> >
> > Then create a separate:
> >
> > static int vfio_group_add_container_user(struct vfio_group *group)
> > {
> >  
> >> +	if (!atomic_inc_not_zero(&group->container_users)) {  
> > 		return -EINVAL;  
> >> +	}
> >> +
> >> +	if (group->noiommu) {
> >> +		atomic_dec(&group->container_users);  
> > 		return -EPERM;  
> >> +	}
> >> +
> >> +	if (!group->container->iommu_driver ||
> >> +	    !vfio_group_viable(group)) {
> >> +		atomic_dec(&group->container_users);  
> > 		return -EINVAL;  
> >> +	}
> >> +  
> > 	return 0;
> > }
> >
> > vfio_group_get_external_user() would be updated to use this.  In fact,
> > creating these two functions and updating the existing code to use
> > these should be a separate patch.
> >  
> 
> Ok. I'll update.
> 
> 
> > Note that your version did not hold a group reference while doing the
> > pin/unpin operations below, which seems like a bug.
> >  
> 
> container->group_lock is held for pin/unpin. I think then we don't have
> to hold the reference to group, because groups are attached and detached
> holding this lock, right?
> 
> 
> >> +
> >> +err_ret:
> >> +	vfio_device_put(device);
> >> +	return ERR_PTR(ret);
> >> +}
> >> +
> >> +/*
> >> + * Pin a set of guest PFNs and return their associated host PFNs for  
> local
> >> + * domain only.
> >> + * @dev [in] : device
> >> + * @user_pfn [in]: array of user/guest PFNs
> >> + * @npage [in]: count of array elements
> >> + * @prot [in] : protection flags
> >> + * @phys_pfn[out] : array of host PFNs
> >> + */
> >> +long vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
> >> +		    long npage, int prot, unsigned long *phys_pfn)
> >> +{
> >> +	struct vfio_container *container;
> >> +	struct vfio_group *group;
> >> +	struct vfio_iommu_driver *driver;
> >> +	ssize_t ret = -EINVAL;
> >> +
> >> +	if (!dev || !user_pfn || !phys_pfn)
> >> +		return -EINVAL;
> >> +
> >> +	group = vfio_group_from_dev(dev);
> >> +	if (IS_ERR(group))
> >> +		return PTR_ERR(group);  
> >
> > As suggested above:
> >
> > 	group = vfio_group_get_from_dev(dev);
> > 	if (!group)
> > 		return -ENODEV;
> >
> > 	ret = vfio_group_add_container_user(group)
> > 	if (ret)
> > 		vfio_group_put(group);
> > 		return ret;
> > 	}
> >  
> 
> Ok.
> 
> 
> >> +
> >> +	container = group->container;
> >> +	if (IS_ERR(container))
> >> +		return PTR_ERR(container);
> >> +
> >> +	down_read(&container->group_lock);
> >> +
> >> +	driver = container->iommu_driver;
> >> +	if (likely(driver && driver->ops->pin_pages))
> >> +		ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
> >> +					     npage, prot, phys_pfn);
> >> +
> >> +	up_read(&container->group_lock);
> >> +	vfio_group_try_dissolve_container(group);  
> >
> > Even if you're considering that the container_user reference holds the
> > driver, I think we need a group reference throughout this and this
> > should end with a vfio_group_put(group);
> >  
> 
> Same as I mentioned above, container->group_lock is held here.

What allows you to assume that your @group pointer is valid when you
finish with vfio_group_try_dissolve_container()?  You have no reference
to the group, the only device reference you have is the struct device,
not the vfio_device, so that might have been unbound from vfio.  I'm
still inclined to believe you need to hold the reference to the group.

> >> +
> >> +static long vfio_iommu_type1_unpin_pages(void *iommu_data, unsigned  
> long *pfn,
> >> +					 long npage)
> >> +{
> >> +	struct vfio_iommu *iommu = iommu_data;
> >> +	struct vfio_domain *domain = NULL;
> >> +	long unlocked = 0;
> >> +	int i;
> >> +
> >> +	if (!iommu || !pfn)
> >> +		return -EINVAL;
> >> +  
> >
> > We need iommu->lock here, right?
> >  
> 
> Oh, yes.
> 
> >> +	domain = iommu->local_domain;
> >> +
> >> +	for (i = 0; i < npage; i++) {
> >> +		struct vfio_pfn *p;
> >> +
> >> +		mutex_lock(&domain->local_addr_space->pfn_list_lock);
> >> +
> >> +		/* verify if pfn exist in pfn_list */
> >> +		p = vfio_find_pfn(domain, pfn[i]);
> >> +		if (p)
> >> +			unlocked += vfio_unpin_pfn(domain, p, true);
> >> +
> >> +		mutex_unlock(&domain->local_addr_space->pfn_list_lock);  
> >
> > We hold this mutex outside the loop in the pin unwind case, why is it
> > different here?
> >  
> 
> pin_unwind is error condition, so should be done in one go.
> Here this is not error case. Holding lock for long could block other
> threads if there are multiple threads.


Ok, iommu->lock will need to be inside the loop then too or else
there's likely no gain anyway.


> >> +static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct  
> vfio_dma *dma,
> >> +			    size_t map_size)
> >> +{
> >> +	dma_addr_t iova = dma->iova;
> >> +	unsigned long vaddr = dma->vaddr;
> >> +	size_t size = map_size, dma_size = 0;
> >> +	long npage;
> >> +	unsigned long pfn;
> >> +	int ret = 0;
> >> +
> >> +	while (size) {
> >> +		/* Pin a contiguous chunk of memory */
> >> +		npage = __vfio_pin_pages_remote(vaddr + dma_size,
> >> +						size >> PAGE_SHIFT, dma->prot,
> >> +						&pfn);
> >> +		if (npage <= 0) {
> >> +			WARN_ON(!npage);
> >> +			ret = (int)npage;
> >> +			break;
> >> +		}
> >> +
> >> +		/* Map it! */
> >> +		ret = vfio_iommu_map(iommu, iova + dma_size, pfn, npage,
> >> +				     dma->prot);
> >> +		if (ret) {
> >> +			__vfio_unpin_pages_remote(pfn, npage, dma->prot, true);
> >> +			break;
> >> +		}
> >> +
> >> +		size -= npage << PAGE_SHIFT;
> >> +		dma_size += npage << PAGE_SHIFT;
> >> +	}
> >> +
> >> +	if (ret)
> >> +		vfio_remove_dma(iommu, dma);  
> >
> >
> > There's a bug introduced here, vfio_remove_dma() needs dma->size to be
> > accurate to the point of failure, it's not updated until the success
> > branch below, so it's never going to unmap/unpin anything.
> >  
> 
> Ops, yes. I'll fix this.
> 
> >> +	else {
> >> +		dma->size = dma_size;
> >> +		dma->iommu_mapped = true;
> >> +		vfio_update_accounting(iommu, dma);  
> >
> > I'm confused how this works, when called from vfio_dma_do_map() we're
> > populating a vfio_dma, that is we're populating part of the iova space
> > of the device.  How could we have pinned pfns in the local address
> > space that overlap that?  It would be invalid to have such pinned pfns
> > since that part of the iova space was not previously mapped.
> >
> > Another issue is that if there were existing overlaps, userspace would
> > need to have locked memory limits sufficient for this temporary double
> > accounting.  I'm not sure how they'd come up with heuristics to handle
> > that since we're potentially looking at the bulk of VM memory in a
> > single vfio_dma entry.
> >  
> 
> I see that when QEMU boots a VM, in the case when first vGPU device is
> attached and then pass through device is attached, then on first call to
> vfio_dma_do_map(), pin and iommu_mmap is skipped. Then when a pass
> through device is attached, all mappings are unmapped and then again
> vfio_dma_do_map() is called. At this moment IOMMU capable domain is
> present and so pin and iommu_mmap() on all sys mem is done. Now in
> between these two device attach, if any pages are pinned by vendor
> driver, then accounting should be updated.

So that actually points out something that was on my todo list to check
in this patch, when an unmap occurs, we need to invalidate the vendor
driver mappings.  For that period you describe above, the mappings the
vendor driver holds are invalid, we cannot assume that they will
return and certainly cannot assume they will have the same GPA to HVA
mapping.  So the sequence should be that the unmap causes invalidation
of any potential vendor mappings and then there's no reason that pfn
path would need to update accounting on a vfio_dma_do_map(), it should
not be possible that anything is currently pinned within that IOVA
range.

> >> +	}
> >> +
> >> +	return ret;
> >> +}
> >> +
> >>  static int vfio_dma_do_map(struct vfio_iommu *iommu,
> >>  			   struct vfio_iommu_type1_dma_map *map)
> >>  {
> >>  	dma_addr_t iova = map->iova;
> >>  	unsigned long vaddr = map->vaddr;
> >>  	size_t size = map->size;
> >> -	long npage;
> >>  	int ret = 0, prot = 0;
> >>  	uint64_t mask;
> >>  	struct vfio_dma *dma;
> >> -	unsigned long pfn;
> >>
> >>  	/* Verify that none of our __u64 fields overflow */
> >>  	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
> >> @@ -611,29 +981,11 @@ static int vfio_dma_do_map(struct vfio_iommu  
> *iommu,
> >>  	/* Insert zero-sized and grow as we map chunks of it */
> >>  	vfio_link_dma(iommu, dma);
> >>
> >> -	while (size) {
> >> -		/* Pin a contiguous chunk of memory */
> >> -		npage = vfio_pin_pages(vaddr + dma->size,
> >> -				       size >> PAGE_SHIFT, prot, &pfn);
> >> -		if (npage <= 0) {
> >> -			WARN_ON(!npage);
> >> -			ret = (int)npage;
> >> -			break;
> >> -		}
> >> -
> >> -		/* Map it! */
> >> -		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot);
> >> -		if (ret) {
> >> -			vfio_unpin_pages(pfn, npage, prot, true);
> >> -			break;
> >> -		}
> >> -
> >> -		size -= npage << PAGE_SHIFT;
> >> -		dma->size += npage << PAGE_SHIFT;
> >> -	}
> >> -
> >> -	if (ret)
> >> -		vfio_remove_dma(iommu, dma);
> >> +	/* Don't pin and map if container doesn't contain IOMMU capable  
> domain*/
> >> +	if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
> >> +		dma->size = size;
> >> +	else
> >> +		ret = vfio_pin_map_dma(iommu, dma, size);
> >>
> >>  	mutex_unlock(&iommu->lock);
> >>  	return ret;
> >> @@ -662,10 +1014,6 @@ static int vfio_iommu_replay(struct vfio_iommu  
> *iommu,
> >>  	d = list_first_entry(&iommu->domain_list, struct vfio_domain, next);
> >>  	n = rb_first(&iommu->dma_list);
> >>
> >> -	/* If there's not a domain, there better not be any mappings */
> >> -	if (WARN_ON(n && !d))
> >> -		return -EINVAL;
> >> -
> >>  	for (; n; n = rb_next(n)) {
> >>  		struct vfio_dma *dma;
> >>  		dma_addr_t iova;
> >> @@ -674,20 +1022,43 @@ static int vfio_iommu_replay(struct vfio_iommu  
> *iommu,
> >>  		iova = dma->iova;
> >>
> >>  		while (iova < dma->iova + dma->size) {
> >> -			phys_addr_t phys = iommu_iova_to_phys(d->domain, iova);
> >> +			phys_addr_t phys;
> >>  			size_t size;
> >>
> >> -			if (WARN_ON(!phys)) {
> >> -				iova += PAGE_SIZE;
> >> -				continue;
> >> -			}
> >> +			if (dma->iommu_mapped) {
> >> +				phys = iommu_iova_to_phys(d->domain, iova);
> >> +
> >> +				if (WARN_ON(!phys)) {
> >> +					iova += PAGE_SIZE;
> >> +					continue;
> >> +				}
> >>
> >> -			size = PAGE_SIZE;
> >> +				size = PAGE_SIZE;
> >>
> >> -			while (iova + size < dma->iova + dma->size &&
> >> -			       phys + size == iommu_iova_to_phys(d->domain,
> >> +				while (iova + size < dma->iova + dma->size &&
> >> +				    phys + size == iommu_iova_to_phys(d->domain,
> >>  								 iova + size))
> >> -				size += PAGE_SIZE;
> >> +					size += PAGE_SIZE;
> >> +			} else {
> >> +				unsigned long pfn;
> >> +				unsigned long vaddr = dma->vaddr +
> >> +						     (iova - dma->iova);
> >> +				size_t n = dma->iova + dma->size - iova;
> >> +				long npage;
> >> +
> >> +				npage = __vfio_pin_pages_remote(vaddr,
> >> +								n >> PAGE_SHIFT,
> >> +								dma->prot,
> >> +								&pfn);
> >> +				if (npage <= 0) {
> >> +					WARN_ON(!npage);
> >> +					ret = (int)npage;
> >> +					return ret;
> >> +				}
> >> +
> >> +				phys = pfn << PAGE_SHIFT;
> >> +				size = npage << PAGE_SHIFT;
> >> +			}
> >>
> >>  			ret = iommu_map(domain->domain, iova, phys,
> >>  					size, dma->prot | domain->prot);
> >> @@ -696,6 +1067,11 @@ static int vfio_iommu_replay(struct vfio_iommu  
> *iommu,
> >>
> >>  			iova += size;
> >>  		}
> >> +
> >> +		if (!dma->iommu_mapped) {
> >> +			dma->iommu_mapped = true;
> >> +			vfio_update_accounting(iommu, dma);
> >> +		}  
> >
> > This is the case where we potentially have pinned pfns and we've added
> > an iommu mapped device and need to adjust accounting.  But we've fully
> > pinned and accounted the entire iommu mapped space while still holding
> > the accounting for any pfn mapped space.  So for a time, assuming some
> > pfn pinned pages, we have duplicate accounting.  How does userspace
> > deal with that?  For instance, if I'm using an mdev device where the
> > vendor driver has pinned 512MB of guest memory, then I hot-add an
> > assigned NIC and the entire VM address space gets pinned, that pinning
> > will fail unless my locked memory limits are at least 512MB in excess
> > of my VM size.  Additionally, the user doesn't know how much memory the
> > vendor driver is going to pin, it might be the whole VM address space,
> > so the user would need 2x the locked memory limits.
> >  
> 
> Is the RLIMIT_MEMLOCK set so low? I got your point. I'll update
> __vfio_pin_pages_remote() to check if page which is pinned is already
> accounted in __vfio_pin_pages_remote() itself.

I believe we currently support running a VM with RLIMIT set to exactly
the VM memory size.  We should not regress from that.  libvirt provides
a small "fudge factor", but we shouldn't count on it.
 
> >>  	}
> >>
> >>  	return 0;
> >> @@ -734,11 +1110,24 @@ static void vfio_test_domain_fgsp(struct  
> vfio_domain *domain)
> >>  	__free_pages(pages, order);
> >>  }
> >>
> >> +static struct vfio_group *find_iommu_group(struct vfio_domain *domain,
> >> +				   struct iommu_group *iommu_group)
> >> +{
> >> +	struct vfio_group *g;
> >> +
> >> +	list_for_each_entry(g, &domain->group_list, next) {
> >> +		if (g->iommu_group == iommu_group)
> >> +			return g;
> >> +	}
> >> +
> >> +	return NULL;
> >> +}  
> >
> > It would make review easier if changes like splitting this into a
> > separate function with no functional change on the calling path could
> > be a separate patch.
> >  
> 
> OK.
> 
> Thanks,
> Kirti
> 
> -----------------------------------------------------------------------------------
> This email message is for the sole use of the intended recipient(s) and may contain
> confidential information.  Any unauthorized review, use, disclosure or distribution
> is prohibited.  If you are not the intended recipient, please contact the sender by
> reply email and destroy all copies of the original message.
> -----------------------------------------------------------------------------------

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kirti Wankhede Oct. 14, 2016, 11:35 a.m. UTC | #6
On 10/12/2016 4:01 PM, Tian, Kevin wrote:
>> From: Kirti Wankhede [mailto:kwankhede@nvidia.com]
>> Sent: Tuesday, October 11, 2016 4:29 AM
>>
> [...]
>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>> index 2ba19424e4a1..ce6d6dcbd9a8 100644
>> --- a/drivers/vfio/vfio_iommu_type1.c
>> +++ b/drivers/vfio/vfio_iommu_type1.c
>> @@ -55,18 +55,26 @@ MODULE_PARM_DESC(disable_hugepages,
>>
>>  struct vfio_iommu {
>>  	struct list_head	domain_list;
>> +	struct vfio_domain	*local_domain;
> 
> Hi, Kirti, can you help explain the meaning of 'local" here? I have a hard time 
> to understand its intention... In your later change of vaddr_get_pfn, it's
> even more confusing where get_user_pages_remote is used on a 'local_mm':
> 

'local' in local_domain is to describe that the domain for local page
tracking. 'local_mm' in vaddr_get_pfn() is local variable in
vaddr_get_pfn() function.
    struct mm_struct *local_mm = (mm ? mm : current->mm);


> +	if (mm) {
> +		down_read(&local_mm->mmap_sem);
> +		ret = get_user_pages_remote(NULL, local_mm, vaddr, 1,
> +					!!(prot & IOMMU_WRITE), 0, page, NULL);
> +		up_read(&local_mm->mmap_sem);
> +	} else
> +		ret = get_user_pages_fast(vaddr, 1,
> +					  !!(prot & IOMMU_WRITE), page);
> 
> 
> [...]
>> -static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
>> +static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
>> +			 int prot, unsigned long *pfn)
>>  {
>>  	struct page *page[1];
>>  	struct vm_area_struct *vma;
>> +	struct mm_struct *local_mm = (mm ? mm : current->mm);
> 
> it'd be clearer if you call this variable as 'mm' while the earlier input parameter
> as 'local_mm'.
> 

Like I mentioned above, 'local' here is for local variable in this
function.

>>  	int ret = -EFAULT;
>>
>> -	if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
>> +	if (mm) {
>> +		down_read(&local_mm->mmap_sem);
>> +		ret = get_user_pages_remote(NULL, local_mm, vaddr, 1,
>> +					!!(prot & IOMMU_WRITE), 0, page, NULL);
>> +		up_read(&local_mm->mmap_sem);
>> +	} else
>> +		ret = get_user_pages_fast(vaddr, 1,
>> +					  !!(prot & IOMMU_WRITE), page);
>> +
>> +	if (ret == 1) {
>>  		*pfn = page_to_pfn(page[0]);
>>  		return 0;
>>  	}
>>
>> -	down_read(&current->mm->mmap_sem);
>> +	down_read(&local_mm->mmap_sem);
>>
>> -	vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
>> +	vma = find_vma_intersection(local_mm, vaddr, vaddr + 1);
>>
>>  	if (vma && vma->vm_flags & VM_PFNMAP) {
>>  		*pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
> 
> [...]
>> +static long __vfio_pin_pages_local(struct vfio_domain *domain,
>> +				   unsigned long vaddr, int prot,
>> +				   unsigned long *pfn_base,
>> +				   bool do_accounting)
> 
> 'pages' -> 'page' since only one page is handled here.
> 
> [...]
>> +
>> +static void __vfio_unpin_pages_local(struct vfio_domain *domain,
>> +				     unsigned long pfn, int prot,
>> +				     bool do_accounting)
> 
> ditto
> 

Ok.

>> +{
>> +	put_pfn(pfn, prot);
>> +
>> +	if (do_accounting)
>> +		vfio_lock_acct(domain->local_addr_space->task, -1);
>> +}
>> +
>> +static int vfio_unpin_pfn(struct vfio_domain *domain,
>> +			  struct vfio_pfn *vpfn, bool do_accounting)
>> +{
>> +	__vfio_unpin_pages_local(domain, vpfn->pfn, vpfn->prot,
>> +				 do_accounting);
>> +
>> +	if (atomic_dec_and_test(&vpfn->ref_count))
>> +		vfio_remove_from_pfn_list(domain, vpfn);
>> +
>> +	return 1;
>> +}
>> +
>> +static long vfio_iommu_type1_pin_pages(void *iommu_data,
>> +				       unsigned long *user_pfn,
>> +				       long npage, int prot,
>> +				       unsigned long *phys_pfn)
>> +{
>> +	struct vfio_iommu *iommu = iommu_data;
>> +	struct vfio_domain *domain;
>> +	int i, j, ret;
>> +	long retpage;
>> +	unsigned long remote_vaddr;
>> +	unsigned long *pfn = phys_pfn;
>> +	struct vfio_dma *dma;
>> +	bool do_accounting = false;
>> +
>> +	if (!iommu || !user_pfn || !phys_pfn)
>> +		return -EINVAL;
>> +
>> +	mutex_lock(&iommu->lock);
>> +
>> +	if (!iommu->local_domain) {
>> +		ret = -EINVAL;
>> +		goto pin_done;
>> +	}
>> +
>> +	domain = iommu->local_domain;
>> +
>> +	/*
>> +	 * If iommu capable domain exist in the container then all pages are
>> +	 * already pinned and accounted. Accouting should be done if there is no
>> +	 * iommu capable domain in the container.
>> +	 */
>> +	do_accounting = !IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu);
>> +
>> +	for (i = 0; i < npage; i++) {
>> +		struct vfio_pfn *p;
>> +		dma_addr_t iova;
>> +
>> +		iova = user_pfn[i] << PAGE_SHIFT;
>> +
>> +		dma = vfio_find_dma(iommu, iova, 0);
>> +		if (!dma) {
>> +			ret = -EINVAL;
>> +			goto pin_unwind;
>> +		}
>> +
>> +		remote_vaddr = dma->vaddr + iova - dma->iova;
> 
> again, why "remote"_vaddr on a 'local' function?
> 

Not 'local' function, its local_domain. __vfio_pin_pages_local() pins
pages for local_domain. When this function is called from other process,
other than QEMU process, vaddr from QEMU process is remote_vaddr for
caller.


>> +
>> +		retpage = __vfio_pin_pages_local(domain, remote_vaddr, prot,
>> +						 &pfn[i], do_accounting);
>> +		if (retpage <= 0) {
>> +			WARN_ON(!retpage);
>> +			ret = (int)retpage;
>> +			goto pin_unwind;
>> +		}
>> +
>> +		mutex_lock(&domain->local_addr_space->pfn_list_lock);
>> +
>> +		/* search if pfn exist */
>> +		p = vfio_find_pfn(domain, pfn[i]);
>> +		if (p) {
>> +			atomic_inc(&p->ref_count);
>> +			mutex_unlock(&domain->local_addr_space->pfn_list_lock);
>> +			continue;
>> +		}
>> +
>> +		ret = vfio_add_to_pfn_list(domain, remote_vaddr, iova,
>> +					   pfn[i], prot);
>> +		mutex_unlock(&domain->local_addr_space->pfn_list_lock);
>> +
>> +		if (ret) {
>> +			__vfio_unpin_pages_local(domain, pfn[i], prot,
>> +						 do_accounting);
>> +			goto pin_unwind;
>> +		}
>> +	}
>> +
>> +	ret = i;
>> +	goto pin_done;
>> +
>> +pin_unwind:
>> +	pfn[i] = 0;
>> +	mutex_lock(&domain->local_addr_space->pfn_list_lock);
>> +	for (j = 0; j < i; j++) {
>> +		struct vfio_pfn *p;
>> +
>> +		p = vfio_find_pfn(domain, pfn[j]);
>> +		if (p)
>> +			vfio_unpin_pfn(domain, p, do_accounting);
>> +
>> +		pfn[j] = 0;
>> +	}
>> +	mutex_unlock(&domain->local_addr_space->pfn_list_lock);
>> +
>> +pin_done:
>> +	mutex_unlock(&iommu->lock);
>> +	return ret;
>> +}
>> +
>> +static long vfio_iommu_type1_unpin_pages(void *iommu_data, unsigned long *pfn,
>> +					 long npage)
>> +{
>> +	struct vfio_iommu *iommu = iommu_data;
>> +	struct vfio_domain *domain = NULL;
>> +	long unlocked = 0;
>> +	int i;
>> +
>> +	if (!iommu || !pfn)
>> +		return -EINVAL;
>> +
> 
> acquire iommu lock...
> 

Yes, Alex has pointed this out and I'm going to fix it in v9.

>> +	domain = iommu->local_domain;
>> +
>> +	for (i = 0; i < npage; i++) {
>> +		struct vfio_pfn *p;
>> +
>> +		mutex_lock(&domain->local_addr_space->pfn_list_lock);
>> +
>> +		/* verify if pfn exist in pfn_list */
>> +		p = vfio_find_pfn(domain, pfn[i]);
>> +		if (p)
>> +			unlocked += vfio_unpin_pfn(domain, p, true);
> 
> Should we force update accounting here even when there is iommu capable
> domain? It's not consistent to earlier pin_pages.
> 

Yes, fixing this.

>> +
>> +		mutex_unlock(&domain->local_addr_space->pfn_list_lock);
>> +	}
>>
>>  	return unlocked;
>>  }
>> @@ -341,6 +636,12 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct
>> vfio_dma *dma)
>>
>>  	if (!dma->size)
>>  		return;
>> +
>> +	if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
>> +		return;
> 
> Is above check redundant to following dma->iommu_mapped?
> 

I'm going to remove dma->iommu_mapped and changing accounting code as
per Alex's comment and problem that Alex pointed out.

Thanks,
Kirti

>> +
>> +	if (!dma->iommu_mapped)
>> +		return;
>>  	/*
>>  	 * We use the IOMMU to track the physical addresses, otherwise we'd
>>  	 * need a much more complicated tracking system.  Unfortunately that
> 
> Thanks
> Kevin
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tian, Kevin Oct. 14, 2016, 12:29 p.m. UTC | #7
> From: Kirti Wankhede [mailto:kwankhede@nvidia.com]
> Sent: Friday, October 14, 2016 7:36 PM
> 
> 
> On 10/12/2016 4:01 PM, Tian, Kevin wrote:
> >> From: Kirti Wankhede [mailto:kwankhede@nvidia.com]
> >> Sent: Tuesday, October 11, 2016 4:29 AM
> >>
> > [...]
> >> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> >> index 2ba19424e4a1..ce6d6dcbd9a8 100644
> >> --- a/drivers/vfio/vfio_iommu_type1.c
> >> +++ b/drivers/vfio/vfio_iommu_type1.c
> >> @@ -55,18 +55,26 @@ MODULE_PARM_DESC(disable_hugepages,
> >>
> >>  struct vfio_iommu {
> >>  	struct list_head	domain_list;
> >> +	struct vfio_domain	*local_domain;
> >
> > Hi, Kirti, can you help explain the meaning of 'local" here? I have a hard time
> > to understand its intention... In your later change of vaddr_get_pfn, it's
> > even more confusing where get_user_pages_remote is used on a 'local_mm':
> >
> 
> 'local' in local_domain is to describe that the domain for local page
> tracking. 'local_mm' in vaddr_get_pfn() is local variable in
> vaddr_get_pfn() function.
>     struct mm_struct *local_mm = (mm ? mm : current->mm);
> 

'local page tracking' means track logic local to VFIO? Then when we say
'remote page tracking', who is remote? I would appreciate some code
comment to describe this definition, otherwise it's easily confusing when
'local' sometimes means who does page tracking, while other times 
it just means a local variable. At least when I read this patch, the 
immediate impression is that local_mm belongs to local_domain. :-)

Thanks
Kevin
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 6fd6fa5469de..e3e342861e04 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1782,6 +1782,123 @@  void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
 }
 EXPORT_SYMBOL_GPL(vfio_info_cap_shift);
 
+static struct vfio_group *vfio_group_from_dev(struct device *dev)
+{
+	struct vfio_device *device;
+	struct vfio_group *group;
+	int ret;
+
+	device = vfio_device_get_from_dev(dev);
+	if (!device)
+		return ERR_PTR(-EINVAL);
+
+	group = device->group;
+	if (!atomic_inc_not_zero(&group->container_users)) {
+		ret = -EINVAL;
+		goto err_ret;
+	}
+
+	if (group->noiommu) {
+		atomic_dec(&group->container_users);
+		ret = -EPERM;
+		goto err_ret;
+	}
+
+	if (!group->container->iommu_driver ||
+	    !vfio_group_viable(group)) {
+		atomic_dec(&group->container_users);
+		ret = -EINVAL;
+		goto err_ret;
+	}
+
+	vfio_device_put(device);
+	return group;
+
+err_ret:
+	vfio_device_put(device);
+	return ERR_PTR(ret);
+}
+
+/*
+ * Pin a set of guest PFNs and return their associated host PFNs for local
+ * domain only.
+ * @dev [in] : device
+ * @user_pfn [in]: array of user/guest PFNs
+ * @npage [in]: count of array elements
+ * @prot [in] : protection flags
+ * @phys_pfn[out] : array of host PFNs
+ */
+long vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
+		    long npage, int prot, unsigned long *phys_pfn)
+{
+	struct vfio_container *container;
+	struct vfio_group *group;
+	struct vfio_iommu_driver *driver;
+	ssize_t ret = -EINVAL;
+
+	if (!dev || !user_pfn || !phys_pfn)
+		return -EINVAL;
+
+	group = vfio_group_from_dev(dev);
+	if (IS_ERR(group))
+		return PTR_ERR(group);
+
+	container = group->container;
+	if (IS_ERR(container))
+		return PTR_ERR(container);
+
+	down_read(&container->group_lock);
+
+	driver = container->iommu_driver;
+	if (likely(driver && driver->ops->pin_pages))
+		ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
+					     npage, prot, phys_pfn);
+
+	up_read(&container->group_lock);
+	vfio_group_try_dissolve_container(group);
+
+	return ret;
+
+}
+EXPORT_SYMBOL(vfio_pin_pages);
+
+/*
+ * Unpin set of host PFNs for local domain only.
+ * @dev [in] : device
+ * @pfn [in] : array of host PFNs to be unpinned.
+ * @npage [in] :count of elements in array, that is number of pages.
+ */
+long vfio_unpin_pages(struct device *dev, unsigned long *pfn, long npage)
+{
+	struct vfio_container *container;
+	struct vfio_group *group;
+	struct vfio_iommu_driver *driver;
+	ssize_t ret = -EINVAL;
+
+	if (!dev || !pfn)
+		return -EINVAL;
+
+	group = vfio_group_from_dev(dev);
+	if (IS_ERR(group))
+		return PTR_ERR(group);
+
+	container = group->container;
+	if (IS_ERR(container))
+		return PTR_ERR(container);
+
+	down_read(&container->group_lock);
+
+	driver = container->iommu_driver;
+	if (likely(driver && driver->ops->unpin_pages))
+		ret = driver->ops->unpin_pages(container->iommu_data, pfn,
+					       npage);
+
+	up_read(&container->group_lock);
+	vfio_group_try_dissolve_container(group);
+	return ret;
+}
+EXPORT_SYMBOL(vfio_unpin_pages);
+
 /**
  * Module/class support
  */
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 2ba19424e4a1..ce6d6dcbd9a8 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -55,18 +55,26 @@  MODULE_PARM_DESC(disable_hugepages,
 
 struct vfio_iommu {
 	struct list_head	domain_list;
+	struct vfio_domain	*local_domain;
 	struct mutex		lock;
 	struct rb_root		dma_list;
 	bool			v2;
 	bool			nesting;
 };
 
+struct local_addr_space {
+	struct task_struct	*task;
+	struct rb_root		pfn_list;	/* pinned Host pfn list */
+	struct mutex		pfn_list_lock;	/* mutex for pfn_list */
+};
+
 struct vfio_domain {
 	struct iommu_domain	*domain;
 	struct list_head	next;
 	struct list_head	group_list;
 	int			prot;		/* IOMMU_CACHE */
 	bool			fgsp;		/* Fine-grained super pages */
+	struct local_addr_space	*local_addr_space;
 };
 
 struct vfio_dma {
@@ -75,6 +83,7 @@  struct vfio_dma {
 	unsigned long		vaddr;		/* Process virtual addr */
 	size_t			size;		/* Map size (bytes) */
 	int			prot;		/* IOMMU_READ/WRITE */
+	bool			iommu_mapped;
 };
 
 struct vfio_group {
@@ -83,6 +92,22 @@  struct vfio_group {
 };
 
 /*
+ * Guest RAM pinning working set or DMA target
+ */
+struct vfio_pfn {
+	struct rb_node		node;
+	unsigned long		vaddr;		/* virtual addr */
+	dma_addr_t		iova;		/* IOVA */
+	unsigned long		pfn;		/* Host pfn */
+	size_t			prot;
+	atomic_t		ref_count;
+};
+
+
+#define IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu)	\
+			 (list_empty(&iommu->domain_list) ? false : true)
+
+/*
  * This code handles mapping and unmapping of user data buffers
  * into DMA'ble space using the IOMMU
  */
@@ -130,6 +155,84 @@  static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
 	rb_erase(&old->node, &iommu->dma_list);
 }
 
+/*
+ * Helper Functions for host pfn list
+ */
+
+static struct vfio_pfn *vfio_find_pfn(struct vfio_domain *domain,
+				      unsigned long pfn)
+{
+	struct rb_node *node;
+	struct vfio_pfn *vpfn, *ret = NULL;
+
+	node = domain->local_addr_space->pfn_list.rb_node;
+
+	while (node) {
+		vpfn = rb_entry(node, struct vfio_pfn, node);
+
+		if (pfn < vpfn->pfn)
+			node = node->rb_left;
+		else if (pfn > vpfn->pfn)
+			node = node->rb_right;
+		else {
+			ret = vpfn;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static void vfio_link_pfn(struct vfio_domain *domain, struct vfio_pfn *new)
+{
+	struct rb_node **link, *parent = NULL;
+	struct vfio_pfn *vpfn;
+
+	link = &domain->local_addr_space->pfn_list.rb_node;
+	while (*link) {
+		parent = *link;
+		vpfn = rb_entry(parent, struct vfio_pfn, node);
+
+		if (new->pfn < vpfn->pfn)
+			link = &(*link)->rb_left;
+		else
+			link = &(*link)->rb_right;
+	}
+
+	rb_link_node(&new->node, parent, link);
+	rb_insert_color(&new->node, &domain->local_addr_space->pfn_list);
+}
+
+static void vfio_unlink_pfn(struct vfio_domain *domain, struct vfio_pfn *old)
+{
+	rb_erase(&old->node, &domain->local_addr_space->pfn_list);
+}
+
+static int vfio_add_to_pfn_list(struct vfio_domain *domain, unsigned long vaddr,
+				dma_addr_t iova, unsigned long pfn, size_t prot)
+{
+	struct vfio_pfn *vpfn;
+
+	vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
+	if (!vpfn)
+		return -ENOMEM;
+
+	vpfn->vaddr = vaddr;
+	vpfn->iova = iova;
+	vpfn->pfn = pfn;
+	vpfn->prot = prot;
+	atomic_set(&vpfn->ref_count, 1);
+	vfio_link_pfn(domain, vpfn);
+	return 0;
+}
+
+static void vfio_remove_from_pfn_list(struct vfio_domain *domain,
+				      struct vfio_pfn *vpfn)
+{
+	vfio_unlink_pfn(domain, vpfn);
+	kfree(vpfn);
+}
+
 struct vwork {
 	struct mm_struct	*mm;
 	long			npage;
@@ -150,17 +253,17 @@  static void vfio_lock_acct_bg(struct work_struct *work)
 	kfree(vwork);
 }
 
-static void vfio_lock_acct(long npage)
+static void vfio_lock_acct(struct task_struct *task, long npage)
 {
 	struct vwork *vwork;
 	struct mm_struct *mm;
 
-	if (!current->mm || !npage)
+	if (!task->mm || !npage)
 		return; /* process exited or nothing to do */
 
-	if (down_write_trylock(&current->mm->mmap_sem)) {
-		current->mm->locked_vm += npage;
-		up_write(&current->mm->mmap_sem);
+	if (down_write_trylock(&task->mm->mmap_sem)) {
+		task->mm->locked_vm += npage;
+		up_write(&task->mm->mmap_sem);
 		return;
 	}
 
@@ -172,7 +275,7 @@  static void vfio_lock_acct(long npage)
 	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
 	if (!vwork)
 		return;
-	mm = get_task_mm(current);
+	mm = get_task_mm(task);
 	if (!mm) {
 		kfree(vwork);
 		return;
@@ -228,20 +331,31 @@  static int put_pfn(unsigned long pfn, int prot)
 	return 0;
 }
 
-static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
+static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
+			 int prot, unsigned long *pfn)
 {
 	struct page *page[1];
 	struct vm_area_struct *vma;
+	struct mm_struct *local_mm = (mm ? mm : current->mm);
 	int ret = -EFAULT;
 
-	if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
+	if (mm) {
+		down_read(&local_mm->mmap_sem);
+		ret = get_user_pages_remote(NULL, local_mm, vaddr, 1,
+					!!(prot & IOMMU_WRITE), 0, page, NULL);
+		up_read(&local_mm->mmap_sem);
+	} else
+		ret = get_user_pages_fast(vaddr, 1,
+					  !!(prot & IOMMU_WRITE), page);
+
+	if (ret == 1) {
 		*pfn = page_to_pfn(page[0]);
 		return 0;
 	}
 
-	down_read(&current->mm->mmap_sem);
+	down_read(&local_mm->mmap_sem);
 
-	vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
+	vma = find_vma_intersection(local_mm, vaddr, vaddr + 1);
 
 	if (vma && vma->vm_flags & VM_PFNMAP) {
 		*pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
@@ -249,7 +363,7 @@  static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
 			ret = 0;
 	}
 
-	up_read(&current->mm->mmap_sem);
+	up_read(&local_mm->mmap_sem);
 
 	return ret;
 }
@@ -259,8 +373,8 @@  static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
  * the iommu can only map chunks of consecutive pfns anyway, so get the
  * first page and all consecutive pages with the same locking.
  */
-static long vfio_pin_pages(unsigned long vaddr, long npage,
-			   int prot, unsigned long *pfn_base)
+static long __vfio_pin_pages_remote(unsigned long vaddr, long npage,
+				    int prot, unsigned long *pfn_base)
 {
 	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 	bool lock_cap = capable(CAP_IPC_LOCK);
@@ -270,7 +384,7 @@  static long vfio_pin_pages(unsigned long vaddr, long npage,
 	if (!current->mm)
 		return -ENODEV;
 
-	ret = vaddr_get_pfn(vaddr, prot, pfn_base);
+	ret = vaddr_get_pfn(NULL, vaddr, prot, pfn_base);
 	if (ret)
 		return ret;
 
@@ -285,7 +399,7 @@  static long vfio_pin_pages(unsigned long vaddr, long npage,
 
 	if (unlikely(disable_hugepages)) {
 		if (!rsvd)
-			vfio_lock_acct(1);
+			vfio_lock_acct(current, 1);
 		return 1;
 	}
 
@@ -293,7 +407,7 @@  static long vfio_pin_pages(unsigned long vaddr, long npage,
 	for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
 		unsigned long pfn = 0;
 
-		ret = vaddr_get_pfn(vaddr, prot, &pfn);
+		ret = vaddr_get_pfn(NULL, vaddr, prot, &pfn);
 		if (ret)
 			break;
 
@@ -313,13 +427,13 @@  static long vfio_pin_pages(unsigned long vaddr, long npage,
 	}
 
 	if (!rsvd)
-		vfio_lock_acct(i);
+		vfio_lock_acct(current, i);
 
 	return i;
 }
 
-static long vfio_unpin_pages(unsigned long pfn, long npage,
-			     int prot, bool do_accounting)
+static long __vfio_unpin_pages_remote(unsigned long pfn, long npage, int prot,
+				      bool do_accounting)
 {
 	unsigned long unlocked = 0;
 	long i;
@@ -328,7 +442,188 @@  static long vfio_unpin_pages(unsigned long pfn, long npage,
 		unlocked += put_pfn(pfn++, prot);
 
 	if (do_accounting)
-		vfio_lock_acct(-unlocked);
+		vfio_lock_acct(current, -unlocked);
+	return unlocked;
+}
+
+static long __vfio_pin_pages_local(struct vfio_domain *domain,
+				   unsigned long vaddr, int prot,
+				   unsigned long *pfn_base,
+				   bool do_accounting)
+{
+	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	bool lock_cap = capable(CAP_IPC_LOCK);
+	long ret;
+	bool rsvd;
+	struct task_struct *task = domain->local_addr_space->task;
+
+	if (!task->mm)
+		return -ENODEV;
+
+	ret = vaddr_get_pfn(task->mm, vaddr, prot, pfn_base);
+	if (ret)
+		return ret;
+
+	rsvd = is_invalid_reserved_pfn(*pfn_base);
+
+	if (!rsvd && !lock_cap && task->mm->locked_vm + 1 > limit) {
+		put_pfn(*pfn_base, prot);
+		pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
+			limit << PAGE_SHIFT);
+		return -ENOMEM;
+	}
+
+	if (!rsvd && do_accounting)
+		vfio_lock_acct(task, 1);
+
+	return 1;
+}
+
+static void __vfio_unpin_pages_local(struct vfio_domain *domain,
+				     unsigned long pfn, int prot,
+				     bool do_accounting)
+{
+	put_pfn(pfn, prot);
+
+	if (do_accounting)
+		vfio_lock_acct(domain->local_addr_space->task, -1);
+}
+
+static int vfio_unpin_pfn(struct vfio_domain *domain,
+			  struct vfio_pfn *vpfn, bool do_accounting)
+{
+	__vfio_unpin_pages_local(domain, vpfn->pfn, vpfn->prot,
+				 do_accounting);
+
+	if (atomic_dec_and_test(&vpfn->ref_count))
+		vfio_remove_from_pfn_list(domain, vpfn);
+
+	return 1;
+}
+
+static long vfio_iommu_type1_pin_pages(void *iommu_data,
+				       unsigned long *user_pfn,
+				       long npage, int prot,
+				       unsigned long *phys_pfn)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	struct vfio_domain *domain;
+	int i, j, ret;
+	long retpage;
+	unsigned long remote_vaddr;
+	unsigned long *pfn = phys_pfn;
+	struct vfio_dma *dma;
+	bool do_accounting = false;
+
+	if (!iommu || !user_pfn || !phys_pfn)
+		return -EINVAL;
+
+	mutex_lock(&iommu->lock);
+
+	if (!iommu->local_domain) {
+		ret = -EINVAL;
+		goto pin_done;
+	}
+
+	domain = iommu->local_domain;
+
+	/*
+	 * If iommu capable domain exist in the container then all pages are
+	 * already pinned and accounted. Accouting should be done if there is no
+	 * iommu capable domain in the container.
+	 */
+	do_accounting = !IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu);
+
+	for (i = 0; i < npage; i++) {
+		struct vfio_pfn *p;
+		dma_addr_t iova;
+
+		iova = user_pfn[i] << PAGE_SHIFT;
+
+		dma = vfio_find_dma(iommu, iova, 0);
+		if (!dma) {
+			ret = -EINVAL;
+			goto pin_unwind;
+		}
+
+		remote_vaddr = dma->vaddr + iova - dma->iova;
+
+		retpage = __vfio_pin_pages_local(domain, remote_vaddr, prot,
+						 &pfn[i], do_accounting);
+		if (retpage <= 0) {
+			WARN_ON(!retpage);
+			ret = (int)retpage;
+			goto pin_unwind;
+		}
+
+		mutex_lock(&domain->local_addr_space->pfn_list_lock);
+
+		/* search if pfn exist */
+		p = vfio_find_pfn(domain, pfn[i]);
+		if (p) {
+			atomic_inc(&p->ref_count);
+			mutex_unlock(&domain->local_addr_space->pfn_list_lock);
+			continue;
+		}
+
+		ret = vfio_add_to_pfn_list(domain, remote_vaddr, iova,
+					   pfn[i], prot);
+		mutex_unlock(&domain->local_addr_space->pfn_list_lock);
+
+		if (ret) {
+			__vfio_unpin_pages_local(domain, pfn[i], prot,
+						 do_accounting);
+			goto pin_unwind;
+		}
+	}
+
+	ret = i;
+	goto pin_done;
+
+pin_unwind:
+	pfn[i] = 0;
+	mutex_lock(&domain->local_addr_space->pfn_list_lock);
+	for (j = 0; j < i; j++) {
+		struct vfio_pfn *p;
+
+		p = vfio_find_pfn(domain, pfn[j]);
+		if (p)
+			vfio_unpin_pfn(domain, p, do_accounting);
+
+		pfn[j] = 0;
+	}
+	mutex_unlock(&domain->local_addr_space->pfn_list_lock);
+
+pin_done:
+	mutex_unlock(&iommu->lock);
+	return ret;
+}
+
+static long vfio_iommu_type1_unpin_pages(void *iommu_data, unsigned long *pfn,
+					 long npage)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	struct vfio_domain *domain = NULL;
+	long unlocked = 0;
+	int i;
+
+	if (!iommu || !pfn)
+		return -EINVAL;
+
+	domain = iommu->local_domain;
+
+	for (i = 0; i < npage; i++) {
+		struct vfio_pfn *p;
+
+		mutex_lock(&domain->local_addr_space->pfn_list_lock);
+
+		/* verify if pfn exist in pfn_list */
+		p = vfio_find_pfn(domain, pfn[i]);
+		if (p)
+			unlocked += vfio_unpin_pfn(domain, p, true);
+
+		mutex_unlock(&domain->local_addr_space->pfn_list_lock);
+	}
 
 	return unlocked;
 }
@@ -341,6 +636,12 @@  static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
 
 	if (!dma->size)
 		return;
+
+	if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
+		return;
+
+	if (!dma->iommu_mapped)
+		return;
 	/*
 	 * We use the IOMMU to track the physical addresses, otherwise we'd
 	 * need a much more complicated tracking system.  Unfortunately that
@@ -382,15 +683,16 @@  static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
 		if (WARN_ON(!unmapped))
 			break;
 
-		unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
-					     unmapped >> PAGE_SHIFT,
-					     dma->prot, false);
+		unlocked += __vfio_unpin_pages_remote(phys >> PAGE_SHIFT,
+						      unmapped >> PAGE_SHIFT,
+						      dma->prot, false);
 		iova += unmapped;
 
 		cond_resched();
 	}
 
-	vfio_lock_acct(-unlocked);
+	dma->iommu_mapped = false;
+	vfio_lock_acct(current, -unlocked);
 }
 
 static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
@@ -558,17 +860,85 @@  unwind:
 	return ret;
 }
 
+void vfio_update_accounting(struct vfio_iommu *iommu, struct vfio_dma *dma)
+{
+	struct vfio_domain *domain = iommu->local_domain;
+	struct rb_node *n;
+	long locked = 0;
+
+	if (!iommu->local_domain)
+		return;
+
+	mutex_lock(&domain->local_addr_space->pfn_list_lock);
+
+	n = rb_first(&domain->local_addr_space->pfn_list);
+
+	for (; n; n = rb_next(n)) {
+		struct vfio_pfn *vpfn;
+
+		vpfn = rb_entry(n, struct vfio_pfn, node);
+
+		if ((vpfn->iova >= dma->iova) &&
+		    (vpfn->iova < dma->iova + dma->size))
+			locked++;
+	}
+	vfio_lock_acct(current, -locked);
+	mutex_unlock(&domain->local_addr_space->pfn_list_lock);
+}
+
+static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
+			    size_t map_size)
+{
+	dma_addr_t iova = dma->iova;
+	unsigned long vaddr = dma->vaddr;
+	size_t size = map_size, dma_size = 0;
+	long npage;
+	unsigned long pfn;
+	int ret = 0;
+
+	while (size) {
+		/* Pin a contiguous chunk of memory */
+		npage = __vfio_pin_pages_remote(vaddr + dma_size,
+						size >> PAGE_SHIFT, dma->prot,
+						&pfn);
+		if (npage <= 0) {
+			WARN_ON(!npage);
+			ret = (int)npage;
+			break;
+		}
+
+		/* Map it! */
+		ret = vfio_iommu_map(iommu, iova + dma_size, pfn, npage,
+				     dma->prot);
+		if (ret) {
+			__vfio_unpin_pages_remote(pfn, npage, dma->prot, true);
+			break;
+		}
+
+		size -= npage << PAGE_SHIFT;
+		dma_size += npage << PAGE_SHIFT;
+	}
+
+	if (ret)
+		vfio_remove_dma(iommu, dma);
+	else {
+		dma->size = dma_size;
+		dma->iommu_mapped = true;
+		vfio_update_accounting(iommu, dma);
+	}
+
+	return ret;
+}
+
 static int vfio_dma_do_map(struct vfio_iommu *iommu,
 			   struct vfio_iommu_type1_dma_map *map)
 {
 	dma_addr_t iova = map->iova;
 	unsigned long vaddr = map->vaddr;
 	size_t size = map->size;
-	long npage;
 	int ret = 0, prot = 0;
 	uint64_t mask;
 	struct vfio_dma *dma;
-	unsigned long pfn;
 
 	/* Verify that none of our __u64 fields overflow */
 	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
@@ -611,29 +981,11 @@  static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	/* Insert zero-sized and grow as we map chunks of it */
 	vfio_link_dma(iommu, dma);
 
-	while (size) {
-		/* Pin a contiguous chunk of memory */
-		npage = vfio_pin_pages(vaddr + dma->size,
-				       size >> PAGE_SHIFT, prot, &pfn);
-		if (npage <= 0) {
-			WARN_ON(!npage);
-			ret = (int)npage;
-			break;
-		}
-
-		/* Map it! */
-		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot);
-		if (ret) {
-			vfio_unpin_pages(pfn, npage, prot, true);
-			break;
-		}
-
-		size -= npage << PAGE_SHIFT;
-		dma->size += npage << PAGE_SHIFT;
-	}
-
-	if (ret)
-		vfio_remove_dma(iommu, dma);
+	/* Don't pin and map if container doesn't contain IOMMU capable domain*/
+	if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
+		dma->size = size;
+	else
+		ret = vfio_pin_map_dma(iommu, dma, size);
 
 	mutex_unlock(&iommu->lock);
 	return ret;
@@ -662,10 +1014,6 @@  static int vfio_iommu_replay(struct vfio_iommu *iommu,
 	d = list_first_entry(&iommu->domain_list, struct vfio_domain, next);
 	n = rb_first(&iommu->dma_list);
 
-	/* If there's not a domain, there better not be any mappings */
-	if (WARN_ON(n && !d))
-		return -EINVAL;
-
 	for (; n; n = rb_next(n)) {
 		struct vfio_dma *dma;
 		dma_addr_t iova;
@@ -674,20 +1022,43 @@  static int vfio_iommu_replay(struct vfio_iommu *iommu,
 		iova = dma->iova;
 
 		while (iova < dma->iova + dma->size) {
-			phys_addr_t phys = iommu_iova_to_phys(d->domain, iova);
+			phys_addr_t phys;
 			size_t size;
 
-			if (WARN_ON(!phys)) {
-				iova += PAGE_SIZE;
-				continue;
-			}
+			if (dma->iommu_mapped) {
+				phys = iommu_iova_to_phys(d->domain, iova);
+
+				if (WARN_ON(!phys)) {
+					iova += PAGE_SIZE;
+					continue;
+				}
 
-			size = PAGE_SIZE;
+				size = PAGE_SIZE;
 
-			while (iova + size < dma->iova + dma->size &&
-			       phys + size == iommu_iova_to_phys(d->domain,
+				while (iova + size < dma->iova + dma->size &&
+				    phys + size == iommu_iova_to_phys(d->domain,
 								 iova + size))
-				size += PAGE_SIZE;
+					size += PAGE_SIZE;
+			} else {
+				unsigned long pfn;
+				unsigned long vaddr = dma->vaddr +
+						     (iova - dma->iova);
+				size_t n = dma->iova + dma->size - iova;
+				long npage;
+
+				npage = __vfio_pin_pages_remote(vaddr,
+								n >> PAGE_SHIFT,
+								dma->prot,
+								&pfn);
+				if (npage <= 0) {
+					WARN_ON(!npage);
+					ret = (int)npage;
+					return ret;
+				}
+
+				phys = pfn << PAGE_SHIFT;
+				size = npage << PAGE_SHIFT;
+			}
 
 			ret = iommu_map(domain->domain, iova, phys,
 					size, dma->prot | domain->prot);
@@ -696,6 +1067,11 @@  static int vfio_iommu_replay(struct vfio_iommu *iommu,
 
 			iova += size;
 		}
+
+		if (!dma->iommu_mapped) {
+			dma->iommu_mapped = true;
+			vfio_update_accounting(iommu, dma);
+		}
 	}
 
 	return 0;
@@ -734,11 +1110,24 @@  static void vfio_test_domain_fgsp(struct vfio_domain *domain)
 	__free_pages(pages, order);
 }
 
+static struct vfio_group *find_iommu_group(struct vfio_domain *domain,
+				   struct iommu_group *iommu_group)
+{
+	struct vfio_group *g;
+
+	list_for_each_entry(g, &domain->group_list, next) {
+		if (g->iommu_group == iommu_group)
+			return g;
+	}
+
+	return NULL;
+}
+
 static int vfio_iommu_type1_attach_group(void *iommu_data,
 					 struct iommu_group *iommu_group)
 {
 	struct vfio_iommu *iommu = iommu_data;
-	struct vfio_group *group, *g;
+	struct vfio_group *group;
 	struct vfio_domain *domain, *d;
 	struct bus_type *bus = NULL;
 	int ret;
@@ -746,10 +1135,14 @@  static int vfio_iommu_type1_attach_group(void *iommu_data,
 	mutex_lock(&iommu->lock);
 
 	list_for_each_entry(d, &iommu->domain_list, next) {
-		list_for_each_entry(g, &d->group_list, next) {
-			if (g->iommu_group != iommu_group)
-				continue;
+		if (find_iommu_group(d, iommu_group)) {
+			mutex_unlock(&iommu->lock);
+			return -EINVAL;
+		}
+	}
 
+	if (iommu->local_domain) {
+		if (find_iommu_group(iommu->local_domain, iommu_group)) {
 			mutex_unlock(&iommu->lock);
 			return -EINVAL;
 		}
@@ -769,6 +1162,34 @@  static int vfio_iommu_type1_attach_group(void *iommu_data,
 	if (ret)
 		goto out_free;
 
+	if (IS_ENABLED(CONFIG_VFIO_MDEV) && !iommu_present(bus) &&
+	    (bus == &mdev_bus_type)) {
+		if (iommu->local_domain) {
+			list_add(&group->next,
+				 &iommu->local_domain->group_list);
+			kfree(domain);
+			mutex_unlock(&iommu->lock);
+			return 0;
+		}
+
+		domain->local_addr_space =
+				      kzalloc(sizeof(*domain->local_addr_space),
+					      GFP_KERNEL);
+		if (!domain->local_addr_space) {
+			ret = -ENOMEM;
+			goto out_free;
+		}
+
+		domain->local_addr_space->task = current;
+		INIT_LIST_HEAD(&domain->group_list);
+		list_add(&group->next, &domain->group_list);
+		domain->local_addr_space->pfn_list = RB_ROOT;
+		mutex_init(&domain->local_addr_space->pfn_list_lock);
+		iommu->local_domain = domain;
+		mutex_unlock(&iommu->lock);
+		return 0;
+	}
+
 	domain->domain = iommu_domain_alloc(bus);
 	if (!domain->domain) {
 		ret = -EIO;
@@ -859,6 +1280,41 @@  static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
 		vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
 }
 
+static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
+{
+	struct vfio_domain *domain = iommu->local_domain;
+	struct vfio_dma *dma, *tdma;
+	struct rb_node *n;
+	long locked = 0;
+
+	rbtree_postorder_for_each_entry_safe(dma, tdma, &iommu->dma_list,
+					     node) {
+		vfio_unmap_unpin(iommu, dma);
+	}
+
+	mutex_lock(&domain->local_addr_space->pfn_list_lock);
+
+	n = rb_first(&domain->local_addr_space->pfn_list);
+
+	for (; n; n = rb_next(n))
+		locked++;
+
+	vfio_lock_acct(domain->local_addr_space->task, locked);
+	mutex_unlock(&domain->local_addr_space->pfn_list_lock);
+}
+
+static void vfio_local_unpin_all(struct vfio_domain *domain)
+{
+	struct rb_node *node;
+
+	mutex_lock(&domain->local_addr_space->pfn_list_lock);
+	while ((node = rb_first(&domain->local_addr_space->pfn_list)))
+		vfio_unpin_pfn(domain,
+				rb_entry(node, struct vfio_pfn, node), false);
+
+	mutex_unlock(&domain->local_addr_space->pfn_list_lock);
+}
+
 static void vfio_iommu_type1_detach_group(void *iommu_data,
 					  struct iommu_group *iommu_group)
 {
@@ -868,31 +1324,57 @@  static void vfio_iommu_type1_detach_group(void *iommu_data,
 
 	mutex_lock(&iommu->lock);
 
-	list_for_each_entry(domain, &iommu->domain_list, next) {
-		list_for_each_entry(group, &domain->group_list, next) {
-			if (group->iommu_group != iommu_group)
-				continue;
-
-			iommu_detach_group(domain->domain, iommu_group);
+	if (iommu->local_domain) {
+		domain = iommu->local_domain;
+		group = find_iommu_group(domain, iommu_group);
+		if (group) {
 			list_del(&group->next);
 			kfree(group);
-			/*
-			 * Group ownership provides privilege, if the group
-			 * list is empty, the domain goes away.  If it's the
-			 * last domain, then all the mappings go away too.
-			 */
+
 			if (list_empty(&domain->group_list)) {
-				if (list_is_singular(&iommu->domain_list))
+				vfio_local_unpin_all(domain);
+				if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
 					vfio_iommu_unmap_unpin_all(iommu);
-				iommu_domain_free(domain->domain);
-				list_del(&domain->next);
 				kfree(domain);
+				iommu->local_domain = NULL;
+			}
+			goto detach_group_done;
+		}
+	}
+
+	if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
+		goto detach_group_done;
+
+	list_for_each_entry(domain, &iommu->domain_list, next) {
+		group = find_iommu_group(domain, iommu_group);
+		if (!group)
+			continue;
+
+		iommu_detach_group(domain->domain, iommu_group);
+		list_del(&group->next);
+		kfree(group);
+		/*
+		 * Group ownership provides privilege, if the group list is
+		 * empty, the domain goes away. If it's the last domain with
+		 * iommu and local domain doesn't exist, then all the mappings
+		 * go away too. If it's the last domain with iommu and local
+		 * domain exist, update accounting
+		 */
+		if (list_empty(&domain->group_list)) {
+			if (list_is_singular(&iommu->domain_list)) {
+				if (!iommu->local_domain)
+					vfio_iommu_unmap_unpin_all(iommu);
+				else
+					vfio_iommu_unmap_unpin_reaccount(iommu);
 			}
-			goto done;
+			iommu_domain_free(domain->domain);
+			list_del(&domain->next);
+			kfree(domain);
 		}
+		break;
 	}
 
-done:
+detach_group_done:
 	mutex_unlock(&iommu->lock);
 }
 
@@ -924,27 +1406,48 @@  static void *vfio_iommu_type1_open(unsigned long arg)
 	return iommu;
 }
 
+static void vfio_release_domain(struct vfio_domain *domain)
+{
+	struct vfio_group *group, *group_tmp;
+
+	list_for_each_entry_safe(group, group_tmp,
+				 &domain->group_list, next) {
+		if (!domain->local_addr_space)
+			iommu_detach_group(domain->domain, group->iommu_group);
+		list_del(&group->next);
+		kfree(group);
+	}
+
+	if (domain->local_addr_space)
+		vfio_local_unpin_all(domain);
+	else
+		iommu_domain_free(domain->domain);
+}
+
 static void vfio_iommu_type1_release(void *iommu_data)
 {
 	struct vfio_iommu *iommu = iommu_data;
 	struct vfio_domain *domain, *domain_tmp;
-	struct vfio_group *group, *group_tmp;
+
+	if (iommu->local_domain) {
+		vfio_release_domain(iommu->local_domain);
+		kfree(iommu->local_domain);
+		iommu->local_domain = NULL;
+	}
 
 	vfio_iommu_unmap_unpin_all(iommu);
 
+	if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
+		goto release_exit;
+
 	list_for_each_entry_safe(domain, domain_tmp,
 				 &iommu->domain_list, next) {
-		list_for_each_entry_safe(group, group_tmp,
-					 &domain->group_list, next) {
-			iommu_detach_group(domain->domain, group->iommu_group);
-			list_del(&group->next);
-			kfree(group);
-		}
-		iommu_domain_free(domain->domain);
+		vfio_release_domain(domain);
 		list_del(&domain->next);
 		kfree(domain);
 	}
 
+release_exit:
 	kfree(iommu);
 }
 
@@ -1048,6 +1551,8 @@  static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
 	.ioctl		= vfio_iommu_type1_ioctl,
 	.attach_group	= vfio_iommu_type1_attach_group,
 	.detach_group	= vfio_iommu_type1_detach_group,
+	.pin_pages	= vfio_iommu_type1_pin_pages,
+	.unpin_pages	= vfio_iommu_type1_unpin_pages,
 };
 
 static int __init vfio_iommu_type1_init(void)
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 0ecae0b1cd34..0bd25ba6223d 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -17,6 +17,7 @@ 
 #include <linux/workqueue.h>
 #include <linux/poll.h>
 #include <uapi/linux/vfio.h>
+#include <linux/mdev.h>
 
 /**
  * struct vfio_device_ops - VFIO bus driver device callbacks
@@ -75,7 +76,11 @@  struct vfio_iommu_driver_ops {
 					struct iommu_group *group);
 	void		(*detach_group)(void *iommu_data,
 					struct iommu_group *group);
-
+	long		(*pin_pages)(void *iommu_data, unsigned long *user_pfn,
+				     long npage, int prot,
+				     unsigned long *phys_pfn);
+	long		(*unpin_pages)(void *iommu_data, unsigned long *pfn,
+				       long npage);
 };
 
 extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
@@ -127,6 +132,12 @@  static inline long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
 }
 #endif /* CONFIG_EEH */
 
+extern long vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
+			   long npage, int prot, unsigned long *phys_pfn);
+
+extern long vfio_unpin_pages(struct device *dev, unsigned long *pfn,
+			     long npage);
+
 /*
  * IRQfd - generic
  */