diff mbox

[v11,09/22] vfio iommu type1: Add task structure to vfio_dma

Message ID 1478293856-8191-10-git-send-email-kwankhede@nvidia.com (mailing list archive)
State New, archived
Headers show

Commit Message

Kirti Wankhede Nov. 4, 2016, 9:10 p.m. UTC
Add task structure to vfio_dma.
Add address space structure. Each vfio_dma structure points to the address
space of the task who mapped it.
List of address spaces is maintained in vfio_iommu structure.
From DMA_MAP call if address space already exist in address space list,
vfio_dma points to it. If address space doesn't exist, allocate address
space, save pointer of mm to it and vfio_dma points to it.
Two tasks can share same address space and so we need keep address space
structure different from task in vfio_dma structure. vfio_dma keeps
pointer to its corresponding address space.
During DMA_UNMAP, same task who mapped it or other task who shares same
address space is allowed to unmap, otherwise unmap fails.
QEMU maps few iova ranges initially, then fork threads and from the child
thread calls DMA_UNMAP on previously mapped iova. Since child shares same
address space, DMA_UNMAP is successful.
This address space structure is used to track pages pinned by external
user in later changes.

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Signed-off-by: Neo Jia <cjia@nvidia.com>
Change-Id: I7600f1bea6b384fd589fa72421ccf031bcfd9ac5
---
 drivers/vfio/vfio_iommu_type1.c | 182 +++++++++++++++++++++++++++++-----------
 1 file changed, 134 insertions(+), 48 deletions(-)

Comments

Alex Williamson Nov. 7, 2016, 9:03 p.m. UTC | #1
On Sat, 5 Nov 2016 02:40:43 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> Add task structure to vfio_dma.
> Add address space structure. Each vfio_dma structure points to the address
> space of the task who mapped it.
> List of address spaces is maintained in vfio_iommu structure.
> From DMA_MAP call if address space already exist in address space list,
> vfio_dma points to it. If address space doesn't exist, allocate address
> space, save pointer of mm to it and vfio_dma points to it.
> Two tasks can share same address space and so we need keep address space
> structure different from task in vfio_dma structure. vfio_dma keeps
> pointer to its corresponding address space.
> During DMA_UNMAP, same task who mapped it or other task who shares same
> address space is allowed to unmap, otherwise unmap fails.
> QEMU maps few iova ranges initially, then fork threads and from the child
> thread calls DMA_UNMAP on previously mapped iova. Since child shares same
> address space, DMA_UNMAP is successful.
> This address space structure is used to track pages pinned by external
> user in later changes.
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Signed-off-by: Neo Jia <cjia@nvidia.com>
> Change-Id: I7600f1bea6b384fd589fa72421ccf031bcfd9ac5
> ---
>  drivers/vfio/vfio_iommu_type1.c | 182 +++++++++++++++++++++++++++++-----------
>  1 file changed, 134 insertions(+), 48 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 422c8d198abb..8d64528dcc22 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -55,12 +55,20 @@ MODULE_PARM_DESC(disable_hugepages,
>  
>  struct vfio_iommu {
>  	struct list_head	domain_list;
> +	struct list_head	addr_space_list;
>  	struct mutex		lock;
>  	struct rb_root		dma_list;
>  	bool			v2;
>  	bool			nesting;
>  };
>  
> +/* address space */
> +struct vfio_addr_space {
> +	struct mm_struct	*mm;
> +	struct list_head	next;
> +	atomic_t		ref_count;
> +};
> +
>  struct vfio_domain {
>  	struct iommu_domain	*domain;
>  	struct list_head	next;
> @@ -75,6 +83,9 @@ struct vfio_dma {
>  	unsigned long		vaddr;		/* Process virtual addr */
>  	size_t			size;		/* Map size (bytes) */
>  	int			prot;		/* IOMMU_READ/WRITE */
> +	struct vfio_addr_space	*addr_space;
> +	struct task_struct	*task;
> +	bool			mlock_cap;
>  };
>  
>  struct vfio_group {
> @@ -130,6 +141,18 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
>  	rb_erase(&old->node, &iommu->dma_list);
>  }
>  
> +static struct vfio_addr_space *vfio_find_addr_space(struct vfio_iommu *iommu,
> +						    struct mm_struct *mm)
> +{
> +	struct vfio_addr_space *as;
> +
> +	list_for_each_entry(as, &iommu->addr_space_list, next) {
> +		if (as->mm == mm)
> +			return as;
> +	}
> +	return NULL;
> +}
> +
>  struct vwork {
>  	struct mm_struct	*mm;
>  	long			npage;
> @@ -273,24 +296,24 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
>   * the iommu can only map chunks of consecutive pfns anyway, so get the
>   * first page and all consecutive pages with the same locking.
>   */
> -static long __vfio_pin_pages_remote(unsigned long vaddr, long npage,
> -				    int prot, unsigned long *pfn_base)
> +static long __vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
> +				    long npage, int prot,
> +				    unsigned long *pfn_base)
>  {
> -	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> -	bool lock_cap = capable(CAP_IPC_LOCK);
> +	struct task_struct *task = dma->task;
> +	unsigned long limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +	bool lock_cap = dma->mlock_cap;
> +	struct mm_struct *mm = dma->addr_space->mm;
>  	long ret, i;
>  	bool rsvd;
>  
> -	if (!current->mm)
> -		return -ENODEV;
> -
> -	ret = vaddr_get_pfn(current->mm, vaddr, prot, pfn_base);
> +	ret = vaddr_get_pfn(mm, vaddr, prot, pfn_base);
>  	if (ret)
>  		return ret;
>  
>  	rsvd = is_invalid_reserved_pfn(*pfn_base);
>  
> -	if (!rsvd && !lock_cap && current->mm->locked_vm + 1 > limit) {
> +	if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) {
>  		put_pfn(*pfn_base, prot);
>  		pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
>  			limit << PAGE_SHIFT);
> @@ -299,7 +322,7 @@ static long __vfio_pin_pages_remote(unsigned long vaddr, long npage,
>  
>  	if (unlikely(disable_hugepages)) {
>  		if (!rsvd)
> -			vfio_lock_acct(current->mm, 1);
> +			vfio_lock_acct(mm, 1);
>  		return 1;
>  	}
>  
> @@ -307,7 +330,7 @@ static long __vfio_pin_pages_remote(unsigned long vaddr, long npage,
>  	for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
>  		unsigned long pfn = 0;
>  
> -		ret = vaddr_get_pfn(current->mm, vaddr, prot, &pfn);
> +		ret = vaddr_get_pfn(mm, vaddr, prot, &pfn);
>  		if (ret)
>  			break;
>  
> @@ -318,7 +341,7 @@ static long __vfio_pin_pages_remote(unsigned long vaddr, long npage,
>  		}
>  
>  		if (!rsvd && !lock_cap &&
> -		    current->mm->locked_vm + i + 1 > limit) {
> +		    mm->locked_vm + i + 1 > limit) {
>  			put_pfn(pfn, prot);
>  			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
>  				__func__, limit << PAGE_SHIFT);
> @@ -327,13 +350,13 @@ static long __vfio_pin_pages_remote(unsigned long vaddr, long npage,
>  	}
>  
>  	if (!rsvd)
> -		vfio_lock_acct(current->mm, i);
> +		vfio_lock_acct(mm, i);
>  
>  	return i;
>  }
>  
> -static long __vfio_unpin_pages_remote(unsigned long pfn, long npage,
> -				      int prot, bool do_accounting)
> +static long __vfio_unpin_pages_remote(struct vfio_dma *dma, unsigned long pfn,
> +				      long npage, int prot, bool do_accounting)
>  {
>  	unsigned long unlocked = 0;
>  	long i;
> @@ -342,7 +365,7 @@ static long __vfio_unpin_pages_remote(unsigned long pfn, long npage,
>  		unlocked += put_pfn(pfn++, prot);
>  
>  	if (do_accounting)
> -		vfio_lock_acct(current->mm, -unlocked);
> +		vfio_lock_acct(dma->addr_space->mm, -unlocked);
>  
>  	return unlocked;
>  }
> @@ -396,7 +419,7 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
>  		if (WARN_ON(!unmapped))
>  			break;
>  
> -		unlocked += __vfio_unpin_pages_remote(phys >> PAGE_SHIFT,
> +		unlocked += __vfio_unpin_pages_remote(dma, phys >> PAGE_SHIFT,
>  						      unmapped >> PAGE_SHIFT,
>  						      dma->prot, false);
>  		iova += unmapped;
> @@ -404,13 +427,20 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
>  		cond_resched();
>  	}
>  
> -	vfio_lock_acct(current->mm, -unlocked);
> +	vfio_lock_acct(dma->addr_space->mm, -unlocked);
>  }
>  
>  static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
>  {
>  	vfio_unmap_unpin(iommu, dma);
>  	vfio_unlink_dma(iommu, dma);
> +
> +	if (atomic_dec_and_test(&dma->addr_space->ref_count)) {
> +		mmput(dma->addr_space->mm);
> +		put_task_struct(dma->task);
> +		list_del(&dma->addr_space->next);
> +		kfree(dma->addr_space);
> +	}
>  	kfree(dma);
>  }
>  
> @@ -506,6 +536,12 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>  	while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
>  		if (!iommu->v2 && unmap->iova > dma->iova)
>  			break;
> +		/*
> +		 * Task with same address space who mapped this iova range is
> +		 * allowed to unmap the iova range.
> +		 */
> +		if (dma->task->mm != current->mm)
> +			break;
>  		unmapped += dma->size;
>  		vfio_remove_dma(iommu, dma);
>  	}
> @@ -572,17 +608,58 @@ unwind:
>  	return ret;
>  }
>  
> +static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
> +			    size_t map_size)
> +{
> +	dma_addr_t iova = dma->iova;
> +	unsigned long vaddr = dma->vaddr;
> +	size_t size = map_size;
> +	long npage;
> +	unsigned long pfn;
> +	int ret = 0;
> +
> +	while (size) {
> +		/* Pin a contiguous chunk of memory */
> +		npage = __vfio_pin_pages_remote(dma, vaddr + dma->size,
> +						size >> PAGE_SHIFT, dma->prot,
> +						&pfn);
> +		if (npage <= 0) {
> +			WARN_ON(!npage);
> +			ret = (int)npage;
> +			break;
> +		}
> +
> +		/* Map it! */
> +		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage,
> +				     dma->prot);
> +		if (ret) {
> +			__vfio_unpin_pages_remote(dma, pfn, npage, dma->prot,
> +						  true);
> +			break;
> +		}
> +
> +		size -= npage << PAGE_SHIFT;
> +		dma->size += npage << PAGE_SHIFT;
> +	}
> +
> +	if (ret)
> +		vfio_remove_dma(iommu, dma);
> +
> +	return ret;
> +}
> +
>  static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  			   struct vfio_iommu_type1_dma_map *map)
>  {
>  	dma_addr_t iova = map->iova;
>  	unsigned long vaddr = map->vaddr;
>  	size_t size = map->size;
> -	long npage;
>  	int ret = 0, prot = 0;
>  	uint64_t mask;
>  	struct vfio_dma *dma;
> -	unsigned long pfn;
> +	struct vfio_addr_space *addr_space;
> +	struct mm_struct *mm;
> +	bool free_addr_space_on_err = false;
>  
>  	/* Verify that none of our __u64 fields overflow */
>  	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
> @@ -608,47 +685,56 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  	mutex_lock(&iommu->lock);
>  
>  	if (vfio_find_dma(iommu, iova, size)) {
> -		mutex_unlock(&iommu->lock);
> -		return -EEXIST;
> +		ret = -EEXIST;
> +		goto do_map_err;
> +	}
> +
> +	mm = get_task_mm(current);
> +	if (!mm) {
> +		ret = -ENODEV;

-EFAULT?

> +		goto do_map_err;
> +	}
> +
> +	addr_space = vfio_find_addr_space(iommu, mm);
> +	if (addr_space) {
> +		atomic_inc(&addr_space->ref_count);
> +		mmput(mm);
> +	} else {
> +		addr_space = kzalloc(sizeof(*addr_space), GFP_KERNEL);
> +		if (!addr_space) {
> +			ret = -ENOMEM;
> +			goto do_map_err;
> +		}
> +		addr_space->mm = mm;
> +		atomic_set(&addr_space->ref_count, 1);
> +		list_add(&addr_space->next, &iommu->addr_space_list);
> +		free_addr_space_on_err = true;
>  	}
>  
>  	dma = kzalloc(sizeof(*dma), GFP_KERNEL);
>  	if (!dma) {
> -		mutex_unlock(&iommu->lock);
> -		return -ENOMEM;
> +		if (free_addr_space_on_err) {
> +			mmput(mm);
> +			list_del(&addr_space->next);
> +			kfree(addr_space);
> +		}
> +		ret = -ENOMEM;
> +		goto do_map_err;
>  	}
>  
>  	dma->iova = iova;
>  	dma->vaddr = vaddr;
>  	dma->prot = prot;
> +	dma->addr_space = addr_space;
> +	get_task_struct(current);
> +	dma->task = current;
> +	dma->mlock_cap = capable(CAP_IPC_LOCK);


How do you reason we can cache this?  Does the fact that the process
had this capability at the time that it did a DMA_MAP imply that it
necessarily still has this capability when an external user (vendor
driver) tries to pin pages?  I don't see how we can make that
assumption.


>  
>  	/* Insert zero-sized and grow as we map chunks of it */
>  	vfio_link_dma(iommu, dma);
>  
> -	while (size) {
> -		/* Pin a contiguous chunk of memory */
> -		npage = __vfio_pin_pages_remote(vaddr + dma->size,
> -						size >> PAGE_SHIFT, prot, &pfn);
> -		if (npage <= 0) {
> -			WARN_ON(!npage);
> -			ret = (int)npage;
> -			break;
> -		}
> -
> -		/* Map it! */
> -		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot);
> -		if (ret) {
> -			__vfio_unpin_pages_remote(pfn, npage, prot, true);
> -			break;
> -		}
> -
> -		size -= npage << PAGE_SHIFT;
> -		dma->size += npage << PAGE_SHIFT;
> -	}
> -
> -	if (ret)
> -		vfio_remove_dma(iommu, dma);
> -
> +	ret = vfio_pin_map_dma(iommu, dma, size);
> +do_map_err:
>  	mutex_unlock(&iommu->lock);
>  	return ret;
>  }

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kirti Wankhede Nov. 8, 2016, 2:13 p.m. UTC | #2
On 11/8/2016 2:33 AM, Alex Williamson wrote:
> On Sat, 5 Nov 2016 02:40:43 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 

...

>>  static int vfio_dma_do_map(struct vfio_iommu *iommu,
>>  			   struct vfio_iommu_type1_dma_map *map)
>>  {
>>  	dma_addr_t iova = map->iova;
>>  	unsigned long vaddr = map->vaddr;
>>  	size_t size = map->size;
>> -	long npage;
>>  	int ret = 0, prot = 0;
>>  	uint64_t mask;
>>  	struct vfio_dma *dma;
>> -	unsigned long pfn;
>> +	struct vfio_addr_space *addr_space;
>> +	struct mm_struct *mm;
>> +	bool free_addr_space_on_err = false;
>>  
>>  	/* Verify that none of our __u64 fields overflow */
>>  	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
>> @@ -608,47 +685,56 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>>  	mutex_lock(&iommu->lock);
>>  
>>  	if (vfio_find_dma(iommu, iova, size)) {
>> -		mutex_unlock(&iommu->lock);
>> -		return -EEXIST;
>> +		ret = -EEXIST;
>> +		goto do_map_err;
>> +	}
>> +
>> +	mm = get_task_mm(current);
>> +	if (!mm) {
>> +		ret = -ENODEV;
> 
> -EFAULT?
>

-ENODEV return is in original code from vfio_pin_pages()
        if (!current->mm)
                return -ENODEV;

Once I thought of changing it to -EFAULT, but then again changed to
-ENODEV to be consistent with original error code.

Should I still change this return to -EFAULT?


>> +		goto do_map_err;
>> +	}
>> +
>> +	addr_space = vfio_find_addr_space(iommu, mm);
>> +	if (addr_space) {
>> +		atomic_inc(&addr_space->ref_count);
>> +		mmput(mm);
>> +	} else {
>> +		addr_space = kzalloc(sizeof(*addr_space), GFP_KERNEL);
>> +		if (!addr_space) {
>> +			ret = -ENOMEM;
>> +			goto do_map_err;
>> +		}
>> +		addr_space->mm = mm;
>> +		atomic_set(&addr_space->ref_count, 1);
>> +		list_add(&addr_space->next, &iommu->addr_space_list);
>> +		free_addr_space_on_err = true;
>>  	}
>>  
>>  	dma = kzalloc(sizeof(*dma), GFP_KERNEL);
>>  	if (!dma) {
>> -		mutex_unlock(&iommu->lock);
>> -		return -ENOMEM;
>> +		if (free_addr_space_on_err) {
>> +			mmput(mm);
>> +			list_del(&addr_space->next);
>> +			kfree(addr_space);
>> +		}
>> +		ret = -ENOMEM;
>> +		goto do_map_err;
>>  	}
>>  
>>  	dma->iova = iova;
>>  	dma->vaddr = vaddr;
>>  	dma->prot = prot;
>> +	dma->addr_space = addr_space;
>> +	get_task_struct(current);
>> +	dma->task = current;
>> +	dma->mlock_cap = capable(CAP_IPC_LOCK);
> 
> 
> How do you reason we can cache this?  Does the fact that the process
> had this capability at the time that it did a DMA_MAP imply that it
> necessarily still has this capability when an external user (vendor
> driver) tries to pin pages?  I don't see how we can make that
> assumption.
> 
> 

Will process change MEMLOCK limit at runtime? I think it shouldn't,
correct me if I'm wrong. QEMU doesn't do that, right?

The function capable() determines current task's capability. But when
vfio_pin_pages() is called, it could come from other task but pages are
pinned from address space of task who mapped it. So we can't use
capable() in vfio_pin_pages()

If this capability shouldn't be cached, we have to use has_capability()
with dma->task as argument in vfio_pin_pages()

 bool has_capability(struct task_struct *t, int cap)

Thanks,
Kirti
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson Nov. 8, 2016, 4:43 p.m. UTC | #3
On Tue, 8 Nov 2016 19:43:25 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 11/8/2016 2:33 AM, Alex Williamson wrote:
> > On Sat, 5 Nov 2016 02:40:43 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> 
> ...
> 
> >>  static int vfio_dma_do_map(struct vfio_iommu *iommu,
> >>  			   struct vfio_iommu_type1_dma_map *map)
> >>  {
> >>  	dma_addr_t iova = map->iova;
> >>  	unsigned long vaddr = map->vaddr;
> >>  	size_t size = map->size;
> >> -	long npage;
> >>  	int ret = 0, prot = 0;
> >>  	uint64_t mask;
> >>  	struct vfio_dma *dma;
> >> -	unsigned long pfn;
> >> +	struct vfio_addr_space *addr_space;
> >> +	struct mm_struct *mm;
> >> +	bool free_addr_space_on_err = false;
> >>  
> >>  	/* Verify that none of our __u64 fields overflow */
> >>  	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
> >> @@ -608,47 +685,56 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
> >>  	mutex_lock(&iommu->lock);
> >>  
> >>  	if (vfio_find_dma(iommu, iova, size)) {
> >> -		mutex_unlock(&iommu->lock);
> >> -		return -EEXIST;
> >> +		ret = -EEXIST;
> >> +		goto do_map_err;
> >> +	}
> >> +
> >> +	mm = get_task_mm(current);
> >> +	if (!mm) {
> >> +		ret = -ENODEV;  
> > 
> > -EFAULT?
> >  
> 
> -ENODEV return is in original code from vfio_pin_pages()
>         if (!current->mm)
>                 return -ENODEV;
> 
> Once I thought of changing it to -EFAULT, but then again changed to
> -ENODEV to be consistent with original error code.
> 
> Should I still change this return to -EFAULT?

Let's keep ENODEV for less code churn, I guess.
 
> >> +		goto do_map_err;
> >> +	}
> >> +
> >> +	addr_space = vfio_find_addr_space(iommu, mm);
> >> +	if (addr_space) {
> >> +		atomic_inc(&addr_space->ref_count);
> >> +		mmput(mm);
> >> +	} else {
> >> +		addr_space = kzalloc(sizeof(*addr_space), GFP_KERNEL);
> >> +		if (!addr_space) {
> >> +			ret = -ENOMEM;
> >> +			goto do_map_err;
> >> +		}
> >> +		addr_space->mm = mm;
> >> +		atomic_set(&addr_space->ref_count, 1);
> >> +		list_add(&addr_space->next, &iommu->addr_space_list);
> >> +		free_addr_space_on_err = true;
> >>  	}
> >>  
> >>  	dma = kzalloc(sizeof(*dma), GFP_KERNEL);
> >>  	if (!dma) {
> >> -		mutex_unlock(&iommu->lock);
> >> -		return -ENOMEM;
> >> +		if (free_addr_space_on_err) {
> >> +			mmput(mm);
> >> +			list_del(&addr_space->next);
> >> +			kfree(addr_space);
> >> +		}
> >> +		ret = -ENOMEM;
> >> +		goto do_map_err;
> >>  	}
> >>  
> >>  	dma->iova = iova;
> >>  	dma->vaddr = vaddr;
> >>  	dma->prot = prot;
> >> +	dma->addr_space = addr_space;
> >> +	get_task_struct(current);
> >> +	dma->task = current;
> >> +	dma->mlock_cap = capable(CAP_IPC_LOCK);  
> > 
> > 
> > How do you reason we can cache this?  Does the fact that the process
> > had this capability at the time that it did a DMA_MAP imply that it
> > necessarily still has this capability when an external user (vendor
> > driver) tries to pin pages?  I don't see how we can make that
> > assumption.
> > 
> >   
> 
> Will process change MEMLOCK limit at runtime? I think it shouldn't,
> correct me if I'm wrong. QEMU doesn't do that, right?

What QEMU does or doesn't do isn't relevant, the question is could a
process change CAP_IPC_LOCK runtime.  It seems plausible to me.

> The function capable() determines current task's capability. But when
> vfio_pin_pages() is called, it could come from other task but pages are
> pinned from address space of task who mapped it. So we can't use
> capable() in vfio_pin_pages()
> 
> If this capability shouldn't be cached, we have to use has_capability()
> with dma->task as argument in vfio_pin_pages()
> 
>  bool has_capability(struct task_struct *t, int cap)

Yep, that sounds better.  Thanks,

Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 422c8d198abb..8d64528dcc22 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -55,12 +55,20 @@  MODULE_PARM_DESC(disable_hugepages,
 
 struct vfio_iommu {
 	struct list_head	domain_list;
+	struct list_head	addr_space_list;
 	struct mutex		lock;
 	struct rb_root		dma_list;
 	bool			v2;
 	bool			nesting;
 };
 
+/* address space */
+struct vfio_addr_space {
+	struct mm_struct	*mm;
+	struct list_head	next;
+	atomic_t		ref_count;
+};
+
 struct vfio_domain {
 	struct iommu_domain	*domain;
 	struct list_head	next;
@@ -75,6 +83,9 @@  struct vfio_dma {
 	unsigned long		vaddr;		/* Process virtual addr */
 	size_t			size;		/* Map size (bytes) */
 	int			prot;		/* IOMMU_READ/WRITE */
+	struct vfio_addr_space	*addr_space;
+	struct task_struct	*task;
+	bool			mlock_cap;
 };
 
 struct vfio_group {
@@ -130,6 +141,18 @@  static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
 	rb_erase(&old->node, &iommu->dma_list);
 }
 
+static struct vfio_addr_space *vfio_find_addr_space(struct vfio_iommu *iommu,
+						    struct mm_struct *mm)
+{
+	struct vfio_addr_space *as;
+
+	list_for_each_entry(as, &iommu->addr_space_list, next) {
+		if (as->mm == mm)
+			return as;
+	}
+	return NULL;
+}
+
 struct vwork {
 	struct mm_struct	*mm;
 	long			npage;
@@ -273,24 +296,24 @@  static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
  * the iommu can only map chunks of consecutive pfns anyway, so get the
  * first page and all consecutive pages with the same locking.
  */
-static long __vfio_pin_pages_remote(unsigned long vaddr, long npage,
-				    int prot, unsigned long *pfn_base)
+static long __vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
+				    long npage, int prot,
+				    unsigned long *pfn_base)
 {
-	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-	bool lock_cap = capable(CAP_IPC_LOCK);
+	struct task_struct *task = dma->task;
+	unsigned long limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	bool lock_cap = dma->mlock_cap;
+	struct mm_struct *mm = dma->addr_space->mm;
 	long ret, i;
 	bool rsvd;
 
-	if (!current->mm)
-		return -ENODEV;
-
-	ret = vaddr_get_pfn(current->mm, vaddr, prot, pfn_base);
+	ret = vaddr_get_pfn(mm, vaddr, prot, pfn_base);
 	if (ret)
 		return ret;
 
 	rsvd = is_invalid_reserved_pfn(*pfn_base);
 
-	if (!rsvd && !lock_cap && current->mm->locked_vm + 1 > limit) {
+	if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) {
 		put_pfn(*pfn_base, prot);
 		pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
 			limit << PAGE_SHIFT);
@@ -299,7 +322,7 @@  static long __vfio_pin_pages_remote(unsigned long vaddr, long npage,
 
 	if (unlikely(disable_hugepages)) {
 		if (!rsvd)
-			vfio_lock_acct(current->mm, 1);
+			vfio_lock_acct(mm, 1);
 		return 1;
 	}
 
@@ -307,7 +330,7 @@  static long __vfio_pin_pages_remote(unsigned long vaddr, long npage,
 	for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
 		unsigned long pfn = 0;
 
-		ret = vaddr_get_pfn(current->mm, vaddr, prot, &pfn);
+		ret = vaddr_get_pfn(mm, vaddr, prot, &pfn);
 		if (ret)
 			break;
 
@@ -318,7 +341,7 @@  static long __vfio_pin_pages_remote(unsigned long vaddr, long npage,
 		}
 
 		if (!rsvd && !lock_cap &&
-		    current->mm->locked_vm + i + 1 > limit) {
+		    mm->locked_vm + i + 1 > limit) {
 			put_pfn(pfn, prot);
 			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
 				__func__, limit << PAGE_SHIFT);
@@ -327,13 +350,13 @@  static long __vfio_pin_pages_remote(unsigned long vaddr, long npage,
 	}
 
 	if (!rsvd)
-		vfio_lock_acct(current->mm, i);
+		vfio_lock_acct(mm, i);
 
 	return i;
 }
 
-static long __vfio_unpin_pages_remote(unsigned long pfn, long npage,
-				      int prot, bool do_accounting)
+static long __vfio_unpin_pages_remote(struct vfio_dma *dma, unsigned long pfn,
+				      long npage, int prot, bool do_accounting)
 {
 	unsigned long unlocked = 0;
 	long i;
@@ -342,7 +365,7 @@  static long __vfio_unpin_pages_remote(unsigned long pfn, long npage,
 		unlocked += put_pfn(pfn++, prot);
 
 	if (do_accounting)
-		vfio_lock_acct(current->mm, -unlocked);
+		vfio_lock_acct(dma->addr_space->mm, -unlocked);
 
 	return unlocked;
 }
@@ -396,7 +419,7 @@  static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
 		if (WARN_ON(!unmapped))
 			break;
 
-		unlocked += __vfio_unpin_pages_remote(phys >> PAGE_SHIFT,
+		unlocked += __vfio_unpin_pages_remote(dma, phys >> PAGE_SHIFT,
 						      unmapped >> PAGE_SHIFT,
 						      dma->prot, false);
 		iova += unmapped;
@@ -404,13 +427,20 @@  static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
 		cond_resched();
 	}
 
-	vfio_lock_acct(current->mm, -unlocked);
+	vfio_lock_acct(dma->addr_space->mm, -unlocked);
 }
 
 static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
 {
 	vfio_unmap_unpin(iommu, dma);
 	vfio_unlink_dma(iommu, dma);
+
+	if (atomic_dec_and_test(&dma->addr_space->ref_count)) {
+		mmput(dma->addr_space->mm);
+		put_task_struct(dma->task);
+		list_del(&dma->addr_space->next);
+		kfree(dma->addr_space);
+	}
 	kfree(dma);
 }
 
@@ -506,6 +536,12 @@  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 	while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
 		if (!iommu->v2 && unmap->iova > dma->iova)
 			break;
+		/*
+		 * Task with same address space who mapped this iova range is
+		 * allowed to unmap the iova range.
+		 */
+		if (dma->task->mm != current->mm)
+			break;
 		unmapped += dma->size;
 		vfio_remove_dma(iommu, dma);
 	}
@@ -572,17 +608,58 @@  unwind:
 	return ret;
 }
 
+static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
+			    size_t map_size)
+{
+	dma_addr_t iova = dma->iova;
+	unsigned long vaddr = dma->vaddr;
+	size_t size = map_size;
+	long npage;
+	unsigned long pfn;
+	int ret = 0;
+
+	while (size) {
+		/* Pin a contiguous chunk of memory */
+		npage = __vfio_pin_pages_remote(dma, vaddr + dma->size,
+						size >> PAGE_SHIFT, dma->prot,
+						&pfn);
+		if (npage <= 0) {
+			WARN_ON(!npage);
+			ret = (int)npage;
+			break;
+		}
+
+		/* Map it! */
+		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage,
+				     dma->prot);
+		if (ret) {
+			__vfio_unpin_pages_remote(dma, pfn, npage, dma->prot,
+						  true);
+			break;
+		}
+
+		size -= npage << PAGE_SHIFT;
+		dma->size += npage << PAGE_SHIFT;
+	}
+
+	if (ret)
+		vfio_remove_dma(iommu, dma);
+
+	return ret;
+}
+
 static int vfio_dma_do_map(struct vfio_iommu *iommu,
 			   struct vfio_iommu_type1_dma_map *map)
 {
 	dma_addr_t iova = map->iova;
 	unsigned long vaddr = map->vaddr;
 	size_t size = map->size;
-	long npage;
 	int ret = 0, prot = 0;
 	uint64_t mask;
 	struct vfio_dma *dma;
-	unsigned long pfn;
+	struct vfio_addr_space *addr_space;
+	struct mm_struct *mm;
+	bool free_addr_space_on_err = false;
 
 	/* Verify that none of our __u64 fields overflow */
 	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
@@ -608,47 +685,56 @@  static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	mutex_lock(&iommu->lock);
 
 	if (vfio_find_dma(iommu, iova, size)) {
-		mutex_unlock(&iommu->lock);
-		return -EEXIST;
+		ret = -EEXIST;
+		goto do_map_err;
+	}
+
+	mm = get_task_mm(current);
+	if (!mm) {
+		ret = -ENODEV;
+		goto do_map_err;
+	}
+
+	addr_space = vfio_find_addr_space(iommu, mm);
+	if (addr_space) {
+		atomic_inc(&addr_space->ref_count);
+		mmput(mm);
+	} else {
+		addr_space = kzalloc(sizeof(*addr_space), GFP_KERNEL);
+		if (!addr_space) {
+			ret = -ENOMEM;
+			goto do_map_err;
+		}
+		addr_space->mm = mm;
+		atomic_set(&addr_space->ref_count, 1);
+		list_add(&addr_space->next, &iommu->addr_space_list);
+		free_addr_space_on_err = true;
 	}
 
 	dma = kzalloc(sizeof(*dma), GFP_KERNEL);
 	if (!dma) {
-		mutex_unlock(&iommu->lock);
-		return -ENOMEM;
+		if (free_addr_space_on_err) {
+			mmput(mm);
+			list_del(&addr_space->next);
+			kfree(addr_space);
+		}
+		ret = -ENOMEM;
+		goto do_map_err;
 	}
 
 	dma->iova = iova;
 	dma->vaddr = vaddr;
 	dma->prot = prot;
+	dma->addr_space = addr_space;
+	get_task_struct(current);
+	dma->task = current;
+	dma->mlock_cap = capable(CAP_IPC_LOCK);
 
 	/* Insert zero-sized and grow as we map chunks of it */
 	vfio_link_dma(iommu, dma);
 
-	while (size) {
-		/* Pin a contiguous chunk of memory */
-		npage = __vfio_pin_pages_remote(vaddr + dma->size,
-						size >> PAGE_SHIFT, prot, &pfn);
-		if (npage <= 0) {
-			WARN_ON(!npage);
-			ret = (int)npage;
-			break;
-		}
-
-		/* Map it! */
-		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot);
-		if (ret) {
-			__vfio_unpin_pages_remote(pfn, npage, prot, true);
-			break;
-		}
-
-		size -= npage << PAGE_SHIFT;
-		dma->size += npage << PAGE_SHIFT;
-	}
-
-	if (ret)
-		vfio_remove_dma(iommu, dma);
-
+	ret = vfio_pin_map_dma(iommu, dma, size);
+do_map_err:
 	mutex_unlock(&iommu->lock);
 	return ret;
 }