diff mbox series

[v16,Kernel,4/7] vfio iommu: Implementation of ioctl for dirty pages tracking.

Message ID 1585084732-18473-1-git-send-email-kwankhede@nvidia.com (mailing list archive)
State New, archived
Headers show
Series None | expand

Commit Message

Kirti Wankhede March 24, 2020, 9:18 p.m. UTC
VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
- Start dirty pages tracking while migration is active
- Stop dirty pages tracking.
- Get dirty pages bitmap. Its user space application's responsibility to
  copy content of dirty pages from source to destination during migration.

To prevent DoS attack, memory for bitmap is allocated per vfio_dma
structure. Bitmap size is calculated considering smallest supported page
size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled

Bitmap is populated for already pinned pages when bitmap is allocated for
a vfio_dma with the smallest supported page size. Update bitmap from
pinning functions when tracking is enabled. When user application queries
bitmap, check if requested page size is same as page size used to
populated bitmap. If it is equal, copy bitmap, but if not equal, return
error.

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Reviewed-by: Neo Jia <cjia@nvidia.com>
---
 drivers/vfio/vfio_iommu_type1.c | 266 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 260 insertions(+), 6 deletions(-)

Comments

Yan Zhao March 25, 2020, 2:11 a.m. UTC | #1
On Wed, Mar 25, 2020 at 05:18:52AM +0800, Kirti Wankhede wrote:
> VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
> - Start dirty pages tracking while migration is active
> - Stop dirty pages tracking.
> - Get dirty pages bitmap. Its user space application's responsibility to
>   copy content of dirty pages from source to destination during migration.
> 
> To prevent DoS attack, memory for bitmap is allocated per vfio_dma
> structure. Bitmap size is calculated considering smallest supported page
> size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled
> 
> Bitmap is populated for already pinned pages when bitmap is allocated for
> a vfio_dma with the smallest supported page size. Update bitmap from
> pinning functions when tracking is enabled. When user application queries
> bitmap, check if requested page size is same as page size used to
> populated bitmap. If it is equal, copy bitmap, but if not equal, return
> error.
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Reviewed-by: Neo Jia <cjia@nvidia.com>
> ---
>  drivers/vfio/vfio_iommu_type1.c | 266 +++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 260 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 70aeab921d0f..874a1a7ae925 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -71,6 +71,7 @@ struct vfio_iommu {
>  	unsigned int		dma_avail;
>  	bool			v2;
>  	bool			nesting;
> +	bool			dirty_page_tracking;
>  };
>  
>  struct vfio_domain {
> @@ -91,6 +92,7 @@ struct vfio_dma {
>  	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
>  	struct task_struct	*task;
>  	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
> +	unsigned long		*bitmap;
>  };
>  
>  struct vfio_group {
> @@ -125,7 +127,21 @@ struct vfio_regions {
>  #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
>  					(!list_empty(&iommu->domain_list))
>  
> +#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
> +
> +/*
> + * Input argument of number of bits to bitmap_set() is unsigned integer, which
> + * further casts to signed integer for unaligned multi-bit operation,
> + * __bitmap_set().
> + * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
> + * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
> + * system.
> + */
> +#define DIRTY_BITMAP_PAGES_MAX	(uint64_t)(INT_MAX - 1)
> +#define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
> +
>  static int put_pfn(unsigned long pfn, int prot);
> +static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
>  
>  /*
>   * This code handles mapping and unmapping of user data buffers
> @@ -175,6 +191,77 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
>  	rb_erase(&old->node, &iommu->dma_list);
>  }
>  
> +
> +static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, uint64_t pgsize)
> +{
> +	uint64_t npages = dma->size / pgsize;
> +
> +	if (npages > DIRTY_BITMAP_PAGES_MAX)
> +		return -EINVAL;
> +
> +	dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
> +	if (!dma->bitmap)
> +		return -ENOMEM;
> +
> +	return 0;
> +}
> +
> +static void vfio_dma_bitmap_free(struct vfio_dma *dma)
> +{
> +	kfree(dma->bitmap);
> +	dma->bitmap = NULL;
> +}
> +
> +static void vfio_dma_populate_bitmap(struct vfio_dma *dma, uint64_t pgsize)
> +{
> +	struct rb_node *p;
> +
> +	if (RB_EMPTY_ROOT(&dma->pfn_list))
> +		return;
> +
> +	for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
> +		struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
> +
> +		bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) / pgsize, 1);
> +	}
> +}
> +
> +static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, uint64_t pgsize)
> +{
> +	struct rb_node *n = rb_first(&iommu->dma_list);
> +
> +	for (; n; n = rb_next(n)) {
> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> +		int ret;
> +
> +		ret = vfio_dma_bitmap_alloc(dma, pgsize);
> +		if (ret) {
> +			struct rb_node *p = rb_prev(n);
> +
> +			for (; p; p = rb_prev(p)) {
> +				struct vfio_dma *dma = rb_entry(n,
> +							struct vfio_dma, node);
> +
> +				vfio_dma_bitmap_free(dma);
> +			}
> +			return ret;
> +		}
> +		vfio_dma_populate_bitmap(dma, pgsize);
> +	}
> +	return 0;
> +}
> +
> +static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
> +{
> +	struct rb_node *n = rb_first(&iommu->dma_list);
> +
> +	for (; n; n = rb_next(n)) {
> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> +
> +		vfio_dma_bitmap_free(dma);
> +	}
> +}
> +
>  /*
>   * Helper Functions for host iova-pfn list
>   */
> @@ -567,6 +654,18 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>  			vfio_unpin_page_external(dma, iova, do_accounting);
>  			goto pin_unwind;
>  		}
> +
> +		if (iommu->dirty_page_tracking) {
> +			unsigned long pgshift =
> +					 __ffs(vfio_pgsize_bitmap(iommu));
> +
> +			/*
> +			 * Bitmap populated with the smallest supported page
> +			 * size
> +			 */
> +			bitmap_set(dma->bitmap,
> +				   (vpfn->iova - dma->iova) >> pgshift, 1);
> +		}
>  	}
>  
>  	ret = i;
> @@ -801,6 +900,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
>  	vfio_unmap_unpin(iommu, dma, true);
>  	vfio_unlink_dma(iommu, dma);
>  	put_task_struct(dma->task);
> +	vfio_dma_bitmap_free(dma);
>  	kfree(dma);
>  	iommu->dma_avail++;
>  }
> @@ -831,6 +931,57 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
>  	return bitmap;
>  }
>  
> +static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
> +				  size_t size, uint64_t pgsize,
> +				  u64 __user *bitmap)
> +{
> +	struct vfio_dma *dma;
> +	unsigned long pgshift = __ffs(pgsize);
> +	unsigned int npages, bitmap_size;
> +
> +	dma = vfio_find_dma(iommu, iova, 1);
> +
> +	if (!dma)
> +		return -EINVAL;
> +
> +	if (dma->iova != iova || dma->size != size)
> +		return -EINVAL;
> +
Still don't sure if it's a good practice.
I saw the qemu implementation.
Qemu just iterates the whole IOVA address space,
It needs to find IOTLB entry for an IOVA
(1) if it can find an IOTLB for an IOVA, do the DIRTY_PAGES IOCTL and 
increment IOVA by (iotlb.addr_mask + 1)

(2) if no existing IOTLB found, the imrc->translate needs to go searching shadow
page table to try to generate one.
if it still fails,(most probably case, as IOMMU only maps a small part in its address
space).  increment IOVA by 1 page.

So, if the address space width is 39bit, and if there's only one page
mapped, you still have to translate IOVA for around 2^27 times in each
query. Isn't it too inefficient?

So, IMHO, why we could not just save an rb tree specific for dirty pages, then generate
a bitmap for each query?

> +	npages = dma->size >> pgshift;
> +	bitmap_size = DIRTY_BITMAP_BYTES(npages);
> +
> +	/* mark all pages dirty if all pages are pinned and mapped. */
> +	if (dma->iommu_mapped)
> +		bitmap_set(dma->bitmap, 0, npages);
> +
> +	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
> +		return -EFAULT;
> +
> +	/*
> +	 * Re-populate bitmap to include all pinned pages which are considered
> +	 * as dirty but exclude pages which are unpinned and pages which are
> +	 * marked dirty by vfio_dma_rw()
> +	 */
> +	bitmap_clear(dma->bitmap, 0, npages);
> +	vfio_dma_populate_bitmap(dma, pgsize);
will this also repopulate bitmap for pinned pages set by pass-through devices in
patch 07 ?


> +	return 0;
> +}
> +
> +static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
> +{
> +	uint64_t bsize;
> +
> +	if (!npages || !bitmap_size || (bitmap_size > DIRTY_BITMAP_SIZE_MAX))
> +		return -EINVAL;
> +
> +	bsize = DIRTY_BITMAP_BYTES(npages);
> +
> +	if (bitmap_size < bsize)
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
>  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>  			     struct vfio_iommu_type1_dma_unmap *unmap)
>  {
> @@ -1038,16 +1189,16 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  	unsigned long vaddr = map->vaddr;
>  	size_t size = map->size;
>  	int ret = 0, prot = 0;
> -	uint64_t mask;
> +	uint64_t pgsize;
>  	struct vfio_dma *dma;
>  
>  	/* Verify that none of our __u64 fields overflow */
>  	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
>  		return -EINVAL;
>  
> -	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
> +	pgsize = (uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu));
>  
> -	WARN_ON(mask & PAGE_MASK);
> +	WARN_ON((pgsize - 1) & PAGE_MASK);
>  
>  	/* READ/WRITE from device perspective */
>  	if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
> @@ -1055,7 +1206,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  	if (map->flags & VFIO_DMA_MAP_FLAG_READ)
>  		prot |= IOMMU_READ;
>  
> -	if (!prot || !size || (size | iova | vaddr) & mask)
> +	if (!prot || !size || (size | iova | vaddr) & (pgsize - 1))
>  		return -EINVAL;
>  
>  	/* Don't allow IOVA or virtual address wrap */
> @@ -1130,6 +1281,12 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  	else
>  		ret = vfio_pin_map_dma(iommu, dma, size);
>  
> +	if (!ret && iommu->dirty_page_tracking) {
> +		ret = vfio_dma_bitmap_alloc(dma, pgsize);
> +		if (ret)
> +			vfio_remove_dma(iommu, dma);
> +	}
> +
>  out_unlock:
>  	mutex_unlock(&iommu->lock);
>  	return ret;
> @@ -2278,6 +2435,93 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>  
>  		return copy_to_user((void __user *)arg, &unmap, minsz) ?
>  			-EFAULT : 0;
> +	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
> +		struct vfio_iommu_type1_dirty_bitmap dirty;
> +		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
> +		int ret = 0;
> +
> +		if (!iommu->v2)
> +			return -EACCES;
> +
> +		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
> +				    flags);
> +
> +		if (copy_from_user(&dirty, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (dirty.argsz < minsz || dirty.flags & ~mask)
> +			return -EINVAL;
> +
> +		/* only one flag should be set at a time */
> +		if (__ffs(dirty.flags) != __fls(dirty.flags))
> +			return -EINVAL;
> +
> +		if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
> +			uint64_t pgsize = 1 << __ffs(vfio_pgsize_bitmap(iommu));
> +
> +			mutex_lock(&iommu->lock);
> +			if (!iommu->dirty_page_tracking) {
> +				ret = vfio_dma_bitmap_alloc_all(iommu, pgsize);
> +				if (!ret)
> +					iommu->dirty_page_tracking = true;
> +			}
> +			mutex_unlock(&iommu->lock);
> +			return ret;
> +		} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
> +			mutex_lock(&iommu->lock);
> +			if (iommu->dirty_page_tracking) {
> +				iommu->dirty_page_tracking = false;
> +				vfio_dma_bitmap_free_all(iommu);
> +			}
> +			mutex_unlock(&iommu->lock);
> +			return 0;
> +		} else if (dirty.flags &
> +				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
> +			struct vfio_iommu_type1_dirty_bitmap_get range;
> +			unsigned long pgshift;
> +			size_t data_size = dirty.argsz - minsz;
> +			uint64_t iommu_pgsize =
> +					 1 << __ffs(vfio_pgsize_bitmap(iommu));
> +
> +			if (!data_size || data_size < sizeof(range))
> +				return -EINVAL;
> +
> +			if (copy_from_user(&range, (void __user *)(arg + minsz),
> +					   sizeof(range)))
> +				return -EFAULT;
> +
> +			/* allow only smallest supported pgsize */
> +			if (range.bitmap.pgsize != iommu_pgsize)
> +				return -EINVAL;
> +			if (range.iova & (iommu_pgsize - 1))
> +				return -EINVAL;
> +			if (!range.size || range.size & (iommu_pgsize - 1))
> +				return -EINVAL;
> +			if (range.iova + range.size < range.iova)
> +				return -EINVAL;
> +			if (!access_ok((void __user *)range.bitmap.data,
> +				       range.bitmap.size))
> +				return -EINVAL;
> +
> +			pgshift = __ffs(range.bitmap.pgsize);
> +			ret = verify_bitmap_size(range.size >> pgshift,
> +						 range.bitmap.size);
> +			if (ret)
> +				return ret;
> +
> +			mutex_lock(&iommu->lock);
> +			if (iommu->dirty_page_tracking)
> +				ret = vfio_iova_dirty_bitmap(iommu, range.iova,
> +						range.size, range.bitmap.pgsize,
> +						range.bitmap.data);
> +			else
> +				ret = -EINVAL;
> +			mutex_unlock(&iommu->lock);
> +
> +			return ret;
> +		}
>  	}
>  
>  	return -ENOTTY;
> @@ -2345,10 +2589,20 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
>  
>  	vaddr = dma->vaddr + offset;
>  
> -	if (write)
> +	if (write) {
>  		*copied = __copy_to_user((void __user *)vaddr, data,
>  					 count) ? 0 : count;
> -	else
> +		if (*copied && iommu->dirty_page_tracking) {
> +			unsigned long pgshift =
> +				__ffs(vfio_pgsize_bitmap(iommu));
> +			/*
> +			 * Bitmap populated with the smallest supported page
> +			 * size
> +			 */
> +			bitmap_set(dma->bitmap, offset >> pgshift,
> +				   *copied >> pgshift);
> +		}
> +	} else
>  		*copied = __copy_from_user(data, (void __user *)vaddr,
>  					   count) ? 0 : count;
>  	if (kthread)
> -- 
> 2.7.0
>
Kirti Wankhede March 26, 2020, 9:39 p.m. UTC | #2
On 3/25/2020 7:41 AM, Yan Zhao wrote:
> On Wed, Mar 25, 2020 at 05:18:52AM +0800, Kirti Wankhede wrote:
>> VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
>> - Start dirty pages tracking while migration is active
>> - Stop dirty pages tracking.
>> - Get dirty pages bitmap. Its user space application's responsibility to
>>    copy content of dirty pages from source to destination during migration.
>>
>> To prevent DoS attack, memory for bitmap is allocated per vfio_dma
>> structure. Bitmap size is calculated considering smallest supported page
>> size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled
>>
>> Bitmap is populated for already pinned pages when bitmap is allocated for
>> a vfio_dma with the smallest supported page size. Update bitmap from
>> pinning functions when tracking is enabled. When user application queries
>> bitmap, check if requested page size is same as page size used to
>> populated bitmap. If it is equal, copy bitmap, but if not equal, return
>> error.
>>
>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>> Reviewed-by: Neo Jia <cjia@nvidia.com>
>> ---
>>   drivers/vfio/vfio_iommu_type1.c | 266 +++++++++++++++++++++++++++++++++++++++-
>>   1 file changed, 260 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>> index 70aeab921d0f..874a1a7ae925 100644
>> --- a/drivers/vfio/vfio_iommu_type1.c
>> +++ b/drivers/vfio/vfio_iommu_type1.c
>> @@ -71,6 +71,7 @@ struct vfio_iommu {
>>   	unsigned int		dma_avail;
>>   	bool			v2;
>>   	bool			nesting;
>> +	bool			dirty_page_tracking;
>>   };
>>   
>>   struct vfio_domain {
>> @@ -91,6 +92,7 @@ struct vfio_dma {
>>   	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
>>   	struct task_struct	*task;
>>   	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
>> +	unsigned long		*bitmap;
>>   };
>>   
>>   struct vfio_group {
>> @@ -125,7 +127,21 @@ struct vfio_regions {
>>   #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
>>   					(!list_empty(&iommu->domain_list))
>>   
>> +#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
>> +
>> +/*
>> + * Input argument of number of bits to bitmap_set() is unsigned integer, which
>> + * further casts to signed integer for unaligned multi-bit operation,
>> + * __bitmap_set().
>> + * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
>> + * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
>> + * system.
>> + */
>> +#define DIRTY_BITMAP_PAGES_MAX	(uint64_t)(INT_MAX - 1)
>> +#define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
>> +
>>   static int put_pfn(unsigned long pfn, int prot);
>> +static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
>>   
>>   /*
>>    * This code handles mapping and unmapping of user data buffers
>> @@ -175,6 +191,77 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
>>   	rb_erase(&old->node, &iommu->dma_list);
>>   }
>>   
>> +
>> +static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, uint64_t pgsize)
>> +{
>> +	uint64_t npages = dma->size / pgsize;
>> +
>> +	if (npages > DIRTY_BITMAP_PAGES_MAX)
>> +		return -EINVAL;
>> +
>> +	dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
>> +	if (!dma->bitmap)
>> +		return -ENOMEM;
>> +
>> +	return 0;
>> +}
>> +
>> +static void vfio_dma_bitmap_free(struct vfio_dma *dma)
>> +{
>> +	kfree(dma->bitmap);
>> +	dma->bitmap = NULL;
>> +}
>> +
>> +static void vfio_dma_populate_bitmap(struct vfio_dma *dma, uint64_t pgsize)
>> +{
>> +	struct rb_node *p;
>> +
>> +	if (RB_EMPTY_ROOT(&dma->pfn_list))
>> +		return;
>> +
>> +	for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
>> +		struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
>> +
>> +		bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) / pgsize, 1);
>> +	}
>> +}
>> +
>> +static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, uint64_t pgsize)
>> +{
>> +	struct rb_node *n = rb_first(&iommu->dma_list);
>> +
>> +	for (; n; n = rb_next(n)) {
>> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
>> +		int ret;
>> +
>> +		ret = vfio_dma_bitmap_alloc(dma, pgsize);
>> +		if (ret) {
>> +			struct rb_node *p = rb_prev(n);
>> +
>> +			for (; p; p = rb_prev(p)) {
>> +				struct vfio_dma *dma = rb_entry(n,
>> +							struct vfio_dma, node);
>> +
>> +				vfio_dma_bitmap_free(dma);
>> +			}
>> +			return ret;
>> +		}
>> +		vfio_dma_populate_bitmap(dma, pgsize);
>> +	}
>> +	return 0;
>> +}
>> +
>> +static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
>> +{
>> +	struct rb_node *n = rb_first(&iommu->dma_list);
>> +
>> +	for (; n; n = rb_next(n)) {
>> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
>> +
>> +		vfio_dma_bitmap_free(dma);
>> +	}
>> +}
>> +
>>   /*
>>    * Helper Functions for host iova-pfn list
>>    */
>> @@ -567,6 +654,18 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>>   			vfio_unpin_page_external(dma, iova, do_accounting);
>>   			goto pin_unwind;
>>   		}
>> +
>> +		if (iommu->dirty_page_tracking) {
>> +			unsigned long pgshift =
>> +					 __ffs(vfio_pgsize_bitmap(iommu));
>> +
>> +			/*
>> +			 * Bitmap populated with the smallest supported page
>> +			 * size
>> +			 */
>> +			bitmap_set(dma->bitmap,
>> +				   (vpfn->iova - dma->iova) >> pgshift, 1);
>> +		}
>>   	}
>>   
>>   	ret = i;
>> @@ -801,6 +900,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
>>   	vfio_unmap_unpin(iommu, dma, true);
>>   	vfio_unlink_dma(iommu, dma);
>>   	put_task_struct(dma->task);
>> +	vfio_dma_bitmap_free(dma);
>>   	kfree(dma);
>>   	iommu->dma_avail++;
>>   }
>> @@ -831,6 +931,57 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
>>   	return bitmap;
>>   }
>>   
>> +static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
>> +				  size_t size, uint64_t pgsize,
>> +				  u64 __user *bitmap)
>> +{
>> +	struct vfio_dma *dma;
>> +	unsigned long pgshift = __ffs(pgsize);
>> +	unsigned int npages, bitmap_size;
>> +
>> +	dma = vfio_find_dma(iommu, iova, 1);
>> +
>> +	if (!dma)
>> +		return -EINVAL;
>> +
>> +	if (dma->iova != iova || dma->size != size)
>> +		return -EINVAL;
>> +
> Still don't sure if it's a good practice.
> I saw the qemu implementation.
> Qemu just iterates the whole IOVA address space,
> It needs to find IOTLB entry for an IOVA
> (1) if it can find an IOTLB for an IOVA, do the DIRTY_PAGES IOCTL and
> increment IOVA by (iotlb.addr_mask + 1)
> 
> (2) if no existing IOTLB found, the imrc->translate needs to go searching shadow
> page table to try to generate one.
> if it still fails,(most probably case, as IOMMU only maps a small part in its address
> space).  increment IOVA by 1 page.
> 
> So, if the address space width is 39bit, and if there's only one page
> mapped, you still have to translate IOVA for around 2^27 times in each
> query. Isn't it too inefficient?
> 

This is Qemu side implementation, let discuss it on QEMU patches.

> So, IMHO, why we could not just save an rb tree specific for dirty pages, then generate
> a bitmap for each query?

This is looping back to implentation in v10 - v12 version. There are 
problems discussed during v10 to v12 version of patches with this approach.
- populating dirty bitmap at the time of query will add more CPU cycles.
- If we save these CPU cyles means dirty pages need to be tracked when 
they are pinned or dirtied by CPU, that is, inttoduced per vfio_dma 
bitmap. If ranges are not vfio_dma aligned, then copying bitmap to user 
space becomes complicated and unefficient.

So we decided to go with the approach implemented here.

> 
>> +	npages = dma->size >> pgshift;
>> +	bitmap_size = DIRTY_BITMAP_BYTES(npages);
>> +
>> +	/* mark all pages dirty if all pages are pinned and mapped. */
>> +	if (dma->iommu_mapped)
>> +		bitmap_set(dma->bitmap, 0, npages);
>> +
>> +	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
>> +		return -EFAULT;
>> +
>> +	/*
>> +	 * Re-populate bitmap to include all pinned pages which are considered
>> +	 * as dirty but exclude pages which are unpinned and pages which are
>> +	 * marked dirty by vfio_dma_rw()
>> +	 */
>> +	bitmap_clear(dma->bitmap, 0, npages);
>> +	vfio_dma_populate_bitmap(dma, pgsize);
> will this also repopulate bitmap for pinned pages set by pass-through devices in
> patch 07 ?
>

If pass through device's driver pins pages using vfio_pin_pages and all 
devices in the group pins pages through vfio_pin_pages, then 
iommu->pinned_page_dirty_scope is set true, then bitmap is repolutated.

Thanks,
Kirti


> 
>> +	return 0;
>> +}
>> +
>> +static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
>> +{
>> +	uint64_t bsize;
>> +
>> +	if (!npages || !bitmap_size || (bitmap_size > DIRTY_BITMAP_SIZE_MAX))
>> +		return -EINVAL;
>> +
>> +	bsize = DIRTY_BITMAP_BYTES(npages);
>> +
>> +	if (bitmap_size < bsize)
>> +		return -EINVAL;
>> +
>> +	return 0;
>> +}
>> +
>>   static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>>   			     struct vfio_iommu_type1_dma_unmap *unmap)
>>   {
>> @@ -1038,16 +1189,16 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>>   	unsigned long vaddr = map->vaddr;
>>   	size_t size = map->size;
>>   	int ret = 0, prot = 0;
>> -	uint64_t mask;
>> +	uint64_t pgsize;
>>   	struct vfio_dma *dma;
>>   
>>   	/* Verify that none of our __u64 fields overflow */
>>   	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
>>   		return -EINVAL;
>>   
>> -	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
>> +	pgsize = (uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu));
>>   
>> -	WARN_ON(mask & PAGE_MASK);
>> +	WARN_ON((pgsize - 1) & PAGE_MASK);
>>   
>>   	/* READ/WRITE from device perspective */
>>   	if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
>> @@ -1055,7 +1206,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>>   	if (map->flags & VFIO_DMA_MAP_FLAG_READ)
>>   		prot |= IOMMU_READ;
>>   
>> -	if (!prot || !size || (size | iova | vaddr) & mask)
>> +	if (!prot || !size || (size | iova | vaddr) & (pgsize - 1))
>>   		return -EINVAL;
>>   
>>   	/* Don't allow IOVA or virtual address wrap */
>> @@ -1130,6 +1281,12 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>>   	else
>>   		ret = vfio_pin_map_dma(iommu, dma, size);
>>   
>> +	if (!ret && iommu->dirty_page_tracking) {
>> +		ret = vfio_dma_bitmap_alloc(dma, pgsize);
>> +		if (ret)
>> +			vfio_remove_dma(iommu, dma);
>> +	}
>> +
>>   out_unlock:
>>   	mutex_unlock(&iommu->lock);
>>   	return ret;
>> @@ -2278,6 +2435,93 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>>   
>>   		return copy_to_user((void __user *)arg, &unmap, minsz) ?
>>   			-EFAULT : 0;
>> +	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
>> +		struct vfio_iommu_type1_dirty_bitmap dirty;
>> +		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
>> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
>> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
>> +		int ret = 0;
>> +
>> +		if (!iommu->v2)
>> +			return -EACCES;
>> +
>> +		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
>> +				    flags);
>> +
>> +		if (copy_from_user(&dirty, (void __user *)arg, minsz))
>> +			return -EFAULT;
>> +
>> +		if (dirty.argsz < minsz || dirty.flags & ~mask)
>> +			return -EINVAL;
>> +
>> +		/* only one flag should be set at a time */
>> +		if (__ffs(dirty.flags) != __fls(dirty.flags))
>> +			return -EINVAL;
>> +
>> +		if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
>> +			uint64_t pgsize = 1 << __ffs(vfio_pgsize_bitmap(iommu));
>> +
>> +			mutex_lock(&iommu->lock);
>> +			if (!iommu->dirty_page_tracking) {
>> +				ret = vfio_dma_bitmap_alloc_all(iommu, pgsize);
>> +				if (!ret)
>> +					iommu->dirty_page_tracking = true;
>> +			}
>> +			mutex_unlock(&iommu->lock);
>> +			return ret;
>> +		} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
>> +			mutex_lock(&iommu->lock);
>> +			if (iommu->dirty_page_tracking) {
>> +				iommu->dirty_page_tracking = false;
>> +				vfio_dma_bitmap_free_all(iommu);
>> +			}
>> +			mutex_unlock(&iommu->lock);
>> +			return 0;
>> +		} else if (dirty.flags &
>> +				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
>> +			struct vfio_iommu_type1_dirty_bitmap_get range;
>> +			unsigned long pgshift;
>> +			size_t data_size = dirty.argsz - minsz;
>> +			uint64_t iommu_pgsize =
>> +					 1 << __ffs(vfio_pgsize_bitmap(iommu));
>> +
>> +			if (!data_size || data_size < sizeof(range))
>> +				return -EINVAL;
>> +
>> +			if (copy_from_user(&range, (void __user *)(arg + minsz),
>> +					   sizeof(range)))
>> +				return -EFAULT;
>> +
>> +			/* allow only smallest supported pgsize */
>> +			if (range.bitmap.pgsize != iommu_pgsize)
>> +				return -EINVAL;
>> +			if (range.iova & (iommu_pgsize - 1))
>> +				return -EINVAL;
>> +			if (!range.size || range.size & (iommu_pgsize - 1))
>> +				return -EINVAL;
>> +			if (range.iova + range.size < range.iova)
>> +				return -EINVAL;
>> +			if (!access_ok((void __user *)range.bitmap.data,
>> +				       range.bitmap.size))
>> +				return -EINVAL;
>> +
>> +			pgshift = __ffs(range.bitmap.pgsize);
>> +			ret = verify_bitmap_size(range.size >> pgshift,
>> +						 range.bitmap.size);
>> +			if (ret)
>> +				return ret;
>> +
>> +			mutex_lock(&iommu->lock);
>> +			if (iommu->dirty_page_tracking)
>> +				ret = vfio_iova_dirty_bitmap(iommu, range.iova,
>> +						range.size, range.bitmap.pgsize,
>> +						range.bitmap.data);
>> +			else
>> +				ret = -EINVAL;
>> +			mutex_unlock(&iommu->lock);
>> +
>> +			return ret;
>> +		}
>>   	}
>>   
>>   	return -ENOTTY;
>> @@ -2345,10 +2589,20 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
>>   
>>   	vaddr = dma->vaddr + offset;
>>   
>> -	if (write)
>> +	if (write) {
>>   		*copied = __copy_to_user((void __user *)vaddr, data,
>>   					 count) ? 0 : count;
>> -	else
>> +		if (*copied && iommu->dirty_page_tracking) {
>> +			unsigned long pgshift =
>> +				__ffs(vfio_pgsize_bitmap(iommu));
>> +			/*
>> +			 * Bitmap populated with the smallest supported page
>> +			 * size
>> +			 */
>> +			bitmap_set(dma->bitmap, offset >> pgshift,
>> +				   *copied >> pgshift);
>> +		}
>> +	} else
>>   		*copied = __copy_from_user(data, (void __user *)vaddr,
>>   					   count) ? 0 : count;
>>   	if (kthread)
>> -- 
>> 2.7.0
>>
Yan Zhao March 27, 2020, 12:30 a.m. UTC | #3
On Fri, Mar 27, 2020 at 05:39:01AM +0800, Kirti Wankhede wrote:
> 
> 
> On 3/25/2020 7:41 AM, Yan Zhao wrote:
> > On Wed, Mar 25, 2020 at 05:18:52AM +0800, Kirti Wankhede wrote:
> >> VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
> >> - Start dirty pages tracking while migration is active
> >> - Stop dirty pages tracking.
> >> - Get dirty pages bitmap. Its user space application's responsibility to
> >>    copy content of dirty pages from source to destination during migration.
> >>
> >> To prevent DoS attack, memory for bitmap is allocated per vfio_dma
> >> structure. Bitmap size is calculated considering smallest supported page
> >> size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled
> >>
> >> Bitmap is populated for already pinned pages when bitmap is allocated for
> >> a vfio_dma with the smallest supported page size. Update bitmap from
> >> pinning functions when tracking is enabled. When user application queries
> >> bitmap, check if requested page size is same as page size used to
> >> populated bitmap. If it is equal, copy bitmap, but if not equal, return
> >> error.
> >>
> >> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> >> Reviewed-by: Neo Jia <cjia@nvidia.com>
> >> ---
> >>   drivers/vfio/vfio_iommu_type1.c | 266 +++++++++++++++++++++++++++++++++++++++-
> >>   1 file changed, 260 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> >> index 70aeab921d0f..874a1a7ae925 100644
> >> --- a/drivers/vfio/vfio_iommu_type1.c
> >> +++ b/drivers/vfio/vfio_iommu_type1.c
> >> @@ -71,6 +71,7 @@ struct vfio_iommu {
> >>   	unsigned int		dma_avail;
> >>   	bool			v2;
> >>   	bool			nesting;
> >> +	bool			dirty_page_tracking;
> >>   };
> >>   
> >>   struct vfio_domain {
> >> @@ -91,6 +92,7 @@ struct vfio_dma {
> >>   	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
> >>   	struct task_struct	*task;
> >>   	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
> >> +	unsigned long		*bitmap;
> >>   };
> >>   
> >>   struct vfio_group {
> >> @@ -125,7 +127,21 @@ struct vfio_regions {
> >>   #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> >>   					(!list_empty(&iommu->domain_list))
> >>   
> >> +#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
> >> +
> >> +/*
> >> + * Input argument of number of bits to bitmap_set() is unsigned integer, which
> >> + * further casts to signed integer for unaligned multi-bit operation,
> >> + * __bitmap_set().
> >> + * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
> >> + * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
> >> + * system.
> >> + */
> >> +#define DIRTY_BITMAP_PAGES_MAX	(uint64_t)(INT_MAX - 1)
> >> +#define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
> >> +
> >>   static int put_pfn(unsigned long pfn, int prot);
> >> +static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
> >>   
> >>   /*
> >>    * This code handles mapping and unmapping of user data buffers
> >> @@ -175,6 +191,77 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
> >>   	rb_erase(&old->node, &iommu->dma_list);
> >>   }
> >>   
> >> +
> >> +static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, uint64_t pgsize)
> >> +{
> >> +	uint64_t npages = dma->size / pgsize;
> >> +
> >> +	if (npages > DIRTY_BITMAP_PAGES_MAX)
> >> +		return -EINVAL;
> >> +
> >> +	dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
> >> +	if (!dma->bitmap)
> >> +		return -ENOMEM;
> >> +
> >> +	return 0;
> >> +}
> >> +
> >> +static void vfio_dma_bitmap_free(struct vfio_dma *dma)
> >> +{
> >> +	kfree(dma->bitmap);
> >> +	dma->bitmap = NULL;
> >> +}
> >> +
> >> +static void vfio_dma_populate_bitmap(struct vfio_dma *dma, uint64_t pgsize)
> >> +{
> >> +	struct rb_node *p;
> >> +
> >> +	if (RB_EMPTY_ROOT(&dma->pfn_list))
> >> +		return;
> >> +
> >> +	for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
> >> +		struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
> >> +
> >> +		bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) / pgsize, 1);
> >> +	}
> >> +}
> >> +
> >> +static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, uint64_t pgsize)
> >> +{
> >> +	struct rb_node *n = rb_first(&iommu->dma_list);
> >> +
> >> +	for (; n; n = rb_next(n)) {
> >> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> >> +		int ret;
> >> +
> >> +		ret = vfio_dma_bitmap_alloc(dma, pgsize);
> >> +		if (ret) {
> >> +			struct rb_node *p = rb_prev(n);
> >> +
> >> +			for (; p; p = rb_prev(p)) {
> >> +				struct vfio_dma *dma = rb_entry(n,
> >> +							struct vfio_dma, node);
> >> +
> >> +				vfio_dma_bitmap_free(dma);
> >> +			}
> >> +			return ret;
> >> +		}
> >> +		vfio_dma_populate_bitmap(dma, pgsize);
> >> +	}
> >> +	return 0;
> >> +}
> >> +
> >> +static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
> >> +{
> >> +	struct rb_node *n = rb_first(&iommu->dma_list);
> >> +
> >> +	for (; n; n = rb_next(n)) {
> >> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> >> +
> >> +		vfio_dma_bitmap_free(dma);
> >> +	}
> >> +}
> >> +
> >>   /*
> >>    * Helper Functions for host iova-pfn list
> >>    */
> >> @@ -567,6 +654,18 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
> >>   			vfio_unpin_page_external(dma, iova, do_accounting);
> >>   			goto pin_unwind;
> >>   		}
> >> +
> >> +		if (iommu->dirty_page_tracking) {
> >> +			unsigned long pgshift =
> >> +					 __ffs(vfio_pgsize_bitmap(iommu));
> >> +
> >> +			/*
> >> +			 * Bitmap populated with the smallest supported page
> >> +			 * size
> >> +			 */
> >> +			bitmap_set(dma->bitmap,
> >> +				   (vpfn->iova - dma->iova) >> pgshift, 1);
> >> +		}
> >>   	}
> >>   
> >>   	ret = i;
> >> @@ -801,6 +900,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
> >>   	vfio_unmap_unpin(iommu, dma, true);
> >>   	vfio_unlink_dma(iommu, dma);
> >>   	put_task_struct(dma->task);
> >> +	vfio_dma_bitmap_free(dma);
> >>   	kfree(dma);
> >>   	iommu->dma_avail++;
> >>   }
> >> @@ -831,6 +931,57 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
> >>   	return bitmap;
> >>   }
> >>   
> >> +static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
> >> +				  size_t size, uint64_t pgsize,
> >> +				  u64 __user *bitmap)
> >> +{
> >> +	struct vfio_dma *dma;
> >> +	unsigned long pgshift = __ffs(pgsize);
> >> +	unsigned int npages, bitmap_size;
> >> +
> >> +	dma = vfio_find_dma(iommu, iova, 1);
> >> +
> >> +	if (!dma)
> >> +		return -EINVAL;
> >> +
> >> +	if (dma->iova != iova || dma->size != size)
> >> +		return -EINVAL;
> >> +
> > Still don't sure if it's a good practice.
> > I saw the qemu implementation.
> > Qemu just iterates the whole IOVA address space,
> > It needs to find IOTLB entry for an IOVA
> > (1) if it can find an IOTLB for an IOVA, do the DIRTY_PAGES IOCTL and
> > increment IOVA by (iotlb.addr_mask + 1)
> > 
> > (2) if no existing IOTLB found, the imrc->translate needs to go searching shadow
> > page table to try to generate one.
> > if it still fails,(most probably case, as IOMMU only maps a small part in its address
> > space).  increment IOVA by 1 page.
> > 
> > So, if the address space width is 39bit, and if there's only one page
> > mapped, you still have to translate IOVA for around 2^27 times in each
> > query. Isn't it too inefficient?
> > 
> 
> This is Qemu side implementation, let discuss it on QEMU patches.
>
But kernel has to support it first, right?

> > So, IMHO, why we could not just save an rb tree specific for dirty pages, then generate
> > a bitmap for each query?
> 
> This is looping back to implentation in v10 - v12 version. There are 
> problems discussed during v10 to v12 version of patches with this approach.
> - populating dirty bitmap at the time of query will add more CPU cycles.
> - If we save these CPU cyles means dirty pages need to be tracked when 
> they are pinned or dirtied by CPU, that is, inttoduced per vfio_dma 
> bitmap. If ranges are not vfio_dma aligned, then copying bitmap to user 
> space becomes complicated and unefficient.
> 
> So we decided to go with the approach implemented here.

I checked v12, it's not like what I said.
In v12, bitmaps are generated per vfio_dma, and combination of the
bitmaps are required in order to generate a big bitmap suiting for dirty
query. It can cause problem when offset not aligning.
But what I propose here is to generate an rb tree orthogonal to the tree
of vfio_dma.

as to CPU cycles saving, I don't think iterating/translating page by page
would achieve that purpose.



> > 
> >> +	npages = dma->size >> pgshift;
> >> +	bitmap_size = DIRTY_BITMAP_BYTES(npages);
> >> +
> >> +	/* mark all pages dirty if all pages are pinned and mapped. */
> >> +	if (dma->iommu_mapped)
> >> +		bitmap_set(dma->bitmap, 0, npages);
> >> +
> >> +	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
> >> +		return -EFAULT;
> >> +
> >> +	/*
> >> +	 * Re-populate bitmap to include all pinned pages which are considered
> >> +	 * as dirty but exclude pages which are unpinned and pages which are
> >> +	 * marked dirty by vfio_dma_rw()
> >> +	 */
> >> +	bitmap_clear(dma->bitmap, 0, npages);
> >> +	vfio_dma_populate_bitmap(dma, pgsize);
> > will this also repopulate bitmap for pinned pages set by pass-through devices in
> > patch 07 ?
> >
> 
> If pass through device's driver pins pages using vfio_pin_pages and all 
> devices in the group pins pages through vfio_pin_pages, then 
> iommu->pinned_page_dirty_scope is set true, then bitmap is repolutated.
> 
>
pass-through devices already have all guest memory pinned, it would have
no reason to call vfio_pin_pages if not attempting to mark page dirty.
Then if it calls vfio_pin_pages, it means "the pages are accessed, please
mark them dirty, feel free to clean it when you get it",
not "the pages will be accesses, please mark them dirty continuously"

Thanks
Yan

> 
> > 
> >> +	return 0;
> >> +}
> >> +
> >> +static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
> >> +{
> >> +	uint64_t bsize;
> >> +
> >> +	if (!npages || !bitmap_size || (bitmap_size > DIRTY_BITMAP_SIZE_MAX))
> >> +		return -EINVAL;
> >> +
> >> +	bsize = DIRTY_BITMAP_BYTES(npages);
> >> +
> >> +	if (bitmap_size < bsize)
> >> +		return -EINVAL;
> >> +
> >> +	return 0;
> >> +}
> >> +
> >>   static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> >>   			     struct vfio_iommu_type1_dma_unmap *unmap)
> >>   {
> >> @@ -1038,16 +1189,16 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
> >>   	unsigned long vaddr = map->vaddr;
> >>   	size_t size = map->size;
> >>   	int ret = 0, prot = 0;
> >> -	uint64_t mask;
> >> +	uint64_t pgsize;
> >>   	struct vfio_dma *dma;
> >>   
> >>   	/* Verify that none of our __u64 fields overflow */
> >>   	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
> >>   		return -EINVAL;
> >>   
> >> -	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
> >> +	pgsize = (uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu));
> >>   
> >> -	WARN_ON(mask & PAGE_MASK);
> >> +	WARN_ON((pgsize - 1) & PAGE_MASK);
> >>   
> >>   	/* READ/WRITE from device perspective */
> >>   	if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
> >> @@ -1055,7 +1206,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
> >>   	if (map->flags & VFIO_DMA_MAP_FLAG_READ)
> >>   		prot |= IOMMU_READ;
> >>   
> >> -	if (!prot || !size || (size | iova | vaddr) & mask)
> >> +	if (!prot || !size || (size | iova | vaddr) & (pgsize - 1))
> >>   		return -EINVAL;
> >>   
> >>   	/* Don't allow IOVA or virtual address wrap */
> >> @@ -1130,6 +1281,12 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
> >>   	else
> >>   		ret = vfio_pin_map_dma(iommu, dma, size);
> >>   
> >> +	if (!ret && iommu->dirty_page_tracking) {
> >> +		ret = vfio_dma_bitmap_alloc(dma, pgsize);
> >> +		if (ret)
> >> +			vfio_remove_dma(iommu, dma);
> >> +	}
> >> +
> >>   out_unlock:
> >>   	mutex_unlock(&iommu->lock);
> >>   	return ret;
> >> @@ -2278,6 +2435,93 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
> >>   
> >>   		return copy_to_user((void __user *)arg, &unmap, minsz) ?
> >>   			-EFAULT : 0;
> >> +	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
> >> +		struct vfio_iommu_type1_dirty_bitmap dirty;
> >> +		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
> >> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
> >> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
> >> +		int ret = 0;
> >> +
> >> +		if (!iommu->v2)
> >> +			return -EACCES;
> >> +
> >> +		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
> >> +				    flags);
> >> +
> >> +		if (copy_from_user(&dirty, (void __user *)arg, minsz))
> >> +			return -EFAULT;
> >> +
> >> +		if (dirty.argsz < minsz || dirty.flags & ~mask)
> >> +			return -EINVAL;
> >> +
> >> +		/* only one flag should be set at a time */
> >> +		if (__ffs(dirty.flags) != __fls(dirty.flags))
> >> +			return -EINVAL;
> >> +
> >> +		if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
> >> +			uint64_t pgsize = 1 << __ffs(vfio_pgsize_bitmap(iommu));
> >> +
> >> +			mutex_lock(&iommu->lock);
> >> +			if (!iommu->dirty_page_tracking) {
> >> +				ret = vfio_dma_bitmap_alloc_all(iommu, pgsize);
> >> +				if (!ret)
> >> +					iommu->dirty_page_tracking = true;
> >> +			}
> >> +			mutex_unlock(&iommu->lock);
> >> +			return ret;
> >> +		} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
> >> +			mutex_lock(&iommu->lock);
> >> +			if (iommu->dirty_page_tracking) {
> >> +				iommu->dirty_page_tracking = false;
> >> +				vfio_dma_bitmap_free_all(iommu);
> >> +			}
> >> +			mutex_unlock(&iommu->lock);
> >> +			return 0;
> >> +		} else if (dirty.flags &
> >> +				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
> >> +			struct vfio_iommu_type1_dirty_bitmap_get range;
> >> +			unsigned long pgshift;
> >> +			size_t data_size = dirty.argsz - minsz;
> >> +			uint64_t iommu_pgsize =
> >> +					 1 << __ffs(vfio_pgsize_bitmap(iommu));
> >> +
> >> +			if (!data_size || data_size < sizeof(range))
> >> +				return -EINVAL;
> >> +
> >> +			if (copy_from_user(&range, (void __user *)(arg + minsz),
> >> +					   sizeof(range)))
> >> +				return -EFAULT;
> >> +
> >> +			/* allow only smallest supported pgsize */
> >> +			if (range.bitmap.pgsize != iommu_pgsize)
> >> +				return -EINVAL;
> >> +			if (range.iova & (iommu_pgsize - 1))
> >> +				return -EINVAL;
> >> +			if (!range.size || range.size & (iommu_pgsize - 1))
> >> +				return -EINVAL;
> >> +			if (range.iova + range.size < range.iova)
> >> +				return -EINVAL;
> >> +			if (!access_ok((void __user *)range.bitmap.data,
> >> +				       range.bitmap.size))
> >> +				return -EINVAL;
> >> +
> >> +			pgshift = __ffs(range.bitmap.pgsize);
> >> +			ret = verify_bitmap_size(range.size >> pgshift,
> >> +						 range.bitmap.size);
> >> +			if (ret)
> >> +				return ret;
> >> +
> >> +			mutex_lock(&iommu->lock);
> >> +			if (iommu->dirty_page_tracking)
> >> +				ret = vfio_iova_dirty_bitmap(iommu, range.iova,
> >> +						range.size, range.bitmap.pgsize,
> >> +						range.bitmap.data);
> >> +			else
> >> +				ret = -EINVAL;
> >> +			mutex_unlock(&iommu->lock);
> >> +
> >> +			return ret;
> >> +		}
> >>   	}
> >>   
> >>   	return -ENOTTY;
> >> @@ -2345,10 +2589,20 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
> >>   
> >>   	vaddr = dma->vaddr + offset;
> >>   
> >> -	if (write)
> >> +	if (write) {
> >>   		*copied = __copy_to_user((void __user *)vaddr, data,
> >>   					 count) ? 0 : count;
> >> -	else
> >> +		if (*copied && iommu->dirty_page_tracking) {
> >> +			unsigned long pgshift =
> >> +				__ffs(vfio_pgsize_bitmap(iommu));
> >> +			/*
> >> +			 * Bitmap populated with the smallest supported page
> >> +			 * size
> >> +			 */
> >> +			bitmap_set(dma->bitmap, offset >> pgshift,
> >> +				   *copied >> pgshift);
> >> +		}
> >> +	} else
> >>   		*copied = __copy_from_user(data, (void __user *)vaddr,
> >>   					   count) ? 0 : count;
> >>   	if (kthread)
> >> -- 
> >> 2.7.0
> >>
Kirti Wankhede March 27, 2020, 5:07 a.m. UTC | #4
On 3/27/2020 6:00 AM, Yan Zhao wrote:
> On Fri, Mar 27, 2020 at 05:39:01AM +0800, Kirti Wankhede wrote:
>>
>>
>> On 3/25/2020 7:41 AM, Yan Zhao wrote:
>>> On Wed, Mar 25, 2020 at 05:18:52AM +0800, Kirti Wankhede wrote:
>>>> VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
>>>> - Start dirty pages tracking while migration is active
>>>> - Stop dirty pages tracking.
>>>> - Get dirty pages bitmap. Its user space application's responsibility to
>>>>     copy content of dirty pages from source to destination during migration.
>>>>
>>>> To prevent DoS attack, memory for bitmap is allocated per vfio_dma
>>>> structure. Bitmap size is calculated considering smallest supported page
>>>> size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled
>>>>
>>>> Bitmap is populated for already pinned pages when bitmap is allocated for
>>>> a vfio_dma with the smallest supported page size. Update bitmap from
>>>> pinning functions when tracking is enabled. When user application queries
>>>> bitmap, check if requested page size is same as page size used to
>>>> populated bitmap. If it is equal, copy bitmap, but if not equal, return
>>>> error.
>>>>
>>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>>>> Reviewed-by: Neo Jia <cjia@nvidia.com>
>>>> ---
>>>>    drivers/vfio/vfio_iommu_type1.c | 266 +++++++++++++++++++++++++++++++++++++++-
>>>>    1 file changed, 260 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>>>> index 70aeab921d0f..874a1a7ae925 100644
>>>> --- a/drivers/vfio/vfio_iommu_type1.c
>>>> +++ b/drivers/vfio/vfio_iommu_type1.c
>>>> @@ -71,6 +71,7 @@ struct vfio_iommu {
>>>>    	unsigned int		dma_avail;
>>>>    	bool			v2;
>>>>    	bool			nesting;
>>>> +	bool			dirty_page_tracking;
>>>>    };
>>>>    
>>>>    struct vfio_domain {
>>>> @@ -91,6 +92,7 @@ struct vfio_dma {
>>>>    	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
>>>>    	struct task_struct	*task;
>>>>    	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
>>>> +	unsigned long		*bitmap;
>>>>    };
>>>>    
>>>>    struct vfio_group {
>>>> @@ -125,7 +127,21 @@ struct vfio_regions {
>>>>    #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
>>>>    					(!list_empty(&iommu->domain_list))
>>>>    
>>>> +#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
>>>> +
>>>> +/*
>>>> + * Input argument of number of bits to bitmap_set() is unsigned integer, which
>>>> + * further casts to signed integer for unaligned multi-bit operation,
>>>> + * __bitmap_set().
>>>> + * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
>>>> + * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
>>>> + * system.
>>>> + */
>>>> +#define DIRTY_BITMAP_PAGES_MAX	(uint64_t)(INT_MAX - 1)
>>>> +#define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
>>>> +
>>>>    static int put_pfn(unsigned long pfn, int prot);
>>>> +static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
>>>>    
>>>>    /*
>>>>     * This code handles mapping and unmapping of user data buffers
>>>> @@ -175,6 +191,77 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
>>>>    	rb_erase(&old->node, &iommu->dma_list);
>>>>    }
>>>>    
>>>> +
>>>> +static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, uint64_t pgsize)
>>>> +{
>>>> +	uint64_t npages = dma->size / pgsize;
>>>> +
>>>> +	if (npages > DIRTY_BITMAP_PAGES_MAX)
>>>> +		return -EINVAL;
>>>> +
>>>> +	dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
>>>> +	if (!dma->bitmap)
>>>> +		return -ENOMEM;
>>>> +
>>>> +	return 0;
>>>> +}
>>>> +
>>>> +static void vfio_dma_bitmap_free(struct vfio_dma *dma)
>>>> +{
>>>> +	kfree(dma->bitmap);
>>>> +	dma->bitmap = NULL;
>>>> +}
>>>> +
>>>> +static void vfio_dma_populate_bitmap(struct vfio_dma *dma, uint64_t pgsize)
>>>> +{
>>>> +	struct rb_node *p;
>>>> +
>>>> +	if (RB_EMPTY_ROOT(&dma->pfn_list))
>>>> +		return;
>>>> +
>>>> +	for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
>>>> +		struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
>>>> +
>>>> +		bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) / pgsize, 1);
>>>> +	}
>>>> +}
>>>> +
>>>> +static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, uint64_t pgsize)
>>>> +{
>>>> +	struct rb_node *n = rb_first(&iommu->dma_list);
>>>> +
>>>> +	for (; n; n = rb_next(n)) {
>>>> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
>>>> +		int ret;
>>>> +
>>>> +		ret = vfio_dma_bitmap_alloc(dma, pgsize);
>>>> +		if (ret) {
>>>> +			struct rb_node *p = rb_prev(n);
>>>> +
>>>> +			for (; p; p = rb_prev(p)) {
>>>> +				struct vfio_dma *dma = rb_entry(n,
>>>> +							struct vfio_dma, node);
>>>> +
>>>> +				vfio_dma_bitmap_free(dma);
>>>> +			}
>>>> +			return ret;
>>>> +		}
>>>> +		vfio_dma_populate_bitmap(dma, pgsize);
>>>> +	}
>>>> +	return 0;
>>>> +}
>>>> +
>>>> +static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
>>>> +{
>>>> +	struct rb_node *n = rb_first(&iommu->dma_list);
>>>> +
>>>> +	for (; n; n = rb_next(n)) {
>>>> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
>>>> +
>>>> +		vfio_dma_bitmap_free(dma);
>>>> +	}
>>>> +}
>>>> +
>>>>    /*
>>>>     * Helper Functions for host iova-pfn list
>>>>     */
>>>> @@ -567,6 +654,18 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>>>>    			vfio_unpin_page_external(dma, iova, do_accounting);
>>>>    			goto pin_unwind;
>>>>    		}
>>>> +
>>>> +		if (iommu->dirty_page_tracking) {
>>>> +			unsigned long pgshift =
>>>> +					 __ffs(vfio_pgsize_bitmap(iommu));
>>>> +
>>>> +			/*
>>>> +			 * Bitmap populated with the smallest supported page
>>>> +			 * size
>>>> +			 */
>>>> +			bitmap_set(dma->bitmap,
>>>> +				   (vpfn->iova - dma->iova) >> pgshift, 1);
>>>> +		}
>>>>    	}
>>>>    
>>>>    	ret = i;
>>>> @@ -801,6 +900,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
>>>>    	vfio_unmap_unpin(iommu, dma, true);
>>>>    	vfio_unlink_dma(iommu, dma);
>>>>    	put_task_struct(dma->task);
>>>> +	vfio_dma_bitmap_free(dma);
>>>>    	kfree(dma);
>>>>    	iommu->dma_avail++;
>>>>    }
>>>> @@ -831,6 +931,57 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
>>>>    	return bitmap;
>>>>    }
>>>>    
>>>> +static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
>>>> +				  size_t size, uint64_t pgsize,
>>>> +				  u64 __user *bitmap)
>>>> +{
>>>> +	struct vfio_dma *dma;
>>>> +	unsigned long pgshift = __ffs(pgsize);
>>>> +	unsigned int npages, bitmap_size;
>>>> +
>>>> +	dma = vfio_find_dma(iommu, iova, 1);
>>>> +
>>>> +	if (!dma)
>>>> +		return -EINVAL;
>>>> +
>>>> +	if (dma->iova != iova || dma->size != size)
>>>> +		return -EINVAL;
>>>> +
>>> Still don't sure if it's a good practice.
>>> I saw the qemu implementation.
>>> Qemu just iterates the whole IOVA address space,
>>> It needs to find IOTLB entry for an IOVA
>>> (1) if it can find an IOTLB for an IOVA, do the DIRTY_PAGES IOCTL and
>>> increment IOVA by (iotlb.addr_mask + 1)
>>>
>>> (2) if no existing IOTLB found, the imrc->translate needs to go searching shadow
>>> page table to try to generate one.
>>> if it still fails,(most probably case, as IOMMU only maps a small part in its address
>>> space).  increment IOVA by 1 page.
>>>
>>> So, if the address space width is 39bit, and if there's only one page
>>> mapped, you still have to translate IOVA for around 2^27 times in each
>>> query. Isn't it too inefficient?
>>>
>>
>> This is Qemu side implementation, let discuss it on QEMU patches.
>>
> But kernel has to support it first, right?
> 

Shadow page table will be in QEMU (?), as long as we support map and 
unmap in kernel space, QEMU part of changes should work. That shouldn't 
block kernel side patches.

>>> So, IMHO, why we could not just save an rb tree specific for dirty pages, then generate
>>> a bitmap for each query?
>>
>> This is looping back to implentation in v10 - v12 version. There are
>> problems discussed during v10 to v12 version of patches with this approach.
>> - populating dirty bitmap at the time of query will add more CPU cycles.
>> - If we save these CPU cyles means dirty pages need to be tracked when
>> they are pinned or dirtied by CPU, that is, inttoduced per vfio_dma
>> bitmap. If ranges are not vfio_dma aligned, then copying bitmap to user
>> space becomes complicated and unefficient.
>>
>> So we decided to go with the approach implemented here.
> 
> I checked v12, it's not like what I said.
> In v12, bitmaps are generated per vfio_dma, and combination of the
> bitmaps are required in order to generate a big bitmap suiting for dirty
> query. It can cause problem when offset not aligning.
> But what I propose here is to generate an rb tree orthogonal to the tree
> of vfio_dma.
> 
> as to CPU cycles saving, I don't think iterating/translating page by page
> would achieve that purpose.
> 
> 



> 
>>>
>>>> +	npages = dma->size >> pgshift;
>>>> +	bitmap_size = DIRTY_BITMAP_BYTES(npages);
>>>> +
>>>> +	/* mark all pages dirty if all pages are pinned and mapped. */
>>>> +	if (dma->iommu_mapped)
>>>> +		bitmap_set(dma->bitmap, 0, npages);
>>>> +
>>>> +	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
>>>> +		return -EFAULT;
>>>> +
>>>> +	/*
>>>> +	 * Re-populate bitmap to include all pinned pages which are considered
>>>> +	 * as dirty but exclude pages which are unpinned and pages which are
>>>> +	 * marked dirty by vfio_dma_rw()
>>>> +	 */
>>>> +	bitmap_clear(dma->bitmap, 0, npages);
>>>> +	vfio_dma_populate_bitmap(dma, pgsize);
>>> will this also repopulate bitmap for pinned pages set by pass-through devices in
>>> patch 07 ?
>>>
>>
>> If pass through device's driver pins pages using vfio_pin_pages and all
>> devices in the group pins pages through vfio_pin_pages, then
>> iommu->pinned_page_dirty_scope is set true, then bitmap is repolutated.
>>
>>
> pass-through devices already have all guest memory pinned, it would have
> no reason to call vfio_pin_pages if not attempting to mark page dirty.
> Then if it calls vfio_pin_pages, it means "the pages are accessed, please
> mark them dirty, feel free to clean it when you get it",

if you see vfio_dma_populate_bitmap() function, then if vfio_pin_pages 
is called, dma->pfn_list rb_tree will be non-empty and bitmap gets 
populates as per pinned pages.

> not "the pages will be accesses, please mark them dirty continuously"
>

if vfio_pin_pages is not called, dma->pfn_list is empty, then it returns 
early.
If suppose there are 2 deviced in the group, one is IOMMU backed device 
and other non-IOMMU mdev device. In that case, all pages are pinned, 
iommu->pinned_page_dirty_scope is false, but dma->pfn_list is also not 
empty since non-IOMMU backed device pins pages using external API. We 
still have to populate bitmap according to dma->pfn_list here, because 
in prec-copy phase on first bitmap query, IOMMU backed device might pin 
pages using external API - with that iommu->pinned_page_dirty_scope will 
get updated to 'true', which means during next iteration report pinned 
pages by external API only.

Thanks,
Kirti
Kirti Wankhede March 27, 2020, 5:28 a.m. UTC | #5
Hit send button little early.

 >
 > I checked v12, it's not like what I said.
 > In v12, bitmaps are generated per vfio_dma, and combination of the
 > bitmaps are required in order to generate a big bitmap suiting for dirty
 > query. It can cause problem when offset not aligning.
 > But what I propose here is to generate an rb tree orthogonal to the tree
 > of vfio_dma.
 >
 > as to CPU cycles saving, I don't think iterating/translating page by page
 > would achieve that purpose.
 >

Instead of creating one extra rb tree for dirty pages tracking in v10 
tried to use dma->pfn_list itself, we tried changes in v10, v11 and v12, 
latest version is evolved version with best possible approach after 
discussion. Probably, go through v11 as well.
https://patchwork.kernel.org/patch/11298335/

Thanks,
Kirti

On 3/27/2020 6:00 AM, Yan Zhao wrote:
> On Fri, Mar 27, 2020 at 05:39:01AM +0800, Kirti Wankhede wrote:
>>
>>
>> On 3/25/2020 7:41 AM, Yan Zhao wrote:
>>> On Wed, Mar 25, 2020 at 05:18:52AM +0800, Kirti Wankhede wrote:
>>>> VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
>>>> - Start dirty pages tracking while migration is active
>>>> - Stop dirty pages tracking.
>>>> - Get dirty pages bitmap. Its user space application's responsibility to
>>>>     copy content of dirty pages from source to destination during migration.
>>>>
>>>> To prevent DoS attack, memory for bitmap is allocated per vfio_dma
>>>> structure. Bitmap size is calculated considering smallest supported page
>>>> size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled
>>>>
>>>> Bitmap is populated for already pinned pages when bitmap is allocated for
>>>> a vfio_dma with the smallest supported page size. Update bitmap from
>>>> pinning functions when tracking is enabled. When user application queries
>>>> bitmap, check if requested page size is same as page size used to
>>>> populated bitmap. If it is equal, copy bitmap, but if not equal, return
>>>> error.
>>>>
>>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>>>> Reviewed-by: Neo Jia <cjia@nvidia.com>
>>>> ---
>>>>    drivers/vfio/vfio_iommu_type1.c | 266 +++++++++++++++++++++++++++++++++++++++-
>>>>    1 file changed, 260 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>>>> index 70aeab921d0f..874a1a7ae925 100644
>>>> --- a/drivers/vfio/vfio_iommu_type1.c
>>>> +++ b/drivers/vfio/vfio_iommu_type1.c
>>>> @@ -71,6 +71,7 @@ struct vfio_iommu {
>>>>    	unsigned int		dma_avail;
>>>>    	bool			v2;
>>>>    	bool			nesting;
>>>> +	bool			dirty_page_tracking;
>>>>    };
>>>>    
>>>>    struct vfio_domain {
>>>> @@ -91,6 +92,7 @@ struct vfio_dma {
>>>>    	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
>>>>    	struct task_struct	*task;
>>>>    	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
>>>> +	unsigned long		*bitmap;
>>>>    };
>>>>    
>>>>    struct vfio_group {
>>>> @@ -125,7 +127,21 @@ struct vfio_regions {
>>>>    #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
>>>>    					(!list_empty(&iommu->domain_list))
>>>>    
>>>> +#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
>>>> +
>>>> +/*
>>>> + * Input argument of number of bits to bitmap_set() is unsigned integer, which
>>>> + * further casts to signed integer for unaligned multi-bit operation,
>>>> + * __bitmap_set().
>>>> + * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
>>>> + * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
>>>> + * system.
>>>> + */
>>>> +#define DIRTY_BITMAP_PAGES_MAX	(uint64_t)(INT_MAX - 1)
>>>> +#define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
>>>> +
>>>>    static int put_pfn(unsigned long pfn, int prot);
>>>> +static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
>>>>    
>>>>    /*
>>>>     * This code handles mapping and unmapping of user data buffers
>>>> @@ -175,6 +191,77 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
>>>>    	rb_erase(&old->node, &iommu->dma_list);
>>>>    }
>>>>    
>>>> +
>>>> +static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, uint64_t pgsize)
>>>> +{
>>>> +	uint64_t npages = dma->size / pgsize;
>>>> +
>>>> +	if (npages > DIRTY_BITMAP_PAGES_MAX)
>>>> +		return -EINVAL;
>>>> +
>>>> +	dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
>>>> +	if (!dma->bitmap)
>>>> +		return -ENOMEM;
>>>> +
>>>> +	return 0;
>>>> +}
>>>> +
>>>> +static void vfio_dma_bitmap_free(struct vfio_dma *dma)
>>>> +{
>>>> +	kfree(dma->bitmap);
>>>> +	dma->bitmap = NULL;
>>>> +}
>>>> +
>>>> +static void vfio_dma_populate_bitmap(struct vfio_dma *dma, uint64_t pgsize)
>>>> +{
>>>> +	struct rb_node *p;
>>>> +
>>>> +	if (RB_EMPTY_ROOT(&dma->pfn_list))
>>>> +		return;
>>>> +
>>>> +	for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
>>>> +		struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
>>>> +
>>>> +		bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) / pgsize, 1);
>>>> +	}
>>>> +}
>>>> +
>>>> +static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, uint64_t pgsize)
>>>> +{
>>>> +	struct rb_node *n = rb_first(&iommu->dma_list);
>>>> +
>>>> +	for (; n; n = rb_next(n)) {
>>>> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
>>>> +		int ret;
>>>> +
>>>> +		ret = vfio_dma_bitmap_alloc(dma, pgsize);
>>>> +		if (ret) {
>>>> +			struct rb_node *p = rb_prev(n);
>>>> +
>>>> +			for (; p; p = rb_prev(p)) {
>>>> +				struct vfio_dma *dma = rb_entry(n,
>>>> +							struct vfio_dma, node);
>>>> +
>>>> +				vfio_dma_bitmap_free(dma);
>>>> +			}
>>>> +			return ret;
>>>> +		}
>>>> +		vfio_dma_populate_bitmap(dma, pgsize);
>>>> +	}
>>>> +	return 0;
>>>> +}
>>>> +
>>>> +static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
>>>> +{
>>>> +	struct rb_node *n = rb_first(&iommu->dma_list);
>>>> +
>>>> +	for (; n; n = rb_next(n)) {
>>>> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
>>>> +
>>>> +		vfio_dma_bitmap_free(dma);
>>>> +	}
>>>> +}
>>>> +
>>>>    /*
>>>>     * Helper Functions for host iova-pfn list
>>>>     */
>>>> @@ -567,6 +654,18 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>>>>    			vfio_unpin_page_external(dma, iova, do_accounting);
>>>>    			goto pin_unwind;
>>>>    		}
>>>> +
>>>> +		if (iommu->dirty_page_tracking) {
>>>> +			unsigned long pgshift =
>>>> +					 __ffs(vfio_pgsize_bitmap(iommu));
>>>> +
>>>> +			/*
>>>> +			 * Bitmap populated with the smallest supported page
>>>> +			 * size
>>>> +			 */
>>>> +			bitmap_set(dma->bitmap,
>>>> +				   (vpfn->iova - dma->iova) >> pgshift, 1);
>>>> +		}
>>>>    	}
>>>>    
>>>>    	ret = i;
>>>> @@ -801,6 +900,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
>>>>    	vfio_unmap_unpin(iommu, dma, true);
>>>>    	vfio_unlink_dma(iommu, dma);
>>>>    	put_task_struct(dma->task);
>>>> +	vfio_dma_bitmap_free(dma);
>>>>    	kfree(dma);
>>>>    	iommu->dma_avail++;
>>>>    }
>>>> @@ -831,6 +931,57 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
>>>>    	return bitmap;
>>>>    }
>>>>    
>>>> +static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
>>>> +				  size_t size, uint64_t pgsize,
>>>> +				  u64 __user *bitmap)
>>>> +{
>>>> +	struct vfio_dma *dma;
>>>> +	unsigned long pgshift = __ffs(pgsize);
>>>> +	unsigned int npages, bitmap_size;
>>>> +
>>>> +	dma = vfio_find_dma(iommu, iova, 1);
>>>> +
>>>> +	if (!dma)
>>>> +		return -EINVAL;
>>>> +
>>>> +	if (dma->iova != iova || dma->size != size)
>>>> +		return -EINVAL;
>>>> +
>>> Still don't sure if it's a good practice.
>>> I saw the qemu implementation.
>>> Qemu just iterates the whole IOVA address space,
>>> It needs to find IOTLB entry for an IOVA
>>> (1) if it can find an IOTLB for an IOVA, do the DIRTY_PAGES IOCTL and
>>> increment IOVA by (iotlb.addr_mask + 1)
>>>
>>> (2) if no existing IOTLB found, the imrc->translate needs to go searching shadow
>>> page table to try to generate one.
>>> if it still fails,(most probably case, as IOMMU only maps a small part in its address
>>> space).  increment IOVA by 1 page.
>>>
>>> So, if the address space width is 39bit, and if there's only one page
>>> mapped, you still have to translate IOVA for around 2^27 times in each
>>> query. Isn't it too inefficient?
>>>
>>
>> This is Qemu side implementation, let discuss it on QEMU patches.
>>
> But kernel has to support it first, right?
> 
>>> So, IMHO, why we could not just save an rb tree specific for dirty pages, then generate
>>> a bitmap for each query?
>>
>> This is looping back to implentation in v10 - v12 version. There are
>> problems discussed during v10 to v12 version of patches with this approach.
>> - populating dirty bitmap at the time of query will add more CPU cycles.
>> - If we save these CPU cyles means dirty pages need to be tracked when
>> they are pinned or dirtied by CPU, that is, inttoduced per vfio_dma
>> bitmap. If ranges are not vfio_dma aligned, then copying bitmap to user
>> space becomes complicated and unefficient.
>>
>> So we decided to go with the approach implemented here.
> 
> I checked v12, it's not like what I said.
> In v12, bitmaps are generated per vfio_dma, and combination of the
> bitmaps are required in order to generate a big bitmap suiting for dirty
> query. It can cause problem when offset not aligning.
> But what I propose here is to generate an rb tree orthogonal to the tree
> of vfio_dma.
> 
> as to CPU cycles saving, I don't think iterating/translating page by page
> would achieve that purpose.
> 
> 
> 
>>>
>>>> +	npages = dma->size >> pgshift;
>>>> +	bitmap_size = DIRTY_BITMAP_BYTES(npages);
>>>> +
>>>> +	/* mark all pages dirty if all pages are pinned and mapped. */
>>>> +	if (dma->iommu_mapped)
>>>> +		bitmap_set(dma->bitmap, 0, npages);
>>>> +
>>>> +	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
>>>> +		return -EFAULT;
>>>> +
>>>> +	/*
>>>> +	 * Re-populate bitmap to include all pinned pages which are considered
>>>> +	 * as dirty but exclude pages which are unpinned and pages which are
>>>> +	 * marked dirty by vfio_dma_rw()
>>>> +	 */
>>>> +	bitmap_clear(dma->bitmap, 0, npages);
>>>> +	vfio_dma_populate_bitmap(dma, pgsize);
>>> will this also repopulate bitmap for pinned pages set by pass-through devices in
>>> patch 07 ?
>>>
>>
>> If pass through device's driver pins pages using vfio_pin_pages and all
>> devices in the group pins pages through vfio_pin_pages, then
>> iommu->pinned_page_dirty_scope is set true, then bitmap is repolutated.
>>
>>
> pass-through devices already have all guest memory pinned, it would have
> no reason to call vfio_pin_pages if not attempting to mark page dirty.
> Then if it calls vfio_pin_pages, it means "the pages are accessed, please
> mark them dirty, feel free to clean it when you get it",
> not "the pages will be accesses, please mark them dirty continuously"
> 
> Thanks
> Yan
> 
>>
>>>
>>>> +	return 0;
>>>> +}
>>>> +
>>>> +static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
>>>> +{
>>>> +	uint64_t bsize;
>>>> +
>>>> +	if (!npages || !bitmap_size || (bitmap_size > DIRTY_BITMAP_SIZE_MAX))
>>>> +		return -EINVAL;
>>>> +
>>>> +	bsize = DIRTY_BITMAP_BYTES(npages);
>>>> +
>>>> +	if (bitmap_size < bsize)
>>>> +		return -EINVAL;
>>>> +
>>>> +	return 0;
>>>> +}
>>>> +
>>>>    static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>>>>    			     struct vfio_iommu_type1_dma_unmap *unmap)
>>>>    {
>>>> @@ -1038,16 +1189,16 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>>>>    	unsigned long vaddr = map->vaddr;
>>>>    	size_t size = map->size;
>>>>    	int ret = 0, prot = 0;
>>>> -	uint64_t mask;
>>>> +	uint64_t pgsize;
>>>>    	struct vfio_dma *dma;
>>>>    
>>>>    	/* Verify that none of our __u64 fields overflow */
>>>>    	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
>>>>    		return -EINVAL;
>>>>    
>>>> -	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
>>>> +	pgsize = (uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu));
>>>>    
>>>> -	WARN_ON(mask & PAGE_MASK);
>>>> +	WARN_ON((pgsize - 1) & PAGE_MASK);
>>>>    
>>>>    	/* READ/WRITE from device perspective */
>>>>    	if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
>>>> @@ -1055,7 +1206,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>>>>    	if (map->flags & VFIO_DMA_MAP_FLAG_READ)
>>>>    		prot |= IOMMU_READ;
>>>>    
>>>> -	if (!prot || !size || (size | iova | vaddr) & mask)
>>>> +	if (!prot || !size || (size | iova | vaddr) & (pgsize - 1))
>>>>    		return -EINVAL;
>>>>    
>>>>    	/* Don't allow IOVA or virtual address wrap */
>>>> @@ -1130,6 +1281,12 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>>>>    	else
>>>>    		ret = vfio_pin_map_dma(iommu, dma, size);
>>>>    
>>>> +	if (!ret && iommu->dirty_page_tracking) {
>>>> +		ret = vfio_dma_bitmap_alloc(dma, pgsize);
>>>> +		if (ret)
>>>> +			vfio_remove_dma(iommu, dma);
>>>> +	}
>>>> +
>>>>    out_unlock:
>>>>    	mutex_unlock(&iommu->lock);
>>>>    	return ret;
>>>> @@ -2278,6 +2435,93 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>>>>    
>>>>    		return copy_to_user((void __user *)arg, &unmap, minsz) ?
>>>>    			-EFAULT : 0;
>>>> +	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
>>>> +		struct vfio_iommu_type1_dirty_bitmap dirty;
>>>> +		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
>>>> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
>>>> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
>>>> +		int ret = 0;
>>>> +
>>>> +		if (!iommu->v2)
>>>> +			return -EACCES;
>>>> +
>>>> +		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
>>>> +				    flags);
>>>> +
>>>> +		if (copy_from_user(&dirty, (void __user *)arg, minsz))
>>>> +			return -EFAULT;
>>>> +
>>>> +		if (dirty.argsz < minsz || dirty.flags & ~mask)
>>>> +			return -EINVAL;
>>>> +
>>>> +		/* only one flag should be set at a time */
>>>> +		if (__ffs(dirty.flags) != __fls(dirty.flags))
>>>> +			return -EINVAL;
>>>> +
>>>> +		if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
>>>> +			uint64_t pgsize = 1 << __ffs(vfio_pgsize_bitmap(iommu));
>>>> +
>>>> +			mutex_lock(&iommu->lock);
>>>> +			if (!iommu->dirty_page_tracking) {
>>>> +				ret = vfio_dma_bitmap_alloc_all(iommu, pgsize);
>>>> +				if (!ret)
>>>> +					iommu->dirty_page_tracking = true;
>>>> +			}
>>>> +			mutex_unlock(&iommu->lock);
>>>> +			return ret;
>>>> +		} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
>>>> +			mutex_lock(&iommu->lock);
>>>> +			if (iommu->dirty_page_tracking) {
>>>> +				iommu->dirty_page_tracking = false;
>>>> +				vfio_dma_bitmap_free_all(iommu);
>>>> +			}
>>>> +			mutex_unlock(&iommu->lock);
>>>> +			return 0;
>>>> +		} else if (dirty.flags &
>>>> +				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
>>>> +			struct vfio_iommu_type1_dirty_bitmap_get range;
>>>> +			unsigned long pgshift;
>>>> +			size_t data_size = dirty.argsz - minsz;
>>>> +			uint64_t iommu_pgsize =
>>>> +					 1 << __ffs(vfio_pgsize_bitmap(iommu));
>>>> +
>>>> +			if (!data_size || data_size < sizeof(range))
>>>> +				return -EINVAL;
>>>> +
>>>> +			if (copy_from_user(&range, (void __user *)(arg + minsz),
>>>> +					   sizeof(range)))
>>>> +				return -EFAULT;
>>>> +
>>>> +			/* allow only smallest supported pgsize */
>>>> +			if (range.bitmap.pgsize != iommu_pgsize)
>>>> +				return -EINVAL;
>>>> +			if (range.iova & (iommu_pgsize - 1))
>>>> +				return -EINVAL;
>>>> +			if (!range.size || range.size & (iommu_pgsize - 1))
>>>> +				return -EINVAL;
>>>> +			if (range.iova + range.size < range.iova)
>>>> +				return -EINVAL;
>>>> +			if (!access_ok((void __user *)range.bitmap.data,
>>>> +				       range.bitmap.size))
>>>> +				return -EINVAL;
>>>> +
>>>> +			pgshift = __ffs(range.bitmap.pgsize);
>>>> +			ret = verify_bitmap_size(range.size >> pgshift,
>>>> +						 range.bitmap.size);
>>>> +			if (ret)
>>>> +				return ret;
>>>> +
>>>> +			mutex_lock(&iommu->lock);
>>>> +			if (iommu->dirty_page_tracking)
>>>> +				ret = vfio_iova_dirty_bitmap(iommu, range.iova,
>>>> +						range.size, range.bitmap.pgsize,
>>>> +						range.bitmap.data);
>>>> +			else
>>>> +				ret = -EINVAL;
>>>> +			mutex_unlock(&iommu->lock);
>>>> +
>>>> +			return ret;
>>>> +		}
>>>>    	}
>>>>    
>>>>    	return -ENOTTY;
>>>> @@ -2345,10 +2589,20 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
>>>>    
>>>>    	vaddr = dma->vaddr + offset;
>>>>    
>>>> -	if (write)
>>>> +	if (write) {
>>>>    		*copied = __copy_to_user((void __user *)vaddr, data,
>>>>    					 count) ? 0 : count;
>>>> -	else
>>>> +		if (*copied && iommu->dirty_page_tracking) {
>>>> +			unsigned long pgshift =
>>>> +				__ffs(vfio_pgsize_bitmap(iommu));
>>>> +			/*
>>>> +			 * Bitmap populated with the smallest supported page
>>>> +			 * size
>>>> +			 */
>>>> +			bitmap_set(dma->bitmap, offset >> pgshift,
>>>> +				   *copied >> pgshift);
>>>> +		}
>>>> +	} else
>>>>    		*copied = __copy_from_user(data, (void __user *)vaddr,
>>>>    					   count) ? 0 : count;
>>>>    	if (kthread)
>>>> -- 
>>>> 2.7.0
>>>>
Dr. David Alan Gilbert March 27, 2020, 11:57 a.m. UTC | #6
* Kirti Wankhede (kwankhede@nvidia.com) wrote:
> VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
> - Start dirty pages tracking while migration is active
> - Stop dirty pages tracking.
> - Get dirty pages bitmap. Its user space application's responsibility to
>   copy content of dirty pages from source to destination during migration.
> 
> To prevent DoS attack, memory for bitmap is allocated per vfio_dma
> structure. Bitmap size is calculated considering smallest supported page
> size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled
> 
> Bitmap is populated for already pinned pages when bitmap is allocated for
> a vfio_dma with the smallest supported page size. Update bitmap from
> pinning functions when tracking is enabled. When user application queries
> bitmap, check if requested page size is same as page size used to
> populated bitmap. If it is equal, copy bitmap, but if not equal, return
> error.
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Reviewed-by: Neo Jia <cjia@nvidia.com>
> ---
>  drivers/vfio/vfio_iommu_type1.c | 266 +++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 260 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 70aeab921d0f..874a1a7ae925 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -71,6 +71,7 @@ struct vfio_iommu {
>  	unsigned int		dma_avail;
>  	bool			v2;
>  	bool			nesting;
> +	bool			dirty_page_tracking;
>  };
>  
>  struct vfio_domain {
> @@ -91,6 +92,7 @@ struct vfio_dma {
>  	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
>  	struct task_struct	*task;
>  	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
> +	unsigned long		*bitmap;
>  };
>  
>  struct vfio_group {
> @@ -125,7 +127,21 @@ struct vfio_regions {
>  #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
>  					(!list_empty(&iommu->domain_list))
>  
> +#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
> +
> +/*
> + * Input argument of number of bits to bitmap_set() is unsigned integer, which
> + * further casts to signed integer for unaligned multi-bit operation,
> + * __bitmap_set().
> + * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
> + * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
> + * system.
> + */

Can you explain to me what that size is the limit of?  People are
already running 12TB VMs.

Dave

> +#define DIRTY_BITMAP_PAGES_MAX	(uint64_t)(INT_MAX - 1)
> +#define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
> +
>  static int put_pfn(unsigned long pfn, int prot);
> +static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
>  
>  /*
>   * This code handles mapping and unmapping of user data buffers
> @@ -175,6 +191,77 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
>  	rb_erase(&old->node, &iommu->dma_list);
>  }
>  
> +
> +static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, uint64_t pgsize)
> +{
> +	uint64_t npages = dma->size / pgsize;
> +
> +	if (npages > DIRTY_BITMAP_PAGES_MAX)
> +		return -EINVAL;
> +
> +	dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
> +	if (!dma->bitmap)
> +		return -ENOMEM;
> +
> +	return 0;
> +}
> +
> +static void vfio_dma_bitmap_free(struct vfio_dma *dma)
> +{
> +	kfree(dma->bitmap);
> +	dma->bitmap = NULL;
> +}
> +
> +static void vfio_dma_populate_bitmap(struct vfio_dma *dma, uint64_t pgsize)
> +{
> +	struct rb_node *p;
> +
> +	if (RB_EMPTY_ROOT(&dma->pfn_list))
> +		return;
> +
> +	for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
> +		struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
> +
> +		bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) / pgsize, 1);
> +	}
> +}
> +
> +static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, uint64_t pgsize)
> +{
> +	struct rb_node *n = rb_first(&iommu->dma_list);
> +
> +	for (; n; n = rb_next(n)) {
> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> +		int ret;
> +
> +		ret = vfio_dma_bitmap_alloc(dma, pgsize);
> +		if (ret) {
> +			struct rb_node *p = rb_prev(n);
> +
> +			for (; p; p = rb_prev(p)) {
> +				struct vfio_dma *dma = rb_entry(n,
> +							struct vfio_dma, node);
> +
> +				vfio_dma_bitmap_free(dma);
> +			}
> +			return ret;
> +		}
> +		vfio_dma_populate_bitmap(dma, pgsize);
> +	}
> +	return 0;
> +}
> +
> +static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
> +{
> +	struct rb_node *n = rb_first(&iommu->dma_list);
> +
> +	for (; n; n = rb_next(n)) {
> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> +
> +		vfio_dma_bitmap_free(dma);
> +	}
> +}
> +
>  /*
>   * Helper Functions for host iova-pfn list
>   */
> @@ -567,6 +654,18 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>  			vfio_unpin_page_external(dma, iova, do_accounting);
>  			goto pin_unwind;
>  		}
> +
> +		if (iommu->dirty_page_tracking) {
> +			unsigned long pgshift =
> +					 __ffs(vfio_pgsize_bitmap(iommu));
> +
> +			/*
> +			 * Bitmap populated with the smallest supported page
> +			 * size
> +			 */
> +			bitmap_set(dma->bitmap,
> +				   (vpfn->iova - dma->iova) >> pgshift, 1);
> +		}
>  	}
>  
>  	ret = i;
> @@ -801,6 +900,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
>  	vfio_unmap_unpin(iommu, dma, true);
>  	vfio_unlink_dma(iommu, dma);
>  	put_task_struct(dma->task);
> +	vfio_dma_bitmap_free(dma);
>  	kfree(dma);
>  	iommu->dma_avail++;
>  }
> @@ -831,6 +931,57 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
>  	return bitmap;
>  }
>  
> +static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
> +				  size_t size, uint64_t pgsize,
> +				  u64 __user *bitmap)
> +{
> +	struct vfio_dma *dma;
> +	unsigned long pgshift = __ffs(pgsize);
> +	unsigned int npages, bitmap_size;
> +
> +	dma = vfio_find_dma(iommu, iova, 1);
> +
> +	if (!dma)
> +		return -EINVAL;
> +
> +	if (dma->iova != iova || dma->size != size)
> +		return -EINVAL;
> +
> +	npages = dma->size >> pgshift;
> +	bitmap_size = DIRTY_BITMAP_BYTES(npages);
> +
> +	/* mark all pages dirty if all pages are pinned and mapped. */
> +	if (dma->iommu_mapped)
> +		bitmap_set(dma->bitmap, 0, npages);
> +
> +	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
> +		return -EFAULT;
> +
> +	/*
> +	 * Re-populate bitmap to include all pinned pages which are considered
> +	 * as dirty but exclude pages which are unpinned and pages which are
> +	 * marked dirty by vfio_dma_rw()
> +	 */
> +	bitmap_clear(dma->bitmap, 0, npages);
> +	vfio_dma_populate_bitmap(dma, pgsize);
> +	return 0;
> +}
> +
> +static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
> +{
> +	uint64_t bsize;
> +
> +	if (!npages || !bitmap_size || (bitmap_size > DIRTY_BITMAP_SIZE_MAX))
> +		return -EINVAL;
> +
> +	bsize = DIRTY_BITMAP_BYTES(npages);
> +
> +	if (bitmap_size < bsize)
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
>  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>  			     struct vfio_iommu_type1_dma_unmap *unmap)
>  {
> @@ -1038,16 +1189,16 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  	unsigned long vaddr = map->vaddr;
>  	size_t size = map->size;
>  	int ret = 0, prot = 0;
> -	uint64_t mask;
> +	uint64_t pgsize;
>  	struct vfio_dma *dma;
>  
>  	/* Verify that none of our __u64 fields overflow */
>  	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
>  		return -EINVAL;
>  
> -	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
> +	pgsize = (uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu));
>  
> -	WARN_ON(mask & PAGE_MASK);
> +	WARN_ON((pgsize - 1) & PAGE_MASK);
>  
>  	/* READ/WRITE from device perspective */
>  	if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
> @@ -1055,7 +1206,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  	if (map->flags & VFIO_DMA_MAP_FLAG_READ)
>  		prot |= IOMMU_READ;
>  
> -	if (!prot || !size || (size | iova | vaddr) & mask)
> +	if (!prot || !size || (size | iova | vaddr) & (pgsize - 1))
>  		return -EINVAL;
>  
>  	/* Don't allow IOVA or virtual address wrap */
> @@ -1130,6 +1281,12 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  	else
>  		ret = vfio_pin_map_dma(iommu, dma, size);
>  
> +	if (!ret && iommu->dirty_page_tracking) {
> +		ret = vfio_dma_bitmap_alloc(dma, pgsize);
> +		if (ret)
> +			vfio_remove_dma(iommu, dma);
> +	}
> +
>  out_unlock:
>  	mutex_unlock(&iommu->lock);
>  	return ret;
> @@ -2278,6 +2435,93 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>  
>  		return copy_to_user((void __user *)arg, &unmap, minsz) ?
>  			-EFAULT : 0;
> +	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
> +		struct vfio_iommu_type1_dirty_bitmap dirty;
> +		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
> +		int ret = 0;
> +
> +		if (!iommu->v2)
> +			return -EACCES;
> +
> +		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
> +				    flags);
> +
> +		if (copy_from_user(&dirty, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (dirty.argsz < minsz || dirty.flags & ~mask)
> +			return -EINVAL;
> +
> +		/* only one flag should be set at a time */
> +		if (__ffs(dirty.flags) != __fls(dirty.flags))
> +			return -EINVAL;
> +
> +		if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
> +			uint64_t pgsize = 1 << __ffs(vfio_pgsize_bitmap(iommu));
> +
> +			mutex_lock(&iommu->lock);
> +			if (!iommu->dirty_page_tracking) {
> +				ret = vfio_dma_bitmap_alloc_all(iommu, pgsize);
> +				if (!ret)
> +					iommu->dirty_page_tracking = true;
> +			}
> +			mutex_unlock(&iommu->lock);
> +			return ret;
> +		} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
> +			mutex_lock(&iommu->lock);
> +			if (iommu->dirty_page_tracking) {
> +				iommu->dirty_page_tracking = false;
> +				vfio_dma_bitmap_free_all(iommu);
> +			}
> +			mutex_unlock(&iommu->lock);
> +			return 0;
> +		} else if (dirty.flags &
> +				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
> +			struct vfio_iommu_type1_dirty_bitmap_get range;
> +			unsigned long pgshift;
> +			size_t data_size = dirty.argsz - minsz;
> +			uint64_t iommu_pgsize =
> +					 1 << __ffs(vfio_pgsize_bitmap(iommu));
> +
> +			if (!data_size || data_size < sizeof(range))
> +				return -EINVAL;
> +
> +			if (copy_from_user(&range, (void __user *)(arg + minsz),
> +					   sizeof(range)))
> +				return -EFAULT;
> +
> +			/* allow only smallest supported pgsize */
> +			if (range.bitmap.pgsize != iommu_pgsize)
> +				return -EINVAL;
> +			if (range.iova & (iommu_pgsize - 1))
> +				return -EINVAL;
> +			if (!range.size || range.size & (iommu_pgsize - 1))
> +				return -EINVAL;
> +			if (range.iova + range.size < range.iova)
> +				return -EINVAL;
> +			if (!access_ok((void __user *)range.bitmap.data,
> +				       range.bitmap.size))
> +				return -EINVAL;
> +
> +			pgshift = __ffs(range.bitmap.pgsize);
> +			ret = verify_bitmap_size(range.size >> pgshift,
> +						 range.bitmap.size);
> +			if (ret)
> +				return ret;
> +
> +			mutex_lock(&iommu->lock);
> +			if (iommu->dirty_page_tracking)
> +				ret = vfio_iova_dirty_bitmap(iommu, range.iova,
> +						range.size, range.bitmap.pgsize,
> +						range.bitmap.data);
> +			else
> +				ret = -EINVAL;
> +			mutex_unlock(&iommu->lock);
> +
> +			return ret;
> +		}
>  	}
>  
>  	return -ENOTTY;
> @@ -2345,10 +2589,20 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
>  
>  	vaddr = dma->vaddr + offset;
>  
> -	if (write)
> +	if (write) {
>  		*copied = __copy_to_user((void __user *)vaddr, data,
>  					 count) ? 0 : count;
> -	else
> +		if (*copied && iommu->dirty_page_tracking) {
> +			unsigned long pgshift =
> +				__ffs(vfio_pgsize_bitmap(iommu));
> +			/*
> +			 * Bitmap populated with the smallest supported page
> +			 * size
> +			 */
> +			bitmap_set(dma->bitmap, offset >> pgshift,
> +				   *copied >> pgshift);
> +		}
> +	} else
>  		*copied = __copy_from_user(data, (void __user *)vaddr,
>  					   count) ? 0 : count;
>  	if (kthread)
> -- 
> 2.7.0
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Alex Williamson March 27, 2020, 1:57 p.m. UTC | #7
On Fri, 27 Mar 2020 11:57:45 +0000
"Dr. David Alan Gilbert" <dgilbert@redhat.com> wrote:

> * Kirti Wankhede (kwankhede@nvidia.com) wrote:
> > VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
> > - Start dirty pages tracking while migration is active
> > - Stop dirty pages tracking.
> > - Get dirty pages bitmap. Its user space application's responsibility to
> >   copy content of dirty pages from source to destination during migration.
> > 
> > To prevent DoS attack, memory for bitmap is allocated per vfio_dma
> > structure. Bitmap size is calculated considering smallest supported page
> > size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled
> > 
> > Bitmap is populated for already pinned pages when bitmap is allocated for
> > a vfio_dma with the smallest supported page size. Update bitmap from
> > pinning functions when tracking is enabled. When user application queries
> > bitmap, check if requested page size is same as page size used to
> > populated bitmap. If it is equal, copy bitmap, but if not equal, return
> > error.
> > 
> > Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> > Reviewed-by: Neo Jia <cjia@nvidia.com>
> > ---
> >  drivers/vfio/vfio_iommu_type1.c | 266 +++++++++++++++++++++++++++++++++++++++-
> >  1 file changed, 260 insertions(+), 6 deletions(-)
> > 
> > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > index 70aeab921d0f..874a1a7ae925 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -71,6 +71,7 @@ struct vfio_iommu {
> >  	unsigned int		dma_avail;
> >  	bool			v2;
> >  	bool			nesting;
> > +	bool			dirty_page_tracking;
> >  };
> >  
> >  struct vfio_domain {
> > @@ -91,6 +92,7 @@ struct vfio_dma {
> >  	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
> >  	struct task_struct	*task;
> >  	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
> > +	unsigned long		*bitmap;
> >  };
> >  
> >  struct vfio_group {
> > @@ -125,7 +127,21 @@ struct vfio_regions {
> >  #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> >  					(!list_empty(&iommu->domain_list))
> >  
> > +#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
> > +
> > +/*
> > + * Input argument of number of bits to bitmap_set() is unsigned integer, which
> > + * further casts to signed integer for unaligned multi-bit operation,
> > + * __bitmap_set().
> > + * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
> > + * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
> > + * system.
> > + */  
> 
> Can you explain to me what that size is the limit of?  People are
> already running 12TB VMs.

It's the limit of a single DMA mapping range.  KVM has the same
limitation for memory slots.  People are running large VMs, but they
need to use hotpluggable DIMMs or NUMA configuration such that any
single KVM memory slot is less than 8TB.  The same should be sufficient
here.  Thanks,

Alex

> > +#define DIRTY_BITMAP_PAGES_MAX	(uint64_t)(INT_MAX - 1)
> > +#define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
> > +
> >  static int put_pfn(unsigned long pfn, int prot);
> > +static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
> >  
> >  /*
> >   * This code handles mapping and unmapping of user data buffers
> > @@ -175,6 +191,77 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
> >  	rb_erase(&old->node, &iommu->dma_list);
> >  }
> >  
> > +
> > +static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, uint64_t pgsize)
> > +{
> > +	uint64_t npages = dma->size / pgsize;
> > +
> > +	if (npages > DIRTY_BITMAP_PAGES_MAX)
> > +		return -EINVAL;
> > +
> > +	dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
> > +	if (!dma->bitmap)
> > +		return -ENOMEM;
> > +
> > +	return 0;
> > +}
> > +
> > +static void vfio_dma_bitmap_free(struct vfio_dma *dma)
> > +{
> > +	kfree(dma->bitmap);
> > +	dma->bitmap = NULL;
> > +}
> > +
> > +static void vfio_dma_populate_bitmap(struct vfio_dma *dma, uint64_t pgsize)
> > +{
> > +	struct rb_node *p;
> > +
> > +	if (RB_EMPTY_ROOT(&dma->pfn_list))
> > +		return;
> > +
> > +	for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
> > +		struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
> > +
> > +		bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) / pgsize, 1);
> > +	}
> > +}
> > +
> > +static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, uint64_t pgsize)
> > +{
> > +	struct rb_node *n = rb_first(&iommu->dma_list);
> > +
> > +	for (; n; n = rb_next(n)) {
> > +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> > +		int ret;
> > +
> > +		ret = vfio_dma_bitmap_alloc(dma, pgsize);
> > +		if (ret) {
> > +			struct rb_node *p = rb_prev(n);
> > +
> > +			for (; p; p = rb_prev(p)) {
> > +				struct vfio_dma *dma = rb_entry(n,
> > +							struct vfio_dma, node);
> > +
> > +				vfio_dma_bitmap_free(dma);
> > +			}
> > +			return ret;
> > +		}
> > +		vfio_dma_populate_bitmap(dma, pgsize);
> > +	}
> > +	return 0;
> > +}
> > +
> > +static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
> > +{
> > +	struct rb_node *n = rb_first(&iommu->dma_list);
> > +
> > +	for (; n; n = rb_next(n)) {
> > +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> > +
> > +		vfio_dma_bitmap_free(dma);
> > +	}
> > +}
> > +
> >  /*
> >   * Helper Functions for host iova-pfn list
> >   */
> > @@ -567,6 +654,18 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
> >  			vfio_unpin_page_external(dma, iova, do_accounting);
> >  			goto pin_unwind;
> >  		}
> > +
> > +		if (iommu->dirty_page_tracking) {
> > +			unsigned long pgshift =
> > +					 __ffs(vfio_pgsize_bitmap(iommu));
> > +
> > +			/*
> > +			 * Bitmap populated with the smallest supported page
> > +			 * size
> > +			 */
> > +			bitmap_set(dma->bitmap,
> > +				   (vpfn->iova - dma->iova) >> pgshift, 1);
> > +		}
> >  	}
> >  
> >  	ret = i;
> > @@ -801,6 +900,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
> >  	vfio_unmap_unpin(iommu, dma, true);
> >  	vfio_unlink_dma(iommu, dma);
> >  	put_task_struct(dma->task);
> > +	vfio_dma_bitmap_free(dma);
> >  	kfree(dma);
> >  	iommu->dma_avail++;
> >  }
> > @@ -831,6 +931,57 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
> >  	return bitmap;
> >  }
> >  
> > +static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
> > +				  size_t size, uint64_t pgsize,
> > +				  u64 __user *bitmap)
> > +{
> > +	struct vfio_dma *dma;
> > +	unsigned long pgshift = __ffs(pgsize);
> > +	unsigned int npages, bitmap_size;
> > +
> > +	dma = vfio_find_dma(iommu, iova, 1);
> > +
> > +	if (!dma)
> > +		return -EINVAL;
> > +
> > +	if (dma->iova != iova || dma->size != size)
> > +		return -EINVAL;
> > +
> > +	npages = dma->size >> pgshift;
> > +	bitmap_size = DIRTY_BITMAP_BYTES(npages);
> > +
> > +	/* mark all pages dirty if all pages are pinned and mapped. */
> > +	if (dma->iommu_mapped)
> > +		bitmap_set(dma->bitmap, 0, npages);
> > +
> > +	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
> > +		return -EFAULT;
> > +
> > +	/*
> > +	 * Re-populate bitmap to include all pinned pages which are considered
> > +	 * as dirty but exclude pages which are unpinned and pages which are
> > +	 * marked dirty by vfio_dma_rw()
> > +	 */
> > +	bitmap_clear(dma->bitmap, 0, npages);
> > +	vfio_dma_populate_bitmap(dma, pgsize);
> > +	return 0;
> > +}
> > +
> > +static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
> > +{
> > +	uint64_t bsize;
> > +
> > +	if (!npages || !bitmap_size || (bitmap_size > DIRTY_BITMAP_SIZE_MAX))
> > +		return -EINVAL;
> > +
> > +	bsize = DIRTY_BITMAP_BYTES(npages);
> > +
> > +	if (bitmap_size < bsize)
> > +		return -EINVAL;
> > +
> > +	return 0;
> > +}
> > +
> >  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> >  			     struct vfio_iommu_type1_dma_unmap *unmap)
> >  {
> > @@ -1038,16 +1189,16 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
> >  	unsigned long vaddr = map->vaddr;
> >  	size_t size = map->size;
> >  	int ret = 0, prot = 0;
> > -	uint64_t mask;
> > +	uint64_t pgsize;
> >  	struct vfio_dma *dma;
> >  
> >  	/* Verify that none of our __u64 fields overflow */
> >  	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
> >  		return -EINVAL;
> >  
> > -	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
> > +	pgsize = (uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu));
> >  
> > -	WARN_ON(mask & PAGE_MASK);
> > +	WARN_ON((pgsize - 1) & PAGE_MASK);
> >  
> >  	/* READ/WRITE from device perspective */
> >  	if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
> > @@ -1055,7 +1206,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
> >  	if (map->flags & VFIO_DMA_MAP_FLAG_READ)
> >  		prot |= IOMMU_READ;
> >  
> > -	if (!prot || !size || (size | iova | vaddr) & mask)
> > +	if (!prot || !size || (size | iova | vaddr) & (pgsize - 1))
> >  		return -EINVAL;
> >  
> >  	/* Don't allow IOVA or virtual address wrap */
> > @@ -1130,6 +1281,12 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
> >  	else
> >  		ret = vfio_pin_map_dma(iommu, dma, size);
> >  
> > +	if (!ret && iommu->dirty_page_tracking) {
> > +		ret = vfio_dma_bitmap_alloc(dma, pgsize);
> > +		if (ret)
> > +			vfio_remove_dma(iommu, dma);
> > +	}
> > +
> >  out_unlock:
> >  	mutex_unlock(&iommu->lock);
> >  	return ret;
> > @@ -2278,6 +2435,93 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
> >  
> >  		return copy_to_user((void __user *)arg, &unmap, minsz) ?
> >  			-EFAULT : 0;
> > +	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
> > +		struct vfio_iommu_type1_dirty_bitmap dirty;
> > +		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
> > +				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
> > +				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
> > +		int ret = 0;
> > +
> > +		if (!iommu->v2)
> > +			return -EACCES;
> > +
> > +		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
> > +				    flags);
> > +
> > +		if (copy_from_user(&dirty, (void __user *)arg, minsz))
> > +			return -EFAULT;
> > +
> > +		if (dirty.argsz < minsz || dirty.flags & ~mask)
> > +			return -EINVAL;
> > +
> > +		/* only one flag should be set at a time */
> > +		if (__ffs(dirty.flags) != __fls(dirty.flags))
> > +			return -EINVAL;
> > +
> > +		if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
> > +			uint64_t pgsize = 1 << __ffs(vfio_pgsize_bitmap(iommu));
> > +
> > +			mutex_lock(&iommu->lock);
> > +			if (!iommu->dirty_page_tracking) {
> > +				ret = vfio_dma_bitmap_alloc_all(iommu, pgsize);
> > +				if (!ret)
> > +					iommu->dirty_page_tracking = true;
> > +			}
> > +			mutex_unlock(&iommu->lock);
> > +			return ret;
> > +		} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
> > +			mutex_lock(&iommu->lock);
> > +			if (iommu->dirty_page_tracking) {
> > +				iommu->dirty_page_tracking = false;
> > +				vfio_dma_bitmap_free_all(iommu);
> > +			}
> > +			mutex_unlock(&iommu->lock);
> > +			return 0;
> > +		} else if (dirty.flags &
> > +				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
> > +			struct vfio_iommu_type1_dirty_bitmap_get range;
> > +			unsigned long pgshift;
> > +			size_t data_size = dirty.argsz - minsz;
> > +			uint64_t iommu_pgsize =
> > +					 1 << __ffs(vfio_pgsize_bitmap(iommu));
> > +
> > +			if (!data_size || data_size < sizeof(range))
> > +				return -EINVAL;
> > +
> > +			if (copy_from_user(&range, (void __user *)(arg + minsz),
> > +					   sizeof(range)))
> > +				return -EFAULT;
> > +
> > +			/* allow only smallest supported pgsize */
> > +			if (range.bitmap.pgsize != iommu_pgsize)
> > +				return -EINVAL;
> > +			if (range.iova & (iommu_pgsize - 1))
> > +				return -EINVAL;
> > +			if (!range.size || range.size & (iommu_pgsize - 1))
> > +				return -EINVAL;
> > +			if (range.iova + range.size < range.iova)
> > +				return -EINVAL;
> > +			if (!access_ok((void __user *)range.bitmap.data,
> > +				       range.bitmap.size))
> > +				return -EINVAL;
> > +
> > +			pgshift = __ffs(range.bitmap.pgsize);
> > +			ret = verify_bitmap_size(range.size >> pgshift,
> > +						 range.bitmap.size);
> > +			if (ret)
> > +				return ret;
> > +
> > +			mutex_lock(&iommu->lock);
> > +			if (iommu->dirty_page_tracking)
> > +				ret = vfio_iova_dirty_bitmap(iommu, range.iova,
> > +						range.size, range.bitmap.pgsize,
> > +						range.bitmap.data);
> > +			else
> > +				ret = -EINVAL;
> > +			mutex_unlock(&iommu->lock);
> > +
> > +			return ret;
> > +		}
> >  	}
> >  
> >  	return -ENOTTY;
> > @@ -2345,10 +2589,20 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
> >  
> >  	vaddr = dma->vaddr + offset;
> >  
> > -	if (write)
> > +	if (write) {
> >  		*copied = __copy_to_user((void __user *)vaddr, data,
> >  					 count) ? 0 : count;
> > -	else
> > +		if (*copied && iommu->dirty_page_tracking) {
> > +			unsigned long pgshift =
> > +				__ffs(vfio_pgsize_bitmap(iommu));
> > +			/*
> > +			 * Bitmap populated with the smallest supported page
> > +			 * size
> > +			 */
> > +			bitmap_set(dma->bitmap, offset >> pgshift,
> > +				   *copied >> pgshift);
> > +		}
> > +	} else
> >  		*copied = __copy_from_user(data, (void __user *)vaddr,
> >  					   count) ? 0 : count;
> >  	if (kthread)
> > -- 
> > 2.7.0
> >   
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Dr. David Alan Gilbert March 27, 2020, 2:09 p.m. UTC | #8
* Alex Williamson (alex.williamson@redhat.com) wrote:
> On Fri, 27 Mar 2020 11:57:45 +0000
> "Dr. David Alan Gilbert" <dgilbert@redhat.com> wrote:
> 
> > * Kirti Wankhede (kwankhede@nvidia.com) wrote:
> > > VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
> > > - Start dirty pages tracking while migration is active
> > > - Stop dirty pages tracking.
> > > - Get dirty pages bitmap. Its user space application's responsibility to
> > >   copy content of dirty pages from source to destination during migration.
> > > 
> > > To prevent DoS attack, memory for bitmap is allocated per vfio_dma
> > > structure. Bitmap size is calculated considering smallest supported page
> > > size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled
> > > 
> > > Bitmap is populated for already pinned pages when bitmap is allocated for
> > > a vfio_dma with the smallest supported page size. Update bitmap from
> > > pinning functions when tracking is enabled. When user application queries
> > > bitmap, check if requested page size is same as page size used to
> > > populated bitmap. If it is equal, copy bitmap, but if not equal, return
> > > error.
> > > 
> > > Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> > > Reviewed-by: Neo Jia <cjia@nvidia.com>
> > > ---
> > >  drivers/vfio/vfio_iommu_type1.c | 266 +++++++++++++++++++++++++++++++++++++++-
> > >  1 file changed, 260 insertions(+), 6 deletions(-)
> > > 
> > > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > > index 70aeab921d0f..874a1a7ae925 100644
> > > --- a/drivers/vfio/vfio_iommu_type1.c
> > > +++ b/drivers/vfio/vfio_iommu_type1.c
> > > @@ -71,6 +71,7 @@ struct vfio_iommu {
> > >  	unsigned int		dma_avail;
> > >  	bool			v2;
> > >  	bool			nesting;
> > > +	bool			dirty_page_tracking;
> > >  };
> > >  
> > >  struct vfio_domain {
> > > @@ -91,6 +92,7 @@ struct vfio_dma {
> > >  	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
> > >  	struct task_struct	*task;
> > >  	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
> > > +	unsigned long		*bitmap;
> > >  };
> > >  
> > >  struct vfio_group {
> > > @@ -125,7 +127,21 @@ struct vfio_regions {
> > >  #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> > >  					(!list_empty(&iommu->domain_list))
> > >  
> > > +#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
> > > +
> > > +/*
> > > + * Input argument of number of bits to bitmap_set() is unsigned integer, which
> > > + * further casts to signed integer for unaligned multi-bit operation,
> > > + * __bitmap_set().
> > > + * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
> > > + * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
> > > + * system.
> > > + */  
> > 
> > Can you explain to me what that size is the limit of?  People are
> > already running 12TB VMs.
> 
> It's the limit of a single DMA mapping range.  KVM has the same
> limitation for memory slots.  People are running large VMs, but they
> need to use hotpluggable DIMMs or NUMA configuration such that any
> single KVM memory slot is less than 8TB.  The same should be sufficient
> here.  Thanks,

Ah OK, that's fine.

Dave

> Alex
> 
> > > +#define DIRTY_BITMAP_PAGES_MAX	(uint64_t)(INT_MAX - 1)
> > > +#define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
> > > +
> > >  static int put_pfn(unsigned long pfn, int prot);
> > > +static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
> > >  
> > >  /*
> > >   * This code handles mapping and unmapping of user data buffers
> > > @@ -175,6 +191,77 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
> > >  	rb_erase(&old->node, &iommu->dma_list);
> > >  }
> > >  
> > > +
> > > +static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, uint64_t pgsize)
> > > +{
> > > +	uint64_t npages = dma->size / pgsize;
> > > +
> > > +	if (npages > DIRTY_BITMAP_PAGES_MAX)
> > > +		return -EINVAL;
> > > +
> > > +	dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
> > > +	if (!dma->bitmap)
> > > +		return -ENOMEM;
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +static void vfio_dma_bitmap_free(struct vfio_dma *dma)
> > > +{
> > > +	kfree(dma->bitmap);
> > > +	dma->bitmap = NULL;
> > > +}
> > > +
> > > +static void vfio_dma_populate_bitmap(struct vfio_dma *dma, uint64_t pgsize)
> > > +{
> > > +	struct rb_node *p;
> > > +
> > > +	if (RB_EMPTY_ROOT(&dma->pfn_list))
> > > +		return;
> > > +
> > > +	for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
> > > +		struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
> > > +
> > > +		bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) / pgsize, 1);
> > > +	}
> > > +}
> > > +
> > > +static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, uint64_t pgsize)
> > > +{
> > > +	struct rb_node *n = rb_first(&iommu->dma_list);
> > > +
> > > +	for (; n; n = rb_next(n)) {
> > > +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> > > +		int ret;
> > > +
> > > +		ret = vfio_dma_bitmap_alloc(dma, pgsize);
> > > +		if (ret) {
> > > +			struct rb_node *p = rb_prev(n);
> > > +
> > > +			for (; p; p = rb_prev(p)) {
> > > +				struct vfio_dma *dma = rb_entry(n,
> > > +							struct vfio_dma, node);
> > > +
> > > +				vfio_dma_bitmap_free(dma);
> > > +			}
> > > +			return ret;
> > > +		}
> > > +		vfio_dma_populate_bitmap(dma, pgsize);
> > > +	}
> > > +	return 0;
> > > +}
> > > +
> > > +static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
> > > +{
> > > +	struct rb_node *n = rb_first(&iommu->dma_list);
> > > +
> > > +	for (; n; n = rb_next(n)) {
> > > +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> > > +
> > > +		vfio_dma_bitmap_free(dma);
> > > +	}
> > > +}
> > > +
> > >  /*
> > >   * Helper Functions for host iova-pfn list
> > >   */
> > > @@ -567,6 +654,18 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
> > >  			vfio_unpin_page_external(dma, iova, do_accounting);
> > >  			goto pin_unwind;
> > >  		}
> > > +
> > > +		if (iommu->dirty_page_tracking) {
> > > +			unsigned long pgshift =
> > > +					 __ffs(vfio_pgsize_bitmap(iommu));
> > > +
> > > +			/*
> > > +			 * Bitmap populated with the smallest supported page
> > > +			 * size
> > > +			 */
> > > +			bitmap_set(dma->bitmap,
> > > +				   (vpfn->iova - dma->iova) >> pgshift, 1);
> > > +		}
> > >  	}
> > >  
> > >  	ret = i;
> > > @@ -801,6 +900,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
> > >  	vfio_unmap_unpin(iommu, dma, true);
> > >  	vfio_unlink_dma(iommu, dma);
> > >  	put_task_struct(dma->task);
> > > +	vfio_dma_bitmap_free(dma);
> > >  	kfree(dma);
> > >  	iommu->dma_avail++;
> > >  }
> > > @@ -831,6 +931,57 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
> > >  	return bitmap;
> > >  }
> > >  
> > > +static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
> > > +				  size_t size, uint64_t pgsize,
> > > +				  u64 __user *bitmap)
> > > +{
> > > +	struct vfio_dma *dma;
> > > +	unsigned long pgshift = __ffs(pgsize);
> > > +	unsigned int npages, bitmap_size;
> > > +
> > > +	dma = vfio_find_dma(iommu, iova, 1);
> > > +
> > > +	if (!dma)
> > > +		return -EINVAL;
> > > +
> > > +	if (dma->iova != iova || dma->size != size)
> > > +		return -EINVAL;
> > > +
> > > +	npages = dma->size >> pgshift;
> > > +	bitmap_size = DIRTY_BITMAP_BYTES(npages);
> > > +
> > > +	/* mark all pages dirty if all pages are pinned and mapped. */
> > > +	if (dma->iommu_mapped)
> > > +		bitmap_set(dma->bitmap, 0, npages);
> > > +
> > > +	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
> > > +		return -EFAULT;
> > > +
> > > +	/*
> > > +	 * Re-populate bitmap to include all pinned pages which are considered
> > > +	 * as dirty but exclude pages which are unpinned and pages which are
> > > +	 * marked dirty by vfio_dma_rw()
> > > +	 */
> > > +	bitmap_clear(dma->bitmap, 0, npages);
> > > +	vfio_dma_populate_bitmap(dma, pgsize);
> > > +	return 0;
> > > +}
> > > +
> > > +static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
> > > +{
> > > +	uint64_t bsize;
> > > +
> > > +	if (!npages || !bitmap_size || (bitmap_size > DIRTY_BITMAP_SIZE_MAX))
> > > +		return -EINVAL;
> > > +
> > > +	bsize = DIRTY_BITMAP_BYTES(npages);
> > > +
> > > +	if (bitmap_size < bsize)
> > > +		return -EINVAL;
> > > +
> > > +	return 0;
> > > +}
> > > +
> > >  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> > >  			     struct vfio_iommu_type1_dma_unmap *unmap)
> > >  {
> > > @@ -1038,16 +1189,16 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
> > >  	unsigned long vaddr = map->vaddr;
> > >  	size_t size = map->size;
> > >  	int ret = 0, prot = 0;
> > > -	uint64_t mask;
> > > +	uint64_t pgsize;
> > >  	struct vfio_dma *dma;
> > >  
> > >  	/* Verify that none of our __u64 fields overflow */
> > >  	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
> > >  		return -EINVAL;
> > >  
> > > -	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
> > > +	pgsize = (uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu));
> > >  
> > > -	WARN_ON(mask & PAGE_MASK);
> > > +	WARN_ON((pgsize - 1) & PAGE_MASK);
> > >  
> > >  	/* READ/WRITE from device perspective */
> > >  	if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
> > > @@ -1055,7 +1206,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
> > >  	if (map->flags & VFIO_DMA_MAP_FLAG_READ)
> > >  		prot |= IOMMU_READ;
> > >  
> > > -	if (!prot || !size || (size | iova | vaddr) & mask)
> > > +	if (!prot || !size || (size | iova | vaddr) & (pgsize - 1))
> > >  		return -EINVAL;
> > >  
> > >  	/* Don't allow IOVA or virtual address wrap */
> > > @@ -1130,6 +1281,12 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
> > >  	else
> > >  		ret = vfio_pin_map_dma(iommu, dma, size);
> > >  
> > > +	if (!ret && iommu->dirty_page_tracking) {
> > > +		ret = vfio_dma_bitmap_alloc(dma, pgsize);
> > > +		if (ret)
> > > +			vfio_remove_dma(iommu, dma);
> > > +	}
> > > +
> > >  out_unlock:
> > >  	mutex_unlock(&iommu->lock);
> > >  	return ret;
> > > @@ -2278,6 +2435,93 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
> > >  
> > >  		return copy_to_user((void __user *)arg, &unmap, minsz) ?
> > >  			-EFAULT : 0;
> > > +	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
> > > +		struct vfio_iommu_type1_dirty_bitmap dirty;
> > > +		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
> > > +				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
> > > +				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
> > > +		int ret = 0;
> > > +
> > > +		if (!iommu->v2)
> > > +			return -EACCES;
> > > +
> > > +		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
> > > +				    flags);
> > > +
> > > +		if (copy_from_user(&dirty, (void __user *)arg, minsz))
> > > +			return -EFAULT;
> > > +
> > > +		if (dirty.argsz < minsz || dirty.flags & ~mask)
> > > +			return -EINVAL;
> > > +
> > > +		/* only one flag should be set at a time */
> > > +		if (__ffs(dirty.flags) != __fls(dirty.flags))
> > > +			return -EINVAL;
> > > +
> > > +		if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
> > > +			uint64_t pgsize = 1 << __ffs(vfio_pgsize_bitmap(iommu));
> > > +
> > > +			mutex_lock(&iommu->lock);
> > > +			if (!iommu->dirty_page_tracking) {
> > > +				ret = vfio_dma_bitmap_alloc_all(iommu, pgsize);
> > > +				if (!ret)
> > > +					iommu->dirty_page_tracking = true;
> > > +			}
> > > +			mutex_unlock(&iommu->lock);
> > > +			return ret;
> > > +		} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
> > > +			mutex_lock(&iommu->lock);
> > > +			if (iommu->dirty_page_tracking) {
> > > +				iommu->dirty_page_tracking = false;
> > > +				vfio_dma_bitmap_free_all(iommu);
> > > +			}
> > > +			mutex_unlock(&iommu->lock);
> > > +			return 0;
> > > +		} else if (dirty.flags &
> > > +				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
> > > +			struct vfio_iommu_type1_dirty_bitmap_get range;
> > > +			unsigned long pgshift;
> > > +			size_t data_size = dirty.argsz - minsz;
> > > +			uint64_t iommu_pgsize =
> > > +					 1 << __ffs(vfio_pgsize_bitmap(iommu));
> > > +
> > > +			if (!data_size || data_size < sizeof(range))
> > > +				return -EINVAL;
> > > +
> > > +			if (copy_from_user(&range, (void __user *)(arg + minsz),
> > > +					   sizeof(range)))
> > > +				return -EFAULT;
> > > +
> > > +			/* allow only smallest supported pgsize */
> > > +			if (range.bitmap.pgsize != iommu_pgsize)
> > > +				return -EINVAL;
> > > +			if (range.iova & (iommu_pgsize - 1))
> > > +				return -EINVAL;
> > > +			if (!range.size || range.size & (iommu_pgsize - 1))
> > > +				return -EINVAL;
> > > +			if (range.iova + range.size < range.iova)
> > > +				return -EINVAL;
> > > +			if (!access_ok((void __user *)range.bitmap.data,
> > > +				       range.bitmap.size))
> > > +				return -EINVAL;
> > > +
> > > +			pgshift = __ffs(range.bitmap.pgsize);
> > > +			ret = verify_bitmap_size(range.size >> pgshift,
> > > +						 range.bitmap.size);
> > > +			if (ret)
> > > +				return ret;
> > > +
> > > +			mutex_lock(&iommu->lock);
> > > +			if (iommu->dirty_page_tracking)
> > > +				ret = vfio_iova_dirty_bitmap(iommu, range.iova,
> > > +						range.size, range.bitmap.pgsize,
> > > +						range.bitmap.data);
> > > +			else
> > > +				ret = -EINVAL;
> > > +			mutex_unlock(&iommu->lock);
> > > +
> > > +			return ret;
> > > +		}
> > >  	}
> > >  
> > >  	return -ENOTTY;
> > > @@ -2345,10 +2589,20 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
> > >  
> > >  	vaddr = dma->vaddr + offset;
> > >  
> > > -	if (write)
> > > +	if (write) {
> > >  		*copied = __copy_to_user((void __user *)vaddr, data,
> > >  					 count) ? 0 : count;
> > > -	else
> > > +		if (*copied && iommu->dirty_page_tracking) {
> > > +			unsigned long pgshift =
> > > +				__ffs(vfio_pgsize_bitmap(iommu));
> > > +			/*
> > > +			 * Bitmap populated with the smallest supported page
> > > +			 * size
> > > +			 */
> > > +			bitmap_set(dma->bitmap, offset >> pgshift,
> > > +				   *copied >> pgshift);
> > > +		}
> > > +	} else
> > >  		*copied = __copy_from_user(data, (void __user *)vaddr,
> > >  					   count) ? 0 : count;
> > >  	if (kthread)
> > > -- 
> > > 2.7.0
> > >   
> > --
> > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Yan Zhao March 30, 2020, 2:07 a.m. UTC | #9
On Fri, Mar 27, 2020 at 01:07:38PM +0800, Kirti Wankhede wrote:
> 
> 
> On 3/27/2020 6:00 AM, Yan Zhao wrote:
> > On Fri, Mar 27, 2020 at 05:39:01AM +0800, Kirti Wankhede wrote:
> >>
> >>
> >> On 3/25/2020 7:41 AM, Yan Zhao wrote:
> >>> On Wed, Mar 25, 2020 at 05:18:52AM +0800, Kirti Wankhede wrote:
> >>>> VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
> >>>> - Start dirty pages tracking while migration is active
> >>>> - Stop dirty pages tracking.
> >>>> - Get dirty pages bitmap. Its user space application's responsibility to
> >>>>     copy content of dirty pages from source to destination during migration.
> >>>>
> >>>> To prevent DoS attack, memory for bitmap is allocated per vfio_dma
> >>>> structure. Bitmap size is calculated considering smallest supported page
> >>>> size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled
> >>>>
> >>>> Bitmap is populated for already pinned pages when bitmap is allocated for
> >>>> a vfio_dma with the smallest supported page size. Update bitmap from
> >>>> pinning functions when tracking is enabled. When user application queries
> >>>> bitmap, check if requested page size is same as page size used to
> >>>> populated bitmap. If it is equal, copy bitmap, but if not equal, return
> >>>> error.
> >>>>
> >>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> >>>> Reviewed-by: Neo Jia <cjia@nvidia.com>
> >>>> ---
> >>>>    drivers/vfio/vfio_iommu_type1.c | 266 +++++++++++++++++++++++++++++++++++++++-
> >>>>    1 file changed, 260 insertions(+), 6 deletions(-)
> >>>>
> >>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> >>>> index 70aeab921d0f..874a1a7ae925 100644
> >>>> --- a/drivers/vfio/vfio_iommu_type1.c
> >>>> +++ b/drivers/vfio/vfio_iommu_type1.c
> >>>> @@ -71,6 +71,7 @@ struct vfio_iommu {
> >>>>    	unsigned int		dma_avail;
> >>>>    	bool			v2;
> >>>>    	bool			nesting;
> >>>> +	bool			dirty_page_tracking;
> >>>>    };
> >>>>    
> >>>>    struct vfio_domain {
> >>>> @@ -91,6 +92,7 @@ struct vfio_dma {
> >>>>    	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
> >>>>    	struct task_struct	*task;
> >>>>    	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
> >>>> +	unsigned long		*bitmap;
> >>>>    };
> >>>>    
> >>>>    struct vfio_group {
> >>>> @@ -125,7 +127,21 @@ struct vfio_regions {
> >>>>    #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> >>>>    					(!list_empty(&iommu->domain_list))
> >>>>    
> >>>> +#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
> >>>> +
> >>>> +/*
> >>>> + * Input argument of number of bits to bitmap_set() is unsigned integer, which
> >>>> + * further casts to signed integer for unaligned multi-bit operation,
> >>>> + * __bitmap_set().
> >>>> + * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
> >>>> + * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
> >>>> + * system.
> >>>> + */
> >>>> +#define DIRTY_BITMAP_PAGES_MAX	(uint64_t)(INT_MAX - 1)
> >>>> +#define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
> >>>> +
> >>>>    static int put_pfn(unsigned long pfn, int prot);
> >>>> +static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
> >>>>    
> >>>>    /*
> >>>>     * This code handles mapping and unmapping of user data buffers
> >>>> @@ -175,6 +191,77 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
> >>>>    	rb_erase(&old->node, &iommu->dma_list);
> >>>>    }
> >>>>    
> >>>> +
> >>>> +static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, uint64_t pgsize)
> >>>> +{
> >>>> +	uint64_t npages = dma->size / pgsize;
> >>>> +
> >>>> +	if (npages > DIRTY_BITMAP_PAGES_MAX)
> >>>> +		return -EINVAL;
> >>>> +
> >>>> +	dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
> >>>> +	if (!dma->bitmap)
> >>>> +		return -ENOMEM;
> >>>> +
> >>>> +	return 0;
> >>>> +}
> >>>> +
> >>>> +static void vfio_dma_bitmap_free(struct vfio_dma *dma)
> >>>> +{
> >>>> +	kfree(dma->bitmap);
> >>>> +	dma->bitmap = NULL;
> >>>> +}
> >>>> +
> >>>> +static void vfio_dma_populate_bitmap(struct vfio_dma *dma, uint64_t pgsize)
> >>>> +{
> >>>> +	struct rb_node *p;
> >>>> +
> >>>> +	if (RB_EMPTY_ROOT(&dma->pfn_list))
> >>>> +		return;
> >>>> +
> >>>> +	for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
> >>>> +		struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
> >>>> +
> >>>> +		bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) / pgsize, 1);
> >>>> +	}
> >>>> +}
> >>>> +
> >>>> +static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, uint64_t pgsize)
> >>>> +{
> >>>> +	struct rb_node *n = rb_first(&iommu->dma_list);
> >>>> +
> >>>> +	for (; n; n = rb_next(n)) {
> >>>> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> >>>> +		int ret;
> >>>> +
> >>>> +		ret = vfio_dma_bitmap_alloc(dma, pgsize);
> >>>> +		if (ret) {
> >>>> +			struct rb_node *p = rb_prev(n);
> >>>> +
> >>>> +			for (; p; p = rb_prev(p)) {
> >>>> +				struct vfio_dma *dma = rb_entry(n,
> >>>> +							struct vfio_dma, node);
> >>>> +
> >>>> +				vfio_dma_bitmap_free(dma);
> >>>> +			}
> >>>> +			return ret;
> >>>> +		}
> >>>> +		vfio_dma_populate_bitmap(dma, pgsize);
> >>>> +	}
> >>>> +	return 0;
> >>>> +}
> >>>> +
> >>>> +static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
> >>>> +{
> >>>> +	struct rb_node *n = rb_first(&iommu->dma_list);
> >>>> +
> >>>> +	for (; n; n = rb_next(n)) {
> >>>> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> >>>> +
> >>>> +		vfio_dma_bitmap_free(dma);
> >>>> +	}
> >>>> +}
> >>>> +
> >>>>    /*
> >>>>     * Helper Functions for host iova-pfn list
> >>>>     */
> >>>> @@ -567,6 +654,18 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
> >>>>    			vfio_unpin_page_external(dma, iova, do_accounting);
> >>>>    			goto pin_unwind;
> >>>>    		}
> >>>> +
> >>>> +		if (iommu->dirty_page_tracking) {
> >>>> +			unsigned long pgshift =
> >>>> +					 __ffs(vfio_pgsize_bitmap(iommu));
> >>>> +
> >>>> +			/*
> >>>> +			 * Bitmap populated with the smallest supported page
> >>>> +			 * size
> >>>> +			 */
> >>>> +			bitmap_set(dma->bitmap,
> >>>> +				   (vpfn->iova - dma->iova) >> pgshift, 1);
> >>>> +		}
> >>>>    	}
> >>>>    
> >>>>    	ret = i;
> >>>> @@ -801,6 +900,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
> >>>>    	vfio_unmap_unpin(iommu, dma, true);
> >>>>    	vfio_unlink_dma(iommu, dma);
> >>>>    	put_task_struct(dma->task);
> >>>> +	vfio_dma_bitmap_free(dma);
> >>>>    	kfree(dma);
> >>>>    	iommu->dma_avail++;
> >>>>    }
> >>>> @@ -831,6 +931,57 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
> >>>>    	return bitmap;
> >>>>    }
> >>>>    
> >>>> +static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
> >>>> +				  size_t size, uint64_t pgsize,
> >>>> +				  u64 __user *bitmap)
> >>>> +{
> >>>> +	struct vfio_dma *dma;
> >>>> +	unsigned long pgshift = __ffs(pgsize);
> >>>> +	unsigned int npages, bitmap_size;
> >>>> +
> >>>> +	dma = vfio_find_dma(iommu, iova, 1);
> >>>> +
> >>>> +	if (!dma)
> >>>> +		return -EINVAL;
> >>>> +
> >>>> +	if (dma->iova != iova || dma->size != size)
> >>>> +		return -EINVAL;
> >>>> +
> >>> Still don't sure if it's a good practice.
> >>> I saw the qemu implementation.
> >>> Qemu just iterates the whole IOVA address space,
> >>> It needs to find IOTLB entry for an IOVA
> >>> (1) if it can find an IOTLB for an IOVA, do the DIRTY_PAGES IOCTL and
> >>> increment IOVA by (iotlb.addr_mask + 1)
> >>>
> >>> (2) if no existing IOTLB found, the imrc->translate needs to go searching shadow
> >>> page table to try to generate one.
> >>> if it still fails,(most probably case, as IOMMU only maps a small part in its address
> >>> space).  increment IOVA by 1 page.
> >>>
> >>> So, if the address space width is 39bit, and if there's only one page
> >>> mapped, you still have to translate IOVA for around 2^27 times in each
> >>> query. Isn't it too inefficient?
> >>>
> >>
> >> This is Qemu side implementation, let discuss it on QEMU patches.
> >>
> > But kernel has to support it first, right?
> > 
> 
> Shadow page table will be in QEMU (?), as long as we support map and 
Yes, shadow page table in QEMU.

> unmap in kernel space, QEMU part of changes should work. That shouldn't 
> block kernel side patches.
Not sure whether this assertion is right:)
I just want to raise the issue out.

> 
> >>> So, IMHO, why we could not just save an rb tree specific for dirty pages, then generate
> >>> a bitmap for each query?
> >>
> >> This is looping back to implentation in v10 - v12 version. There are
> >> problems discussed during v10 to v12 version of patches with this approach.
> >> - populating dirty bitmap at the time of query will add more CPU cycles.
> >> - If we save these CPU cyles means dirty pages need to be tracked when
> >> they are pinned or dirtied by CPU, that is, inttoduced per vfio_dma
> >> bitmap. If ranges are not vfio_dma aligned, then copying bitmap to user
> >> space becomes complicated and unefficient.
> >>
> >> So we decided to go with the approach implemented here.
> > 
> > I checked v12, it's not like what I said.
> > In v12, bitmaps are generated per vfio_dma, and combination of the
> > bitmaps are required in order to generate a big bitmap suiting for dirty
> > query. It can cause problem when offset not aligning.
> > But what I propose here is to generate an rb tree orthogonal to the tree
> > of vfio_dma.
> > 
> > as to CPU cycles saving, I don't think iterating/translating page by page
> > would achieve that purpose.
> > 
> > 
> 
> 
> 
> > 
> >>>
> >>>> +	npages = dma->size >> pgshift;
> >>>> +	bitmap_size = DIRTY_BITMAP_BYTES(npages);
> >>>> +
> >>>> +	/* mark all pages dirty if all pages are pinned and mapped. */
> >>>> +	if (dma->iommu_mapped)
> >>>> +		bitmap_set(dma->bitmap, 0, npages);
> >>>> +
> >>>> +	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
> >>>> +		return -EFAULT;
> >>>> +
> >>>> +	/*
> >>>> +	 * Re-populate bitmap to include all pinned pages which are considered
> >>>> +	 * as dirty but exclude pages which are unpinned and pages which are
> >>>> +	 * marked dirty by vfio_dma_rw()
> >>>> +	 */
> >>>> +	bitmap_clear(dma->bitmap, 0, npages);
> >>>> +	vfio_dma_populate_bitmap(dma, pgsize);
> >>> will this also repopulate bitmap for pinned pages set by pass-through devices in
> >>> patch 07 ?
> >>>
> >>
> >> If pass through device's driver pins pages using vfio_pin_pages and all
> >> devices in the group pins pages through vfio_pin_pages, then
> >> iommu->pinned_page_dirty_scope is set true, then bitmap is repolutated.
> >>
> >>
> > pass-through devices already have all guest memory pinned, it would have
> > no reason to call vfio_pin_pages if not attempting to mark page dirty.
> > Then if it calls vfio_pin_pages, it means "the pages are accessed, please
> > mark them dirty, feel free to clean it when you get it",
> 
> if you see vfio_dma_populate_bitmap() function, then if vfio_pin_pages 
> is called, dma->pfn_list rb_tree will be non-empty and bitmap gets 
> populates as per pinned pages.
> 
> > not "the pages will be accesses, please mark them dirty continuously"
> >
> 
> if vfio_pin_pages is not called, dma->pfn_list is empty, then it returns 
> early.
> If suppose there are 2 deviced in the group, one is IOMMU backed device 
> and other non-IOMMU mdev device. In that case, all pages are pinned, 
> iommu->pinned_page_dirty_scope is false, but dma->pfn_list is also not 
> empty since non-IOMMU backed device pins pages using external API. We 
> still have to populate bitmap according to dma->pfn_list here, because 
> in prec-copy phase on first bitmap query, IOMMU backed device might pin 
> pages using external API - with that iommu->pinned_page_dirty_scope will 
> get updated to 'true', which means during next iteration report pinned 
> pages by external API only.
>
ok, I previously thought vfio_pin_pages for IOMMU backed device is to set
dirty pages after it has write access to them. Looks your intention here
is presume pinned pages are dirty so you have to re-fill them until they
are unpinned.
Maybe you can leave it as is, and we can add mark dirty interface later for
the purpose I said above (mark dirty after write access).

Thanks
Yan
Yan Zhao March 30, 2020, 3:24 a.m. UTC | #10
On Fri, Mar 27, 2020 at 01:28:13PM +0800, Kirti Wankhede wrote:
> Hit send button little early.
> 
>  >
>  > I checked v12, it's not like what I said.
>  > In v12, bitmaps are generated per vfio_dma, and combination of the
>  > bitmaps are required in order to generate a big bitmap suiting for dirty
>  > query. It can cause problem when offset not aligning.
>  > But what I propose here is to generate an rb tree orthogonal to the tree
>  > of vfio_dma.
>  >
>  > as to CPU cycles saving, I don't think iterating/translating page by page
>  > would achieve that purpose.
>  >
> 
> Instead of creating one extra rb tree for dirty pages tracking in v10 
> tried to use dma->pfn_list itself, we tried changes in v10, v11 and v12, 
> latest version is evolved version with best possible approach after 
> discussion. Probably, go through v11 as well.
> https://patchwork.kernel.org/patch/11298335/
>
I'm not sure why all those previous implementations are bound to
vfio_dma. for vIOMMU on, in most cases, a vfio_dma is only for a page,
so generating a one-byte bitmap for a single page in each vfio_dma ?
is it possible to creating one extra rb tree to keep dirty ranges, and
one fixed length kernel bitmap whose content is generated on query,
serving as a bouncing buffer for copy_to_user

> 
> On 3/27/2020 6:00 AM, Yan Zhao wrote:
> > On Fri, Mar 27, 2020 at 05:39:01AM +0800, Kirti Wankhede wrote:
> >>
> >>
> >> On 3/25/2020 7:41 AM, Yan Zhao wrote:
> >>> On Wed, Mar 25, 2020 at 05:18:52AM +0800, Kirti Wankhede wrote:
> >>>> VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
> >>>> - Start dirty pages tracking while migration is active
> >>>> - Stop dirty pages tracking.
> >>>> - Get dirty pages bitmap. Its user space application's responsibility to
> >>>>     copy content of dirty pages from source to destination during migration.
> >>>>
> >>>> To prevent DoS attack, memory for bitmap is allocated per vfio_dma
> >>>> structure. Bitmap size is calculated considering smallest supported page
> >>>> size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled
> >>>>
> >>>> Bitmap is populated for already pinned pages when bitmap is allocated for
> >>>> a vfio_dma with the smallest supported page size. Update bitmap from
> >>>> pinning functions when tracking is enabled. When user application queries
> >>>> bitmap, check if requested page size is same as page size used to
> >>>> populated bitmap. If it is equal, copy bitmap, but if not equal, return
> >>>> error.
> >>>>
> >>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> >>>> Reviewed-by: Neo Jia <cjia@nvidia.com>
> >>>> ---
> >>>>    drivers/vfio/vfio_iommu_type1.c | 266 +++++++++++++++++++++++++++++++++++++++-
> >>>>    1 file changed, 260 insertions(+), 6 deletions(-)
> >>>>
> >>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> >>>> index 70aeab921d0f..874a1a7ae925 100644
> >>>> --- a/drivers/vfio/vfio_iommu_type1.c
> >>>> +++ b/drivers/vfio/vfio_iommu_type1.c
> >>>> @@ -71,6 +71,7 @@ struct vfio_iommu {
> >>>>    	unsigned int		dma_avail;
> >>>>    	bool			v2;
> >>>>    	bool			nesting;
> >>>> +	bool			dirty_page_tracking;
> >>>>    };
> >>>>    
> >>>>    struct vfio_domain {
> >>>> @@ -91,6 +92,7 @@ struct vfio_dma {
> >>>>    	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
> >>>>    	struct task_struct	*task;
> >>>>    	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
> >>>> +	unsigned long		*bitmap;
> >>>>    };
> >>>>    
> >>>>    struct vfio_group {
> >>>> @@ -125,7 +127,21 @@ struct vfio_regions {
> >>>>    #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> >>>>    					(!list_empty(&iommu->domain_list))
> >>>>    
> >>>> +#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
> >>>> +
> >>>> +/*
> >>>> + * Input argument of number of bits to bitmap_set() is unsigned integer, which
> >>>> + * further casts to signed integer for unaligned multi-bit operation,
> >>>> + * __bitmap_set().
> >>>> + * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
> >>>> + * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
> >>>> + * system.
> >>>> + */
> >>>> +#define DIRTY_BITMAP_PAGES_MAX	(uint64_t)(INT_MAX - 1)
> >>>> +#define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
> >>>> +
> >>>>    static int put_pfn(unsigned long pfn, int prot);
> >>>> +static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
> >>>>    
> >>>>    /*
> >>>>     * This code handles mapping and unmapping of user data buffers
> >>>> @@ -175,6 +191,77 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
> >>>>    	rb_erase(&old->node, &iommu->dma_list);
> >>>>    }
> >>>>    
> >>>> +
> >>>> +static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, uint64_t pgsize)
> >>>> +{
> >>>> +	uint64_t npages = dma->size / pgsize;
> >>>> +
If pgsize > dma->size, npages = 0.
wouldn't it cause problem?


> >>>> +	if (npages > DIRTY_BITMAP_PAGES_MAX)
> >>>> +		return -EINVAL;
> >>>> +
> >>>> +	dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
> >>>> +	if (!dma->bitmap)
> >>>> +		return -ENOMEM;
> >>>> +
> >>>> +	return 0;
> >>>> +}
> >>>> +
> >>>> +static void vfio_dma_bitmap_free(struct vfio_dma *dma)
> >>>> +{
> >>>> +	kfree(dma->bitmap);
> >>>> +	dma->bitmap = NULL;
> >>>> +}
> >>>> +
> >>>> +static void vfio_dma_populate_bitmap(struct vfio_dma *dma, uint64_t pgsize)
> >>>> +{
> >>>> +	struct rb_node *p;
> >>>> +
> >>>> +	if (RB_EMPTY_ROOT(&dma->pfn_list))
> >>>> +		return;
> >>>> +
> >>>> +	for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
> >>>> +		struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
> >>>> +
> >>>> +		bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) / pgsize, 1);
> >>>> +	}
> >>>> +}
> >>>> +
> >>>> +static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, uint64_t pgsize)
> >>>> +{
> >>>> +	struct rb_node *n = rb_first(&iommu->dma_list);
> >>>> +
> >>>> +	for (; n; n = rb_next(n)) {
> >>>> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> >>>> +		int ret;
> >>>> +
> >>>> +		ret = vfio_dma_bitmap_alloc(dma, pgsize);
> >>>> +		if (ret) {
> >>>> +			struct rb_node *p = rb_prev(n);
> >>>> +
> >>>> +			for (; p; p = rb_prev(p)) {
> >>>> +				struct vfio_dma *dma = rb_entry(n,
> >>>> +							struct vfio_dma, node);
> >>>> +
> >>>> +				vfio_dma_bitmap_free(dma);
> >>>> +			}
> >>>> +			return ret;
> >>>> +		}
> >>>> +		vfio_dma_populate_bitmap(dma, pgsize);
> >>>> +	}
> >>>> +	return 0;
> >>>> +}
> >>>> +
> >>>> +static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
> >>>> +{
> >>>> +	struct rb_node *n = rb_first(&iommu->dma_list);
> >>>> +
> >>>> +	for (; n; n = rb_next(n)) {
> >>>> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> >>>> +
> >>>> +		vfio_dma_bitmap_free(dma);
> >>>> +	}
> >>>> +}
> >>>> +
> >>>>    /*
> >>>>     * Helper Functions for host iova-pfn list
> >>>>     */
> >>>> @@ -567,6 +654,18 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
> >>>>    			vfio_unpin_page_external(dma, iova, do_accounting);
> >>>>    			goto pin_unwind;
> >>>>    		}
> >>>> +
> >>>> +		if (iommu->dirty_page_tracking) {
> >>>> +			unsigned long pgshift =
> >>>> +					 __ffs(vfio_pgsize_bitmap(iommu));
> >>>> +
> >>>> +			/*
> >>>> +			 * Bitmap populated with the smallest supported page
> >>>> +			 * size
> >>>> +			 */
> >>>> +			bitmap_set(dma->bitmap,
> >>>> +				   (vpfn->iova - dma->iova) >> pgshift, 1);
> >>>> +		}
> >>>>    	}
> >>>>    
> >>>>    	ret = i;
> >>>> @@ -801,6 +900,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
> >>>>    	vfio_unmap_unpin(iommu, dma, true);
> >>>>    	vfio_unlink_dma(iommu, dma);
> >>>>    	put_task_struct(dma->task);
> >>>> +	vfio_dma_bitmap_free(dma);
> >>>>    	kfree(dma);
> >>>>    	iommu->dma_avail++;
> >>>>    }
> >>>> @@ -831,6 +931,57 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
> >>>>    	return bitmap;
> >>>>    }
> >>>>    
> >>>> +static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
> >>>> +				  size_t size, uint64_t pgsize,
> >>>> +				  u64 __user *bitmap)
> >>>> +{
> >>>> +	struct vfio_dma *dma;
> >>>> +	unsigned long pgshift = __ffs(pgsize);
> >>>> +	unsigned int npages, bitmap_size;
> >>>> +
> >>>> +	dma = vfio_find_dma(iommu, iova, 1);
> >>>> +
> >>>> +	if (!dma)
> >>>> +		return -EINVAL;
> >>>> +
> >>>> +	if (dma->iova != iova || dma->size != size)
> >>>> +		return -EINVAL;
> >>>> +
> >>> Still don't sure if it's a good practice.
> >>> I saw the qemu implementation.
> >>> Qemu just iterates the whole IOVA address space,
> >>> It needs to find IOTLB entry for an IOVA
> >>> (1) if it can find an IOTLB for an IOVA, do the DIRTY_PAGES IOCTL and
> >>> increment IOVA by (iotlb.addr_mask + 1)
> >>>
> >>> (2) if no existing IOTLB found, the imrc->translate needs to go searching shadow
> >>> page table to try to generate one.
> >>> if it still fails,(most probably case, as IOMMU only maps a small part in its address
> >>> space).  increment IOVA by 1 page.
> >>>
> >>> So, if the address space width is 39bit, and if there's only one page
> >>> mapped, you still have to translate IOVA for around 2^27 times in each
> >>> query. Isn't it too inefficient?
> >>>
> >>
> >> This is Qemu side implementation, let discuss it on QEMU patches.
> >>
> > But kernel has to support it first, right?
> > 
> >>> So, IMHO, why we could not just save an rb tree specific for dirty pages, then generate
> >>> a bitmap for each query?
> >>
> >> This is looping back to implentation in v10 - v12 version. There are
> >> problems discussed during v10 to v12 version of patches with this approach.
> >> - populating dirty bitmap at the time of query will add more CPU cycles.
> >> - If we save these CPU cyles means dirty pages need to be tracked when
> >> they are pinned or dirtied by CPU, that is, inttoduced per vfio_dma
> >> bitmap. If ranges are not vfio_dma aligned, then copying bitmap to user
> >> space becomes complicated and unefficient.
> >>
> >> So we decided to go with the approach implemented here.
> > 
> > I checked v12, it's not like what I said.
> > In v12, bitmaps are generated per vfio_dma, and combination of the
> > bitmaps are required in order to generate a big bitmap suiting for dirty
> > query. It can cause problem when offset not aligning.
> > But what I propose here is to generate an rb tree orthogonal to the tree
> > of vfio_dma.
> > 
> > as to CPU cycles saving, I don't think iterating/translating page by page
> > would achieve that purpose.
> > 
> > 
> > 
> >>>
> >>>> +	npages = dma->size >> pgshift;
> >>>> +	bitmap_size = DIRTY_BITMAP_BYTES(npages);
> >>>> +
> >>>> +	/* mark all pages dirty if all pages are pinned and mapped. */
> >>>> +	if (dma->iommu_mapped)
> >>>> +		bitmap_set(dma->bitmap, 0, npages);
> >>>> +
> >>>> +	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
> >>>> +		return -EFAULT;
> >>>> +
> >>>> +	/*
> >>>> +	 * Re-populate bitmap to include all pinned pages which are considered
> >>>> +	 * as dirty but exclude pages which are unpinned and pages which are
> >>>> +	 * marked dirty by vfio_dma_rw()
> >>>> +	 */
> >>>> +	bitmap_clear(dma->bitmap, 0, npages);
> >>>> +	vfio_dma_populate_bitmap(dma, pgsize);
> >>> will this also repopulate bitmap for pinned pages set by pass-through devices in
> >>> patch 07 ?
> >>>
> >>
> >> If pass through device's driver pins pages using vfio_pin_pages and all
> >> devices in the group pins pages through vfio_pin_pages, then
> >> iommu->pinned_page_dirty_scope is set true, then bitmap is repolutated.
> >>
> >>
> > pass-through devices already have all guest memory pinned, it would have
> > no reason to call vfio_pin_pages if not attempting to mark page dirty.
> > Then if it calls vfio_pin_pages, it means "the pages are accessed, please
> > mark them dirty, feel free to clean it when you get it",
> > not "the pages will be accesses, please mark them dirty continuously"
> > 
> > Thanks
> > Yan
> > 
> >>
> >>>
> >>>> +	return 0;
> >>>> +}
> >>>> +
> >>>> +static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
> >>>> +{
> >>>> +	uint64_t bsize;
> >>>> +
> >>>> +	if (!npages || !bitmap_size || (bitmap_size > DIRTY_BITMAP_SIZE_MAX))
> >>>> +		return -EINVAL;
> >>>> +
> >>>> +	bsize = DIRTY_BITMAP_BYTES(npages);
> >>>> +
> >>>> +	if (bitmap_size < bsize)
> >>>> +		return -EINVAL;
> >>>> +
> >>>> +	return 0;
> >>>> +}
> >>>> +
> >>>>    static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> >>>>    			     struct vfio_iommu_type1_dma_unmap *unmap)
> >>>>    {
> >>>> @@ -1038,16 +1189,16 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
> >>>>    	unsigned long vaddr = map->vaddr;
> >>>>    	size_t size = map->size;
> >>>>    	int ret = 0, prot = 0;
> >>>> -	uint64_t mask;
> >>>> +	uint64_t pgsize;
> >>>>    	struct vfio_dma *dma;
> >>>>    
> >>>>    	/* Verify that none of our __u64 fields overflow */
> >>>>    	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
> >>>>    		return -EINVAL;
> >>>>    
> >>>> -	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
> >>>> +	pgsize = (uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu));
> >>>>    
> >>>> -	WARN_ON(mask & PAGE_MASK);
> >>>> +	WARN_ON((pgsize - 1) & PAGE_MASK);
> >>>>    
> >>>>    	/* READ/WRITE from device perspective */
> >>>>    	if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
> >>>> @@ -1055,7 +1206,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
> >>>>    	if (map->flags & VFIO_DMA_MAP_FLAG_READ)
> >>>>    		prot |= IOMMU_READ;
> >>>>    
> >>>> -	if (!prot || !size || (size | iova | vaddr) & mask)
> >>>> +	if (!prot || !size || (size | iova | vaddr) & (pgsize - 1))
> >>>>    		return -EINVAL;
> >>>>    
> >>>>    	/* Don't allow IOVA or virtual address wrap */
> >>>> @@ -1130,6 +1281,12 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
> >>>>    	else
> >>>>    		ret = vfio_pin_map_dma(iommu, dma, size);
> >>>>    
> >>>> +	if (!ret && iommu->dirty_page_tracking) {
> >>>> +		ret = vfio_dma_bitmap_alloc(dma, pgsize);
> >>>> +		if (ret)
> >>>> +			vfio_remove_dma(iommu, dma);
> >>>> +	}
> >>>> +
> >>>>    out_unlock:
> >>>>    	mutex_unlock(&iommu->lock);
> >>>>    	return ret;
> >>>> @@ -2278,6 +2435,93 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
> >>>>    
> >>>>    		return copy_to_user((void __user *)arg, &unmap, minsz) ?
> >>>>    			-EFAULT : 0;
> >>>> +	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
> >>>> +		struct vfio_iommu_type1_dirty_bitmap dirty;
> >>>> +		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
> >>>> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
> >>>> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
> >>>> +		int ret = 0;
> >>>> +
> >>>> +		if (!iommu->v2)
> >>>> +			return -EACCES;
> >>>> +
> >>>> +		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
> >>>> +				    flags);
> >>>> +
> >>>> +		if (copy_from_user(&dirty, (void __user *)arg, minsz))
> >>>> +			return -EFAULT;
> >>>> +
> >>>> +		if (dirty.argsz < minsz || dirty.flags & ~mask)
> >>>> +			return -EINVAL;
> >>>> +
> >>>> +		/* only one flag should be set at a time */
> >>>> +		if (__ffs(dirty.flags) != __fls(dirty.flags))
> >>>> +			return -EINVAL;
> >>>> +
> >>>> +		if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
> >>>> +			uint64_t pgsize = 1 << __ffs(vfio_pgsize_bitmap(iommu));
> >>>> +
> >>>> +			mutex_lock(&iommu->lock);
> >>>> +			if (!iommu->dirty_page_tracking) {
> >>>> +				ret = vfio_dma_bitmap_alloc_all(iommu, pgsize);
> >>>> +				if (!ret)
> >>>> +					iommu->dirty_page_tracking = true;
> >>>> +			}
> >>>> +			mutex_unlock(&iommu->lock);
> >>>> +			return ret;
> >>>> +		} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
> >>>> +			mutex_lock(&iommu->lock);
> >>>> +			if (iommu->dirty_page_tracking) {
> >>>> +				iommu->dirty_page_tracking = false;
> >>>> +				vfio_dma_bitmap_free_all(iommu);
> >>>> +			}
> >>>> +			mutex_unlock(&iommu->lock);
> >>>> +			return 0;
> >>>> +		} else if (dirty.flags &
> >>>> +				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
> >>>> +			struct vfio_iommu_type1_dirty_bitmap_get range;
> >>>> +			unsigned long pgshift;
> >>>> +			size_t data_size = dirty.argsz - minsz;
> >>>> +			uint64_t iommu_pgsize =
> >>>> +					 1 << __ffs(vfio_pgsize_bitmap(iommu));
> >>>> +
> >>>> +			if (!data_size || data_size < sizeof(range))
> >>>> +				return -EINVAL;
> >>>> +
> >>>> +			if (copy_from_user(&range, (void __user *)(arg + minsz),
> >>>> +					   sizeof(range)))
> >>>> +				return -EFAULT;
> >>>> +
> >>>> +			/* allow only smallest supported pgsize */
> >>>> +			if (range.bitmap.pgsize != iommu_pgsize)
> >>>> +				return -EINVAL;
> >>>> +			if (range.iova & (iommu_pgsize - 1))
> >>>> +				return -EINVAL;
> >>>> +			if (!range.size || range.size & (iommu_pgsize - 1))
> >>>> +				return -EINVAL;
> >>>> +			if (range.iova + range.size < range.iova)
> >>>> +				return -EINVAL;
> >>>> +			if (!access_ok((void __user *)range.bitmap.data,
> >>>> +				       range.bitmap.size))
> >>>> +				return -EINVAL;
> >>>> +
> >>>> +			pgshift = __ffs(range.bitmap.pgsize);
> >>>> +			ret = verify_bitmap_size(range.size >> pgshift,
> >>>> +						 range.bitmap.size);
> >>>> +			if (ret)
> >>>> +				return ret;
> >>>> +
> >>>> +			mutex_lock(&iommu->lock);
> >>>> +			if (iommu->dirty_page_tracking)
> >>>> +				ret = vfio_iova_dirty_bitmap(iommu, range.iova,
> >>>> +						range.size, range.bitmap.pgsize,
> >>>> +						range.bitmap.data);
> >>>> +			else
> >>>> +				ret = -EINVAL;
> >>>> +			mutex_unlock(&iommu->lock);
> >>>> +
> >>>> +			return ret;
> >>>> +		}
> >>>>    	}
> >>>>    
> >>>>    	return -ENOTTY;
> >>>> @@ -2345,10 +2589,20 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
> >>>>    
> >>>>    	vaddr = dma->vaddr + offset;
> >>>>    
> >>>> -	if (write)
> >>>> +	if (write) {
> >>>>    		*copied = __copy_to_user((void __user *)vaddr, data,
> >>>>    					 count) ? 0 : count;
> >>>> -	else
> >>>> +		if (*copied && iommu->dirty_page_tracking) {
> >>>> +			unsigned long pgshift =
> >>>> +				__ffs(vfio_pgsize_bitmap(iommu));
> >>>> +			/*
> >>>> +			 * Bitmap populated with the smallest supported page
> >>>> +			 * size
> >>>> +			 */
> >>>> +			bitmap_set(dma->bitmap, offset >> pgshift,
> >>>> +				   *copied >> pgshift);
> >>>> +		}
> >>>> +	} else
> >>>>    		*copied = __copy_from_user(data, (void __user *)vaddr,
> >>>>    					   count) ? 0 : count;
> >>>>    	if (kthread)
> >>>> -- 
> >>>> 2.7.0
> >>>>
Kirti Wankhede March 30, 2020, 1:49 p.m. UTC | #11
On 3/30/2020 8:54 AM, Yan Zhao wrote:
> On Fri, Mar 27, 2020 at 01:28:13PM +0800, Kirti Wankhede wrote:
>> Hit send button little early.
>>
>>   >
>>   > I checked v12, it's not like what I said.
>>   > In v12, bitmaps are generated per vfio_dma, and combination of the
>>   > bitmaps are required in order to generate a big bitmap suiting for dirty
>>   > query. It can cause problem when offset not aligning.
>>   > But what I propose here is to generate an rb tree orthogonal to the tree
>>   > of vfio_dma.
>>   >
>>   > as to CPU cycles saving, I don't think iterating/translating page by page
>>   > would achieve that purpose.
>>   >
>>
>> Instead of creating one extra rb tree for dirty pages tracking in v10
>> tried to use dma->pfn_list itself, we tried changes in v10, v11 and v12,
>> latest version is evolved version with best possible approach after
>> discussion. Probably, go through v11 as well.
>> https://patchwork.kernel.org/patch/11298335/
>>
> I'm not sure why all those previous implementations are bound to
> vfio_dma. for vIOMMU on, in most cases, a vfio_dma is only for a page,
> so generating a one-byte bitmap for a single page in each vfio_dma ?
> is it possible to creating one extra rb tree to keep dirty ranges, and
> one fixed length kernel bitmap whose content is generated on query,
> serving as a bouncing buffer for copy_to_user
> 

One fixed length? what should be fixed value? then isn't it better to 
fix the size to dma->size?

This is also to prevent DoS attack, user space application can query a 
very large range.

>>
>> On 3/27/2020 6:00 AM, Yan Zhao wrote:
>>> On Fri, Mar 27, 2020 at 05:39:01AM +0800, Kirti Wankhede wrote:
>>>>
>>>>
>>>> On 3/25/2020 7:41 AM, Yan Zhao wrote:
>>>>> On Wed, Mar 25, 2020 at 05:18:52AM +0800, Kirti Wankhede wrote:
>>>>>> VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
>>>>>> - Start dirty pages tracking while migration is active
>>>>>> - Stop dirty pages tracking.
>>>>>> - Get dirty pages bitmap. Its user space application's responsibility to
>>>>>>      copy content of dirty pages from source to destination during migration.
>>>>>>
>>>>>> To prevent DoS attack, memory for bitmap is allocated per vfio_dma
>>>>>> structure. Bitmap size is calculated considering smallest supported page
>>>>>> size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled
>>>>>>
>>>>>> Bitmap is populated for already pinned pages when bitmap is allocated for
>>>>>> a vfio_dma with the smallest supported page size. Update bitmap from
>>>>>> pinning functions when tracking is enabled. When user application queries
>>>>>> bitmap, check if requested page size is same as page size used to
>>>>>> populated bitmap. If it is equal, copy bitmap, but if not equal, return
>>>>>> error.
>>>>>>
>>>>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>>>>>> Reviewed-by: Neo Jia <cjia@nvidia.com>
>>>>>> ---
>>>>>>     drivers/vfio/vfio_iommu_type1.c | 266 +++++++++++++++++++++++++++++++++++++++-
>>>>>>     1 file changed, 260 insertions(+), 6 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>>>>>> index 70aeab921d0f..874a1a7ae925 100644
>>>>>> --- a/drivers/vfio/vfio_iommu_type1.c
>>>>>> +++ b/drivers/vfio/vfio_iommu_type1.c
>>>>>> @@ -71,6 +71,7 @@ struct vfio_iommu {
>>>>>>     	unsigned int		dma_avail;
>>>>>>     	bool			v2;
>>>>>>     	bool			nesting;
>>>>>> +	bool			dirty_page_tracking;
>>>>>>     };
>>>>>>     
>>>>>>     struct vfio_domain {
>>>>>> @@ -91,6 +92,7 @@ struct vfio_dma {
>>>>>>     	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
>>>>>>     	struct task_struct	*task;
>>>>>>     	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
>>>>>> +	unsigned long		*bitmap;
>>>>>>     };
>>>>>>     
>>>>>>     struct vfio_group {
>>>>>> @@ -125,7 +127,21 @@ struct vfio_regions {
>>>>>>     #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
>>>>>>     					(!list_empty(&iommu->domain_list))
>>>>>>     
>>>>>> +#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
>>>>>> +
>>>>>> +/*
>>>>>> + * Input argument of number of bits to bitmap_set() is unsigned integer, which
>>>>>> + * further casts to signed integer for unaligned multi-bit operation,
>>>>>> + * __bitmap_set().
>>>>>> + * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
>>>>>> + * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
>>>>>> + * system.
>>>>>> + */
>>>>>> +#define DIRTY_BITMAP_PAGES_MAX	(uint64_t)(INT_MAX - 1)
>>>>>> +#define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
>>>>>> +
>>>>>>     static int put_pfn(unsigned long pfn, int prot);
>>>>>> +static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
>>>>>>     
>>>>>>     /*
>>>>>>      * This code handles mapping and unmapping of user data buffers
>>>>>> @@ -175,6 +191,77 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
>>>>>>     	rb_erase(&old->node, &iommu->dma_list);
>>>>>>     }
>>>>>>     
>>>>>> +
>>>>>> +static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, uint64_t pgsize)
>>>>>> +{
>>>>>> +	uint64_t npages = dma->size / pgsize;
>>>>>> +
> If pgsize > dma->size, npages = 0.
> wouldn't it cause problem?
> 

This patch-set supports bitmap for smallest supported page size, i.e. 
PAGE_SIZE. vfio_dma_do_map() validates dma->size accordingly. So this 
case will not happen.

Thanks,
Kirti
Alex Williamson March 30, 2020, 8:47 p.m. UTC | #12
On Sun, 29 Mar 2020 22:07:08 -0400
Yan Zhao <yan.y.zhao@intel.com> wrote:

> On Fri, Mar 27, 2020 at 01:07:38PM +0800, Kirti Wankhede wrote:
> > 
> > 
> > On 3/27/2020 6:00 AM, Yan Zhao wrote:  
> > > On Fri, Mar 27, 2020 at 05:39:01AM +0800, Kirti Wankhede wrote:  
> > >>
> > >>
> > >> On 3/25/2020 7:41 AM, Yan Zhao wrote:  
> > >>> On Wed, Mar 25, 2020 at 05:18:52AM +0800, Kirti Wankhede wrote:  
> > >>>> VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
> > >>>> - Start dirty pages tracking while migration is active
> > >>>> - Stop dirty pages tracking.
> > >>>> - Get dirty pages bitmap. Its user space application's responsibility to
> > >>>>     copy content of dirty pages from source to destination during migration.
> > >>>>
> > >>>> To prevent DoS attack, memory for bitmap is allocated per vfio_dma
> > >>>> structure. Bitmap size is calculated considering smallest supported page
> > >>>> size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled
> > >>>>
> > >>>> Bitmap is populated for already pinned pages when bitmap is allocated for
> > >>>> a vfio_dma with the smallest supported page size. Update bitmap from
> > >>>> pinning functions when tracking is enabled. When user application queries
> > >>>> bitmap, check if requested page size is same as page size used to
> > >>>> populated bitmap. If it is equal, copy bitmap, but if not equal, return
> > >>>> error.
> > >>>>
> > >>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> > >>>> Reviewed-by: Neo Jia <cjia@nvidia.com>
> > >>>> ---
> > >>>>    drivers/vfio/vfio_iommu_type1.c | 266 +++++++++++++++++++++++++++++++++++++++-
> > >>>>    1 file changed, 260 insertions(+), 6 deletions(-)
> > >>>>
> > >>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > >>>> index 70aeab921d0f..874a1a7ae925 100644
> > >>>> --- a/drivers/vfio/vfio_iommu_type1.c
> > >>>> +++ b/drivers/vfio/vfio_iommu_type1.c
> > >>>> @@ -71,6 +71,7 @@ struct vfio_iommu {
> > >>>>    	unsigned int		dma_avail;
> > >>>>    	bool			v2;
> > >>>>    	bool			nesting;
> > >>>> +	bool			dirty_page_tracking;
> > >>>>    };
> > >>>>    
> > >>>>    struct vfio_domain {
> > >>>> @@ -91,6 +92,7 @@ struct vfio_dma {
> > >>>>    	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
> > >>>>    	struct task_struct	*task;
> > >>>>    	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
> > >>>> +	unsigned long		*bitmap;
> > >>>>    };
> > >>>>    
> > >>>>    struct vfio_group {
> > >>>> @@ -125,7 +127,21 @@ struct vfio_regions {
> > >>>>    #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> > >>>>    					(!list_empty(&iommu->domain_list))
> > >>>>    
> > >>>> +#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
> > >>>> +
> > >>>> +/*
> > >>>> + * Input argument of number of bits to bitmap_set() is unsigned integer, which
> > >>>> + * further casts to signed integer for unaligned multi-bit operation,
> > >>>> + * __bitmap_set().
> > >>>> + * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
> > >>>> + * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
> > >>>> + * system.
> > >>>> + */
> > >>>> +#define DIRTY_BITMAP_PAGES_MAX	(uint64_t)(INT_MAX - 1)
> > >>>> +#define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
> > >>>> +
> > >>>>    static int put_pfn(unsigned long pfn, int prot);
> > >>>> +static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
> > >>>>    
> > >>>>    /*
> > >>>>     * This code handles mapping and unmapping of user data buffers
> > >>>> @@ -175,6 +191,77 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
> > >>>>    	rb_erase(&old->node, &iommu->dma_list);
> > >>>>    }
> > >>>>    
> > >>>> +
> > >>>> +static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, uint64_t pgsize)
> > >>>> +{
> > >>>> +	uint64_t npages = dma->size / pgsize;
> > >>>> +
> > >>>> +	if (npages > DIRTY_BITMAP_PAGES_MAX)
> > >>>> +		return -EINVAL;
> > >>>> +
> > >>>> +	dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
> > >>>> +	if (!dma->bitmap)
> > >>>> +		return -ENOMEM;
> > >>>> +
> > >>>> +	return 0;
> > >>>> +}
> > >>>> +
> > >>>> +static void vfio_dma_bitmap_free(struct vfio_dma *dma)
> > >>>> +{
> > >>>> +	kfree(dma->bitmap);
> > >>>> +	dma->bitmap = NULL;
> > >>>> +}
> > >>>> +
> > >>>> +static void vfio_dma_populate_bitmap(struct vfio_dma *dma, uint64_t pgsize)
> > >>>> +{
> > >>>> +	struct rb_node *p;
> > >>>> +
> > >>>> +	if (RB_EMPTY_ROOT(&dma->pfn_list))
> > >>>> +		return;
> > >>>> +
> > >>>> +	for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
> > >>>> +		struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
> > >>>> +
> > >>>> +		bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) / pgsize, 1);
> > >>>> +	}
> > >>>> +}
> > >>>> +
> > >>>> +static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, uint64_t pgsize)
> > >>>> +{
> > >>>> +	struct rb_node *n = rb_first(&iommu->dma_list);
> > >>>> +
> > >>>> +	for (; n; n = rb_next(n)) {
> > >>>> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> > >>>> +		int ret;
> > >>>> +
> > >>>> +		ret = vfio_dma_bitmap_alloc(dma, pgsize);
> > >>>> +		if (ret) {
> > >>>> +			struct rb_node *p = rb_prev(n);
> > >>>> +
> > >>>> +			for (; p; p = rb_prev(p)) {
> > >>>> +				struct vfio_dma *dma = rb_entry(n,
> > >>>> +							struct vfio_dma, node);
> > >>>> +
> > >>>> +				vfio_dma_bitmap_free(dma);
> > >>>> +			}
> > >>>> +			return ret;
> > >>>> +		}
> > >>>> +		vfio_dma_populate_bitmap(dma, pgsize);
> > >>>> +	}
> > >>>> +	return 0;
> > >>>> +}
> > >>>> +
> > >>>> +static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
> > >>>> +{
> > >>>> +	struct rb_node *n = rb_first(&iommu->dma_list);
> > >>>> +
> > >>>> +	for (; n; n = rb_next(n)) {
> > >>>> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> > >>>> +
> > >>>> +		vfio_dma_bitmap_free(dma);
> > >>>> +	}
> > >>>> +}
> > >>>> +
> > >>>>    /*
> > >>>>     * Helper Functions for host iova-pfn list
> > >>>>     */
> > >>>> @@ -567,6 +654,18 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
> > >>>>    			vfio_unpin_page_external(dma, iova, do_accounting);
> > >>>>    			goto pin_unwind;
> > >>>>    		}
> > >>>> +
> > >>>> +		if (iommu->dirty_page_tracking) {
> > >>>> +			unsigned long pgshift =
> > >>>> +					 __ffs(vfio_pgsize_bitmap(iommu));
> > >>>> +
> > >>>> +			/*
> > >>>> +			 * Bitmap populated with the smallest supported page
> > >>>> +			 * size
> > >>>> +			 */
> > >>>> +			bitmap_set(dma->bitmap,
> > >>>> +				   (vpfn->iova - dma->iova) >> pgshift, 1);
> > >>>> +		}
> > >>>>    	}
> > >>>>    
> > >>>>    	ret = i;
> > >>>> @@ -801,6 +900,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
> > >>>>    	vfio_unmap_unpin(iommu, dma, true);
> > >>>>    	vfio_unlink_dma(iommu, dma);
> > >>>>    	put_task_struct(dma->task);
> > >>>> +	vfio_dma_bitmap_free(dma);
> > >>>>    	kfree(dma);
> > >>>>    	iommu->dma_avail++;
> > >>>>    }
> > >>>> @@ -831,6 +931,57 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
> > >>>>    	return bitmap;
> > >>>>    }
> > >>>>    
> > >>>> +static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
> > >>>> +				  size_t size, uint64_t pgsize,
> > >>>> +				  u64 __user *bitmap)
> > >>>> +{
> > >>>> +	struct vfio_dma *dma;
> > >>>> +	unsigned long pgshift = __ffs(pgsize);
> > >>>> +	unsigned int npages, bitmap_size;
> > >>>> +
> > >>>> +	dma = vfio_find_dma(iommu, iova, 1);
> > >>>> +
> > >>>> +	if (!dma)
> > >>>> +		return -EINVAL;
> > >>>> +
> > >>>> +	if (dma->iova != iova || dma->size != size)
> > >>>> +		return -EINVAL;
> > >>>> +  
> > >>> Still don't sure if it's a good practice.
> > >>> I saw the qemu implementation.
> > >>> Qemu just iterates the whole IOVA address space,
> > >>> It needs to find IOTLB entry for an IOVA
> > >>> (1) if it can find an IOTLB for an IOVA, do the DIRTY_PAGES IOCTL and
> > >>> increment IOVA by (iotlb.addr_mask + 1)
> > >>>
> > >>> (2) if no existing IOTLB found, the imrc->translate needs to go searching shadow
> > >>> page table to try to generate one.
> > >>> if it still fails,(most probably case, as IOMMU only maps a small part in its address
> > >>> space).  increment IOVA by 1 page.
> > >>>
> > >>> So, if the address space width is 39bit, and if there's only one page
> > >>> mapped, you still have to translate IOVA for around 2^27 times in each
> > >>> query. Isn't it too inefficient?
> > >>>  
> > >>
> > >> This is Qemu side implementation, let discuss it on QEMU patches.
> > >>  
> > > But kernel has to support it first, right?
> > >   
> > 
> > Shadow page table will be in QEMU (?), as long as we support map and   
> Yes, shadow page table in QEMU.
> 
> > unmap in kernel space, QEMU part of changes should work. That shouldn't 
> > block kernel side patches.  
> Not sure whether this assertion is right:)
> I just want to raise the issue out.

And I think we need to make sure that we have a path to an efficient
userspace implementation.  Walking a shadow page table to unmap and
collect individual dirty bits is clearly better than blindly walking
every page of a 39 bit address space, but it would be an obvious
improvement if the QEMU code could zap entire levels at once.

The issues we raised before about combining multiple bitmaps are not
insurmountable, they're just complicated and potentially something that
we can defer for the initial implementation.  We can change the
implementation of the dirty bitmap without affecting the user, but we
would need to use another flag bit of the IOMMU_GET_INFO ioctl or expose
it via the CHECK_EXTENSION ioctl to indicate multi-mapping dirty bitmap
support.  In fact, the flags field on IOMMU_GET_INFO so far only
describes fields returned by the ioctl, not support for other ioctls.
Would the CHECK_EXTENSION ioctl be a better choice for both exposing
this initial support as well as a v2 when we have multi-mapping?

> > >>> So, IMHO, why we could not just save an rb tree specific for dirty pages, then generate
> > >>> a bitmap for each query?  

I'm curious to know how this might work, I was strongly encouraging
that we must have a bitmap mechanism that supports copy_to_user(),
otherwise we don't have an efficient way to push the bits to the user.
We'd need to copy_from_user() a chunk of their bitmap, set bits, then
push it back with copy_to_user().  If you're thinking of an rb-tree, do
we have a node per dirty page?  The overhead for that seem excessive.
I think we could support multi-mapping dirty bits using __bitmap_and(),
__bitmap_or(), and __bitmap_shift_left/right() to extract the unaligned
portion of the bitmap, or it into a previous segment, then shift the
remainder of the bitmap so that we could use copy_to_user() with it.

> > >> This is looping back to implentation in v10 - v12 version. There are
> > >> problems discussed during v10 to v12 version of patches with this approach.
> > >> - populating dirty bitmap at the time of query will add more CPU cycles.
> > >> - If we save these CPU cyles means dirty pages need to be tracked when
> > >> they are pinned or dirtied by CPU, that is, inttoduced per vfio_dma
> > >> bitmap. If ranges are not vfio_dma aligned, then copying bitmap to user
> > >> space becomes complicated and unefficient.
> > >>
> > >> So we decided to go with the approach implemented here.  
> > > 
> > > I checked v12, it's not like what I said.
> > > In v12, bitmaps are generated per vfio_dma, and combination of the
> > > bitmaps are required in order to generate a big bitmap suiting for dirty
> > > query. It can cause problem when offset not aligning.
> > > But what I propose here is to generate an rb tree orthogonal to the tree
> > > of vfio_dma.
> > > 
> > > as to CPU cycles saving, I don't think iterating/translating page by page
> > > would achieve that purpose.
> > > 
> > >   
> > 
> > 
> >   
> > >   
> > >>>  
> > >>>> +	npages = dma->size >> pgshift;
> > >>>> +	bitmap_size = DIRTY_BITMAP_BYTES(npages);
> > >>>> +
> > >>>> +	/* mark all pages dirty if all pages are pinned and mapped. */
> > >>>> +	if (dma->iommu_mapped)
> > >>>> +		bitmap_set(dma->bitmap, 0, npages);
> > >>>> +
> > >>>> +	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
> > >>>> +		return -EFAULT;
> > >>>> +
> > >>>> +	/*
> > >>>> +	 * Re-populate bitmap to include all pinned pages which are considered
> > >>>> +	 * as dirty but exclude pages which are unpinned and pages which are
> > >>>> +	 * marked dirty by vfio_dma_rw()
> > >>>> +	 */
> > >>>> +	bitmap_clear(dma->bitmap, 0, npages);
> > >>>> +	vfio_dma_populate_bitmap(dma, pgsize);  
> > >>> will this also repopulate bitmap for pinned pages set by pass-through devices in
> > >>> patch 07 ?
> > >>>  
> > >>
> > >> If pass through device's driver pins pages using vfio_pin_pages and all
> > >> devices in the group pins pages through vfio_pin_pages, then
> > >> iommu->pinned_page_dirty_scope is set true, then bitmap is repolutated.
> > >>
> > >>  
> > > pass-through devices already have all guest memory pinned, it would have
> > > no reason to call vfio_pin_pages if not attempting to mark page dirty.
> > > Then if it calls vfio_pin_pages, it means "the pages are accessed, please
> > > mark them dirty, feel free to clean it when you get it",  
> > 
> > if you see vfio_dma_populate_bitmap() function, then if vfio_pin_pages 
> > is called, dma->pfn_list rb_tree will be non-empty and bitmap gets 
> > populates as per pinned pages.
> >   
> > > not "the pages will be accesses, please mark them dirty continuously"
> > >  
> > 
> > if vfio_pin_pages is not called, dma->pfn_list is empty, then it returns 
> > early.
> > If suppose there are 2 deviced in the group, one is IOMMU backed device 
> > and other non-IOMMU mdev device. In that case, all pages are pinned, 
> > iommu->pinned_page_dirty_scope is false, but dma->pfn_list is also not 
> > empty since non-IOMMU backed device pins pages using external API. We 
> > still have to populate bitmap according to dma->pfn_list here, because 
> > in prec-copy phase on first bitmap query, IOMMU backed device might pin 
> > pages using external API - with that iommu->pinned_page_dirty_scope will 
> > get updated to 'true', which means during next iteration report pinned 
> > pages by external API only.
> >  
> ok, I previously thought vfio_pin_pages for IOMMU backed device is to set
> dirty pages after it has write access to them. Looks your intention here
> is presume pinned pages are dirty so you have to re-fill them until they
> are unpinned.
> Maybe you can leave it as is, and we can add mark dirty interface later for
> the purpose I said above (mark dirty after write access).

Yes, just as with non-iommu backed devices, pinned pages are assumed to
be continuously dirtied.  A pin followed by unpin could be used by a
driver to indicate a transient dirty page, but I think we'd want to
think about a lower overhead interface when we have such a driver.
We'd essentially need vfio_dma_rw with only the portion that sets the
dirty bit on write.  Thanks,

Alex
Yan Zhao March 30, 2020, 11:49 p.m. UTC | #13
On Tue, Mar 31, 2020 at 04:47:20AM +0800, Alex Williamson wrote:
> On Sun, 29 Mar 2020 22:07:08 -0400
> Yan Zhao <yan.y.zhao@intel.com> wrote:
> 
> > On Fri, Mar 27, 2020 at 01:07:38PM +0800, Kirti Wankhede wrote:
> > > 
> > > 
> > > On 3/27/2020 6:00 AM, Yan Zhao wrote:  
> > > > On Fri, Mar 27, 2020 at 05:39:01AM +0800, Kirti Wankhede wrote:  
> > > >>
> > > >>
> > > >> On 3/25/2020 7:41 AM, Yan Zhao wrote:  
> > > >>> On Wed, Mar 25, 2020 at 05:18:52AM +0800, Kirti Wankhede wrote:  
> > > >>>> VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
> > > >>>> - Start dirty pages tracking while migration is active
> > > >>>> - Stop dirty pages tracking.
> > > >>>> - Get dirty pages bitmap. Its user space application's responsibility to
> > > >>>>     copy content of dirty pages from source to destination during migration.
> > > >>>>
> > > >>>> To prevent DoS attack, memory for bitmap is allocated per vfio_dma
> > > >>>> structure. Bitmap size is calculated considering smallest supported page
> > > >>>> size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled
> > > >>>>
> > > >>>> Bitmap is populated for already pinned pages when bitmap is allocated for
> > > >>>> a vfio_dma with the smallest supported page size. Update bitmap from
> > > >>>> pinning functions when tracking is enabled. When user application queries
> > > >>>> bitmap, check if requested page size is same as page size used to
> > > >>>> populated bitmap. If it is equal, copy bitmap, but if not equal, return
> > > >>>> error.
> > > >>>>
> > > >>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> > > >>>> Reviewed-by: Neo Jia <cjia@nvidia.com>
> > > >>>> ---
> > > >>>>    drivers/vfio/vfio_iommu_type1.c | 266 +++++++++++++++++++++++++++++++++++++++-
> > > >>>>    1 file changed, 260 insertions(+), 6 deletions(-)
> > > >>>>
> > > >>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > > >>>> index 70aeab921d0f..874a1a7ae925 100644
> > > >>>> --- a/drivers/vfio/vfio_iommu_type1.c
> > > >>>> +++ b/drivers/vfio/vfio_iommu_type1.c
> > > >>>> @@ -71,6 +71,7 @@ struct vfio_iommu {
> > > >>>>    	unsigned int		dma_avail;
> > > >>>>    	bool			v2;
> > > >>>>    	bool			nesting;
> > > >>>> +	bool			dirty_page_tracking;
> > > >>>>    };
> > > >>>>    
> > > >>>>    struct vfio_domain {
> > > >>>> @@ -91,6 +92,7 @@ struct vfio_dma {
> > > >>>>    	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
> > > >>>>    	struct task_struct	*task;
> > > >>>>    	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
> > > >>>> +	unsigned long		*bitmap;
> > > >>>>    };
> > > >>>>    
> > > >>>>    struct vfio_group {
> > > >>>> @@ -125,7 +127,21 @@ struct vfio_regions {
> > > >>>>    #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> > > >>>>    					(!list_empty(&iommu->domain_list))
> > > >>>>    
> > > >>>> +#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
> > > >>>> +
> > > >>>> +/*
> > > >>>> + * Input argument of number of bits to bitmap_set() is unsigned integer, which
> > > >>>> + * further casts to signed integer for unaligned multi-bit operation,
> > > >>>> + * __bitmap_set().
> > > >>>> + * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
> > > >>>> + * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
> > > >>>> + * system.
> > > >>>> + */
> > > >>>> +#define DIRTY_BITMAP_PAGES_MAX	(uint64_t)(INT_MAX - 1)
> > > >>>> +#define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
> > > >>>> +
> > > >>>>    static int put_pfn(unsigned long pfn, int prot);
> > > >>>> +static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
> > > >>>>    
> > > >>>>    /*
> > > >>>>     * This code handles mapping and unmapping of user data buffers
> > > >>>> @@ -175,6 +191,77 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
> > > >>>>    	rb_erase(&old->node, &iommu->dma_list);
> > > >>>>    }
> > > >>>>    
> > > >>>> +
> > > >>>> +static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, uint64_t pgsize)
> > > >>>> +{
> > > >>>> +	uint64_t npages = dma->size / pgsize;
> > > >>>> +
> > > >>>> +	if (npages > DIRTY_BITMAP_PAGES_MAX)
> > > >>>> +		return -EINVAL;
> > > >>>> +
> > > >>>> +	dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
> > > >>>> +	if (!dma->bitmap)
> > > >>>> +		return -ENOMEM;
> > > >>>> +
> > > >>>> +	return 0;
> > > >>>> +}
> > > >>>> +
> > > >>>> +static void vfio_dma_bitmap_free(struct vfio_dma *dma)
> > > >>>> +{
> > > >>>> +	kfree(dma->bitmap);
> > > >>>> +	dma->bitmap = NULL;
> > > >>>> +}
> > > >>>> +
> > > >>>> +static void vfio_dma_populate_bitmap(struct vfio_dma *dma, uint64_t pgsize)
> > > >>>> +{
> > > >>>> +	struct rb_node *p;
> > > >>>> +
> > > >>>> +	if (RB_EMPTY_ROOT(&dma->pfn_list))
> > > >>>> +		return;
> > > >>>> +
> > > >>>> +	for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
> > > >>>> +		struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
> > > >>>> +
> > > >>>> +		bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) / pgsize, 1);
> > > >>>> +	}
> > > >>>> +}
> > > >>>> +
> > > >>>> +static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, uint64_t pgsize)
> > > >>>> +{
> > > >>>> +	struct rb_node *n = rb_first(&iommu->dma_list);
> > > >>>> +
> > > >>>> +	for (; n; n = rb_next(n)) {
> > > >>>> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> > > >>>> +		int ret;
> > > >>>> +
> > > >>>> +		ret = vfio_dma_bitmap_alloc(dma, pgsize);
> > > >>>> +		if (ret) {
> > > >>>> +			struct rb_node *p = rb_prev(n);
> > > >>>> +
> > > >>>> +			for (; p; p = rb_prev(p)) {
> > > >>>> +				struct vfio_dma *dma = rb_entry(n,
> > > >>>> +							struct vfio_dma, node);
> > > >>>> +
> > > >>>> +				vfio_dma_bitmap_free(dma);
> > > >>>> +			}
> > > >>>> +			return ret;
> > > >>>> +		}
> > > >>>> +		vfio_dma_populate_bitmap(dma, pgsize);
> > > >>>> +	}
> > > >>>> +	return 0;
> > > >>>> +}
> > > >>>> +
> > > >>>> +static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
> > > >>>> +{
> > > >>>> +	struct rb_node *n = rb_first(&iommu->dma_list);
> > > >>>> +
> > > >>>> +	for (; n; n = rb_next(n)) {
> > > >>>> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> > > >>>> +
> > > >>>> +		vfio_dma_bitmap_free(dma);
> > > >>>> +	}
> > > >>>> +}
> > > >>>> +
> > > >>>>    /*
> > > >>>>     * Helper Functions for host iova-pfn list
> > > >>>>     */
> > > >>>> @@ -567,6 +654,18 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
> > > >>>>    			vfio_unpin_page_external(dma, iova, do_accounting);
> > > >>>>    			goto pin_unwind;
> > > >>>>    		}
> > > >>>> +
> > > >>>> +		if (iommu->dirty_page_tracking) {
> > > >>>> +			unsigned long pgshift =
> > > >>>> +					 __ffs(vfio_pgsize_bitmap(iommu));
> > > >>>> +
> > > >>>> +			/*
> > > >>>> +			 * Bitmap populated with the smallest supported page
> > > >>>> +			 * size
> > > >>>> +			 */
> > > >>>> +			bitmap_set(dma->bitmap,
> > > >>>> +				   (vpfn->iova - dma->iova) >> pgshift, 1);
> > > >>>> +		}
> > > >>>>    	}
> > > >>>>    
> > > >>>>    	ret = i;
> > > >>>> @@ -801,6 +900,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
> > > >>>>    	vfio_unmap_unpin(iommu, dma, true);
> > > >>>>    	vfio_unlink_dma(iommu, dma);
> > > >>>>    	put_task_struct(dma->task);
> > > >>>> +	vfio_dma_bitmap_free(dma);
> > > >>>>    	kfree(dma);
> > > >>>>    	iommu->dma_avail++;
> > > >>>>    }
> > > >>>> @@ -831,6 +931,57 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
> > > >>>>    	return bitmap;
> > > >>>>    }
> > > >>>>    
> > > >>>> +static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
> > > >>>> +				  size_t size, uint64_t pgsize,
> > > >>>> +				  u64 __user *bitmap)
> > > >>>> +{
> > > >>>> +	struct vfio_dma *dma;
> > > >>>> +	unsigned long pgshift = __ffs(pgsize);
> > > >>>> +	unsigned int npages, bitmap_size;
> > > >>>> +
> > > >>>> +	dma = vfio_find_dma(iommu, iova, 1);
> > > >>>> +
> > > >>>> +	if (!dma)
> > > >>>> +		return -EINVAL;
> > > >>>> +
> > > >>>> +	if (dma->iova != iova || dma->size != size)
> > > >>>> +		return -EINVAL;
> > > >>>> +  
> > > >>> Still don't sure if it's a good practice.
> > > >>> I saw the qemu implementation.
> > > >>> Qemu just iterates the whole IOVA address space,
> > > >>> It needs to find IOTLB entry for an IOVA
> > > >>> (1) if it can find an IOTLB for an IOVA, do the DIRTY_PAGES IOCTL and
> > > >>> increment IOVA by (iotlb.addr_mask + 1)
> > > >>>
> > > >>> (2) if no existing IOTLB found, the imrc->translate needs to go searching shadow
> > > >>> page table to try to generate one.
> > > >>> if it still fails,(most probably case, as IOMMU only maps a small part in its address
> > > >>> space).  increment IOVA by 1 page.
> > > >>>
> > > >>> So, if the address space width is 39bit, and if there's only one page
> > > >>> mapped, you still have to translate IOVA for around 2^27 times in each
> > > >>> query. Isn't it too inefficient?
> > > >>>  
> > > >>
> > > >> This is Qemu side implementation, let discuss it on QEMU patches.
> > > >>  
> > > > But kernel has to support it first, right?
> > > >   
> > > 
> > > Shadow page table will be in QEMU (?), as long as we support map and   
> > Yes, shadow page table in QEMU.
> > 
> > > unmap in kernel space, QEMU part of changes should work. That shouldn't 
> > > block kernel side patches.  
> > Not sure whether this assertion is right:)
> > I just want to raise the issue out.
> 
> And I think we need to make sure that we have a path to an efficient
> userspace implementation.  Walking a shadow page table to unmap and
> collect individual dirty bits is clearly better than blindly walking
> every page of a 39 bit address space, but it would be an obvious
> improvement if the QEMU code could zap entire levels at once.
> 
> The issues we raised before about combining multiple bitmaps are not
> insurmountable, they're just complicated and potentially something that
> we can defer for the initial implementation.  We can change the
> implementation of the dirty bitmap without affecting the user, but we
> would need to use another flag bit of the IOMMU_GET_INFO ioctl or expose
> it via the CHECK_EXTENSION ioctl to indicate multi-mapping dirty bitmap
> support.  In fact, the flags field on IOMMU_GET_INFO so far only
> describes fields returned by the ioctl, not support for other ioctls.
> Would the CHECK_EXTENSION ioctl be a better choice for both exposing
> this initial support as well as a v2 when we have multi-mapping?
>
ok.

> > > >>> So, IMHO, why we could not just save an rb tree specific for dirty pages, then generate
> > > >>> a bitmap for each query?  
> 
> I'm curious to know how this might work, I was strongly encouraging
> that we must have a bitmap mechanism that supports copy_to_user(),
> otherwise we don't have an efficient way to push the bits to the user.
> We'd need to copy_from_user() a chunk of their bitmap, set bits, then
> push it back with copy_to_user().  If you're thinking of an rb-tree, do
> we have a node per dirty page?  The overhead for that seem excessive.
hmm, maybe the kernel can allocate a fixed-length buffer that works as a
transmitter for copy_to_user() ?
The sequence is:
1. when dirty bit tracking is on, alloc a fixed length buffer, say, 64k.
2. when a query ioctl comes, searching the rb tree for the queried
range, and filling the fixed length buffer by chunks and copy_to_user()
by chunks as well.
3. when dirty bit tracking is off, free the fixed length buffer.

yes, rb tree takes more memory than bitmap, but it only needs to alloc
nodes for dirty page ranges. e.g. node 1 for range starting with
address A, of size 0x200000, node 2 for range starting with address B,
of size 0x1000, node 3 for range starting with address C, of size
0x7000... as long as they are not overlapping.

for a vm with huge memory, is it still worthwhile?

just an idea, for your consideration :)

> I think we could support multi-mapping dirty bits using __bitmap_and(),
> __bitmap_or(), and __bitmap_shift_left/right() to extract the unaligned
> portion of the bitmap, or it into a previous segment, then shift the
> remainder of the bitmap so that we could use copy_to_user() with it.
> 

> > > >> This is looping back to implentation in v10 - v12 version. There are
> > > >> problems discussed during v10 to v12 version of patches with this approach.
> > > >> - populating dirty bitmap at the time of query will add more CPU cycles.
> > > >> - If we save these CPU cyles means dirty pages need to be tracked when
> > > >> they are pinned or dirtied by CPU, that is, inttoduced per vfio_dma
> > > >> bitmap. If ranges are not vfio_dma aligned, then copying bitmap to user
> > > >> space becomes complicated and unefficient.
> > > >>
> > > >> So we decided to go with the approach implemented here.  
> > > > 
> > > > I checked v12, it's not like what I said.
> > > > In v12, bitmaps are generated per vfio_dma, and combination of the
> > > > bitmaps are required in order to generate a big bitmap suiting for dirty
> > > > query. It can cause problem when offset not aligning.
> > > > But what I propose here is to generate an rb tree orthogonal to the tree
> > > > of vfio_dma.
> > > > 
> > > > as to CPU cycles saving, I don't think iterating/translating page by page
> > > > would achieve that purpose.
> > > > 
> > > >   
> > > 
> > > 
> > >   
> > > >   
> > > >>>  
> > > >>>> +	npages = dma->size >> pgshift;
> > > >>>> +	bitmap_size = DIRTY_BITMAP_BYTES(npages);
> > > >>>> +
> > > >>>> +	/* mark all pages dirty if all pages are pinned and mapped. */
> > > >>>> +	if (dma->iommu_mapped)
> > > >>>> +		bitmap_set(dma->bitmap, 0, npages);
> > > >>>> +
> > > >>>> +	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
> > > >>>> +		return -EFAULT;
> > > >>>> +
> > > >>>> +	/*
> > > >>>> +	 * Re-populate bitmap to include all pinned pages which are considered
> > > >>>> +	 * as dirty but exclude pages which are unpinned and pages which are
> > > >>>> +	 * marked dirty by vfio_dma_rw()
> > > >>>> +	 */
> > > >>>> +	bitmap_clear(dma->bitmap, 0, npages);
> > > >>>> +	vfio_dma_populate_bitmap(dma, pgsize);  
> > > >>> will this also repopulate bitmap for pinned pages set by pass-through devices in
> > > >>> patch 07 ?
> > > >>>  
> > > >>
> > > >> If pass through device's driver pins pages using vfio_pin_pages and all
> > > >> devices in the group pins pages through vfio_pin_pages, then
> > > >> iommu->pinned_page_dirty_scope is set true, then bitmap is repolutated.
> > > >>
> > > >>  
> > > > pass-through devices already have all guest memory pinned, it would have
> > > > no reason to call vfio_pin_pages if not attempting to mark page dirty.
> > > > Then if it calls vfio_pin_pages, it means "the pages are accessed, please
> > > > mark them dirty, feel free to clean it when you get it",  
> > > 
> > > if you see vfio_dma_populate_bitmap() function, then if vfio_pin_pages 
> > > is called, dma->pfn_list rb_tree will be non-empty and bitmap gets 
> > > populates as per pinned pages.
> > >   
> > > > not "the pages will be accesses, please mark them dirty continuously"
> > > >  
> > > 
> > > if vfio_pin_pages is not called, dma->pfn_list is empty, then it returns 
> > > early.
> > > If suppose there are 2 deviced in the group, one is IOMMU backed device 
> > > and other non-IOMMU mdev device. In that case, all pages are pinned, 
> > > iommu->pinned_page_dirty_scope is false, but dma->pfn_list is also not 
> > > empty since non-IOMMU backed device pins pages using external API. We 
> > > still have to populate bitmap according to dma->pfn_list here, because 
> > > in prec-copy phase on first bitmap query, IOMMU backed device might pin 
> > > pages using external API - with that iommu->pinned_page_dirty_scope will 
> > > get updated to 'true', which means during next iteration report pinned 
> > > pages by external API only.
> > >  
> > ok, I previously thought vfio_pin_pages for IOMMU backed device is to set
> > dirty pages after it has write access to them. Looks your intention here
> > is presume pinned pages are dirty so you have to re-fill them until they
> > are unpinned.
> > Maybe you can leave it as is, and we can add mark dirty interface later for
> > the purpose I said above (mark dirty after write access).
> 
> Yes, just as with non-iommu backed devices, pinned pages are assumed to
> be continuously dirtied.  A pin followed by unpin could be used by a
> driver to indicate a transient dirty page, but I think we'd want to
> think about a lower overhead interface when we have such a driver.
> We'd essentially need vfio_dma_rw with only the portion that sets the
> dirty bit on write.  Thanks,
> 
ok. got it.

Thanks
Yan
Yan Zhao March 30, 2020, 11:51 p.m. UTC | #14
On Mon, Mar 30, 2020 at 09:49:21PM +0800, Kirti Wankhede wrote:
> 
> 
> On 3/30/2020 8:54 AM, Yan Zhao wrote:
> > On Fri, Mar 27, 2020 at 01:28:13PM +0800, Kirti Wankhede wrote:
> >> Hit send button little early.
> >>
> >>   >
> >>   > I checked v12, it's not like what I said.
> >>   > In v12, bitmaps are generated per vfio_dma, and combination of the
> >>   > bitmaps are required in order to generate a big bitmap suiting for dirty
> >>   > query. It can cause problem when offset not aligning.
> >>   > But what I propose here is to generate an rb tree orthogonal to the tree
> >>   > of vfio_dma.
> >>   >
> >>   > as to CPU cycles saving, I don't think iterating/translating page by page
> >>   > would achieve that purpose.
> >>   >
> >>
> >> Instead of creating one extra rb tree for dirty pages tracking in v10
> >> tried to use dma->pfn_list itself, we tried changes in v10, v11 and v12,
> >> latest version is evolved version with best possible approach after
> >> discussion. Probably, go through v11 as well.
> >> https://patchwork.kernel.org/patch/11298335/
> >>
> > I'm not sure why all those previous implementations are bound to
> > vfio_dma. for vIOMMU on, in most cases, a vfio_dma is only for a page,
> > so generating a one-byte bitmap for a single page in each vfio_dma ?
> > is it possible to creating one extra rb tree to keep dirty ranges, and
> > one fixed length kernel bitmap whose content is generated on query,
> > serving as a bouncing buffer for copy_to_user
> > 
> 
> One fixed length? what should be fixed value? then isn't it better to 
> fix the size to dma->size?
> 
> This is also to prevent DoS attack, user space application can query a 
> very large range.
> 
> >>
> >> On 3/27/2020 6:00 AM, Yan Zhao wrote:
> >>> On Fri, Mar 27, 2020 at 05:39:01AM +0800, Kirti Wankhede wrote:
> >>>>
> >>>>
> >>>> On 3/25/2020 7:41 AM, Yan Zhao wrote:
> >>>>> On Wed, Mar 25, 2020 at 05:18:52AM +0800, Kirti Wankhede wrote:
> >>>>>> VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
> >>>>>> - Start dirty pages tracking while migration is active
> >>>>>> - Stop dirty pages tracking.
> >>>>>> - Get dirty pages bitmap. Its user space application's responsibility to
> >>>>>>      copy content of dirty pages from source to destination during migration.
> >>>>>>
> >>>>>> To prevent DoS attack, memory for bitmap is allocated per vfio_dma
> >>>>>> structure. Bitmap size is calculated considering smallest supported page
> >>>>>> size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled
> >>>>>>
> >>>>>> Bitmap is populated for already pinned pages when bitmap is allocated for
> >>>>>> a vfio_dma with the smallest supported page size. Update bitmap from
> >>>>>> pinning functions when tracking is enabled. When user application queries
> >>>>>> bitmap, check if requested page size is same as page size used to
> >>>>>> populated bitmap. If it is equal, copy bitmap, but if not equal, return
> >>>>>> error.
> >>>>>>
> >>>>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> >>>>>> Reviewed-by: Neo Jia <cjia@nvidia.com>
> >>>>>> ---
> >>>>>>     drivers/vfio/vfio_iommu_type1.c | 266 +++++++++++++++++++++++++++++++++++++++-
> >>>>>>     1 file changed, 260 insertions(+), 6 deletions(-)
> >>>>>>
> >>>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> >>>>>> index 70aeab921d0f..874a1a7ae925 100644
> >>>>>> --- a/drivers/vfio/vfio_iommu_type1.c
> >>>>>> +++ b/drivers/vfio/vfio_iommu_type1.c
> >>>>>> @@ -71,6 +71,7 @@ struct vfio_iommu {
> >>>>>>     	unsigned int		dma_avail;
> >>>>>>     	bool			v2;
> >>>>>>     	bool			nesting;
> >>>>>> +	bool			dirty_page_tracking;
> >>>>>>     };
> >>>>>>     
> >>>>>>     struct vfio_domain {
> >>>>>> @@ -91,6 +92,7 @@ struct vfio_dma {
> >>>>>>     	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
> >>>>>>     	struct task_struct	*task;
> >>>>>>     	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
> >>>>>> +	unsigned long		*bitmap;
> >>>>>>     };
> >>>>>>     
> >>>>>>     struct vfio_group {
> >>>>>> @@ -125,7 +127,21 @@ struct vfio_regions {
> >>>>>>     #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> >>>>>>     					(!list_empty(&iommu->domain_list))
> >>>>>>     
> >>>>>> +#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
> >>>>>> +
> >>>>>> +/*
> >>>>>> + * Input argument of number of bits to bitmap_set() is unsigned integer, which
> >>>>>> + * further casts to signed integer for unaligned multi-bit operation,
> >>>>>> + * __bitmap_set().
> >>>>>> + * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
> >>>>>> + * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
> >>>>>> + * system.
> >>>>>> + */
> >>>>>> +#define DIRTY_BITMAP_PAGES_MAX	(uint64_t)(INT_MAX - 1)
> >>>>>> +#define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
> >>>>>> +
> >>>>>>     static int put_pfn(unsigned long pfn, int prot);
> >>>>>> +static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
> >>>>>>     
> >>>>>>     /*
> >>>>>>      * This code handles mapping and unmapping of user data buffers
> >>>>>> @@ -175,6 +191,77 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
> >>>>>>     	rb_erase(&old->node, &iommu->dma_list);
> >>>>>>     }
> >>>>>>     
> >>>>>> +
> >>>>>> +static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, uint64_t pgsize)
> >>>>>> +{
> >>>>>> +	uint64_t npages = dma->size / pgsize;
> >>>>>> +
> > If pgsize > dma->size, npages = 0.
> > wouldn't it cause problem?
> > 
> 
> This patch-set supports bitmap for smallest supported page size, i.e. 
> PAGE_SIZE. vfio_dma_do_map() validates dma->size accordingly. So this 
> case will not happen.
> 
as far as I know, qemu/kvm uses 4k as the unit for dirty page tracking.
so why smallest iommu page size is used here?
wouldn't it cause problem?

Thanks
Yan
Yan Zhao March 31, 2020, 12:50 a.m. UTC | #15
On Tue, Mar 31, 2020 at 08:53:47AM +0800, Alex Williamson wrote:
> On Mon, 30 Mar 2020 19:51:31 -0400
> Yan Zhao <yan.y.zhao@intel.com> wrote:
> 
> > On Mon, Mar 30, 2020 at 09:49:21PM +0800, Kirti Wankhede wrote:
> > > 
> > > 
> > > On 3/30/2020 8:54 AM, Yan Zhao wrote:  
> > > > On Fri, Mar 27, 2020 at 01:28:13PM +0800, Kirti Wankhede wrote:  
> > > >> Hit send button little early.
> > > >>  
> > > >>   >
> > > >>   > I checked v12, it's not like what I said.
> > > >>   > In v12, bitmaps are generated per vfio_dma, and combination of the
> > > >>   > bitmaps are required in order to generate a big bitmap suiting for dirty
> > > >>   > query. It can cause problem when offset not aligning.
> > > >>   > But what I propose here is to generate an rb tree orthogonal to the tree
> > > >>   > of vfio_dma.
> > > >>   >
> > > >>   > as to CPU cycles saving, I don't think iterating/translating page by page
> > > >>   > would achieve that purpose.
> > > >>   >  
> > > >>
> > > >> Instead of creating one extra rb tree for dirty pages tracking in v10
> > > >> tried to use dma->pfn_list itself, we tried changes in v10, v11 and v12,
> > > >> latest version is evolved version with best possible approach after
> > > >> discussion. Probably, go through v11 as well.
> > > >> https://patchwork.kernel.org/patch/11298335/
> > > >>  
> > > > I'm not sure why all those previous implementations are bound to
> > > > vfio_dma. for vIOMMU on, in most cases, a vfio_dma is only for a page,
> > > > so generating a one-byte bitmap for a single page in each vfio_dma ?
> > > > is it possible to creating one extra rb tree to keep dirty ranges, and
> > > > one fixed length kernel bitmap whose content is generated on query,
> > > > serving as a bouncing buffer for copy_to_user
> > > >   
> > > 
> > > One fixed length? what should be fixed value? then isn't it better to 
> > > fix the size to dma->size?
> > > 
> > > This is also to prevent DoS attack, user space application can query a 
> > > very large range.
> > >   
> > > >>
> > > >> On 3/27/2020 6:00 AM, Yan Zhao wrote:  
> > > >>> On Fri, Mar 27, 2020 at 05:39:01AM +0800, Kirti Wankhede wrote:  
> > > >>>>
> > > >>>>
> > > >>>> On 3/25/2020 7:41 AM, Yan Zhao wrote:  
> > > >>>>> On Wed, Mar 25, 2020 at 05:18:52AM +0800, Kirti Wankhede wrote:  
> > > >>>>>> VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
> > > >>>>>> - Start dirty pages tracking while migration is active
> > > >>>>>> - Stop dirty pages tracking.
> > > >>>>>> - Get dirty pages bitmap. Its user space application's responsibility to
> > > >>>>>>      copy content of dirty pages from source to destination during migration.
> > > >>>>>>
> > > >>>>>> To prevent DoS attack, memory for bitmap is allocated per vfio_dma
> > > >>>>>> structure. Bitmap size is calculated considering smallest supported page
> > > >>>>>> size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled
> > > >>>>>>
> > > >>>>>> Bitmap is populated for already pinned pages when bitmap is allocated for
> > > >>>>>> a vfio_dma with the smallest supported page size. Update bitmap from
> > > >>>>>> pinning functions when tracking is enabled. When user application queries
> > > >>>>>> bitmap, check if requested page size is same as page size used to
> > > >>>>>> populated bitmap. If it is equal, copy bitmap, but if not equal, return
> > > >>>>>> error.
> > > >>>>>>
> > > >>>>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> > > >>>>>> Reviewed-by: Neo Jia <cjia@nvidia.com>
> > > >>>>>> ---
> > > >>>>>>     drivers/vfio/vfio_iommu_type1.c | 266 +++++++++++++++++++++++++++++++++++++++-
> > > >>>>>>     1 file changed, 260 insertions(+), 6 deletions(-)
> > > >>>>>>
> > > >>>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > > >>>>>> index 70aeab921d0f..874a1a7ae925 100644
> > > >>>>>> --- a/drivers/vfio/vfio_iommu_type1.c
> > > >>>>>> +++ b/drivers/vfio/vfio_iommu_type1.c
> > > >>>>>> @@ -71,6 +71,7 @@ struct vfio_iommu {
> > > >>>>>>     	unsigned int		dma_avail;
> > > >>>>>>     	bool			v2;
> > > >>>>>>     	bool			nesting;
> > > >>>>>> +	bool			dirty_page_tracking;
> > > >>>>>>     };
> > > >>>>>>     
> > > >>>>>>     struct vfio_domain {
> > > >>>>>> @@ -91,6 +92,7 @@ struct vfio_dma {
> > > >>>>>>     	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
> > > >>>>>>     	struct task_struct	*task;
> > > >>>>>>     	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
> > > >>>>>> +	unsigned long		*bitmap;
> > > >>>>>>     };
> > > >>>>>>     
> > > >>>>>>     struct vfio_group {
> > > >>>>>> @@ -125,7 +127,21 @@ struct vfio_regions {
> > > >>>>>>     #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> > > >>>>>>     					(!list_empty(&iommu->domain_list))
> > > >>>>>>     
> > > >>>>>> +#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
> > > >>>>>> +
> > > >>>>>> +/*
> > > >>>>>> + * Input argument of number of bits to bitmap_set() is unsigned integer, which
> > > >>>>>> + * further casts to signed integer for unaligned multi-bit operation,
> > > >>>>>> + * __bitmap_set().
> > > >>>>>> + * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
> > > >>>>>> + * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
> > > >>>>>> + * system.
> > > >>>>>> + */
> > > >>>>>> +#define DIRTY_BITMAP_PAGES_MAX	(uint64_t)(INT_MAX - 1)
> > > >>>>>> +#define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
> > > >>>>>> +
> > > >>>>>>     static int put_pfn(unsigned long pfn, int prot);
> > > >>>>>> +static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
> > > >>>>>>     
> > > >>>>>>     /*
> > > >>>>>>      * This code handles mapping and unmapping of user data buffers
> > > >>>>>> @@ -175,6 +191,77 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
> > > >>>>>>     	rb_erase(&old->node, &iommu->dma_list);
> > > >>>>>>     }
> > > >>>>>>     
> > > >>>>>> +
> > > >>>>>> +static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, uint64_t pgsize)
> > > >>>>>> +{
> > > >>>>>> +	uint64_t npages = dma->size / pgsize;
> > > >>>>>> +  
> > > > If pgsize > dma->size, npages = 0.
> > > > wouldn't it cause problem?
> > > >   
> > > 
> > > This patch-set supports bitmap for smallest supported page size, i.e. 
> > > PAGE_SIZE. vfio_dma_do_map() validates dma->size accordingly. So this 
> > > case will not happen.
> > >   
> > as far as I know, qemu/kvm uses 4k as the unit for dirty page tracking.
> > so why smallest iommu page size is used here?
> > wouldn't it cause problem?
> 
> If your concern is that the IOMMU supports sub-4K page sizes, see
> vfio_pgsize_bitmap().  We actually only support PAGE_SIZE as our
> minimum mapping unit, even if the IOMMU supports less, so PAGE_SIZE is
> our lower bound.  Thanks,

if we always uses PAGE_SIZE, why not use PAGE_SIZE directly?
or returning dirty bitmap unit (e.g. 1 << __ffs(vfio_pgsize_bitmap(iommu)))
to QEMU in VFIO_IOMMU_DIRTY_PAGES_FLAG_START, so that qemu can do possible
conversion if it's not the same unit that QEMU uses.

Thanks
Yan
Alex Williamson March 31, 2020, 12:53 a.m. UTC | #16
On Mon, 30 Mar 2020 19:51:31 -0400
Yan Zhao <yan.y.zhao@intel.com> wrote:

> On Mon, Mar 30, 2020 at 09:49:21PM +0800, Kirti Wankhede wrote:
> > 
> > 
> > On 3/30/2020 8:54 AM, Yan Zhao wrote:  
> > > On Fri, Mar 27, 2020 at 01:28:13PM +0800, Kirti Wankhede wrote:  
> > >> Hit send button little early.
> > >>  
> > >>   >
> > >>   > I checked v12, it's not like what I said.
> > >>   > In v12, bitmaps are generated per vfio_dma, and combination of the
> > >>   > bitmaps are required in order to generate a big bitmap suiting for dirty
> > >>   > query. It can cause problem when offset not aligning.
> > >>   > But what I propose here is to generate an rb tree orthogonal to the tree
> > >>   > of vfio_dma.
> > >>   >
> > >>   > as to CPU cycles saving, I don't think iterating/translating page by page
> > >>   > would achieve that purpose.
> > >>   >  
> > >>
> > >> Instead of creating one extra rb tree for dirty pages tracking in v10
> > >> tried to use dma->pfn_list itself, we tried changes in v10, v11 and v12,
> > >> latest version is evolved version with best possible approach after
> > >> discussion. Probably, go through v11 as well.
> > >> https://patchwork.kernel.org/patch/11298335/
> > >>  
> > > I'm not sure why all those previous implementations are bound to
> > > vfio_dma. for vIOMMU on, in most cases, a vfio_dma is only for a page,
> > > so generating a one-byte bitmap for a single page in each vfio_dma ?
> > > is it possible to creating one extra rb tree to keep dirty ranges, and
> > > one fixed length kernel bitmap whose content is generated on query,
> > > serving as a bouncing buffer for copy_to_user
> > >   
> > 
> > One fixed length? what should be fixed value? then isn't it better to 
> > fix the size to dma->size?
> > 
> > This is also to prevent DoS attack, user space application can query a 
> > very large range.
> >   
> > >>
> > >> On 3/27/2020 6:00 AM, Yan Zhao wrote:  
> > >>> On Fri, Mar 27, 2020 at 05:39:01AM +0800, Kirti Wankhede wrote:  
> > >>>>
> > >>>>
> > >>>> On 3/25/2020 7:41 AM, Yan Zhao wrote:  
> > >>>>> On Wed, Mar 25, 2020 at 05:18:52AM +0800, Kirti Wankhede wrote:  
> > >>>>>> VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
> > >>>>>> - Start dirty pages tracking while migration is active
> > >>>>>> - Stop dirty pages tracking.
> > >>>>>> - Get dirty pages bitmap. Its user space application's responsibility to
> > >>>>>>      copy content of dirty pages from source to destination during migration.
> > >>>>>>
> > >>>>>> To prevent DoS attack, memory for bitmap is allocated per vfio_dma
> > >>>>>> structure. Bitmap size is calculated considering smallest supported page
> > >>>>>> size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled
> > >>>>>>
> > >>>>>> Bitmap is populated for already pinned pages when bitmap is allocated for
> > >>>>>> a vfio_dma with the smallest supported page size. Update bitmap from
> > >>>>>> pinning functions when tracking is enabled. When user application queries
> > >>>>>> bitmap, check if requested page size is same as page size used to
> > >>>>>> populated bitmap. If it is equal, copy bitmap, but if not equal, return
> > >>>>>> error.
> > >>>>>>
> > >>>>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> > >>>>>> Reviewed-by: Neo Jia <cjia@nvidia.com>
> > >>>>>> ---
> > >>>>>>     drivers/vfio/vfio_iommu_type1.c | 266 +++++++++++++++++++++++++++++++++++++++-
> > >>>>>>     1 file changed, 260 insertions(+), 6 deletions(-)
> > >>>>>>
> > >>>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > >>>>>> index 70aeab921d0f..874a1a7ae925 100644
> > >>>>>> --- a/drivers/vfio/vfio_iommu_type1.c
> > >>>>>> +++ b/drivers/vfio/vfio_iommu_type1.c
> > >>>>>> @@ -71,6 +71,7 @@ struct vfio_iommu {
> > >>>>>>     	unsigned int		dma_avail;
> > >>>>>>     	bool			v2;
> > >>>>>>     	bool			nesting;
> > >>>>>> +	bool			dirty_page_tracking;
> > >>>>>>     };
> > >>>>>>     
> > >>>>>>     struct vfio_domain {
> > >>>>>> @@ -91,6 +92,7 @@ struct vfio_dma {
> > >>>>>>     	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
> > >>>>>>     	struct task_struct	*task;
> > >>>>>>     	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
> > >>>>>> +	unsigned long		*bitmap;
> > >>>>>>     };
> > >>>>>>     
> > >>>>>>     struct vfio_group {
> > >>>>>> @@ -125,7 +127,21 @@ struct vfio_regions {
> > >>>>>>     #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> > >>>>>>     					(!list_empty(&iommu->domain_list))
> > >>>>>>     
> > >>>>>> +#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
> > >>>>>> +
> > >>>>>> +/*
> > >>>>>> + * Input argument of number of bits to bitmap_set() is unsigned integer, which
> > >>>>>> + * further casts to signed integer for unaligned multi-bit operation,
> > >>>>>> + * __bitmap_set().
> > >>>>>> + * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
> > >>>>>> + * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
> > >>>>>> + * system.
> > >>>>>> + */
> > >>>>>> +#define DIRTY_BITMAP_PAGES_MAX	(uint64_t)(INT_MAX - 1)
> > >>>>>> +#define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
> > >>>>>> +
> > >>>>>>     static int put_pfn(unsigned long pfn, int prot);
> > >>>>>> +static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
> > >>>>>>     
> > >>>>>>     /*
> > >>>>>>      * This code handles mapping and unmapping of user data buffers
> > >>>>>> @@ -175,6 +191,77 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
> > >>>>>>     	rb_erase(&old->node, &iommu->dma_list);
> > >>>>>>     }
> > >>>>>>     
> > >>>>>> +
> > >>>>>> +static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, uint64_t pgsize)
> > >>>>>> +{
> > >>>>>> +	uint64_t npages = dma->size / pgsize;
> > >>>>>> +  
> > > If pgsize > dma->size, npages = 0.
> > > wouldn't it cause problem?
> > >   
> > 
> > This patch-set supports bitmap for smallest supported page size, i.e. 
> > PAGE_SIZE. vfio_dma_do_map() validates dma->size accordingly. So this 
> > case will not happen.
> >   
> as far as I know, qemu/kvm uses 4k as the unit for dirty page tracking.
> so why smallest iommu page size is used here?
> wouldn't it cause problem?

If your concern is that the IOMMU supports sub-4K page sizes, see
vfio_pgsize_bitmap().  We actually only support PAGE_SIZE as our
minimum mapping unit, even if the IOMMU supports less, so PAGE_SIZE is
our lower bound.  Thanks,

Alex
Alex Williamson March 31, 2020, 1:12 a.m. UTC | #17
On Mon, 30 Mar 2020 20:50:47 -0400
Yan Zhao <yan.y.zhao@intel.com> wrote:

> On Tue, Mar 31, 2020 at 08:53:47AM +0800, Alex Williamson wrote:
> > On Mon, 30 Mar 2020 19:51:31 -0400
> > Yan Zhao <yan.y.zhao@intel.com> wrote:
> >   
> > > On Mon, Mar 30, 2020 at 09:49:21PM +0800, Kirti Wankhede wrote:  
> > > > 
> > > > 
> > > > On 3/30/2020 8:54 AM, Yan Zhao wrote:    
> > > > > On Fri, Mar 27, 2020 at 01:28:13PM +0800, Kirti Wankhede wrote:    
> > > > >> Hit send button little early.
> > > > >>    
> > > > >>   >
> > > > >>   > I checked v12, it's not like what I said.
> > > > >>   > In v12, bitmaps are generated per vfio_dma, and combination of the
> > > > >>   > bitmaps are required in order to generate a big bitmap suiting for dirty
> > > > >>   > query. It can cause problem when offset not aligning.
> > > > >>   > But what I propose here is to generate an rb tree orthogonal to the tree
> > > > >>   > of vfio_dma.
> > > > >>   >
> > > > >>   > as to CPU cycles saving, I don't think iterating/translating page by page
> > > > >>   > would achieve that purpose.
> > > > >>   >    
> > > > >>
> > > > >> Instead of creating one extra rb tree for dirty pages tracking in v10
> > > > >> tried to use dma->pfn_list itself, we tried changes in v10, v11 and v12,
> > > > >> latest version is evolved version with best possible approach after
> > > > >> discussion. Probably, go through v11 as well.
> > > > >> https://patchwork.kernel.org/patch/11298335/
> > > > >>    
> > > > > I'm not sure why all those previous implementations are bound to
> > > > > vfio_dma. for vIOMMU on, in most cases, a vfio_dma is only for a page,
> > > > > so generating a one-byte bitmap for a single page in each vfio_dma ?
> > > > > is it possible to creating one extra rb tree to keep dirty ranges, and
> > > > > one fixed length kernel bitmap whose content is generated on query,
> > > > > serving as a bouncing buffer for copy_to_user
> > > > >     
> > > > 
> > > > One fixed length? what should be fixed value? then isn't it better to 
> > > > fix the size to dma->size?
> > > > 
> > > > This is also to prevent DoS attack, user space application can query a 
> > > > very large range.
> > > >     
> > > > >>
> > > > >> On 3/27/2020 6:00 AM, Yan Zhao wrote:    
> > > > >>> On Fri, Mar 27, 2020 at 05:39:01AM +0800, Kirti Wankhede wrote:    
> > > > >>>>
> > > > >>>>
> > > > >>>> On 3/25/2020 7:41 AM, Yan Zhao wrote:    
> > > > >>>>> On Wed, Mar 25, 2020 at 05:18:52AM +0800, Kirti Wankhede wrote:    
> > > > >>>>>> VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
> > > > >>>>>> - Start dirty pages tracking while migration is active
> > > > >>>>>> - Stop dirty pages tracking.
> > > > >>>>>> - Get dirty pages bitmap. Its user space application's responsibility to
> > > > >>>>>>      copy content of dirty pages from source to destination during migration.
> > > > >>>>>>
> > > > >>>>>> To prevent DoS attack, memory for bitmap is allocated per vfio_dma
> > > > >>>>>> structure. Bitmap size is calculated considering smallest supported page
> > > > >>>>>> size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled
> > > > >>>>>>
> > > > >>>>>> Bitmap is populated for already pinned pages when bitmap is allocated for
> > > > >>>>>> a vfio_dma with the smallest supported page size. Update bitmap from
> > > > >>>>>> pinning functions when tracking is enabled. When user application queries
> > > > >>>>>> bitmap, check if requested page size is same as page size used to
> > > > >>>>>> populated bitmap. If it is equal, copy bitmap, but if not equal, return
> > > > >>>>>> error.
> > > > >>>>>>
> > > > >>>>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> > > > >>>>>> Reviewed-by: Neo Jia <cjia@nvidia.com>
> > > > >>>>>> ---
> > > > >>>>>>     drivers/vfio/vfio_iommu_type1.c | 266 +++++++++++++++++++++++++++++++++++++++-
> > > > >>>>>>     1 file changed, 260 insertions(+), 6 deletions(-)
> > > > >>>>>>
> > > > >>>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > > > >>>>>> index 70aeab921d0f..874a1a7ae925 100644
> > > > >>>>>> --- a/drivers/vfio/vfio_iommu_type1.c
> > > > >>>>>> +++ b/drivers/vfio/vfio_iommu_type1.c
> > > > >>>>>> @@ -71,6 +71,7 @@ struct vfio_iommu {
> > > > >>>>>>     	unsigned int		dma_avail;
> > > > >>>>>>     	bool			v2;
> > > > >>>>>>     	bool			nesting;
> > > > >>>>>> +	bool			dirty_page_tracking;
> > > > >>>>>>     };
> > > > >>>>>>     
> > > > >>>>>>     struct vfio_domain {
> > > > >>>>>> @@ -91,6 +92,7 @@ struct vfio_dma {
> > > > >>>>>>     	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
> > > > >>>>>>     	struct task_struct	*task;
> > > > >>>>>>     	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
> > > > >>>>>> +	unsigned long		*bitmap;
> > > > >>>>>>     };
> > > > >>>>>>     
> > > > >>>>>>     struct vfio_group {
> > > > >>>>>> @@ -125,7 +127,21 @@ struct vfio_regions {
> > > > >>>>>>     #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> > > > >>>>>>     					(!list_empty(&iommu->domain_list))
> > > > >>>>>>     
> > > > >>>>>> +#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
> > > > >>>>>> +
> > > > >>>>>> +/*
> > > > >>>>>> + * Input argument of number of bits to bitmap_set() is unsigned integer, which
> > > > >>>>>> + * further casts to signed integer for unaligned multi-bit operation,
> > > > >>>>>> + * __bitmap_set().
> > > > >>>>>> + * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
> > > > >>>>>> + * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
> > > > >>>>>> + * system.
> > > > >>>>>> + */
> > > > >>>>>> +#define DIRTY_BITMAP_PAGES_MAX	(uint64_t)(INT_MAX - 1)
> > > > >>>>>> +#define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
> > > > >>>>>> +
> > > > >>>>>>     static int put_pfn(unsigned long pfn, int prot);
> > > > >>>>>> +static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
> > > > >>>>>>     
> > > > >>>>>>     /*
> > > > >>>>>>      * This code handles mapping and unmapping of user data buffers
> > > > >>>>>> @@ -175,6 +191,77 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
> > > > >>>>>>     	rb_erase(&old->node, &iommu->dma_list);
> > > > >>>>>>     }
> > > > >>>>>>     
> > > > >>>>>> +
> > > > >>>>>> +static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, uint64_t pgsize)
> > > > >>>>>> +{
> > > > >>>>>> +	uint64_t npages = dma->size / pgsize;
> > > > >>>>>> +    
> > > > > If pgsize > dma->size, npages = 0.
> > > > > wouldn't it cause problem?
> > > > >     
> > > > 
> > > > This patch-set supports bitmap for smallest supported page size, i.e. 
> > > > PAGE_SIZE. vfio_dma_do_map() validates dma->size accordingly. So this 
> > > > case will not happen.
> > > >     
> > > as far as I know, qemu/kvm uses 4k as the unit for dirty page tracking.
> > > so why smallest iommu page size is used here?
> > > wouldn't it cause problem?  
> > 
> > If your concern is that the IOMMU supports sub-4K page sizes, see
> > vfio_pgsize_bitmap().  We actually only support PAGE_SIZE as our
> > minimum mapping unit, even if the IOMMU supports less, so PAGE_SIZE is
> > our lower bound.  Thanks,  
> 
> if we always uses PAGE_SIZE, why not use PAGE_SIZE directly?
> or returning dirty bitmap unit (e.g. 1 << __ffs(vfio_pgsize_bitmap(iommu)))
> to QEMU in VFIO_IOMMU_DIRTY_PAGES_FLAG_START, so that qemu can do possible
> conversion if it's not the same unit that QEMU uses.

The vfio interface is essentially just an extension of the IOMMU API
via domain->pgsize_bitmap.  intel-iommu mostly made the bitmask
meaningless by reporting essentially PAGE_MASK, and we just expose the
common version of that across potentially all the IOMMUs used by the
domain, modulo minimum of PAGE_SIZE.  Thanks,

Alex
Yan Zhao March 31, 2020, 1:16 a.m. UTC | #18
On Tue, Mar 31, 2020 at 09:12:59AM +0800, Alex Williamson wrote:
> On Mon, 30 Mar 2020 20:50:47 -0400
> Yan Zhao <yan.y.zhao@intel.com> wrote:
> 
> > On Tue, Mar 31, 2020 at 08:53:47AM +0800, Alex Williamson wrote:
> > > On Mon, 30 Mar 2020 19:51:31 -0400
> > > Yan Zhao <yan.y.zhao@intel.com> wrote:
> > >   
> > > > On Mon, Mar 30, 2020 at 09:49:21PM +0800, Kirti Wankhede wrote:  
> > > > > 
> > > > > 
> > > > > On 3/30/2020 8:54 AM, Yan Zhao wrote:    
> > > > > > On Fri, Mar 27, 2020 at 01:28:13PM +0800, Kirti Wankhede wrote:    
> > > > > >> Hit send button little early.
> > > > > >>    
> > > > > >>   >
> > > > > >>   > I checked v12, it's not like what I said.
> > > > > >>   > In v12, bitmaps are generated per vfio_dma, and combination of the
> > > > > >>   > bitmaps are required in order to generate a big bitmap suiting for dirty
> > > > > >>   > query. It can cause problem when offset not aligning.
> > > > > >>   > But what I propose here is to generate an rb tree orthogonal to the tree
> > > > > >>   > of vfio_dma.
> > > > > >>   >
> > > > > >>   > as to CPU cycles saving, I don't think iterating/translating page by page
> > > > > >>   > would achieve that purpose.
> > > > > >>   >    
> > > > > >>
> > > > > >> Instead of creating one extra rb tree for dirty pages tracking in v10
> > > > > >> tried to use dma->pfn_list itself, we tried changes in v10, v11 and v12,
> > > > > >> latest version is evolved version with best possible approach after
> > > > > >> discussion. Probably, go through v11 as well.
> > > > > >> https://patchwork.kernel.org/patch/11298335/
> > > > > >>    
> > > > > > I'm not sure why all those previous implementations are bound to
> > > > > > vfio_dma. for vIOMMU on, in most cases, a vfio_dma is only for a page,
> > > > > > so generating a one-byte bitmap for a single page in each vfio_dma ?
> > > > > > is it possible to creating one extra rb tree to keep dirty ranges, and
> > > > > > one fixed length kernel bitmap whose content is generated on query,
> > > > > > serving as a bouncing buffer for copy_to_user
> > > > > >     
> > > > > 
> > > > > One fixed length? what should be fixed value? then isn't it better to 
> > > > > fix the size to dma->size?
> > > > > 
> > > > > This is also to prevent DoS attack, user space application can query a 
> > > > > very large range.
> > > > >     
> > > > > >>
> > > > > >> On 3/27/2020 6:00 AM, Yan Zhao wrote:    
> > > > > >>> On Fri, Mar 27, 2020 at 05:39:01AM +0800, Kirti Wankhede wrote:    
> > > > > >>>>
> > > > > >>>>
> > > > > >>>> On 3/25/2020 7:41 AM, Yan Zhao wrote:    
> > > > > >>>>> On Wed, Mar 25, 2020 at 05:18:52AM +0800, Kirti Wankhede wrote:    
> > > > > >>>>>> VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
> > > > > >>>>>> - Start dirty pages tracking while migration is active
> > > > > >>>>>> - Stop dirty pages tracking.
> > > > > >>>>>> - Get dirty pages bitmap. Its user space application's responsibility to
> > > > > >>>>>>      copy content of dirty pages from source to destination during migration.
> > > > > >>>>>>
> > > > > >>>>>> To prevent DoS attack, memory for bitmap is allocated per vfio_dma
> > > > > >>>>>> structure. Bitmap size is calculated considering smallest supported page
> > > > > >>>>>> size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled
> > > > > >>>>>>
> > > > > >>>>>> Bitmap is populated for already pinned pages when bitmap is allocated for
> > > > > >>>>>> a vfio_dma with the smallest supported page size. Update bitmap from
> > > > > >>>>>> pinning functions when tracking is enabled. When user application queries
> > > > > >>>>>> bitmap, check if requested page size is same as page size used to
> > > > > >>>>>> populated bitmap. If it is equal, copy bitmap, but if not equal, return
> > > > > >>>>>> error.
> > > > > >>>>>>
> > > > > >>>>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> > > > > >>>>>> Reviewed-by: Neo Jia <cjia@nvidia.com>
> > > > > >>>>>> ---
> > > > > >>>>>>     drivers/vfio/vfio_iommu_type1.c | 266 +++++++++++++++++++++++++++++++++++++++-
> > > > > >>>>>>     1 file changed, 260 insertions(+), 6 deletions(-)
> > > > > >>>>>>
> > > > > >>>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > > > > >>>>>> index 70aeab921d0f..874a1a7ae925 100644
> > > > > >>>>>> --- a/drivers/vfio/vfio_iommu_type1.c
> > > > > >>>>>> +++ b/drivers/vfio/vfio_iommu_type1.c
> > > > > >>>>>> @@ -71,6 +71,7 @@ struct vfio_iommu {
> > > > > >>>>>>     	unsigned int		dma_avail;
> > > > > >>>>>>     	bool			v2;
> > > > > >>>>>>     	bool			nesting;
> > > > > >>>>>> +	bool			dirty_page_tracking;
> > > > > >>>>>>     };
> > > > > >>>>>>     
> > > > > >>>>>>     struct vfio_domain {
> > > > > >>>>>> @@ -91,6 +92,7 @@ struct vfio_dma {
> > > > > >>>>>>     	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
> > > > > >>>>>>     	struct task_struct	*task;
> > > > > >>>>>>     	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
> > > > > >>>>>> +	unsigned long		*bitmap;
> > > > > >>>>>>     };
> > > > > >>>>>>     
> > > > > >>>>>>     struct vfio_group {
> > > > > >>>>>> @@ -125,7 +127,21 @@ struct vfio_regions {
> > > > > >>>>>>     #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> > > > > >>>>>>     					(!list_empty(&iommu->domain_list))
> > > > > >>>>>>     
> > > > > >>>>>> +#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
> > > > > >>>>>> +
> > > > > >>>>>> +/*
> > > > > >>>>>> + * Input argument of number of bits to bitmap_set() is unsigned integer, which
> > > > > >>>>>> + * further casts to signed integer for unaligned multi-bit operation,
> > > > > >>>>>> + * __bitmap_set().
> > > > > >>>>>> + * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
> > > > > >>>>>> + * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
> > > > > >>>>>> + * system.
> > > > > >>>>>> + */
> > > > > >>>>>> +#define DIRTY_BITMAP_PAGES_MAX	(uint64_t)(INT_MAX - 1)
> > > > > >>>>>> +#define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
> > > > > >>>>>> +
> > > > > >>>>>>     static int put_pfn(unsigned long pfn, int prot);
> > > > > >>>>>> +static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
> > > > > >>>>>>     
> > > > > >>>>>>     /*
> > > > > >>>>>>      * This code handles mapping and unmapping of user data buffers
> > > > > >>>>>> @@ -175,6 +191,77 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
> > > > > >>>>>>     	rb_erase(&old->node, &iommu->dma_list);
> > > > > >>>>>>     }
> > > > > >>>>>>     
> > > > > >>>>>> +
> > > > > >>>>>> +static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, uint64_t pgsize)
> > > > > >>>>>> +{
> > > > > >>>>>> +	uint64_t npages = dma->size / pgsize;
> > > > > >>>>>> +    
> > > > > > If pgsize > dma->size, npages = 0.
> > > > > > wouldn't it cause problem?
> > > > > >     
> > > > > 
> > > > > This patch-set supports bitmap for smallest supported page size, i.e. 
> > > > > PAGE_SIZE. vfio_dma_do_map() validates dma->size accordingly. So this 
> > > > > case will not happen.
> > > > >     
> > > > as far as I know, qemu/kvm uses 4k as the unit for dirty page tracking.
> > > > so why smallest iommu page size is used here?
> > > > wouldn't it cause problem?  
> > > 
> > > If your concern is that the IOMMU supports sub-4K page sizes, see
> > > vfio_pgsize_bitmap().  We actually only support PAGE_SIZE as our
> > > minimum mapping unit, even if the IOMMU supports less, so PAGE_SIZE is
> > > our lower bound.  Thanks,  
> > 
> > if we always uses PAGE_SIZE, why not use PAGE_SIZE directly?
> > or returning dirty bitmap unit (e.g. 1 << __ffs(vfio_pgsize_bitmap(iommu)))
> > to QEMU in VFIO_IOMMU_DIRTY_PAGES_FLAG_START, so that qemu can do possible
> > conversion if it's not the same unit that QEMU uses.
> 
> The vfio interface is essentially just an extension of the IOMMU API
> via domain->pgsize_bitmap.  intel-iommu mostly made the bitmask
> meaningless by reporting essentially PAGE_MASK, and we just expose the
> common version of that across potentially all the IOMMUs used by the
> domain, modulo minimum of PAGE_SIZE.  Thanks,

ok. got it. do you think it's good to return this iommu page size
when turning on dirty page tracking? so when GET_BITMAP ioctl
comes, we don't need to quit if range.bitmap.pgsize != iommu_pgsize.
instead, the GET_BITMAP can success with iommu page size and qemu does
the bitmap conversion afterwards.

Thanks
Yan
Alex Williamson March 31, 2020, 2:38 a.m. UTC | #19
On Mon, 30 Mar 2020 21:16:21 -0400
Yan Zhao <yan.y.zhao@intel.com> wrote:

> On Tue, Mar 31, 2020 at 09:12:59AM +0800, Alex Williamson wrote:
> > On Mon, 30 Mar 2020 20:50:47 -0400
> > Yan Zhao <yan.y.zhao@intel.com> wrote:
> >   
> > > On Tue, Mar 31, 2020 at 08:53:47AM +0800, Alex Williamson wrote:  
> > > > On Mon, 30 Mar 2020 19:51:31 -0400
> > > > Yan Zhao <yan.y.zhao@intel.com> wrote:
> > > >     
> > > > > On Mon, Mar 30, 2020 at 09:49:21PM +0800, Kirti Wankhede wrote:    
> > > > > > 
> > > > > > 
> > > > > > On 3/30/2020 8:54 AM, Yan Zhao wrote:      
> > > > > > > On Fri, Mar 27, 2020 at 01:28:13PM +0800, Kirti Wankhede wrote:      
> > > > > > >> Hit send button little early.
> > > > > > >>      
> > > > > > >>   >
> > > > > > >>   > I checked v12, it's not like what I said.
> > > > > > >>   > In v12, bitmaps are generated per vfio_dma, and combination of the
> > > > > > >>   > bitmaps are required in order to generate a big bitmap suiting for dirty
> > > > > > >>   > query. It can cause problem when offset not aligning.
> > > > > > >>   > But what I propose here is to generate an rb tree orthogonal to the tree
> > > > > > >>   > of vfio_dma.
> > > > > > >>   >
> > > > > > >>   > as to CPU cycles saving, I don't think iterating/translating page by page
> > > > > > >>   > would achieve that purpose.
> > > > > > >>   >      
> > > > > > >>
> > > > > > >> Instead of creating one extra rb tree for dirty pages tracking in v10
> > > > > > >> tried to use dma->pfn_list itself, we tried changes in v10, v11 and v12,
> > > > > > >> latest version is evolved version with best possible approach after
> > > > > > >> discussion. Probably, go through v11 as well.
> > > > > > >> https://patchwork.kernel.org/patch/11298335/
> > > > > > >>      
> > > > > > > I'm not sure why all those previous implementations are bound to
> > > > > > > vfio_dma. for vIOMMU on, in most cases, a vfio_dma is only for a page,
> > > > > > > so generating a one-byte bitmap for a single page in each vfio_dma ?
> > > > > > > is it possible to creating one extra rb tree to keep dirty ranges, and
> > > > > > > one fixed length kernel bitmap whose content is generated on query,
> > > > > > > serving as a bouncing buffer for copy_to_user
> > > > > > >       
> > > > > > 
> > > > > > One fixed length? what should be fixed value? then isn't it better to 
> > > > > > fix the size to dma->size?
> > > > > > 
> > > > > > This is also to prevent DoS attack, user space application can query a 
> > > > > > very large range.
> > > > > >       
> > > > > > >>
> > > > > > >> On 3/27/2020 6:00 AM, Yan Zhao wrote:      
> > > > > > >>> On Fri, Mar 27, 2020 at 05:39:01AM +0800, Kirti Wankhede wrote:      
> > > > > > >>>>
> > > > > > >>>>
> > > > > > >>>> On 3/25/2020 7:41 AM, Yan Zhao wrote:      
> > > > > > >>>>> On Wed, Mar 25, 2020 at 05:18:52AM +0800, Kirti Wankhede wrote:      
> > > > > > >>>>>> VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
> > > > > > >>>>>> - Start dirty pages tracking while migration is active
> > > > > > >>>>>> - Stop dirty pages tracking.
> > > > > > >>>>>> - Get dirty pages bitmap. Its user space application's responsibility to
> > > > > > >>>>>>      copy content of dirty pages from source to destination during migration.
> > > > > > >>>>>>
> > > > > > >>>>>> To prevent DoS attack, memory for bitmap is allocated per vfio_dma
> > > > > > >>>>>> structure. Bitmap size is calculated considering smallest supported page
> > > > > > >>>>>> size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled
> > > > > > >>>>>>
> > > > > > >>>>>> Bitmap is populated for already pinned pages when bitmap is allocated for
> > > > > > >>>>>> a vfio_dma with the smallest supported page size. Update bitmap from
> > > > > > >>>>>> pinning functions when tracking is enabled. When user application queries
> > > > > > >>>>>> bitmap, check if requested page size is same as page size used to
> > > > > > >>>>>> populated bitmap. If it is equal, copy bitmap, but if not equal, return
> > > > > > >>>>>> error.
> > > > > > >>>>>>
> > > > > > >>>>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> > > > > > >>>>>> Reviewed-by: Neo Jia <cjia@nvidia.com>
> > > > > > >>>>>> ---
> > > > > > >>>>>>     drivers/vfio/vfio_iommu_type1.c | 266 +++++++++++++++++++++++++++++++++++++++-
> > > > > > >>>>>>     1 file changed, 260 insertions(+), 6 deletions(-)
> > > > > > >>>>>>
> > > > > > >>>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > > > > > >>>>>> index 70aeab921d0f..874a1a7ae925 100644
> > > > > > >>>>>> --- a/drivers/vfio/vfio_iommu_type1.c
> > > > > > >>>>>> +++ b/drivers/vfio/vfio_iommu_type1.c
> > > > > > >>>>>> @@ -71,6 +71,7 @@ struct vfio_iommu {
> > > > > > >>>>>>     	unsigned int		dma_avail;
> > > > > > >>>>>>     	bool			v2;
> > > > > > >>>>>>     	bool			nesting;
> > > > > > >>>>>> +	bool			dirty_page_tracking;
> > > > > > >>>>>>     };
> > > > > > >>>>>>     
> > > > > > >>>>>>     struct vfio_domain {
> > > > > > >>>>>> @@ -91,6 +92,7 @@ struct vfio_dma {
> > > > > > >>>>>>     	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
> > > > > > >>>>>>     	struct task_struct	*task;
> > > > > > >>>>>>     	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
> > > > > > >>>>>> +	unsigned long		*bitmap;
> > > > > > >>>>>>     };
> > > > > > >>>>>>     
> > > > > > >>>>>>     struct vfio_group {
> > > > > > >>>>>> @@ -125,7 +127,21 @@ struct vfio_regions {
> > > > > > >>>>>>     #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> > > > > > >>>>>>     					(!list_empty(&iommu->domain_list))
> > > > > > >>>>>>     
> > > > > > >>>>>> +#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
> > > > > > >>>>>> +
> > > > > > >>>>>> +/*
> > > > > > >>>>>> + * Input argument of number of bits to bitmap_set() is unsigned integer, which
> > > > > > >>>>>> + * further casts to signed integer for unaligned multi-bit operation,
> > > > > > >>>>>> + * __bitmap_set().
> > > > > > >>>>>> + * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
> > > > > > >>>>>> + * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
> > > > > > >>>>>> + * system.
> > > > > > >>>>>> + */
> > > > > > >>>>>> +#define DIRTY_BITMAP_PAGES_MAX	(uint64_t)(INT_MAX - 1)
> > > > > > >>>>>> +#define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
> > > > > > >>>>>> +
> > > > > > >>>>>>     static int put_pfn(unsigned long pfn, int prot);
> > > > > > >>>>>> +static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
> > > > > > >>>>>>     
> > > > > > >>>>>>     /*
> > > > > > >>>>>>      * This code handles mapping and unmapping of user data buffers
> > > > > > >>>>>> @@ -175,6 +191,77 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
> > > > > > >>>>>>     	rb_erase(&old->node, &iommu->dma_list);
> > > > > > >>>>>>     }
> > > > > > >>>>>>     
> > > > > > >>>>>> +
> > > > > > >>>>>> +static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, uint64_t pgsize)
> > > > > > >>>>>> +{
> > > > > > >>>>>> +	uint64_t npages = dma->size / pgsize;
> > > > > > >>>>>> +      
> > > > > > > If pgsize > dma->size, npages = 0.
> > > > > > > wouldn't it cause problem?
> > > > > > >       
> > > > > > 
> > > > > > This patch-set supports bitmap for smallest supported page size, i.e. 
> > > > > > PAGE_SIZE. vfio_dma_do_map() validates dma->size accordingly. So this 
> > > > > > case will not happen.
> > > > > >       
> > > > > as far as I know, qemu/kvm uses 4k as the unit for dirty page tracking.
> > > > > so why smallest iommu page size is used here?
> > > > > wouldn't it cause problem?    
> > > > 
> > > > If your concern is that the IOMMU supports sub-4K page sizes, see
> > > > vfio_pgsize_bitmap().  We actually only support PAGE_SIZE as our
> > > > minimum mapping unit, even if the IOMMU supports less, so PAGE_SIZE is
> > > > our lower bound.  Thanks,    
> > > 
> > > if we always uses PAGE_SIZE, why not use PAGE_SIZE directly?
> > > or returning dirty bitmap unit (e.g. 1 << __ffs(vfio_pgsize_bitmap(iommu)))
> > > to QEMU in VFIO_IOMMU_DIRTY_PAGES_FLAG_START, so that qemu can do possible
> > > conversion if it's not the same unit that QEMU uses.  
> > 
> > The vfio interface is essentially just an extension of the IOMMU API
> > via domain->pgsize_bitmap.  intel-iommu mostly made the bitmask
> > meaningless by reporting essentially PAGE_MASK, and we just expose the
> > common version of that across potentially all the IOMMUs used by the
> > domain, modulo minimum of PAGE_SIZE.  Thanks,  
> 
> ok. got it. do you think it's good to return this iommu page size
> when turning on dirty page tracking? so when GET_BITMAP ioctl
> comes, we don't need to quit if range.bitmap.pgsize != iommu_pgsize.
> instead, the GET_BITMAP can success with iommu page size and qemu does
> the bitmap conversion afterwards.

The bitmap is already expose to the user via
vfio_iommu_type1_info.iova_pgsizes in the VFIO_IOMMU_GET_INFO ioctl.
Our original intention was to allow the user to specify the dirty
bitmap page size, which is still enabled in the ioctl via
bitmap.pgsize, but for simplification we currently only support the
smallest page size.  This could be something else we expose via the
extension interface when more page sizes are supported.  Thanks,

Alex
Yan Zhao March 31, 2020, 2:40 a.m. UTC | #20
> -----Original Message-----
> From: kvm-owner@vger.kernel.org <kvm-owner@vger.kernel.org> On Behalf
> Of Alex Williamson
> Sent: Tuesday, March 31, 2020 10:38 AM
> To: Zhao, Yan Y <yan.y.zhao@intel.com>
> Cc: Kirti Wankhede <kwankhede@nvidia.com>; cjia@nvidia.com; Tian, Kevin
> <kevin.tian@intel.com>; Yang, Ziye <ziye.yang@intel.com>; Liu, Changpeng
> <changpeng.liu@intel.com>; Liu, Yi L <yi.l.liu@intel.com>;
> mlevitsk@redhat.com; eskultet@redhat.com; cohuck@redhat.com;
> dgilbert@redhat.com; jonathan.davies@nutanix.com; eauger@redhat.com;
> aik@ozlabs.ru; pasic@linux.ibm.com; felipe@nutanix.com;
> Zhengxiao.zx@Alibaba-inc.com; shuangtai.tst@alibaba-inc.com;
> Ken.Xue@amd.com; Wang, Zhi A <zhi.a.wang@intel.com>; qemu-
> devel@nongnu.org; kvm@vger.kernel.org
> Subject: Re: [PATCH v16 Kernel 4/7] vfio iommu: Implementation of ioctl for
> dirty pages tracking.
> 
> On Mon, 30 Mar 2020 21:16:21 -0400
> Yan Zhao <yan.y.zhao@intel.com> wrote:
> 
> > On Tue, Mar 31, 2020 at 09:12:59AM +0800, Alex Williamson wrote:
> > > On Mon, 30 Mar 2020 20:50:47 -0400
> > > Yan Zhao <yan.y.zhao@intel.com> wrote:
> > >
> > > > On Tue, Mar 31, 2020 at 08:53:47AM +0800, Alex Williamson wrote:
> > > > > On Mon, 30 Mar 2020 19:51:31 -0400 Yan Zhao
> > > > > <yan.y.zhao@intel.com> wrote:
> > > > >
> > > > > > On Mon, Mar 30, 2020 at 09:49:21PM +0800, Kirti Wankhede wrote:
> > > > > > >
> > > > > > >
> > > > > > > On 3/30/2020 8:54 AM, Yan Zhao wrote:
> > > > > > > > On Fri, Mar 27, 2020 at 01:28:13PM +0800, Kirti Wankhede wrote:
> > > > > > > >> Hit send button little early.
> > > > > > > >>
> > > > > > > >>   >
> > > > > > > >>   > I checked v12, it's not like what I said.
> > > > > > > >>   > In v12, bitmaps are generated per vfio_dma, and
> combination of the
> > > > > > > >>   > bitmaps are required in order to generate a big bitmap
> suiting for dirty
> > > > > > > >>   > query. It can cause problem when offset not aligning.
> > > > > > > >>   > But what I propose here is to generate an rb tree orthogonal
> to the tree
> > > > > > > >>   > of vfio_dma.
> > > > > > > >>   >
> > > > > > > >>   > as to CPU cycles saving, I don't think iterating/translating
> page by page
> > > > > > > >>   > would achieve that purpose.
> > > > > > > >>   >
> > > > > > > >>
> > > > > > > >> Instead of creating one extra rb tree for dirty pages
> > > > > > > >> tracking in v10 tried to use dma->pfn_list itself, we
> > > > > > > >> tried changes in v10, v11 and v12, latest version is
> > > > > > > >> evolved version with best possible approach after discussion.
> Probably, go through v11 as well.
> > > > > > > >> https://patchwork.kernel.org/patch/11298335/
> > > > > > > >>
> > > > > > > > I'm not sure why all those previous implementations are
> > > > > > > > bound to vfio_dma. for vIOMMU on, in most cases, a
> > > > > > > > vfio_dma is only for a page, so generating a one-byte bitmap for
> a single page in each vfio_dma ?
> > > > > > > > is it possible to creating one extra rb tree to keep dirty
> > > > > > > > ranges, and one fixed length kernel bitmap whose content
> > > > > > > > is generated on query, serving as a bouncing buffer for
> > > > > > > > copy_to_user
> > > > > > > >
> > > > > > >
> > > > > > > One fixed length? what should be fixed value? then isn't it
> > > > > > > better to fix the size to dma->size?
> > > > > > >
> > > > > > > This is also to prevent DoS attack, user space application
> > > > > > > can query a very large range.
> > > > > > >
> > > > > > > >>
> > > > > > > >> On 3/27/2020 6:00 AM, Yan Zhao wrote:
> > > > > > > >>> On Fri, Mar 27, 2020 at 05:39:01AM +0800, Kirti Wankhede
> wrote:
> > > > > > > >>>>
> > > > > > > >>>>
> > > > > > > >>>> On 3/25/2020 7:41 AM, Yan Zhao wrote:
> > > > > > > >>>>> On Wed, Mar 25, 2020 at 05:18:52AM +0800, Kirti
> Wankhede wrote:
> > > > > > > >>>>>> VFIO_IOMMU_DIRTY_PAGES ioctl performs three
> operations:
> > > > > > > >>>>>> - Start dirty pages tracking while migration is
> > > > > > > >>>>>> active
> > > > > > > >>>>>> - Stop dirty pages tracking.
> > > > > > > >>>>>> - Get dirty pages bitmap. Its user space application's
> responsibility to
> > > > > > > >>>>>>      copy content of dirty pages from source to destination
> during migration.
> > > > > > > >>>>>>
> > > > > > > >>>>>> To prevent DoS attack, memory for bitmap is allocated
> > > > > > > >>>>>> per vfio_dma structure. Bitmap size is calculated
> > > > > > > >>>>>> considering smallest supported page size. Bitmap is
> > > > > > > >>>>>> allocated for all vfio_dmas when dirty logging is
> > > > > > > >>>>>> enabled
> > > > > > > >>>>>>
> > > > > > > >>>>>> Bitmap is populated for already pinned pages when
> > > > > > > >>>>>> bitmap is allocated for a vfio_dma with the smallest
> > > > > > > >>>>>> supported page size. Update bitmap from pinning
> > > > > > > >>>>>> functions when tracking is enabled. When user
> > > > > > > >>>>>> application queries bitmap, check if requested page
> > > > > > > >>>>>> size is same as page size used to populated bitmap. If it is
> equal, copy bitmap, but if not equal, return error.
> > > > > > > >>>>>>
> > > > > > > >>>>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> > > > > > > >>>>>> Reviewed-by: Neo Jia <cjia@nvidia.com>
> > > > > > > >>>>>> ---
> > > > > > > >>>>>>     drivers/vfio/vfio_iommu_type1.c | 266
> +++++++++++++++++++++++++++++++++++++++-
> > > > > > > >>>>>>     1 file changed, 260 insertions(+), 6 deletions(-)
> > > > > > > >>>>>>
> > > > > > > >>>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c
> > > > > > > >>>>>> b/drivers/vfio/vfio_iommu_type1.c index
> > > > > > > >>>>>> 70aeab921d0f..874a1a7ae925 100644
> > > > > > > >>>>>> --- a/drivers/vfio/vfio_iommu_type1.c
> > > > > > > >>>>>> +++ b/drivers/vfio/vfio_iommu_type1.c
> > > > > > > >>>>>> @@ -71,6 +71,7 @@ struct vfio_iommu {
> > > > > > > >>>>>>     	unsigned int		dma_avail;
> > > > > > > >>>>>>     	bool			v2;
> > > > > > > >>>>>>     	bool			nesting;
> > > > > > > >>>>>> +	bool			dirty_page_tracking;
> > > > > > > >>>>>>     };
> > > > > > > >>>>>>
> > > > > > > >>>>>>     struct vfio_domain { @@ -91,6 +92,7 @@ struct
> > > > > > > >>>>>> vfio_dma {
> > > > > > > >>>>>>     	bool			lock_cap;	/*
> capable(CAP_IPC_LOCK) */
> > > > > > > >>>>>>     	struct task_struct	*task;
> > > > > > > >>>>>>     	struct rb_root		pfn_list;	/* Ex-user
> pinned pfn list */
> > > > > > > >>>>>> +	unsigned long		*bitmap;
> > > > > > > >>>>>>     };
> > > > > > > >>>>>>
> > > > > > > >>>>>>     struct vfio_group { @@ -125,7 +127,21 @@ struct
> > > > > > > >>>>>> vfio_regions {
> > > > > > > >>>>>>     #define
> IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> > > > > > > >>>>>>
> 	(!list_empty(&iommu->domain_list))
> > > > > > > >>>>>>
> > > > > > > >>>>>> +#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n,
> BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
> > > > > > > >>>>>> +
> > > > > > > >>>>>> +/*
> > > > > > > >>>>>> + * Input argument of number of bits to bitmap_set()
> > > > > > > >>>>>> +is unsigned integer, which
> > > > > > > >>>>>> + * further casts to signed integer for unaligned
> > > > > > > >>>>>> +multi-bit operation,
> > > > > > > >>>>>> + * __bitmap_set().
> > > > > > > >>>>>> + * Then maximum bitmap size supported is 2^31 bits
> > > > > > > >>>>>> +divided by 2^3 bits/byte,
> > > > > > > >>>>>> + * that is 2^28 (256 MB) which maps to 2^31 * 2^12 =
> > > > > > > >>>>>> +2^43 (8TB) on 4K page
> > > > > > > >>>>>> + * system.
> > > > > > > >>>>>> + */
> > > > > > > >>>>>> +#define DIRTY_BITMAP_PAGES_MAX
> 	(uint64_t)(INT_MAX - 1)
> > > > > > > >>>>>> +#define DIRTY_BITMAP_SIZE_MAX
> DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
> > > > > > > >>>>>> +
> > > > > > > >>>>>>     static int put_pfn(unsigned long pfn, int prot);
> > > > > > > >>>>>> +static unsigned long vfio_pgsize_bitmap(struct
> > > > > > > >>>>>> +vfio_iommu *iommu);
> > > > > > > >>>>>>
> > > > > > > >>>>>>     /*
> > > > > > > >>>>>>      * This code handles mapping and unmapping of
> > > > > > > >>>>>> user data buffers @@ -175,6 +191,77 @@ static void
> vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
> > > > > > > >>>>>>     	rb_erase(&old->node, &iommu->dma_list);
> > > > > > > >>>>>>     }
> > > > > > > >>>>>>
> > > > > > > >>>>>> +
> > > > > > > >>>>>> +static int vfio_dma_bitmap_alloc(struct vfio_dma
> > > > > > > >>>>>> +*dma, uint64_t pgsize) {
> > > > > > > >>>>>> +	uint64_t npages = dma->size / pgsize;
> > > > > > > >>>>>> +
> > > > > > > > If pgsize > dma->size, npages = 0.
> > > > > > > > wouldn't it cause problem?
> > > > > > > >
> > > > > > >
> > > > > > > This patch-set supports bitmap for smallest supported page size,
> i.e.
> > > > > > > PAGE_SIZE. vfio_dma_do_map() validates dma->size
> > > > > > > accordingly. So this case will not happen.
> > > > > > >
> > > > > > as far as I know, qemu/kvm uses 4k as the unit for dirty page
> tracking.
> > > > > > so why smallest iommu page size is used here?
> > > > > > wouldn't it cause problem?
> > > > >
> > > > > If your concern is that the IOMMU supports sub-4K page sizes,
> > > > > see vfio_pgsize_bitmap().  We actually only support PAGE_SIZE as
> > > > > our minimum mapping unit, even if the IOMMU supports less, so
> PAGE_SIZE is
> > > > > our lower bound.  Thanks,
> > > >
> > > > if we always uses PAGE_SIZE, why not use PAGE_SIZE directly?
> > > > or returning dirty bitmap unit (e.g. 1 <<
> > > > __ffs(vfio_pgsize_bitmap(iommu))) to QEMU in
> > > > VFIO_IOMMU_DIRTY_PAGES_FLAG_START, so that qemu can do
> possible conversion if it's not the same unit that QEMU uses.
> > >
> > > The vfio interface is essentially just an extension of the IOMMU API
> > > via domain->pgsize_bitmap.  intel-iommu mostly made the bitmask
> > > meaningless by reporting essentially PAGE_MASK, and we just expose
> > > the common version of that across potentially all the IOMMUs used by
> > > the domain, modulo minimum of PAGE_SIZE.  Thanks,
> >
> > ok. got it. do you think it's good to return this iommu page size when
> > turning on dirty page tracking? so when GET_BITMAP ioctl comes, we
> > don't need to quit if range.bitmap.pgsize != iommu_pgsize.
> > instead, the GET_BITMAP can success with iommu page size and qemu
> does
> > the bitmap conversion afterwards.
> 
> The bitmap is already expose to the user via
> vfio_iommu_type1_info.iova_pgsizes in the VFIO_IOMMU_GET_INFO ioctl.
> Our original intention was to allow the user to specify the dirty bitmap page
> size, which is still enabled in the ioctl via bitmap.pgsize, but for simplification
> we currently only support the smallest page size.  This could be something
> else we expose via the extension interface when more page sizes are
> supported.  Thanks,
> 
Ok. Got it .  Thanks
diff mbox series

Patch

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 70aeab921d0f..874a1a7ae925 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -71,6 +71,7 @@  struct vfio_iommu {
 	unsigned int		dma_avail;
 	bool			v2;
 	bool			nesting;
+	bool			dirty_page_tracking;
 };
 
 struct vfio_domain {
@@ -91,6 +92,7 @@  struct vfio_dma {
 	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
 	struct task_struct	*task;
 	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
+	unsigned long		*bitmap;
 };
 
 struct vfio_group {
@@ -125,7 +127,21 @@  struct vfio_regions {
 #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
 					(!list_empty(&iommu->domain_list))
 
+#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
+
+/*
+ * Input argument of number of bits to bitmap_set() is unsigned integer, which
+ * further casts to signed integer for unaligned multi-bit operation,
+ * __bitmap_set().
+ * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
+ * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
+ * system.
+ */
+#define DIRTY_BITMAP_PAGES_MAX	(uint64_t)(INT_MAX - 1)
+#define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
+
 static int put_pfn(unsigned long pfn, int prot);
+static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
 
 /*
  * This code handles mapping and unmapping of user data buffers
@@ -175,6 +191,77 @@  static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
 	rb_erase(&old->node, &iommu->dma_list);
 }
 
+
+static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, uint64_t pgsize)
+{
+	uint64_t npages = dma->size / pgsize;
+
+	if (npages > DIRTY_BITMAP_PAGES_MAX)
+		return -EINVAL;
+
+	dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
+	if (!dma->bitmap)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void vfio_dma_bitmap_free(struct vfio_dma *dma)
+{
+	kfree(dma->bitmap);
+	dma->bitmap = NULL;
+}
+
+static void vfio_dma_populate_bitmap(struct vfio_dma *dma, uint64_t pgsize)
+{
+	struct rb_node *p;
+
+	if (RB_EMPTY_ROOT(&dma->pfn_list))
+		return;
+
+	for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
+		struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
+
+		bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) / pgsize, 1);
+	}
+}
+
+static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, uint64_t pgsize)
+{
+	struct rb_node *n = rb_first(&iommu->dma_list);
+
+	for (; n; n = rb_next(n)) {
+		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+		int ret;
+
+		ret = vfio_dma_bitmap_alloc(dma, pgsize);
+		if (ret) {
+			struct rb_node *p = rb_prev(n);
+
+			for (; p; p = rb_prev(p)) {
+				struct vfio_dma *dma = rb_entry(n,
+							struct vfio_dma, node);
+
+				vfio_dma_bitmap_free(dma);
+			}
+			return ret;
+		}
+		vfio_dma_populate_bitmap(dma, pgsize);
+	}
+	return 0;
+}
+
+static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
+{
+	struct rb_node *n = rb_first(&iommu->dma_list);
+
+	for (; n; n = rb_next(n)) {
+		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+
+		vfio_dma_bitmap_free(dma);
+	}
+}
+
 /*
  * Helper Functions for host iova-pfn list
  */
@@ -567,6 +654,18 @@  static int vfio_iommu_type1_pin_pages(void *iommu_data,
 			vfio_unpin_page_external(dma, iova, do_accounting);
 			goto pin_unwind;
 		}
+
+		if (iommu->dirty_page_tracking) {
+			unsigned long pgshift =
+					 __ffs(vfio_pgsize_bitmap(iommu));
+
+			/*
+			 * Bitmap populated with the smallest supported page
+			 * size
+			 */
+			bitmap_set(dma->bitmap,
+				   (vpfn->iova - dma->iova) >> pgshift, 1);
+		}
 	}
 
 	ret = i;
@@ -801,6 +900,7 @@  static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
 	vfio_unmap_unpin(iommu, dma, true);
 	vfio_unlink_dma(iommu, dma);
 	put_task_struct(dma->task);
+	vfio_dma_bitmap_free(dma);
 	kfree(dma);
 	iommu->dma_avail++;
 }
@@ -831,6 +931,57 @@  static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
 	return bitmap;
 }
 
+static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
+				  size_t size, uint64_t pgsize,
+				  u64 __user *bitmap)
+{
+	struct vfio_dma *dma;
+	unsigned long pgshift = __ffs(pgsize);
+	unsigned int npages, bitmap_size;
+
+	dma = vfio_find_dma(iommu, iova, 1);
+
+	if (!dma)
+		return -EINVAL;
+
+	if (dma->iova != iova || dma->size != size)
+		return -EINVAL;
+
+	npages = dma->size >> pgshift;
+	bitmap_size = DIRTY_BITMAP_BYTES(npages);
+
+	/* mark all pages dirty if all pages are pinned and mapped. */
+	if (dma->iommu_mapped)
+		bitmap_set(dma->bitmap, 0, npages);
+
+	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
+		return -EFAULT;
+
+	/*
+	 * Re-populate bitmap to include all pinned pages which are considered
+	 * as dirty but exclude pages which are unpinned and pages which are
+	 * marked dirty by vfio_dma_rw()
+	 */
+	bitmap_clear(dma->bitmap, 0, npages);
+	vfio_dma_populate_bitmap(dma, pgsize);
+	return 0;
+}
+
+static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
+{
+	uint64_t bsize;
+
+	if (!npages || !bitmap_size || (bitmap_size > DIRTY_BITMAP_SIZE_MAX))
+		return -EINVAL;
+
+	bsize = DIRTY_BITMAP_BYTES(npages);
+
+	if (bitmap_size < bsize)
+		return -EINVAL;
+
+	return 0;
+}
+
 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 			     struct vfio_iommu_type1_dma_unmap *unmap)
 {
@@ -1038,16 +1189,16 @@  static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	unsigned long vaddr = map->vaddr;
 	size_t size = map->size;
 	int ret = 0, prot = 0;
-	uint64_t mask;
+	uint64_t pgsize;
 	struct vfio_dma *dma;
 
 	/* Verify that none of our __u64 fields overflow */
 	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
 		return -EINVAL;
 
-	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
+	pgsize = (uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu));
 
-	WARN_ON(mask & PAGE_MASK);
+	WARN_ON((pgsize - 1) & PAGE_MASK);
 
 	/* READ/WRITE from device perspective */
 	if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
@@ -1055,7 +1206,7 @@  static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	if (map->flags & VFIO_DMA_MAP_FLAG_READ)
 		prot |= IOMMU_READ;
 
-	if (!prot || !size || (size | iova | vaddr) & mask)
+	if (!prot || !size || (size | iova | vaddr) & (pgsize - 1))
 		return -EINVAL;
 
 	/* Don't allow IOVA or virtual address wrap */
@@ -1130,6 +1281,12 @@  static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	else
 		ret = vfio_pin_map_dma(iommu, dma, size);
 
+	if (!ret && iommu->dirty_page_tracking) {
+		ret = vfio_dma_bitmap_alloc(dma, pgsize);
+		if (ret)
+			vfio_remove_dma(iommu, dma);
+	}
+
 out_unlock:
 	mutex_unlock(&iommu->lock);
 	return ret;
@@ -2278,6 +2435,93 @@  static long vfio_iommu_type1_ioctl(void *iommu_data,
 
 		return copy_to_user((void __user *)arg, &unmap, minsz) ?
 			-EFAULT : 0;
+	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
+		struct vfio_iommu_type1_dirty_bitmap dirty;
+		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
+				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
+				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
+		int ret = 0;
+
+		if (!iommu->v2)
+			return -EACCES;
+
+		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
+				    flags);
+
+		if (copy_from_user(&dirty, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (dirty.argsz < minsz || dirty.flags & ~mask)
+			return -EINVAL;
+
+		/* only one flag should be set at a time */
+		if (__ffs(dirty.flags) != __fls(dirty.flags))
+			return -EINVAL;
+
+		if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
+			uint64_t pgsize = 1 << __ffs(vfio_pgsize_bitmap(iommu));
+
+			mutex_lock(&iommu->lock);
+			if (!iommu->dirty_page_tracking) {
+				ret = vfio_dma_bitmap_alloc_all(iommu, pgsize);
+				if (!ret)
+					iommu->dirty_page_tracking = true;
+			}
+			mutex_unlock(&iommu->lock);
+			return ret;
+		} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
+			mutex_lock(&iommu->lock);
+			if (iommu->dirty_page_tracking) {
+				iommu->dirty_page_tracking = false;
+				vfio_dma_bitmap_free_all(iommu);
+			}
+			mutex_unlock(&iommu->lock);
+			return 0;
+		} else if (dirty.flags &
+				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
+			struct vfio_iommu_type1_dirty_bitmap_get range;
+			unsigned long pgshift;
+			size_t data_size = dirty.argsz - minsz;
+			uint64_t iommu_pgsize =
+					 1 << __ffs(vfio_pgsize_bitmap(iommu));
+
+			if (!data_size || data_size < sizeof(range))
+				return -EINVAL;
+
+			if (copy_from_user(&range, (void __user *)(arg + minsz),
+					   sizeof(range)))
+				return -EFAULT;
+
+			/* allow only smallest supported pgsize */
+			if (range.bitmap.pgsize != iommu_pgsize)
+				return -EINVAL;
+			if (range.iova & (iommu_pgsize - 1))
+				return -EINVAL;
+			if (!range.size || range.size & (iommu_pgsize - 1))
+				return -EINVAL;
+			if (range.iova + range.size < range.iova)
+				return -EINVAL;
+			if (!access_ok((void __user *)range.bitmap.data,
+				       range.bitmap.size))
+				return -EINVAL;
+
+			pgshift = __ffs(range.bitmap.pgsize);
+			ret = verify_bitmap_size(range.size >> pgshift,
+						 range.bitmap.size);
+			if (ret)
+				return ret;
+
+			mutex_lock(&iommu->lock);
+			if (iommu->dirty_page_tracking)
+				ret = vfio_iova_dirty_bitmap(iommu, range.iova,
+						range.size, range.bitmap.pgsize,
+						range.bitmap.data);
+			else
+				ret = -EINVAL;
+			mutex_unlock(&iommu->lock);
+
+			return ret;
+		}
 	}
 
 	return -ENOTTY;
@@ -2345,10 +2589,20 @@  static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
 
 	vaddr = dma->vaddr + offset;
 
-	if (write)
+	if (write) {
 		*copied = __copy_to_user((void __user *)vaddr, data,
 					 count) ? 0 : count;
-	else
+		if (*copied && iommu->dirty_page_tracking) {
+			unsigned long pgshift =
+				__ffs(vfio_pgsize_bitmap(iommu));
+			/*
+			 * Bitmap populated with the smallest supported page
+			 * size
+			 */
+			bitmap_set(dma->bitmap, offset >> pgshift,
+				   *copied >> pgshift);
+		}
+	} else
 		*copied = __copy_from_user(data, (void __user *)vaddr,
 					   count) ? 0 : count;
 	if (kthread)