diff mbox series

[v15,Kernel,4/7] vfio iommu: Implementation of ioctl for dirty pages tracking.

Message ID 1584649004-8285-5-git-send-email-kwankhede@nvidia.com (mailing list archive)
State New, archived
Headers show
Series KABIs to support migration for VFIO devices | expand

Commit Message

Kirti Wankhede March 19, 2020, 8:16 p.m. UTC
VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
- Start dirty pages tracking while migration is active
- Stop dirty pages tracking.
- Get dirty pages bitmap. Its user space application's responsibility to
  copy content of dirty pages from source to destination during migration.

To prevent DoS attack, memory for bitmap is allocated per vfio_dma
structure. Bitmap size is calculated considering smallest supported page
size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled

Bitmap is populated for already pinned pages when bitmap is allocated for
a vfio_dma with the smallest supported page size. Update bitmap from
pinning functions when tracking is enabled. When user application queries
bitmap, check if requested page size is same as page size used to
populated bitmap. If it is equal, copy bitmap, but if not equal, return
error.

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Reviewed-by: Neo Jia <cjia@nvidia.com>
---
 drivers/vfio/vfio_iommu_type1.c | 242 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 236 insertions(+), 6 deletions(-)

Comments

Alex Williamson March 19, 2020, 10:57 p.m. UTC | #1
On Fri, 20 Mar 2020 01:46:41 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
> - Start dirty pages tracking while migration is active
> - Stop dirty pages tracking.
> - Get dirty pages bitmap. Its user space application's responsibility to
>   copy content of dirty pages from source to destination during migration.
> 
> To prevent DoS attack, memory for bitmap is allocated per vfio_dma
> structure. Bitmap size is calculated considering smallest supported page
> size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled
> 
> Bitmap is populated for already pinned pages when bitmap is allocated for
> a vfio_dma with the smallest supported page size. Update bitmap from
> pinning functions when tracking is enabled. When user application queries
> bitmap, check if requested page size is same as page size used to
> populated bitmap. If it is equal, copy bitmap, but if not equal, return
> error.
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Reviewed-by: Neo Jia <cjia@nvidia.com>
> ---
>  drivers/vfio/vfio_iommu_type1.c | 242 +++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 236 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 70aeab921d0f..239f61764d03 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -71,6 +71,7 @@ struct vfio_iommu {
>  	unsigned int		dma_avail;
>  	bool			v2;
>  	bool			nesting;
> +	bool			dirty_page_tracking;
>  };
>  
>  struct vfio_domain {
> @@ -91,6 +92,7 @@ struct vfio_dma {
>  	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
>  	struct task_struct	*task;
>  	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
> +	unsigned long		*bitmap;
>  };
>  
>  struct vfio_group {
> @@ -125,7 +127,21 @@ struct vfio_regions {
>  #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
>  					(!list_empty(&iommu->domain_list))
>  
> +#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
> +
> +/*
> + * Input argument of number of bits to bitmap_set() is unsigned integer, which
> + * further casts to signed integer for unaligned multi-bit operation,
> + * __bitmap_set().
> + * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
> + * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
> + * system.
> + */
> +#define DIRTY_BITMAP_PAGES_MAX	((1UL << 31) - 1)
> +#define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
> +
>  static int put_pfn(unsigned long pfn, int prot);
> +static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
>  
>  /*
>   * This code handles mapping and unmapping of user data buffers
> @@ -175,6 +191,67 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
>  	rb_erase(&old->node, &iommu->dma_list);
>  }
>  
> +
> +static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, uint64_t pgsize)
> +{
> +	uint64_t npages = dma->size / pgsize;
> +

Shouldn't we test this against one of the MAX macros defined above?  It
would be bad if we could enabled dirty tracking but not allow the user
to retrieve it.

> +	dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
> +	if (!dma->bitmap)
> +		return -ENOMEM;
> +
> +	return 0;
> +}
> +
> +static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, uint64_t pgsize)
> +{
> +	struct rb_node *n = rb_first(&iommu->dma_list);
> +
> +	for (; n; n = rb_next(n)) {
> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> +		struct rb_node *p;
> +		int ret;
> +
> +		ret = vfio_dma_bitmap_alloc(dma, pgsize);
> +		if (ret) {
> +			struct rb_node *p = rb_prev(n);
> +
> +			for (; p; p = rb_prev(p)) {
> +				struct vfio_dma *dma = rb_entry(n,
> +							struct vfio_dma, node);
> +
> +				kfree(dma->bitmap);
> +				dma->bitmap = NULL;
> +			}
> +			return ret;
> +		}
> +
> +		if (RB_EMPTY_ROOT(&dma->pfn_list))
> +			continue;
> +
> +		for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
> +			struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
> +							 node);
> +
> +			bitmap_set(dma->bitmap,
> +				   (vpfn->iova - dma->iova) / pgsize, 1);
> +		}
> +	}
> +	return 0;
> +}
> +
> +static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
> +{
> +	struct rb_node *n = rb_first(&iommu->dma_list);
> +
> +	for (; n; n = rb_next(n)) {
> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> +
> +		kfree(dma->bitmap);
> +		dma->bitmap = NULL;

Might be useful to have a vfio_dma_bitmap_free() for here and above.

> +	}
> +}
> +
>  /*
>   * Helper Functions for host iova-pfn list
>   */
> @@ -567,6 +644,14 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>  			vfio_unpin_page_external(dma, iova, do_accounting);
>  			goto pin_unwind;
>  		}
> +
> +		if (iommu->dirty_page_tracking) {
> +			unsigned long pgshift =
> +					 __ffs(vfio_pgsize_bitmap(iommu));
> +
> +			bitmap_set(dma->bitmap,
> +				   (vpfn->iova - dma->iova) >> pgshift, 1);
> +		}
>  	}
>  
>  	ret = i;
> @@ -801,6 +886,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
>  	vfio_unmap_unpin(iommu, dma, true);
>  	vfio_unlink_dma(iommu, dma);
>  	put_task_struct(dma->task);
> +	kfree(dma->bitmap);
>  	kfree(dma);
>  	iommu->dma_avail++;
>  }
> @@ -831,6 +917,50 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
>  	return bitmap;
>  }
>  
> +static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
> +				  size_t size, uint64_t pgsize,
> +				  u64 __user *bitmap)
> +{
> +	struct vfio_dma *dma;
> +	unsigned long pgshift = __ffs(pgsize);
> +	unsigned int npages, bitmap_size;
> +
> +	dma = vfio_find_dma(iommu, iova, 1);
> +
> +	if (!dma)
> +		return -EINVAL;
> +
> +	if (dma->iova != iova || dma->size != size)
> +		return -EINVAL;
> +
> +	npages = dma->size >> pgshift;
> +	bitmap_size = DIRTY_BITMAP_BYTES(npages);
> +
> +	/* mark all pages dirty if all pages are pinned and mapped. */
> +	if (dma->iommu_mapped)
> +		bitmap_set(dma->bitmap, 0, npages);
> +
> +	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
> +		return -EFAULT;

We still need to reset the bitmap here, clearing and re-adding the
pages that are still pinned.

https://lore.kernel.org/kvm/20200319070635.2ff5db56@x1.home/

> +
> +	return 0;
> +}
> +
> +static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
> +{
> +	uint64_t bsize;
> +
> +	if (!npages || !bitmap_size || (bitmap_size > DIRTY_BITMAP_SIZE_MAX))
> +		return -EINVAL;
> +
> +	bsize = DIRTY_BITMAP_BYTES(npages);
> +
> +	if (bitmap_size < bsize)
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
>  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>  			     struct vfio_iommu_type1_dma_unmap *unmap)
>  {
> @@ -1038,16 +1168,16 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  	unsigned long vaddr = map->vaddr;
>  	size_t size = map->size;
>  	int ret = 0, prot = 0;
> -	uint64_t mask;
> +	uint64_t pgsize;
>  	struct vfio_dma *dma;
>  
>  	/* Verify that none of our __u64 fields overflow */
>  	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
>  		return -EINVAL;
>  
> -	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
> +	pgsize = (uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu));
>  
> -	WARN_ON(mask & PAGE_MASK);
> +	WARN_ON((pgsize - 1) & PAGE_MASK);
>  
>  	/* READ/WRITE from device perspective */
>  	if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
> @@ -1055,7 +1185,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  	if (map->flags & VFIO_DMA_MAP_FLAG_READ)
>  		prot |= IOMMU_READ;
>  
> -	if (!prot || !size || (size | iova | vaddr) & mask)
> +	if (!prot || !size || (size | iova | vaddr) & (pgsize - 1))
>  		return -EINVAL;
>  
>  	/* Don't allow IOVA or virtual address wrap */
> @@ -1130,6 +1260,12 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  	else
>  		ret = vfio_pin_map_dma(iommu, dma, size);
>  
> +	if (!ret && iommu->dirty_page_tracking) {
> +		ret = vfio_dma_bitmap_alloc(dma, pgsize);
> +		if (ret)
> +			vfio_remove_dma(iommu, dma);
> +	}
> +
>  out_unlock:
>  	mutex_unlock(&iommu->lock);
>  	return ret;
> @@ -2278,6 +2414,93 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>  
>  		return copy_to_user((void __user *)arg, &unmap, minsz) ?
>  			-EFAULT : 0;
> +	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
> +		struct vfio_iommu_type1_dirty_bitmap dirty;
> +		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
> +		int ret = 0;
> +
> +		if (!iommu->v2)
> +			return -EACCES;
> +
> +		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
> +				    flags);
> +
> +		if (copy_from_user(&dirty, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (dirty.argsz < minsz || dirty.flags & ~mask)
> +			return -EINVAL;
> +
> +		/* only one flag should be set at a time */
> +		if (__ffs(dirty.flags) != __fls(dirty.flags))
> +			return -EINVAL;
> +
> +		if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
> +			uint64_t pgsize = 1 << __ffs(vfio_pgsize_bitmap(iommu));
> +
> +			mutex_lock(&iommu->lock);
> +			if (!iommu->dirty_page_tracking) {
> +				ret = vfio_dma_bitmap_alloc_all(iommu, pgsize);
> +				if (!ret)
> +					iommu->dirty_page_tracking = true;
> +			}
> +			mutex_unlock(&iommu->lock);
> +			return ret;
> +		} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
> +			mutex_lock(&iommu->lock);
> +			if (iommu->dirty_page_tracking) {
> +				iommu->dirty_page_tracking = false;
> +				vfio_dma_bitmap_free_all(iommu);
> +			}
> +			mutex_unlock(&iommu->lock);
> +			return 0;
> +		} else if (dirty.flags &
> +				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
> +			struct vfio_iommu_type1_dirty_bitmap_get range;
> +			unsigned long pgshift;
> +			size_t data_size = dirty.argsz - minsz;
> +			uint64_t iommu_pgsize =
> +					 1 << __ffs(vfio_pgsize_bitmap(iommu));
> +
> +			if (!data_size || data_size < sizeof(range))
> +				return -EINVAL;
> +
> +			if (copy_from_user(&range, (void __user *)(arg + minsz),
> +					   sizeof(range)))
> +				return -EFAULT;
> +
> +			/* allow only min supported pgsize */
> +			if (range.bitmap.pgsize != iommu_pgsize)
> +				return -EINVAL;
> +			if (range.iova & (iommu_pgsize - 1))
> +				return -EINVAL;
> +			if (!range.size || range.size & (iommu_pgsize - 1))
> +				return -EINVAL;
> +			if (range.iova + range.size < range.iova)
> +				return -EINVAL;
> +			if (!access_ok((void __user *)range.bitmap.data,
> +				       range.bitmap.size))
> +				return -EINVAL;
> +
> +			pgshift = __ffs(range.bitmap.pgsize);
> +			ret = verify_bitmap_size(range.size >> pgshift,
> +						 range.bitmap.size);
> +			if (ret)
> +				return ret;
> +
> +			mutex_lock(&iommu->lock);
> +			if (iommu->dirty_page_tracking)
> +				ret = vfio_iova_dirty_bitmap(iommu, range.iova,
> +						range.size, range.bitmap.pgsize,
> +						range.bitmap.data);
> +			else
> +				ret = -EINVAL;
> +			mutex_unlock(&iommu->lock);
> +
> +			return ret;
> +		}
>  	}
>  
>  	return -ENOTTY;
> @@ -2345,10 +2568,17 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
>  
>  	vaddr = dma->vaddr + offset;
>  
> -	if (write)
> +	if (write) {
>  		*copied = __copy_to_user((void __user *)vaddr, data,
>  					 count) ? 0 : count;
> -	else
> +		if (*copied && iommu->dirty_page_tracking) {
> +			unsigned long pgshift =
> +				__ffs(vfio_pgsize_bitmap(iommu));
> +
> +			bitmap_set(dma->bitmap, offset >> pgshift,
> +				   *copied >> pgshift);
> +		}
> +	} else
>  		*copied = __copy_from_user(data, (void __user *)vaddr,
>  					   count) ? 0 : count;
>  	if (kthread)
Kirti Wankhede March 20, 2020, 5:49 p.m. UTC | #2
On 3/20/2020 4:27 AM, Alex Williamson wrote:
> On Fri, 20 Mar 2020 01:46:41 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
>> - Start dirty pages tracking while migration is active
>> - Stop dirty pages tracking.
>> - Get dirty pages bitmap. Its user space application's responsibility to
>>    copy content of dirty pages from source to destination during migration.
>>
>> To prevent DoS attack, memory for bitmap is allocated per vfio_dma
>> structure. Bitmap size is calculated considering smallest supported page
>> size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled
>>
>> Bitmap is populated for already pinned pages when bitmap is allocated for
>> a vfio_dma with the smallest supported page size. Update bitmap from
>> pinning functions when tracking is enabled. When user application queries
>> bitmap, check if requested page size is same as page size used to
>> populated bitmap. If it is equal, copy bitmap, but if not equal, return
>> error.
>>
>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>> Reviewed-by: Neo Jia <cjia@nvidia.com>
>> ---
>>   drivers/vfio/vfio_iommu_type1.c | 242 +++++++++++++++++++++++++++++++++++++++-
>>   1 file changed, 236 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>> index 70aeab921d0f..239f61764d03 100644
>> --- a/drivers/vfio/vfio_iommu_type1.c
>> +++ b/drivers/vfio/vfio_iommu_type1.c
>> @@ -71,6 +71,7 @@ struct vfio_iommu {
>>   	unsigned int		dma_avail;
>>   	bool			v2;
>>   	bool			nesting;
>> +	bool			dirty_page_tracking;
>>   };
>>   
>>   struct vfio_domain {
>> @@ -91,6 +92,7 @@ struct vfio_dma {
>>   	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
>>   	struct task_struct	*task;
>>   	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
>> +	unsigned long		*bitmap;
>>   };
>>   
>>   struct vfio_group {
>> @@ -125,7 +127,21 @@ struct vfio_regions {
>>   #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
>>   					(!list_empty(&iommu->domain_list))
>>   
>> +#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
>> +
>> +/*
>> + * Input argument of number of bits to bitmap_set() is unsigned integer, which
>> + * further casts to signed integer for unaligned multi-bit operation,
>> + * __bitmap_set().
>> + * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
>> + * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
>> + * system.
>> + */
>> +#define DIRTY_BITMAP_PAGES_MAX	((1UL << 31) - 1)
>> +#define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
>> +
>>   static int put_pfn(unsigned long pfn, int prot);
>> +static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
>>   
>>   /*
>>    * This code handles mapping and unmapping of user data buffers
>> @@ -175,6 +191,67 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
>>   	rb_erase(&old->node, &iommu->dma_list);
>>   }
>>   
>> +
>> +static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, uint64_t pgsize)
>> +{
>> +	uint64_t npages = dma->size / pgsize;
>> +
> 
> Shouldn't we test this against one of the MAX macros defined above?  It
> would be bad if we could enabled dirty tracking but not allow the user
> to retrieve it.
> 

Yes, adding check as below:

         if (npages > DIRTY_BITMAP_PAGES_MAX)
                 -EINVAL;


>> +	dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
>> +	if (!dma->bitmap)
>> +		return -ENOMEM;
>> +
>> +	return 0;
>> +}
>> +
>> +static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, uint64_t pgsize)
>> +{
>> +	struct rb_node *n = rb_first(&iommu->dma_list);
>> +
>> +	for (; n; n = rb_next(n)) {
>> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
>> +		struct rb_node *p;
>> +		int ret;
>> +
>> +		ret = vfio_dma_bitmap_alloc(dma, pgsize);
>> +		if (ret) {
>> +			struct rb_node *p = rb_prev(n);
>> +
>> +			for (; p; p = rb_prev(p)) {
>> +				struct vfio_dma *dma = rb_entry(n,
>> +							struct vfio_dma, node);
>> +
>> +				kfree(dma->bitmap);
>> +				dma->bitmap = NULL;
>> +			}
>> +			return ret;
>> +		}
>> +
>> +		if (RB_EMPTY_ROOT(&dma->pfn_list))
>> +			continue;
>> +
>> +		for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
>> +			struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
>> +							 node);
>> +
>> +			bitmap_set(dma->bitmap,
>> +				   (vpfn->iova - dma->iova) / pgsize, 1);
>> +		}
>> +	}
>> +	return 0;
>> +}
>> +
>> +static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
>> +{
>> +	struct rb_node *n = rb_first(&iommu->dma_list);
>> +
>> +	for (; n; n = rb_next(n)) {
>> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
>> +
>> +		kfree(dma->bitmap);
>> +		dma->bitmap = NULL;
> 
> Might be useful to have a vfio_dma_bitmap_free() for here and above.
> 

Ok.

>> +	}
>> +}
>> +
>>   /*
>>    * Helper Functions for host iova-pfn list
>>    */
>> @@ -567,6 +644,14 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>>   			vfio_unpin_page_external(dma, iova, do_accounting);
>>   			goto pin_unwind;
>>   		}
>> +
>> +		if (iommu->dirty_page_tracking) {
>> +			unsigned long pgshift =
>> +					 __ffs(vfio_pgsize_bitmap(iommu));
>> +
>> +			bitmap_set(dma->bitmap,
>> +				   (vpfn->iova - dma->iova) >> pgshift, 1);
>> +		}
>>   	}
>>   
>>   	ret = i;
>> @@ -801,6 +886,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
>>   	vfio_unmap_unpin(iommu, dma, true);
>>   	vfio_unlink_dma(iommu, dma);
>>   	put_task_struct(dma->task);
>> +	kfree(dma->bitmap);
>>   	kfree(dma);
>>   	iommu->dma_avail++;
>>   }
>> @@ -831,6 +917,50 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
>>   	return bitmap;
>>   }
>>   
>> +static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
>> +				  size_t size, uint64_t pgsize,
>> +				  u64 __user *bitmap)
>> +{
>> +	struct vfio_dma *dma;
>> +	unsigned long pgshift = __ffs(pgsize);
>> +	unsigned int npages, bitmap_size;
>> +
>> +	dma = vfio_find_dma(iommu, iova, 1);
>> +
>> +	if (!dma)
>> +		return -EINVAL;
>> +
>> +	if (dma->iova != iova || dma->size != size)
>> +		return -EINVAL;
>> +
>> +	npages = dma->size >> pgshift;
>> +	bitmap_size = DIRTY_BITMAP_BYTES(npages);
>> +
>> +	/* mark all pages dirty if all pages are pinned and mapped. */
>> +	if (dma->iommu_mapped)
>> +		bitmap_set(dma->bitmap, 0, npages);
>> +
>> +	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
>> +		return -EFAULT;
> 
> We still need to reset the bitmap here, clearing and re-adding the
> pages that are still pinned.
> 
> https://lore.kernel.org/kvm/20200319070635.2ff5db56@x1.home/
> 

I thought you agreed on my reply to it
https://lore.kernel.org/kvm/31621b70-02a9-2ea5-045f-f72b671fe703@nvidia.com/

 > Why re-populate when there will be no change since
 > vfio_iova_dirty_bitmap() is called holding iommu->lock? If there is any
 > pin request while vfio_iova_dirty_bitmap() is still working, it will
 > wait till iommu->lock is released. Bitmap will be populated when page is
 > pinned.

Thanks,
Kirti

>> +
>> +	return 0;
>> +}
>> +
>> +static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
>> +{
>> +	uint64_t bsize;
>> +
>> +	if (!npages || !bitmap_size || (bitmap_size > DIRTY_BITMAP_SIZE_MAX))
>> +		return -EINVAL;
>> +
>> +	bsize = DIRTY_BITMAP_BYTES(npages);
>> +
>> +	if (bitmap_size < bsize)
>> +		return -EINVAL;
>> +
>> +	return 0;
>> +}
>> +
>>   static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>>   			     struct vfio_iommu_type1_dma_unmap *unmap)
>>   {
>> @@ -1038,16 +1168,16 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>>   	unsigned long vaddr = map->vaddr;
>>   	size_t size = map->size;
>>   	int ret = 0, prot = 0;
>> -	uint64_t mask;
>> +	uint64_t pgsize;
>>   	struct vfio_dma *dma;
>>   
>>   	/* Verify that none of our __u64 fields overflow */
>>   	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
>>   		return -EINVAL;
>>   
>> -	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
>> +	pgsize = (uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu));
>>   
>> -	WARN_ON(mask & PAGE_MASK);
>> +	WARN_ON((pgsize - 1) & PAGE_MASK);
>>   
>>   	/* READ/WRITE from device perspective */
>>   	if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
>> @@ -1055,7 +1185,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>>   	if (map->flags & VFIO_DMA_MAP_FLAG_READ)
>>   		prot |= IOMMU_READ;
>>   
>> -	if (!prot || !size || (size | iova | vaddr) & mask)
>> +	if (!prot || !size || (size | iova | vaddr) & (pgsize - 1))
>>   		return -EINVAL;
>>   
>>   	/* Don't allow IOVA or virtual address wrap */
>> @@ -1130,6 +1260,12 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>>   	else
>>   		ret = vfio_pin_map_dma(iommu, dma, size);
>>   
>> +	if (!ret && iommu->dirty_page_tracking) {
>> +		ret = vfio_dma_bitmap_alloc(dma, pgsize);
>> +		if (ret)
>> +			vfio_remove_dma(iommu, dma);
>> +	}
>> +
>>   out_unlock:
>>   	mutex_unlock(&iommu->lock);
>>   	return ret;
>> @@ -2278,6 +2414,93 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>>   
>>   		return copy_to_user((void __user *)arg, &unmap, minsz) ?
>>   			-EFAULT : 0;
>> +	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
>> +		struct vfio_iommu_type1_dirty_bitmap dirty;
>> +		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
>> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
>> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
>> +		int ret = 0;
>> +
>> +		if (!iommu->v2)
>> +			return -EACCES;
>> +
>> +		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
>> +				    flags);
>> +
>> +		if (copy_from_user(&dirty, (void __user *)arg, minsz))
>> +			return -EFAULT;
>> +
>> +		if (dirty.argsz < minsz || dirty.flags & ~mask)
>> +			return -EINVAL;
>> +
>> +		/* only one flag should be set at a time */
>> +		if (__ffs(dirty.flags) != __fls(dirty.flags))
>> +			return -EINVAL;
>> +
>> +		if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
>> +			uint64_t pgsize = 1 << __ffs(vfio_pgsize_bitmap(iommu));
>> +
>> +			mutex_lock(&iommu->lock);
>> +			if (!iommu->dirty_page_tracking) {
>> +				ret = vfio_dma_bitmap_alloc_all(iommu, pgsize);
>> +				if (!ret)
>> +					iommu->dirty_page_tracking = true;
>> +			}
>> +			mutex_unlock(&iommu->lock);
>> +			return ret;
>> +		} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
>> +			mutex_lock(&iommu->lock);
>> +			if (iommu->dirty_page_tracking) {
>> +				iommu->dirty_page_tracking = false;
>> +				vfio_dma_bitmap_free_all(iommu);
>> +			}
>> +			mutex_unlock(&iommu->lock);
>> +			return 0;
>> +		} else if (dirty.flags &
>> +				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
>> +			struct vfio_iommu_type1_dirty_bitmap_get range;
>> +			unsigned long pgshift;
>> +			size_t data_size = dirty.argsz - minsz;
>> +			uint64_t iommu_pgsize =
>> +					 1 << __ffs(vfio_pgsize_bitmap(iommu));
>> +
>> +			if (!data_size || data_size < sizeof(range))
>> +				return -EINVAL;
>> +
>> +			if (copy_from_user(&range, (void __user *)(arg + minsz),
>> +					   sizeof(range)))
>> +				return -EFAULT;
>> +
>> +			/* allow only min supported pgsize */
>> +			if (range.bitmap.pgsize != iommu_pgsize)
>> +				return -EINVAL;
>> +			if (range.iova & (iommu_pgsize - 1))
>> +				return -EINVAL;
>> +			if (!range.size || range.size & (iommu_pgsize - 1))
>> +				return -EINVAL;
>> +			if (range.iova + range.size < range.iova)
>> +				return -EINVAL;
>> +			if (!access_ok((void __user *)range.bitmap.data,
>> +				       range.bitmap.size))
>> +				return -EINVAL;
>> +
>> +			pgshift = __ffs(range.bitmap.pgsize);
>> +			ret = verify_bitmap_size(range.size >> pgshift,
>> +						 range.bitmap.size);
>> +			if (ret)
>> +				return ret;
>> +
>> +			mutex_lock(&iommu->lock);
>> +			if (iommu->dirty_page_tracking)
>> +				ret = vfio_iova_dirty_bitmap(iommu, range.iova,
>> +						range.size, range.bitmap.pgsize,
>> +						range.bitmap.data);
>> +			else
>> +				ret = -EINVAL;
>> +			mutex_unlock(&iommu->lock);
>> +
>> +			return ret;
>> +		}
>>   	}
>>   
>>   	return -ENOTTY;
>> @@ -2345,10 +2568,17 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
>>   
>>   	vaddr = dma->vaddr + offset;
>>   
>> -	if (write)
>> +	if (write) {
>>   		*copied = __copy_to_user((void __user *)vaddr, data,
>>   					 count) ? 0 : count;
>> -	else
>> +		if (*copied && iommu->dirty_page_tracking) {
>> +			unsigned long pgshift =
>> +				__ffs(vfio_pgsize_bitmap(iommu));
>> +
>> +			bitmap_set(dma->bitmap, offset >> pgshift,
>> +				   *copied >> pgshift);
>> +		}
>> +	} else
>>   		*copied = __copy_from_user(data, (void __user *)vaddr,
>>   					   count) ? 0 : count;
>>   	if (kthread)
>
Alex Williamson March 20, 2020, 6:01 p.m. UTC | #3
On Fri, 20 Mar 2020 23:19:14 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 3/20/2020 4:27 AM, Alex Williamson wrote:
> > On Fri, 20 Mar 2020 01:46:41 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> >> VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
> >> - Start dirty pages tracking while migration is active
> >> - Stop dirty pages tracking.
> >> - Get dirty pages bitmap. Its user space application's responsibility to
> >>    copy content of dirty pages from source to destination during migration.
> >>
> >> To prevent DoS attack, memory for bitmap is allocated per vfio_dma
> >> structure. Bitmap size is calculated considering smallest supported page
> >> size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled
> >>
> >> Bitmap is populated for already pinned pages when bitmap is allocated for
> >> a vfio_dma with the smallest supported page size. Update bitmap from
> >> pinning functions when tracking is enabled. When user application queries
> >> bitmap, check if requested page size is same as page size used to
> >> populated bitmap. If it is equal, copy bitmap, but if not equal, return
> >> error.
> >>
> >> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> >> Reviewed-by: Neo Jia <cjia@nvidia.com>
> >> ---
> >>   drivers/vfio/vfio_iommu_type1.c | 242 +++++++++++++++++++++++++++++++++++++++-
> >>   1 file changed, 236 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> >> index 70aeab921d0f..239f61764d03 100644
> >> --- a/drivers/vfio/vfio_iommu_type1.c
> >> +++ b/drivers/vfio/vfio_iommu_type1.c
> >> @@ -71,6 +71,7 @@ struct vfio_iommu {
> >>   	unsigned int		dma_avail;
> >>   	bool			v2;
> >>   	bool			nesting;
> >> +	bool			dirty_page_tracking;
> >>   };
> >>   
> >>   struct vfio_domain {
> >> @@ -91,6 +92,7 @@ struct vfio_dma {
> >>   	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
> >>   	struct task_struct	*task;
> >>   	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
> >> +	unsigned long		*bitmap;
> >>   };
> >>   
> >>   struct vfio_group {
> >> @@ -125,7 +127,21 @@ struct vfio_regions {
> >>   #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> >>   					(!list_empty(&iommu->domain_list))
> >>   
> >> +#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
> >> +
> >> +/*
> >> + * Input argument of number of bits to bitmap_set() is unsigned integer, which
> >> + * further casts to signed integer for unaligned multi-bit operation,
> >> + * __bitmap_set().
> >> + * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
> >> + * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
> >> + * system.
> >> + */
> >> +#define DIRTY_BITMAP_PAGES_MAX	((1UL << 31) - 1)
> >> +#define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
> >> +
> >>   static int put_pfn(unsigned long pfn, int prot);
> >> +static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
> >>   
> >>   /*
> >>    * This code handles mapping and unmapping of user data buffers
> >> @@ -175,6 +191,67 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
> >>   	rb_erase(&old->node, &iommu->dma_list);
> >>   }
> >>   
> >> +
> >> +static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, uint64_t pgsize)
> >> +{
> >> +	uint64_t npages = dma->size / pgsize;
> >> +  
> > 
> > Shouldn't we test this against one of the MAX macros defined above?  It
> > would be bad if we could enabled dirty tracking but not allow the user
> > to retrieve it.
> >   
> 
> Yes, adding check as below:
> 
>          if (npages > DIRTY_BITMAP_PAGES_MAX)
>                  -EINVAL;
> 
> 
> >> +	dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
> >> +	if (!dma->bitmap)
> >> +		return -ENOMEM;
> >> +
> >> +	return 0;
> >> +}
> >> +
> >> +static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, uint64_t pgsize)
> >> +{
> >> +	struct rb_node *n = rb_first(&iommu->dma_list);
> >> +
> >> +	for (; n; n = rb_next(n)) {
> >> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> >> +		struct rb_node *p;
> >> +		int ret;
> >> +
> >> +		ret = vfio_dma_bitmap_alloc(dma, pgsize);
> >> +		if (ret) {
> >> +			struct rb_node *p = rb_prev(n);
> >> +
> >> +			for (; p; p = rb_prev(p)) {
> >> +				struct vfio_dma *dma = rb_entry(n,
> >> +							struct vfio_dma, node);
> >> +
> >> +				kfree(dma->bitmap);
> >> +				dma->bitmap = NULL;
> >> +			}
> >> +			return ret;
> >> +		}
> >> +
> >> +		if (RB_EMPTY_ROOT(&dma->pfn_list))
> >> +			continue;
> >> +
> >> +		for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
> >> +			struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
> >> +							 node);
> >> +
> >> +			bitmap_set(dma->bitmap,
> >> +				   (vpfn->iova - dma->iova) / pgsize, 1);
> >> +		}
> >> +	}
> >> +	return 0;
> >> +}
> >> +
> >> +static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
> >> +{
> >> +	struct rb_node *n = rb_first(&iommu->dma_list);
> >> +
> >> +	for (; n; n = rb_next(n)) {
> >> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> >> +
> >> +		kfree(dma->bitmap);
> >> +		dma->bitmap = NULL;  
> > 
> > Might be useful to have a vfio_dma_bitmap_free() for here and above.
> >   
> 
> Ok.
> 
> >> +	}
> >> +}
> >> +
> >>   /*
> >>    * Helper Functions for host iova-pfn list
> >>    */
> >> @@ -567,6 +644,14 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
> >>   			vfio_unpin_page_external(dma, iova, do_accounting);
> >>   			goto pin_unwind;
> >>   		}
> >> +
> >> +		if (iommu->dirty_page_tracking) {
> >> +			unsigned long pgshift =
> >> +					 __ffs(vfio_pgsize_bitmap(iommu));
> >> +
> >> +			bitmap_set(dma->bitmap,
> >> +				   (vpfn->iova - dma->iova) >> pgshift, 1);
> >> +		}
> >>   	}
> >>   
> >>   	ret = i;
> >> @@ -801,6 +886,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
> >>   	vfio_unmap_unpin(iommu, dma, true);
> >>   	vfio_unlink_dma(iommu, dma);
> >>   	put_task_struct(dma->task);
> >> +	kfree(dma->bitmap);
> >>   	kfree(dma);
> >>   	iommu->dma_avail++;
> >>   }
> >> @@ -831,6 +917,50 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
> >>   	return bitmap;
> >>   }
> >>   
> >> +static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
> >> +				  size_t size, uint64_t pgsize,
> >> +				  u64 __user *bitmap)
> >> +{
> >> +	struct vfio_dma *dma;
> >> +	unsigned long pgshift = __ffs(pgsize);
> >> +	unsigned int npages, bitmap_size;
> >> +
> >> +	dma = vfio_find_dma(iommu, iova, 1);
> >> +
> >> +	if (!dma)
> >> +		return -EINVAL;
> >> +
> >> +	if (dma->iova != iova || dma->size != size)
> >> +		return -EINVAL;
> >> +
> >> +	npages = dma->size >> pgshift;
> >> +	bitmap_size = DIRTY_BITMAP_BYTES(npages);
> >> +
> >> +	/* mark all pages dirty if all pages are pinned and mapped. */
> >> +	if (dma->iommu_mapped)
> >> +		bitmap_set(dma->bitmap, 0, npages);
> >> +
> >> +	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
> >> +		return -EFAULT;  
> > 
> > We still need to reset the bitmap here, clearing and re-adding the
> > pages that are still pinned.
> > 
> > https://lore.kernel.org/kvm/20200319070635.2ff5db56@x1.home/
> >   
> 
> I thought you agreed on my reply to it
> https://lore.kernel.org/kvm/31621b70-02a9-2ea5-045f-f72b671fe703@nvidia.com/
> 
>  > Why re-populate when there will be no change since
>  > vfio_iova_dirty_bitmap() is called holding iommu->lock? If there is any
>  > pin request while vfio_iova_dirty_bitmap() is still working, it will
>  > wait till iommu->lock is released. Bitmap will be populated when page is
>  > pinned.  

As coded, dirty bits are only ever set in the bitmap, never cleared.
If a page is unpinned between iterations of the user recording the
dirty bitmap, it should be marked dirty in the iteration immediately
after the unpinning and not marked dirty in the following iteration.
That doesn't happen here.  We're reporting cumulative dirty pages since
logging was enabled, we need to be reporting dirty pages since the user
last retrieved the dirty bitmap.  The bitmap should be cleared and
currently pinned pages re-added after copying to the user.  Thanks,

Alex

> >> +	return 0;
> >> +}
> >> +
> >> +static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
> >> +{
> >> +	uint64_t bsize;
> >> +
> >> +	if (!npages || !bitmap_size || (bitmap_size > DIRTY_BITMAP_SIZE_MAX))
> >> +		return -EINVAL;
> >> +
> >> +	bsize = DIRTY_BITMAP_BYTES(npages);
> >> +
> >> +	if (bitmap_size < bsize)
> >> +		return -EINVAL;
> >> +
> >> +	return 0;
> >> +}
> >> +
> >>   static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> >>   			     struct vfio_iommu_type1_dma_unmap *unmap)
> >>   {
> >> @@ -1038,16 +1168,16 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
> >>   	unsigned long vaddr = map->vaddr;
> >>   	size_t size = map->size;
> >>   	int ret = 0, prot = 0;
> >> -	uint64_t mask;
> >> +	uint64_t pgsize;
> >>   	struct vfio_dma *dma;
> >>   
> >>   	/* Verify that none of our __u64 fields overflow */
> >>   	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
> >>   		return -EINVAL;
> >>   
> >> -	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
> >> +	pgsize = (uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu));
> >>   
> >> -	WARN_ON(mask & PAGE_MASK);
> >> +	WARN_ON((pgsize - 1) & PAGE_MASK);
> >>   
> >>   	/* READ/WRITE from device perspective */
> >>   	if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
> >> @@ -1055,7 +1185,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
> >>   	if (map->flags & VFIO_DMA_MAP_FLAG_READ)
> >>   		prot |= IOMMU_READ;
> >>   
> >> -	if (!prot || !size || (size | iova | vaddr) & mask)
> >> +	if (!prot || !size || (size | iova | vaddr) & (pgsize - 1))
> >>   		return -EINVAL;
> >>   
> >>   	/* Don't allow IOVA or virtual address wrap */
> >> @@ -1130,6 +1260,12 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
> >>   	else
> >>   		ret = vfio_pin_map_dma(iommu, dma, size);
> >>   
> >> +	if (!ret && iommu->dirty_page_tracking) {
> >> +		ret = vfio_dma_bitmap_alloc(dma, pgsize);
> >> +		if (ret)
> >> +			vfio_remove_dma(iommu, dma);
> >> +	}
> >> +
> >>   out_unlock:
> >>   	mutex_unlock(&iommu->lock);
> >>   	return ret;
> >> @@ -2278,6 +2414,93 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
> >>   
> >>   		return copy_to_user((void __user *)arg, &unmap, minsz) ?
> >>   			-EFAULT : 0;
> >> +	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
> >> +		struct vfio_iommu_type1_dirty_bitmap dirty;
> >> +		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
> >> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
> >> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
> >> +		int ret = 0;
> >> +
> >> +		if (!iommu->v2)
> >> +			return -EACCES;
> >> +
> >> +		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
> >> +				    flags);
> >> +
> >> +		if (copy_from_user(&dirty, (void __user *)arg, minsz))
> >> +			return -EFAULT;
> >> +
> >> +		if (dirty.argsz < minsz || dirty.flags & ~mask)
> >> +			return -EINVAL;
> >> +
> >> +		/* only one flag should be set at a time */
> >> +		if (__ffs(dirty.flags) != __fls(dirty.flags))
> >> +			return -EINVAL;
> >> +
> >> +		if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
> >> +			uint64_t pgsize = 1 << __ffs(vfio_pgsize_bitmap(iommu));
> >> +
> >> +			mutex_lock(&iommu->lock);
> >> +			if (!iommu->dirty_page_tracking) {
> >> +				ret = vfio_dma_bitmap_alloc_all(iommu, pgsize);
> >> +				if (!ret)
> >> +					iommu->dirty_page_tracking = true;
> >> +			}
> >> +			mutex_unlock(&iommu->lock);
> >> +			return ret;
> >> +		} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
> >> +			mutex_lock(&iommu->lock);
> >> +			if (iommu->dirty_page_tracking) {
> >> +				iommu->dirty_page_tracking = false;
> >> +				vfio_dma_bitmap_free_all(iommu);
> >> +			}
> >> +			mutex_unlock(&iommu->lock);
> >> +			return 0;
> >> +		} else if (dirty.flags &
> >> +				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
> >> +			struct vfio_iommu_type1_dirty_bitmap_get range;
> >> +			unsigned long pgshift;
> >> +			size_t data_size = dirty.argsz - minsz;
> >> +			uint64_t iommu_pgsize =
> >> +					 1 << __ffs(vfio_pgsize_bitmap(iommu));
> >> +
> >> +			if (!data_size || data_size < sizeof(range))
> >> +				return -EINVAL;
> >> +
> >> +			if (copy_from_user(&range, (void __user *)(arg + minsz),
> >> +					   sizeof(range)))
> >> +				return -EFAULT;
> >> +
> >> +			/* allow only min supported pgsize */
> >> +			if (range.bitmap.pgsize != iommu_pgsize)
> >> +				return -EINVAL;
> >> +			if (range.iova & (iommu_pgsize - 1))
> >> +				return -EINVAL;
> >> +			if (!range.size || range.size & (iommu_pgsize - 1))
> >> +				return -EINVAL;
> >> +			if (range.iova + range.size < range.iova)
> >> +				return -EINVAL;
> >> +			if (!access_ok((void __user *)range.bitmap.data,
> >> +				       range.bitmap.size))
> >> +				return -EINVAL;
> >> +
> >> +			pgshift = __ffs(range.bitmap.pgsize);
> >> +			ret = verify_bitmap_size(range.size >> pgshift,
> >> +						 range.bitmap.size);
> >> +			if (ret)
> >> +				return ret;
> >> +
> >> +			mutex_lock(&iommu->lock);
> >> +			if (iommu->dirty_page_tracking)
> >> +				ret = vfio_iova_dirty_bitmap(iommu, range.iova,
> >> +						range.size, range.bitmap.pgsize,
> >> +						range.bitmap.data);
> >> +			else
> >> +				ret = -EINVAL;
> >> +			mutex_unlock(&iommu->lock);
> >> +
> >> +			return ret;
> >> +		}
> >>   	}
> >>   
> >>   	return -ENOTTY;
> >> @@ -2345,10 +2568,17 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
> >>   
> >>   	vaddr = dma->vaddr + offset;
> >>   
> >> -	if (write)
> >> +	if (write) {
> >>   		*copied = __copy_to_user((void __user *)vaddr, data,
> >>   					 count) ? 0 : count;
> >> -	else
> >> +		if (*copied && iommu->dirty_page_tracking) {
> >> +			unsigned long pgshift =
> >> +				__ffs(vfio_pgsize_bitmap(iommu));
> >> +
> >> +			bitmap_set(dma->bitmap, offset >> pgshift,
> >> +				   *copied >> pgshift);
> >> +		}
> >> +	} else
> >>   		*copied = __copy_from_user(data, (void __user *)vaddr,
> >>   					   count) ? 0 : count;
> >>   	if (kthread)  
> >   
>
Kirti Wankhede March 20, 2020, 6:42 p.m. UTC | #4
On 3/20/2020 11:31 PM, Alex Williamson wrote:
> On Fri, 20 Mar 2020 23:19:14 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> On 3/20/2020 4:27 AM, Alex Williamson wrote:
>>> On Fri, 20 Mar 2020 01:46:41 +0530
>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>    

<snip>

>>>> +static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
>>>> +				  size_t size, uint64_t pgsize,
>>>> +				  u64 __user *bitmap)
>>>> +{
>>>> +	struct vfio_dma *dma;
>>>> +	unsigned long pgshift = __ffs(pgsize);
>>>> +	unsigned int npages, bitmap_size;
>>>> +
>>>> +	dma = vfio_find_dma(iommu, iova, 1);
>>>> +
>>>> +	if (!dma)
>>>> +		return -EINVAL;
>>>> +
>>>> +	if (dma->iova != iova || dma->size != size)
>>>> +		return -EINVAL;
>>>> +
>>>> +	npages = dma->size >> pgshift;
>>>> +	bitmap_size = DIRTY_BITMAP_BYTES(npages);
>>>> +
>>>> +	/* mark all pages dirty if all pages are pinned and mapped. */
>>>> +	if (dma->iommu_mapped)
>>>> +		bitmap_set(dma->bitmap, 0, npages);
>>>> +
>>>> +	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
>>>> +		return -EFAULT;
>>>
>>> We still need to reset the bitmap here, clearing and re-adding the
>>> pages that are still pinned.
>>>
>>> https://lore.kernel.org/kvm/20200319070635.2ff5db56@x1.home/
>>>    
>>
>> I thought you agreed on my reply to it
>> https://lore.kernel.org/kvm/31621b70-02a9-2ea5-045f-f72b671fe703@nvidia.com/
>>
>>   > Why re-populate when there will be no change since
>>   > vfio_iova_dirty_bitmap() is called holding iommu->lock? If there is any
>>   > pin request while vfio_iova_dirty_bitmap() is still working, it will
>>   > wait till iommu->lock is released. Bitmap will be populated when page is
>>   > pinned.
> 
> As coded, dirty bits are only ever set in the bitmap, never cleared.
> If a page is unpinned between iterations of the user recording the
> dirty bitmap, it should be marked dirty in the iteration immediately
> after the unpinning and not marked dirty in the following iteration.
> That doesn't happen here.  We're reporting cumulative dirty pages since
> logging was enabled, we need to be reporting dirty pages since the user
> last retrieved the dirty bitmap.  The bitmap should be cleared and
> currently pinned pages re-added after copying to the user.  Thanks,
> 

Does that mean, we have to track every iteration? do we really need that 
tracking?

Generally the flow is:
- vendor driver pin x pages
- Enter pre-copy-phase where vCPUs are running - user starts dirty pages 
tracking, then user asks dirty bitmap, x pages reported dirty by 
VFIO_IOMMU_DIRTY_PAGES ioctl with _GET flag
- In pre-copy phase, vendor driver pins y more pages, now bitmap 
consists of x+y bits set
- In pre-copy phase, vendor driver unpins z pages, but bitmap is not 
updated, so again bitmap consists of x+y bits set.
- Enter in stop-and-copy phase, vCPUs are stopped, mdev devices are stopped
- user asks dirty bitmap - Since here vCPU and mdev devices are stopped, 
pages should not get dirty by guest driver or the physical device. 
Hence, x+y dirty pages would be reported.

I don't think we need to track every iteration of bitmap reporting.

Thanks,
Kirti
Alex Williamson March 20, 2020, 6:59 p.m. UTC | #5
On Sat, 21 Mar 2020 00:12:04 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 3/20/2020 11:31 PM, Alex Williamson wrote:
> > On Fri, 20 Mar 2020 23:19:14 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> >> On 3/20/2020 4:27 AM, Alex Williamson wrote:  
> >>> On Fri, 20 Mar 2020 01:46:41 +0530
> >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>      
> 
> <snip>
> 
> >>>> +static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
> >>>> +				  size_t size, uint64_t pgsize,
> >>>> +				  u64 __user *bitmap)
> >>>> +{
> >>>> +	struct vfio_dma *dma;
> >>>> +	unsigned long pgshift = __ffs(pgsize);
> >>>> +	unsigned int npages, bitmap_size;
> >>>> +
> >>>> +	dma = vfio_find_dma(iommu, iova, 1);
> >>>> +
> >>>> +	if (!dma)
> >>>> +		return -EINVAL;
> >>>> +
> >>>> +	if (dma->iova != iova || dma->size != size)
> >>>> +		return -EINVAL;
> >>>> +
> >>>> +	npages = dma->size >> pgshift;
> >>>> +	bitmap_size = DIRTY_BITMAP_BYTES(npages);
> >>>> +
> >>>> +	/* mark all pages dirty if all pages are pinned and mapped. */
> >>>> +	if (dma->iommu_mapped)
> >>>> +		bitmap_set(dma->bitmap, 0, npages);
> >>>> +
> >>>> +	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
> >>>> +		return -EFAULT;  
> >>>
> >>> We still need to reset the bitmap here, clearing and re-adding the
> >>> pages that are still pinned.
> >>>
> >>> https://lore.kernel.org/kvm/20200319070635.2ff5db56@x1.home/
> >>>      
> >>
> >> I thought you agreed on my reply to it
> >> https://lore.kernel.org/kvm/31621b70-02a9-2ea5-045f-f72b671fe703@nvidia.com/
> >>  
> >>   > Why re-populate when there will be no change since
> >>   > vfio_iova_dirty_bitmap() is called holding iommu->lock? If there is any
> >>   > pin request while vfio_iova_dirty_bitmap() is still working, it will
> >>   > wait till iommu->lock is released. Bitmap will be populated when page is
> >>   > pinned.  
> > 
> > As coded, dirty bits are only ever set in the bitmap, never cleared.
> > If a page is unpinned between iterations of the user recording the
> > dirty bitmap, it should be marked dirty in the iteration immediately
> > after the unpinning and not marked dirty in the following iteration.
> > That doesn't happen here.  We're reporting cumulative dirty pages since
> > logging was enabled, we need to be reporting dirty pages since the user
> > last retrieved the dirty bitmap.  The bitmap should be cleared and
> > currently pinned pages re-added after copying to the user.  Thanks,
> >   
> 
> Does that mean, we have to track every iteration? do we really need that 
> tracking?
> 
> Generally the flow is:
> - vendor driver pin x pages
> - Enter pre-copy-phase where vCPUs are running - user starts dirty pages 
> tracking, then user asks dirty bitmap, x pages reported dirty by 
> VFIO_IOMMU_DIRTY_PAGES ioctl with _GET flag
> - In pre-copy phase, vendor driver pins y more pages, now bitmap 
> consists of x+y bits set
> - In pre-copy phase, vendor driver unpins z pages, but bitmap is not 
> updated, so again bitmap consists of x+y bits set.
> - Enter in stop-and-copy phase, vCPUs are stopped, mdev devices are stopped
> - user asks dirty bitmap - Since here vCPU and mdev devices are stopped, 
> pages should not get dirty by guest driver or the physical device. 
> Hence, x+y dirty pages would be reported.
> 
> I don't think we need to track every iteration of bitmap reporting.

Yes, once a bitmap is read, it's reset.  In your example, after
unpinning z pages the user should still see a bitmap with x+y pages,
but once they've read that bitmap, the next bitmap should be x+y-z.
Userspace can make decisions about when to switch from pre-copy to
stop-and-copy based on convergence, ie. the slope of the line recording
dirty pages per iteration.  The implementation here never allows an
inflection point, dirty pages reported through vfio would always either
be flat or climbing.  There might also be a case that an iommu backed
device could start pinning pages during the course of a migration, how
would the bitmap ever revert from fully populated to only tracking the
pinned pages?  Thanks,

Alex
Kirti Wankhede March 23, 2020, 5:54 p.m. UTC | #6
On 3/21/2020 12:29 AM, Alex Williamson wrote:
> On Sat, 21 Mar 2020 00:12:04 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> On 3/20/2020 11:31 PM, Alex Williamson wrote:
>>> On Fri, 20 Mar 2020 23:19:14 +0530
>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>    
>>>> On 3/20/2020 4:27 AM, Alex Williamson wrote:
>>>>> On Fri, 20 Mar 2020 01:46:41 +0530
>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>>>       
>>
>> <snip>
>>
>>>>>> +static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
>>>>>> +				  size_t size, uint64_t pgsize,
>>>>>> +				  u64 __user *bitmap)
>>>>>> +{
>>>>>> +	struct vfio_dma *dma;
>>>>>> +	unsigned long pgshift = __ffs(pgsize);
>>>>>> +	unsigned int npages, bitmap_size;
>>>>>> +
>>>>>> +	dma = vfio_find_dma(iommu, iova, 1);
>>>>>> +
>>>>>> +	if (!dma)
>>>>>> +		return -EINVAL;
>>>>>> +
>>>>>> +	if (dma->iova != iova || dma->size != size)
>>>>>> +		return -EINVAL;
>>>>>> +
>>>>>> +	npages = dma->size >> pgshift;
>>>>>> +	bitmap_size = DIRTY_BITMAP_BYTES(npages);
>>>>>> +
>>>>>> +	/* mark all pages dirty if all pages are pinned and mapped. */
>>>>>> +	if (dma->iommu_mapped)
>>>>>> +		bitmap_set(dma->bitmap, 0, npages);
>>>>>> +
>>>>>> +	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
>>>>>> +		return -EFAULT;
>>>>>
>>>>> We still need to reset the bitmap here, clearing and re-adding the
>>>>> pages that are still pinned.
>>>>>
>>>>> https://lore.kernel.org/kvm/20200319070635.2ff5db56@x1.home/
>>>>>       
>>>>
>>>> I thought you agreed on my reply to it
>>>> https://lore.kernel.org/kvm/31621b70-02a9-2ea5-045f-f72b671fe703@nvidia.com/
>>>>   
>>>>    > Why re-populate when there will be no change since
>>>>    > vfio_iova_dirty_bitmap() is called holding iommu->lock? If there is any
>>>>    > pin request while vfio_iova_dirty_bitmap() is still working, it will
>>>>    > wait till iommu->lock is released. Bitmap will be populated when page is
>>>>    > pinned.
>>>
>>> As coded, dirty bits are only ever set in the bitmap, never cleared.
>>> If a page is unpinned between iterations of the user recording the
>>> dirty bitmap, it should be marked dirty in the iteration immediately
>>> after the unpinning and not marked dirty in the following iteration.
>>> That doesn't happen here.  We're reporting cumulative dirty pages since
>>> logging was enabled, we need to be reporting dirty pages since the user
>>> last retrieved the dirty bitmap.  The bitmap should be cleared and
>>> currently pinned pages re-added after copying to the user.  Thanks,
>>>    
>>
>> Does that mean, we have to track every iteration? do we really need that
>> tracking?
>>
>> Generally the flow is:
>> - vendor driver pin x pages
>> - Enter pre-copy-phase where vCPUs are running - user starts dirty pages
>> tracking, then user asks dirty bitmap, x pages reported dirty by
>> VFIO_IOMMU_DIRTY_PAGES ioctl with _GET flag
>> - In pre-copy phase, vendor driver pins y more pages, now bitmap
>> consists of x+y bits set
>> - In pre-copy phase, vendor driver unpins z pages, but bitmap is not
>> updated, so again bitmap consists of x+y bits set.
>> - Enter in stop-and-copy phase, vCPUs are stopped, mdev devices are stopped
>> - user asks dirty bitmap - Since here vCPU and mdev devices are stopped,
>> pages should not get dirty by guest driver or the physical device.
>> Hence, x+y dirty pages would be reported.
>>
>> I don't think we need to track every iteration of bitmap reporting.
> 
> Yes, once a bitmap is read, it's reset.  In your example, after
> unpinning z pages the user should still see a bitmap with x+y pages,
> but once they've read that bitmap, the next bitmap should be x+y-z.
> Userspace can make decisions about when to switch from pre-copy to
> stop-and-copy based on convergence, ie. the slope of the line recording
> dirty pages per iteration.  The implementation here never allows an
> inflection point, dirty pages reported through vfio would always either
> be flat or climbing.  There might also be a case that an iommu backed
> device could start pinning pages during the course of a migration, how
> would the bitmap ever revert from fully populated to only tracking the
> pinned pages?  Thanks,
> 

At KVM forum we discussed this - if guest driver pins say 1024 pages 
before migration starts, during pre-copy phase device can dirty 0 pages 
in best case and 1024 pages in worst case. In that case, user will 
transfer content of 1024 pages during pre-copy phase and in 
stop-and-copy phase also, that will be pages will be copied twice. So we 
decided to only get dirty pages bitmap at stop-and-copy phase. If user 
is going to get dirty pages in stop-and-copy phase only, then that will 
be single iteration.
There aren't any devices yet that can track sys memory dirty pages. So 
we can go ahead with this patch and support for dirty pages tracking 
during pre-copy phase can be added later when there will be consumers of 
that functionality.

Thanks,
Kirti
Alex Williamson March 23, 2020, 6:44 p.m. UTC | #7
On Mon, 23 Mar 2020 23:24:37 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 3/21/2020 12:29 AM, Alex Williamson wrote:
> > On Sat, 21 Mar 2020 00:12:04 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> >> On 3/20/2020 11:31 PM, Alex Williamson wrote:  
> >>> On Fri, 20 Mar 2020 23:19:14 +0530
> >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>      
> >>>> On 3/20/2020 4:27 AM, Alex Williamson wrote:  
> >>>>> On Fri, 20 Mar 2020 01:46:41 +0530
> >>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>>>         
> >>
> >> <snip>
> >>  
> >>>>>> +static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
> >>>>>> +				  size_t size, uint64_t pgsize,
> >>>>>> +				  u64 __user *bitmap)
> >>>>>> +{
> >>>>>> +	struct vfio_dma *dma;
> >>>>>> +	unsigned long pgshift = __ffs(pgsize);
> >>>>>> +	unsigned int npages, bitmap_size;
> >>>>>> +
> >>>>>> +	dma = vfio_find_dma(iommu, iova, 1);
> >>>>>> +
> >>>>>> +	if (!dma)
> >>>>>> +		return -EINVAL;
> >>>>>> +
> >>>>>> +	if (dma->iova != iova || dma->size != size)
> >>>>>> +		return -EINVAL;
> >>>>>> +
> >>>>>> +	npages = dma->size >> pgshift;
> >>>>>> +	bitmap_size = DIRTY_BITMAP_BYTES(npages);
> >>>>>> +
> >>>>>> +	/* mark all pages dirty if all pages are pinned and mapped. */
> >>>>>> +	if (dma->iommu_mapped)
> >>>>>> +		bitmap_set(dma->bitmap, 0, npages);
> >>>>>> +
> >>>>>> +	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
> >>>>>> +		return -EFAULT;  
> >>>>>
> >>>>> We still need to reset the bitmap here, clearing and re-adding the
> >>>>> pages that are still pinned.
> >>>>>
> >>>>> https://lore.kernel.org/kvm/20200319070635.2ff5db56@x1.home/
> >>>>>         
> >>>>
> >>>> I thought you agreed on my reply to it
> >>>> https://lore.kernel.org/kvm/31621b70-02a9-2ea5-045f-f72b671fe703@nvidia.com/
> >>>>     
> >>>>    > Why re-populate when there will be no change since
> >>>>    > vfio_iova_dirty_bitmap() is called holding iommu->lock? If there is any
> >>>>    > pin request while vfio_iova_dirty_bitmap() is still working, it will
> >>>>    > wait till iommu->lock is released. Bitmap will be populated when page is
> >>>>    > pinned.  
> >>>
> >>> As coded, dirty bits are only ever set in the bitmap, never cleared.
> >>> If a page is unpinned between iterations of the user recording the
> >>> dirty bitmap, it should be marked dirty in the iteration immediately
> >>> after the unpinning and not marked dirty in the following iteration.
> >>> That doesn't happen here.  We're reporting cumulative dirty pages since
> >>> logging was enabled, we need to be reporting dirty pages since the user
> >>> last retrieved the dirty bitmap.  The bitmap should be cleared and
> >>> currently pinned pages re-added after copying to the user.  Thanks,
> >>>      
> >>
> >> Does that mean, we have to track every iteration? do we really need that
> >> tracking?
> >>
> >> Generally the flow is:
> >> - vendor driver pin x pages
> >> - Enter pre-copy-phase where vCPUs are running - user starts dirty pages
> >> tracking, then user asks dirty bitmap, x pages reported dirty by
> >> VFIO_IOMMU_DIRTY_PAGES ioctl with _GET flag
> >> - In pre-copy phase, vendor driver pins y more pages, now bitmap
> >> consists of x+y bits set
> >> - In pre-copy phase, vendor driver unpins z pages, but bitmap is not
> >> updated, so again bitmap consists of x+y bits set.
> >> - Enter in stop-and-copy phase, vCPUs are stopped, mdev devices are stopped
> >> - user asks dirty bitmap - Since here vCPU and mdev devices are stopped,
> >> pages should not get dirty by guest driver or the physical device.
> >> Hence, x+y dirty pages would be reported.
> >>
> >> I don't think we need to track every iteration of bitmap reporting.  
> > 
> > Yes, once a bitmap is read, it's reset.  In your example, after
> > unpinning z pages the user should still see a bitmap with x+y pages,
> > but once they've read that bitmap, the next bitmap should be x+y-z.
> > Userspace can make decisions about when to switch from pre-copy to
> > stop-and-copy based on convergence, ie. the slope of the line recording
> > dirty pages per iteration.  The implementation here never allows an
> > inflection point, dirty pages reported through vfio would always either
> > be flat or climbing.  There might also be a case that an iommu backed
> > device could start pinning pages during the course of a migration, how
> > would the bitmap ever revert from fully populated to only tracking the
> > pinned pages?  Thanks,
> >   
> 
> At KVM forum we discussed this - if guest driver pins say 1024 pages 
> before migration starts, during pre-copy phase device can dirty 0 pages 
> in best case and 1024 pages in worst case. In that case, user will 
> transfer content of 1024 pages during pre-copy phase and in 
> stop-and-copy phase also, that will be pages will be copied twice. So we 
> decided to only get dirty pages bitmap at stop-and-copy phase. If user 
> is going to get dirty pages in stop-and-copy phase only, then that will 
> be single iteration.
> There aren't any devices yet that can track sys memory dirty pages. So 
> we can go ahead with this patch and support for dirty pages tracking 
> during pre-copy phase can be added later when there will be consumers of 
> that functionality.

So if I understand this right, you're expecting the dirty bitmap to
accumulate dirty bits, in perpetuity, so that the user can only
retrieve them once at the end of migration?  But if that's the case,
the user could simply choose to not retrieve the bitmap until the end
of migration, the result would be the same.  What we have here is that
dirty bits are never cleared, regardless of whether the user has seen
them, which is wrong.  Sorry, we had a lot of discussions at KVM forum,
I don't recall this specific one 5 months later and maybe we weren't
considering all aspects.  I see the behavior we have here as incorrect,
but it also seems relatively trivial to make correct.  I hope the QEMU
code isn't making us go through all this trouble to report a dirty
bitmap that gets thrown away because it expects the final one to be
cumulative since the beginning of dirty logging.  Thanks,

Alex
Dr. David Alan Gilbert March 23, 2020, 6:51 p.m. UTC | #8
* Alex Williamson (alex.williamson@redhat.com) wrote:
> On Mon, 23 Mar 2020 23:24:37 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
> > On 3/21/2020 12:29 AM, Alex Williamson wrote:
> > > On Sat, 21 Mar 2020 00:12:04 +0530
> > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > >   
> > >> On 3/20/2020 11:31 PM, Alex Williamson wrote:  
> > >>> On Fri, 20 Mar 2020 23:19:14 +0530
> > >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > >>>      
> > >>>> On 3/20/2020 4:27 AM, Alex Williamson wrote:  
> > >>>>> On Fri, 20 Mar 2020 01:46:41 +0530
> > >>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > >>>>>         
> > >>
> > >> <snip>
> > >>  
> > >>>>>> +static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
> > >>>>>> +				  size_t size, uint64_t pgsize,
> > >>>>>> +				  u64 __user *bitmap)
> > >>>>>> +{
> > >>>>>> +	struct vfio_dma *dma;
> > >>>>>> +	unsigned long pgshift = __ffs(pgsize);
> > >>>>>> +	unsigned int npages, bitmap_size;
> > >>>>>> +
> > >>>>>> +	dma = vfio_find_dma(iommu, iova, 1);
> > >>>>>> +
> > >>>>>> +	if (!dma)
> > >>>>>> +		return -EINVAL;
> > >>>>>> +
> > >>>>>> +	if (dma->iova != iova || dma->size != size)
> > >>>>>> +		return -EINVAL;
> > >>>>>> +
> > >>>>>> +	npages = dma->size >> pgshift;
> > >>>>>> +	bitmap_size = DIRTY_BITMAP_BYTES(npages);
> > >>>>>> +
> > >>>>>> +	/* mark all pages dirty if all pages are pinned and mapped. */
> > >>>>>> +	if (dma->iommu_mapped)
> > >>>>>> +		bitmap_set(dma->bitmap, 0, npages);
> > >>>>>> +
> > >>>>>> +	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
> > >>>>>> +		return -EFAULT;  
> > >>>>>
> > >>>>> We still need to reset the bitmap here, clearing and re-adding the
> > >>>>> pages that are still pinned.
> > >>>>>
> > >>>>> https://lore.kernel.org/kvm/20200319070635.2ff5db56@x1.home/
> > >>>>>         
> > >>>>
> > >>>> I thought you agreed on my reply to it
> > >>>> https://lore.kernel.org/kvm/31621b70-02a9-2ea5-045f-f72b671fe703@nvidia.com/
> > >>>>     
> > >>>>    > Why re-populate when there will be no change since
> > >>>>    > vfio_iova_dirty_bitmap() is called holding iommu->lock? If there is any
> > >>>>    > pin request while vfio_iova_dirty_bitmap() is still working, it will
> > >>>>    > wait till iommu->lock is released. Bitmap will be populated when page is
> > >>>>    > pinned.  
> > >>>
> > >>> As coded, dirty bits are only ever set in the bitmap, never cleared.
> > >>> If a page is unpinned between iterations of the user recording the
> > >>> dirty bitmap, it should be marked dirty in the iteration immediately
> > >>> after the unpinning and not marked dirty in the following iteration.
> > >>> That doesn't happen here.  We're reporting cumulative dirty pages since
> > >>> logging was enabled, we need to be reporting dirty pages since the user
> > >>> last retrieved the dirty bitmap.  The bitmap should be cleared and
> > >>> currently pinned pages re-added after copying to the user.  Thanks,
> > >>>      
> > >>
> > >> Does that mean, we have to track every iteration? do we really need that
> > >> tracking?
> > >>
> > >> Generally the flow is:
> > >> - vendor driver pin x pages
> > >> - Enter pre-copy-phase where vCPUs are running - user starts dirty pages
> > >> tracking, then user asks dirty bitmap, x pages reported dirty by
> > >> VFIO_IOMMU_DIRTY_PAGES ioctl with _GET flag
> > >> - In pre-copy phase, vendor driver pins y more pages, now bitmap
> > >> consists of x+y bits set
> > >> - In pre-copy phase, vendor driver unpins z pages, but bitmap is not
> > >> updated, so again bitmap consists of x+y bits set.
> > >> - Enter in stop-and-copy phase, vCPUs are stopped, mdev devices are stopped
> > >> - user asks dirty bitmap - Since here vCPU and mdev devices are stopped,
> > >> pages should not get dirty by guest driver or the physical device.
> > >> Hence, x+y dirty pages would be reported.
> > >>
> > >> I don't think we need to track every iteration of bitmap reporting.  
> > > 
> > > Yes, once a bitmap is read, it's reset.  In your example, after
> > > unpinning z pages the user should still see a bitmap with x+y pages,
> > > but once they've read that bitmap, the next bitmap should be x+y-z.
> > > Userspace can make decisions about when to switch from pre-copy to
> > > stop-and-copy based on convergence, ie. the slope of the line recording
> > > dirty pages per iteration.  The implementation here never allows an
> > > inflection point, dirty pages reported through vfio would always either
> > > be flat or climbing.  There might also be a case that an iommu backed
> > > device could start pinning pages during the course of a migration, how
> > > would the bitmap ever revert from fully populated to only tracking the
> > > pinned pages?  Thanks,
> > >   
> > 
> > At KVM forum we discussed this - if guest driver pins say 1024 pages 
> > before migration starts, during pre-copy phase device can dirty 0 pages 
> > in best case and 1024 pages in worst case. In that case, user will 
> > transfer content of 1024 pages during pre-copy phase and in 
> > stop-and-copy phase also, that will be pages will be copied twice. So we 
> > decided to only get dirty pages bitmap at stop-and-copy phase. If user 
> > is going to get dirty pages in stop-and-copy phase only, then that will 
> > be single iteration.
> > There aren't any devices yet that can track sys memory dirty pages. So 
> > we can go ahead with this patch and support for dirty pages tracking 
> > during pre-copy phase can be added later when there will be consumers of 
> > that functionality.
> 
> So if I understand this right, you're expecting the dirty bitmap to
> accumulate dirty bits, in perpetuity, so that the user can only
> retrieve them once at the end of migration?  But if that's the case,
> the user could simply choose to not retrieve the bitmap until the end
> of migration, the result would be the same.  What we have here is that
> dirty bits are never cleared, regardless of whether the user has seen
> them, which is wrong.  Sorry, we had a lot of discussions at KVM forum,
> I don't recall this specific one 5 months later and maybe we weren't
> considering all aspects.  I see the behavior we have here as incorrect,
> but it also seems relatively trivial to make correct.  I hope the QEMU
> code isn't making us go through all this trouble to report a dirty
> bitmap that gets thrown away because it expects the final one to be
> cumulative since the beginning of dirty logging.  Thanks,

I remember the discussion that we couldn't track the system memory
dirtying with current hardware; so the question then is just to track
what has been pinned and then ideally put that memory off until the end.
(Which is interesting because I don't think we currently have  a way
to delay RAM pages till the end in qemu).

[I still worry whether migration will be usable with any
significant amount of system ram that's pinned in this way; the
downside will very easily get above the threshold that people like]

Dave

> Alex
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Yan Zhao March 24, 2020, 3:01 a.m. UTC | #9
On Tue, Mar 24, 2020 at 02:51:14AM +0800, Dr. David Alan Gilbert wrote:
> * Alex Williamson (alex.williamson@redhat.com) wrote:
> > On Mon, 23 Mar 2020 23:24:37 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > 
> > > On 3/21/2020 12:29 AM, Alex Williamson wrote:
> > > > On Sat, 21 Mar 2020 00:12:04 +0530
> > > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > >   
> > > >> On 3/20/2020 11:31 PM, Alex Williamson wrote:  
> > > >>> On Fri, 20 Mar 2020 23:19:14 +0530
> > > >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > >>>      
> > > >>>> On 3/20/2020 4:27 AM, Alex Williamson wrote:  
> > > >>>>> On Fri, 20 Mar 2020 01:46:41 +0530
> > > >>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > >>>>>         
> > > >>
> > > >> <snip>
> > > >>  
> > > >>>>>> +static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
> > > >>>>>> +				  size_t size, uint64_t pgsize,
> > > >>>>>> +				  u64 __user *bitmap)
> > > >>>>>> +{
> > > >>>>>> +	struct vfio_dma *dma;
> > > >>>>>> +	unsigned long pgshift = __ffs(pgsize);
> > > >>>>>> +	unsigned int npages, bitmap_size;
> > > >>>>>> +
> > > >>>>>> +	dma = vfio_find_dma(iommu, iova, 1);
> > > >>>>>> +
> > > >>>>>> +	if (!dma)
> > > >>>>>> +		return -EINVAL;
> > > >>>>>> +
> > > >>>>>> +	if (dma->iova != iova || dma->size != size)
> > > >>>>>> +		return -EINVAL;
> > > >>>>>> +
> > > >>>>>> +	npages = dma->size >> pgshift;
> > > >>>>>> +	bitmap_size = DIRTY_BITMAP_BYTES(npages);
> > > >>>>>> +
> > > >>>>>> +	/* mark all pages dirty if all pages are pinned and mapped. */
> > > >>>>>> +	if (dma->iommu_mapped)
> > > >>>>>> +		bitmap_set(dma->bitmap, 0, npages);
> > > >>>>>> +
> > > >>>>>> +	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
> > > >>>>>> +		return -EFAULT;  
> > > >>>>>
> > > >>>>> We still need to reset the bitmap here, clearing and re-adding the
> > > >>>>> pages that are still pinned.
> > > >>>>>
> > > >>>>> https://lore.kernel.org/kvm/20200319070635.2ff5db56@x1.home/
> > > >>>>>         
> > > >>>>
> > > >>>> I thought you agreed on my reply to it
> > > >>>> https://lore.kernel.org/kvm/31621b70-02a9-2ea5-045f-f72b671fe703@nvidia.com/
> > > >>>>     
> > > >>>>    > Why re-populate when there will be no change since
> > > >>>>    > vfio_iova_dirty_bitmap() is called holding iommu->lock? If there is any
> > > >>>>    > pin request while vfio_iova_dirty_bitmap() is still working, it will
> > > >>>>    > wait till iommu->lock is released. Bitmap will be populated when page is
> > > >>>>    > pinned.  
> > > >>>
> > > >>> As coded, dirty bits are only ever set in the bitmap, never cleared.
> > > >>> If a page is unpinned between iterations of the user recording the
> > > >>> dirty bitmap, it should be marked dirty in the iteration immediately
> > > >>> after the unpinning and not marked dirty in the following iteration.
> > > >>> That doesn't happen here.  We're reporting cumulative dirty pages since
> > > >>> logging was enabled, we need to be reporting dirty pages since the user
> > > >>> last retrieved the dirty bitmap.  The bitmap should be cleared and
> > > >>> currently pinned pages re-added after copying to the user.  Thanks,
> > > >>>      
> > > >>
> > > >> Does that mean, we have to track every iteration? do we really need that
> > > >> tracking?
> > > >>
> > > >> Generally the flow is:
> > > >> - vendor driver pin x pages
> > > >> - Enter pre-copy-phase where vCPUs are running - user starts dirty pages
> > > >> tracking, then user asks dirty bitmap, x pages reported dirty by
> > > >> VFIO_IOMMU_DIRTY_PAGES ioctl with _GET flag
> > > >> - In pre-copy phase, vendor driver pins y more pages, now bitmap
> > > >> consists of x+y bits set
> > > >> - In pre-copy phase, vendor driver unpins z pages, but bitmap is not
> > > >> updated, so again bitmap consists of x+y bits set.
> > > >> - Enter in stop-and-copy phase, vCPUs are stopped, mdev devices are stopped
> > > >> - user asks dirty bitmap - Since here vCPU and mdev devices are stopped,
> > > >> pages should not get dirty by guest driver or the physical device.
> > > >> Hence, x+y dirty pages would be reported.
> > > >>
> > > >> I don't think we need to track every iteration of bitmap reporting.  
> > > > 
> > > > Yes, once a bitmap is read, it's reset.  In your example, after
> > > > unpinning z pages the user should still see a bitmap with x+y pages,
> > > > but once they've read that bitmap, the next bitmap should be x+y-z.
> > > > Userspace can make decisions about when to switch from pre-copy to
> > > > stop-and-copy based on convergence, ie. the slope of the line recording
> > > > dirty pages per iteration.  The implementation here never allows an
> > > > inflection point, dirty pages reported through vfio would always either
> > > > be flat or climbing.  There might also be a case that an iommu backed
> > > > device could start pinning pages during the course of a migration, how
> > > > would the bitmap ever revert from fully populated to only tracking the
> > > > pinned pages?  Thanks,
> > > >   
> > > 
> > > At KVM forum we discussed this - if guest driver pins say 1024 pages 
> > > before migration starts, during pre-copy phase device can dirty 0 pages 
> > > in best case and 1024 pages in worst case. In that case, user will 
> > > transfer content of 1024 pages during pre-copy phase and in 
> > > stop-and-copy phase also, that will be pages will be copied twice. So we 
> > > decided to only get dirty pages bitmap at stop-and-copy phase. If user 
> > > is going to get dirty pages in stop-and-copy phase only, then that will 
> > > be single iteration.
> > > There aren't any devices yet that can track sys memory dirty pages. So 
> > > we can go ahead with this patch and support for dirty pages tracking 
> > > during pre-copy phase can be added later when there will be consumers of 
> > > that functionality.
> > 
> > So if I understand this right, you're expecting the dirty bitmap to
> > accumulate dirty bits, in perpetuity, so that the user can only
> > retrieve them once at the end of migration?  But if that's the case,
> > the user could simply choose to not retrieve the bitmap until the end
> > of migration, the result would be the same.  What we have here is that
> > dirty bits are never cleared, regardless of whether the user has seen
> > them, which is wrong.  Sorry, we had a lot of discussions at KVM forum,
> > I don't recall this specific one 5 months later and maybe we weren't
> > considering all aspects.  I see the behavior we have here as incorrect,
> > but it also seems relatively trivial to make correct.  I hope the QEMU
> > code isn't making us go through all this trouble to report a dirty
> > bitmap that gets thrown away because it expects the final one to be
> > cumulative since the beginning of dirty logging.  Thanks,
> 
> I remember the discussion that we couldn't track the system memory
> dirtying with current hardware; so the question then is just to track
hi Dave
there are already devices that are able to track the system memory,
through two ways:
(1) software method. like VFs for "Intel(R) Ethernet Controller XL710 Family
support".
(2) hardware method. through hardware internal buffer (as one Intel
internal hardware not yet to public, but very soon) or through VTD-3.0
IOMMU.

we have already had code verified using the two ways to track system memory
in fine-grained level.


> what has been pinned and then ideally put that memory off until the end.
> (Which is interesting because I don't think we currently have  a way
> to delay RAM pages till the end in qemu).

I think the problem here is that we mixed pinned pages with dirty pages.
yes, pinned pages for mdev devices are continuously likely to be dirty
until device stopped.
But for devices that are able to report dirty pages, dirtied pages
will be marked again if hardware writes them later.

So, is it good to introduce a capability to let vfio/qemu know how to
treat the dirty pages?
(1) for devices have no fine-grained dirty page tracking capability
  a. pinned pages are regarded as dirty pages. they are not cleared by
  dirty page query
  b. unpinned pages are regarded as dirty pages. they are cleared by
  dirty page query or UNMAP ioctl.
(2) for devices that have fine-grained dirty page tracking capability
   a. pinned/unpinned pages are not regarded as dirty pages
   b. only pages they reported are regarded as dirty pages and are to be
   cleared by dirty page query and UNMAP ioctl.
(3) for dirty pages marking APIs, like vfio_dma_rw()...
   pages marked by them are regared as dirty and are to be cleared by
   dirty page query and UNMAP ioctl

For (1), qemu VFIO only reports dirty page amount and would not transfer
those pages until last round.
for (2) and (3), qemu VFIO should report and transfer them in each
round.


> [I still worry whether migration will be usable with any
> significant amount of system ram that's pinned in this way; the
> downside will very easily get above the threshold that people like]
> 
yes. that's why we have to do multi-round dirty page query and
transfer and clear the dirty bitmaps in each round for devices that are
able to track in fine grain.
and that's why we have to report the amount of dirty pages before
stop-and-copy phase for mdev devices, so that people are able to know
the real downtime as much as possible.

Thanks
Yan
Kirti Wankhede March 24, 2020, 9:45 a.m. UTC | #10
On 3/24/2020 8:31 AM, Yan Zhao wrote:
> On Tue, Mar 24, 2020 at 02:51:14AM +0800, Dr. David Alan Gilbert wrote:
>> * Alex Williamson (alex.williamson@redhat.com) wrote:
>>> On Mon, 23 Mar 2020 23:24:37 +0530
>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>
>>>> On 3/21/2020 12:29 AM, Alex Williamson wrote:
>>>>> On Sat, 21 Mar 2020 00:12:04 +0530
>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>>>    
>>>>>> On 3/20/2020 11:31 PM, Alex Williamson wrote:
>>>>>>> On Fri, 20 Mar 2020 23:19:14 +0530
>>>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>>>>>       
>>>>>>>> On 3/20/2020 4:27 AM, Alex Williamson wrote:
>>>>>>>>> On Fri, 20 Mar 2020 01:46:41 +0530
>>>>>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>>>>>>>          
>>>>>>
>>>>>> <snip>
>>>>>>   
>>>>>>>>>> +static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
>>>>>>>>>> +				  size_t size, uint64_t pgsize,
>>>>>>>>>> +				  u64 __user *bitmap)
>>>>>>>>>> +{
>>>>>>>>>> +	struct vfio_dma *dma;
>>>>>>>>>> +	unsigned long pgshift = __ffs(pgsize);
>>>>>>>>>> +	unsigned int npages, bitmap_size;
>>>>>>>>>> +
>>>>>>>>>> +	dma = vfio_find_dma(iommu, iova, 1);
>>>>>>>>>> +
>>>>>>>>>> +	if (!dma)
>>>>>>>>>> +		return -EINVAL;
>>>>>>>>>> +
>>>>>>>>>> +	if (dma->iova != iova || dma->size != size)
>>>>>>>>>> +		return -EINVAL;
>>>>>>>>>> +
>>>>>>>>>> +	npages = dma->size >> pgshift;
>>>>>>>>>> +	bitmap_size = DIRTY_BITMAP_BYTES(npages);
>>>>>>>>>> +
>>>>>>>>>> +	/* mark all pages dirty if all pages are pinned and mapped. */
>>>>>>>>>> +	if (dma->iommu_mapped)
>>>>>>>>>> +		bitmap_set(dma->bitmap, 0, npages);
>>>>>>>>>> +
>>>>>>>>>> +	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
>>>>>>>>>> +		return -EFAULT;
>>>>>>>>>
>>>>>>>>> We still need to reset the bitmap here, clearing and re-adding the
>>>>>>>>> pages that are still pinned.
>>>>>>>>>
>>>>>>>>> https://lore.kernel.org/kvm/20200319070635.2ff5db56@x1.home/
>>>>>>>>>          
>>>>>>>>
>>>>>>>> I thought you agreed on my reply to it
>>>>>>>> https://lore.kernel.org/kvm/31621b70-02a9-2ea5-045f-f72b671fe703@nvidia.com/
>>>>>>>>      
>>>>>>>>     > Why re-populate when there will be no change since
>>>>>>>>     > vfio_iova_dirty_bitmap() is called holding iommu->lock? If there is any
>>>>>>>>     > pin request while vfio_iova_dirty_bitmap() is still working, it will
>>>>>>>>     > wait till iommu->lock is released. Bitmap will be populated when page is
>>>>>>>>     > pinned.
>>>>>>>
>>>>>>> As coded, dirty bits are only ever set in the bitmap, never cleared.
>>>>>>> If a page is unpinned between iterations of the user recording the
>>>>>>> dirty bitmap, it should be marked dirty in the iteration immediately
>>>>>>> after the unpinning and not marked dirty in the following iteration.
>>>>>>> That doesn't happen here.  We're reporting cumulative dirty pages since
>>>>>>> logging was enabled, we need to be reporting dirty pages since the user
>>>>>>> last retrieved the dirty bitmap.  The bitmap should be cleared and
>>>>>>> currently pinned pages re-added after copying to the user.  Thanks,
>>>>>>>       
>>>>>>
>>>>>> Does that mean, we have to track every iteration? do we really need that
>>>>>> tracking?
>>>>>>
>>>>>> Generally the flow is:
>>>>>> - vendor driver pin x pages
>>>>>> - Enter pre-copy-phase where vCPUs are running - user starts dirty pages
>>>>>> tracking, then user asks dirty bitmap, x pages reported dirty by
>>>>>> VFIO_IOMMU_DIRTY_PAGES ioctl with _GET flag
>>>>>> - In pre-copy phase, vendor driver pins y more pages, now bitmap
>>>>>> consists of x+y bits set
>>>>>> - In pre-copy phase, vendor driver unpins z pages, but bitmap is not
>>>>>> updated, so again bitmap consists of x+y bits set.
>>>>>> - Enter in stop-and-copy phase, vCPUs are stopped, mdev devices are stopped
>>>>>> - user asks dirty bitmap - Since here vCPU and mdev devices are stopped,
>>>>>> pages should not get dirty by guest driver or the physical device.
>>>>>> Hence, x+y dirty pages would be reported.
>>>>>>
>>>>>> I don't think we need to track every iteration of bitmap reporting.
>>>>>
>>>>> Yes, once a bitmap is read, it's reset.  In your example, after
>>>>> unpinning z pages the user should still see a bitmap with x+y pages,
>>>>> but once they've read that bitmap, the next bitmap should be x+y-z.
>>>>> Userspace can make decisions about when to switch from pre-copy to
>>>>> stop-and-copy based on convergence, ie. the slope of the line recording
>>>>> dirty pages per iteration.  The implementation here never allows an
>>>>> inflection point, dirty pages reported through vfio would always either
>>>>> be flat or climbing.  There might also be a case that an iommu backed
>>>>> device could start pinning pages during the course of a migration, how
>>>>> would the bitmap ever revert from fully populated to only tracking the
>>>>> pinned pages?  Thanks,
>>>>>    
>>>>
>>>> At KVM forum we discussed this - if guest driver pins say 1024 pages
>>>> before migration starts, during pre-copy phase device can dirty 0 pages
>>>> in best case and 1024 pages in worst case. In that case, user will
>>>> transfer content of 1024 pages during pre-copy phase and in
>>>> stop-and-copy phase also, that will be pages will be copied twice. So we
>>>> decided to only get dirty pages bitmap at stop-and-copy phase. If user
>>>> is going to get dirty pages in stop-and-copy phase only, then that will
>>>> be single iteration.
>>>> There aren't any devices yet that can track sys memory dirty pages. So
>>>> we can go ahead with this patch and support for dirty pages tracking
>>>> during pre-copy phase can be added later when there will be consumers of
>>>> that functionality.
>>>
>>> So if I understand this right, you're expecting the dirty bitmap to
>>> accumulate dirty bits, in perpetuity, so that the user can only
>>> retrieve them once at the end of migration?  But if that's the case,
>>> the user could simply choose to not retrieve the bitmap until the end
>>> of migration, the result would be the same.  What we have here is that
>>> dirty bits are never cleared, regardless of whether the user has seen
>>> them, which is wrong.  Sorry, we had a lot of discussions at KVM forum,
>>> I don't recall this specific one 5 months later and maybe we weren't
>>> considering all aspects.  I see the behavior we have here as incorrect,
>>> but it also seems relatively trivial to make correct.  I hope the QEMU
>>> code isn't making us go through all this trouble to report a dirty
>>> bitmap that gets thrown away because it expects the final one to be
>>> cumulative since the beginning of dirty logging.  Thanks,
>>
>> I remember the discussion that we couldn't track the system memory
>> dirtying with current hardware; so the question then is just to track

> hi Dave
> there are already devices that are able to track the system memory,
> through two ways:
> (1) software method. like VFs for "Intel(R) Ethernet Controller XL710 Family
> support".
> (2) hardware method. through hardware internal buffer (as one Intel
> internal hardware not yet to public, but very soon) or through VTD-3.0
> IOMMU.
> 
> we have already had code verified using the two ways to track system memory
> in fine-grained level.
> 
> 
>> what has been pinned and then ideally put that memory off until the end.
>> (Which is interesting because I don't think we currently have  a way
>> to delay RAM pages till the end in qemu).
> 
> I think the problem here is that we mixed pinned pages with dirty pages.
> yes, pinned pages for mdev devices are continuously likely to be dirty
> until device stopped.
> But for devices that are able to report dirty pages, dirtied pages
> will be marked again if hardware writes them later.
> 
> So, is it good to introduce a capability to let vfio/qemu know how to
> treat the dirty pages?
> (1) for devices have no fine-grained dirty page tracking capability
>    a. pinned pages are regarded as dirty pages. they are not cleared by
>    dirty page query
>    b. unpinned pages are regarded as dirty pages. they are cleared by
>    dirty page query or UNMAP ioctl.
> (2) for devices that have fine-grained dirty page tracking capability
>     a. pinned/unpinned pages are not regarded as dirty pages
>     b. only pages they reported are regarded as dirty pages and are to be
>     cleared by dirty page query and UNMAP ioctl.
> (3) for dirty pages marking APIs, like vfio_dma_rw()...
>     pages marked by them are regared as dirty and are to be cleared by
>     dirty page query and UNMAP ioctl
> 
> For (1), qemu VFIO only reports dirty page amount and would not transfer
> those pages until last round.

(1) and (3) can be implemented now. I don't think QEMU have support to 
delay certain marked pages dirty as of now. If QEMU queries dirty pages 
bitmap in pre-copy phase, all pinned pages reported as dirty will be 
transferred in pre-copy and again in stop-and-copy phase.

> for (2) and (3), qemu VFIO should report and transfer them in each
> round.
>

(2) can be added later

Thanks,
Kirti

> 
>> [I still worry whether migration will be usable with any
>> significant amount of system ram that's pinned in this way; the
>> downside will very easily get above the threshold that people like]
>>
> yes. that's why we have to do multi-round dirty page query and
> transfer and clear the dirty bitmaps in each round for devices that are
> able to track in fine grain.
> and that's why we have to report the amount of dirty pages before
> stop-and-copy phase for mdev devices, so that people are able to know
> the real downtime as much as possible.
> 
> Thanks
> Yan
>
Alex Williamson March 24, 2020, 2:36 p.m. UTC | #11
On Mon, 23 Mar 2020 23:01:18 -0400
Yan Zhao <yan.y.zhao@intel.com> wrote:

> On Tue, Mar 24, 2020 at 02:51:14AM +0800, Dr. David Alan Gilbert wrote:
> > * Alex Williamson (alex.williamson@redhat.com) wrote:  
> > > On Mon, 23 Mar 2020 23:24:37 +0530
> > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > >   
> > > > On 3/21/2020 12:29 AM, Alex Williamson wrote:  
> > > > > On Sat, 21 Mar 2020 00:12:04 +0530
> > > > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > > >     
> > > > >> On 3/20/2020 11:31 PM, Alex Williamson wrote:    
> > > > >>> On Fri, 20 Mar 2020 23:19:14 +0530
> > > > >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > > >>>        
> > > > >>>> On 3/20/2020 4:27 AM, Alex Williamson wrote:    
> > > > >>>>> On Fri, 20 Mar 2020 01:46:41 +0530
> > > > >>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > > >>>>>           
> > > > >>
> > > > >> <snip>
> > > > >>    
> > > > >>>>>> +static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
> > > > >>>>>> +				  size_t size, uint64_t pgsize,
> > > > >>>>>> +				  u64 __user *bitmap)
> > > > >>>>>> +{
> > > > >>>>>> +	struct vfio_dma *dma;
> > > > >>>>>> +	unsigned long pgshift = __ffs(pgsize);
> > > > >>>>>> +	unsigned int npages, bitmap_size;
> > > > >>>>>> +
> > > > >>>>>> +	dma = vfio_find_dma(iommu, iova, 1);
> > > > >>>>>> +
> > > > >>>>>> +	if (!dma)
> > > > >>>>>> +		return -EINVAL;
> > > > >>>>>> +
> > > > >>>>>> +	if (dma->iova != iova || dma->size != size)
> > > > >>>>>> +		return -EINVAL;
> > > > >>>>>> +
> > > > >>>>>> +	npages = dma->size >> pgshift;
> > > > >>>>>> +	bitmap_size = DIRTY_BITMAP_BYTES(npages);
> > > > >>>>>> +
> > > > >>>>>> +	/* mark all pages dirty if all pages are pinned and mapped. */
> > > > >>>>>> +	if (dma->iommu_mapped)
> > > > >>>>>> +		bitmap_set(dma->bitmap, 0, npages);
> > > > >>>>>> +
> > > > >>>>>> +	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
> > > > >>>>>> +		return -EFAULT;    
> > > > >>>>>
> > > > >>>>> We still need to reset the bitmap here, clearing and re-adding the
> > > > >>>>> pages that are still pinned.
> > > > >>>>>
> > > > >>>>> https://lore.kernel.org/kvm/20200319070635.2ff5db56@x1.home/
> > > > >>>>>           
> > > > >>>>
> > > > >>>> I thought you agreed on my reply to it
> > > > >>>> https://lore.kernel.org/kvm/31621b70-02a9-2ea5-045f-f72b671fe703@nvidia.com/
> > > > >>>>       
> > > > >>>>    > Why re-populate when there will be no change since
> > > > >>>>    > vfio_iova_dirty_bitmap() is called holding iommu->lock? If there is any
> > > > >>>>    > pin request while vfio_iova_dirty_bitmap() is still working, it will
> > > > >>>>    > wait till iommu->lock is released. Bitmap will be populated when page is
> > > > >>>>    > pinned.    
> > > > >>>
> > > > >>> As coded, dirty bits are only ever set in the bitmap, never cleared.
> > > > >>> If a page is unpinned between iterations of the user recording the
> > > > >>> dirty bitmap, it should be marked dirty in the iteration immediately
> > > > >>> after the unpinning and not marked dirty in the following iteration.
> > > > >>> That doesn't happen here.  We're reporting cumulative dirty pages since
> > > > >>> logging was enabled, we need to be reporting dirty pages since the user
> > > > >>> last retrieved the dirty bitmap.  The bitmap should be cleared and
> > > > >>> currently pinned pages re-added after copying to the user.  Thanks,
> > > > >>>        
> > > > >>
> > > > >> Does that mean, we have to track every iteration? do we really need that
> > > > >> tracking?
> > > > >>
> > > > >> Generally the flow is:
> > > > >> - vendor driver pin x pages
> > > > >> - Enter pre-copy-phase where vCPUs are running - user starts dirty pages
> > > > >> tracking, then user asks dirty bitmap, x pages reported dirty by
> > > > >> VFIO_IOMMU_DIRTY_PAGES ioctl with _GET flag
> > > > >> - In pre-copy phase, vendor driver pins y more pages, now bitmap
> > > > >> consists of x+y bits set
> > > > >> - In pre-copy phase, vendor driver unpins z pages, but bitmap is not
> > > > >> updated, so again bitmap consists of x+y bits set.
> > > > >> - Enter in stop-and-copy phase, vCPUs are stopped, mdev devices are stopped
> > > > >> - user asks dirty bitmap - Since here vCPU and mdev devices are stopped,
> > > > >> pages should not get dirty by guest driver or the physical device.
> > > > >> Hence, x+y dirty pages would be reported.
> > > > >>
> > > > >> I don't think we need to track every iteration of bitmap reporting.    
> > > > > 
> > > > > Yes, once a bitmap is read, it's reset.  In your example, after
> > > > > unpinning z pages the user should still see a bitmap with x+y pages,
> > > > > but once they've read that bitmap, the next bitmap should be x+y-z.
> > > > > Userspace can make decisions about when to switch from pre-copy to
> > > > > stop-and-copy based on convergence, ie. the slope of the line recording
> > > > > dirty pages per iteration.  The implementation here never allows an
> > > > > inflection point, dirty pages reported through vfio would always either
> > > > > be flat or climbing.  There might also be a case that an iommu backed
> > > > > device could start pinning pages during the course of a migration, how
> > > > > would the bitmap ever revert from fully populated to only tracking the
> > > > > pinned pages?  Thanks,
> > > > >     
> > > > 
> > > > At KVM forum we discussed this - if guest driver pins say 1024 pages 
> > > > before migration starts, during pre-copy phase device can dirty 0 pages 
> > > > in best case and 1024 pages in worst case. In that case, user will 
> > > > transfer content of 1024 pages during pre-copy phase and in 
> > > > stop-and-copy phase also, that will be pages will be copied twice. So we 
> > > > decided to only get dirty pages bitmap at stop-and-copy phase. If user 
> > > > is going to get dirty pages in stop-and-copy phase only, then that will 
> > > > be single iteration.
> > > > There aren't any devices yet that can track sys memory dirty pages. So 
> > > > we can go ahead with this patch and support for dirty pages tracking 
> > > > during pre-copy phase can be added later when there will be consumers of 
> > > > that functionality.  
> > > 
> > > So if I understand this right, you're expecting the dirty bitmap to
> > > accumulate dirty bits, in perpetuity, so that the user can only
> > > retrieve them once at the end of migration?  But if that's the case,
> > > the user could simply choose to not retrieve the bitmap until the end
> > > of migration, the result would be the same.  What we have here is that
> > > dirty bits are never cleared, regardless of whether the user has seen
> > > them, which is wrong.  Sorry, we had a lot of discussions at KVM forum,
> > > I don't recall this specific one 5 months later and maybe we weren't
> > > considering all aspects.  I see the behavior we have here as incorrect,
> > > but it also seems relatively trivial to make correct.  I hope the QEMU
> > > code isn't making us go through all this trouble to report a dirty
> > > bitmap that gets thrown away because it expects the final one to be
> > > cumulative since the beginning of dirty logging.  Thanks,  
> > 
> > I remember the discussion that we couldn't track the system memory
> > dirtying with current hardware; so the question then is just to track  
> hi Dave
> there are already devices that are able to track the system memory,
> through two ways:
> (1) software method. like VFs for "Intel(R) Ethernet Controller XL710 Family
> support".
> (2) hardware method. through hardware internal buffer (as one Intel
> internal hardware not yet to public, but very soon) or through VTD-3.0
> IOMMU.
> 
> we have already had code verified using the two ways to track system memory
> in fine-grained level.
> 
> 
> > what has been pinned and then ideally put that memory off until the end.
> > (Which is interesting because I don't think we currently have  a way
> > to delay RAM pages till the end in qemu).  
> 
> I think the problem here is that we mixed pinned pages with dirty pages.

We are reporting dirty pages, pinned pages are just assumed to be dirty.

> yes, pinned pages for mdev devices are continuously likely to be dirty
> until device stopped.
> But for devices that are able to report dirty pages, dirtied pages
> will be marked again if hardware writes them later.
> 
> So, is it good to introduce a capability to let vfio/qemu know how to
> treat the dirty pages?

Dirty pages are dirty, QEMU doesn't need any special flag, instead we
need to evolve different mechanisms for the vendor driver so that we
can differentiate pages pinned for read vs pages pinned for write.
Perhaps interfaces to pin pages without dirtying them, and a separate
mechanism to dirty a previously pinned-page, ie. promote it permanently
or transiently to a writable page.

> (1) for devices have no fine-grained dirty page tracking capability
>   a. pinned pages are regarded as dirty pages. they are not cleared by
>   dirty page query
>   b. unpinned pages are regarded as dirty pages. they are cleared by
>   dirty page query or UNMAP ioctl.
> (2) for devices that have fine-grained dirty page tracking capability
>    a. pinned/unpinned pages are not regarded as dirty pages

We need a pin-read-only interface for this.

>    b. only pages they reported are regarded as dirty pages and are to be
>    cleared by dirty page query and UNMAP ioctl.

We need a set-dirty or promote-writable interface for this.

> (3) for dirty pages marking APIs, like vfio_dma_rw()...
>    pages marked by them are regared as dirty and are to be cleared by
>    dirty page query and UNMAP ioctl
> 
> For (1), qemu VFIO only reports dirty page amount and would not transfer
> those pages until last round.
> for (2) and (3), qemu VFIO should report and transfer them in each
> round.

IMO, QEMU should not be aware of any of this.  Userspace has an
interface to retrieve dirtied pages (period).  We should adjust the
pages that we report as dirtied to be accurate based on the
capabilities of the vendor driver.  We can evolve those internal APIs
between the vendor driver and vfio iommu over time without modifying
this user interface.

> > [I still worry whether migration will be usable with any
> > significant amount of system ram that's pinned in this way; the
> > downside will very easily get above the threshold that people like]
> >   
> yes. that's why we have to do multi-round dirty page query and
> transfer and clear the dirty bitmaps in each round for devices that are
> able to track in fine grain.
> and that's why we have to report the amount of dirty pages before
> stop-and-copy phase for mdev devices, so that people are able to know
> the real downtime as much as possible.

Yes, the dirty bitmap should be accurate to report the pages dirtied
since it was last retrieved and over time we can add internal
interfaces to give vendor drivers more granularity in marking pinned
pages dirty and perhaps even exposing the bitmap to the vendor drivers
to set pages themselves.  I don't necessarily think it's worthwhile to
create a new class of dirtied pages to transfer at the end, we're
fighting a losing battle at that point.  We should be focusing on
improving the granularity of page dirtying in order to reduce the pages
transferred at the end of migration.  Thanks,

Alex
Dr. David Alan Gilbert March 24, 2020, 8:23 p.m. UTC | #12
* Alex Williamson (alex.williamson@redhat.com) wrote:
> On Mon, 23 Mar 2020 23:01:18 -0400
> Yan Zhao <yan.y.zhao@intel.com> wrote:
> 
> > On Tue, Mar 24, 2020 at 02:51:14AM +0800, Dr. David Alan Gilbert wrote:
> > > * Alex Williamson (alex.williamson@redhat.com) wrote:  
> > > > On Mon, 23 Mar 2020 23:24:37 +0530
> > > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > >   
> > > > > On 3/21/2020 12:29 AM, Alex Williamson wrote:  
> > > > > > On Sat, 21 Mar 2020 00:12:04 +0530
> > > > > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > > > >     
> > > > > >> On 3/20/2020 11:31 PM, Alex Williamson wrote:    
> > > > > >>> On Fri, 20 Mar 2020 23:19:14 +0530
> > > > > >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > > > >>>        
> > > > > >>>> On 3/20/2020 4:27 AM, Alex Williamson wrote:    
> > > > > >>>>> On Fri, 20 Mar 2020 01:46:41 +0530
> > > > > >>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > > > >>>>>           
> > > > > >>
> > > > > >> <snip>
> > > > > >>    
> > > > > >>>>>> +static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
> > > > > >>>>>> +				  size_t size, uint64_t pgsize,
> > > > > >>>>>> +				  u64 __user *bitmap)
> > > > > >>>>>> +{
> > > > > >>>>>> +	struct vfio_dma *dma;
> > > > > >>>>>> +	unsigned long pgshift = __ffs(pgsize);
> > > > > >>>>>> +	unsigned int npages, bitmap_size;
> > > > > >>>>>> +
> > > > > >>>>>> +	dma = vfio_find_dma(iommu, iova, 1);
> > > > > >>>>>> +
> > > > > >>>>>> +	if (!dma)
> > > > > >>>>>> +		return -EINVAL;
> > > > > >>>>>> +
> > > > > >>>>>> +	if (dma->iova != iova || dma->size != size)
> > > > > >>>>>> +		return -EINVAL;
> > > > > >>>>>> +
> > > > > >>>>>> +	npages = dma->size >> pgshift;
> > > > > >>>>>> +	bitmap_size = DIRTY_BITMAP_BYTES(npages);
> > > > > >>>>>> +
> > > > > >>>>>> +	/* mark all pages dirty if all pages are pinned and mapped. */
> > > > > >>>>>> +	if (dma->iommu_mapped)
> > > > > >>>>>> +		bitmap_set(dma->bitmap, 0, npages);
> > > > > >>>>>> +
> > > > > >>>>>> +	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
> > > > > >>>>>> +		return -EFAULT;    
> > > > > >>>>>
> > > > > >>>>> We still need to reset the bitmap here, clearing and re-adding the
> > > > > >>>>> pages that are still pinned.
> > > > > >>>>>
> > > > > >>>>> https://lore.kernel.org/kvm/20200319070635.2ff5db56@x1.home/
> > > > > >>>>>           
> > > > > >>>>
> > > > > >>>> I thought you agreed on my reply to it
> > > > > >>>> https://lore.kernel.org/kvm/31621b70-02a9-2ea5-045f-f72b671fe703@nvidia.com/
> > > > > >>>>       
> > > > > >>>>    > Why re-populate when there will be no change since
> > > > > >>>>    > vfio_iova_dirty_bitmap() is called holding iommu->lock? If there is any
> > > > > >>>>    > pin request while vfio_iova_dirty_bitmap() is still working, it will
> > > > > >>>>    > wait till iommu->lock is released. Bitmap will be populated when page is
> > > > > >>>>    > pinned.    
> > > > > >>>
> > > > > >>> As coded, dirty bits are only ever set in the bitmap, never cleared.
> > > > > >>> If a page is unpinned between iterations of the user recording the
> > > > > >>> dirty bitmap, it should be marked dirty in the iteration immediately
> > > > > >>> after the unpinning and not marked dirty in the following iteration.
> > > > > >>> That doesn't happen here.  We're reporting cumulative dirty pages since
> > > > > >>> logging was enabled, we need to be reporting dirty pages since the user
> > > > > >>> last retrieved the dirty bitmap.  The bitmap should be cleared and
> > > > > >>> currently pinned pages re-added after copying to the user.  Thanks,
> > > > > >>>        
> > > > > >>
> > > > > >> Does that mean, we have to track every iteration? do we really need that
> > > > > >> tracking?
> > > > > >>
> > > > > >> Generally the flow is:
> > > > > >> - vendor driver pin x pages
> > > > > >> - Enter pre-copy-phase where vCPUs are running - user starts dirty pages
> > > > > >> tracking, then user asks dirty bitmap, x pages reported dirty by
> > > > > >> VFIO_IOMMU_DIRTY_PAGES ioctl with _GET flag
> > > > > >> - In pre-copy phase, vendor driver pins y more pages, now bitmap
> > > > > >> consists of x+y bits set
> > > > > >> - In pre-copy phase, vendor driver unpins z pages, but bitmap is not
> > > > > >> updated, so again bitmap consists of x+y bits set.
> > > > > >> - Enter in stop-and-copy phase, vCPUs are stopped, mdev devices are stopped
> > > > > >> - user asks dirty bitmap - Since here vCPU and mdev devices are stopped,
> > > > > >> pages should not get dirty by guest driver or the physical device.
> > > > > >> Hence, x+y dirty pages would be reported.
> > > > > >>
> > > > > >> I don't think we need to track every iteration of bitmap reporting.    
> > > > > > 
> > > > > > Yes, once a bitmap is read, it's reset.  In your example, after
> > > > > > unpinning z pages the user should still see a bitmap with x+y pages,
> > > > > > but once they've read that bitmap, the next bitmap should be x+y-z.
> > > > > > Userspace can make decisions about when to switch from pre-copy to
> > > > > > stop-and-copy based on convergence, ie. the slope of the line recording
> > > > > > dirty pages per iteration.  The implementation here never allows an
> > > > > > inflection point, dirty pages reported through vfio would always either
> > > > > > be flat or climbing.  There might also be a case that an iommu backed
> > > > > > device could start pinning pages during the course of a migration, how
> > > > > > would the bitmap ever revert from fully populated to only tracking the
> > > > > > pinned pages?  Thanks,
> > > > > >     
> > > > > 
> > > > > At KVM forum we discussed this - if guest driver pins say 1024 pages 
> > > > > before migration starts, during pre-copy phase device can dirty 0 pages 
> > > > > in best case and 1024 pages in worst case. In that case, user will 
> > > > > transfer content of 1024 pages during pre-copy phase and in 
> > > > > stop-and-copy phase also, that will be pages will be copied twice. So we 
> > > > > decided to only get dirty pages bitmap at stop-and-copy phase. If user 
> > > > > is going to get dirty pages in stop-and-copy phase only, then that will 
> > > > > be single iteration.
> > > > > There aren't any devices yet that can track sys memory dirty pages. So 
> > > > > we can go ahead with this patch and support for dirty pages tracking 
> > > > > during pre-copy phase can be added later when there will be consumers of 
> > > > > that functionality.  
> > > > 
> > > > So if I understand this right, you're expecting the dirty bitmap to
> > > > accumulate dirty bits, in perpetuity, so that the user can only
> > > > retrieve them once at the end of migration?  But if that's the case,
> > > > the user could simply choose to not retrieve the bitmap until the end
> > > > of migration, the result would be the same.  What we have here is that
> > > > dirty bits are never cleared, regardless of whether the user has seen
> > > > them, which is wrong.  Sorry, we had a lot of discussions at KVM forum,
> > > > I don't recall this specific one 5 months later and maybe we weren't
> > > > considering all aspects.  I see the behavior we have here as incorrect,
> > > > but it also seems relatively trivial to make correct.  I hope the QEMU
> > > > code isn't making us go through all this trouble to report a dirty
> > > > bitmap that gets thrown away because it expects the final one to be
> > > > cumulative since the beginning of dirty logging.  Thanks,  
> > > 
> > > I remember the discussion that we couldn't track the system memory
> > > dirtying with current hardware; so the question then is just to track  
> > hi Dave
> > there are already devices that are able to track the system memory,
> > through two ways:
> > (1) software method. like VFs for "Intel(R) Ethernet Controller XL710 Family
> > support".
> > (2) hardware method. through hardware internal buffer (as one Intel
> > internal hardware not yet to public, but very soon) or through VTD-3.0
> > IOMMU.
> > 
> > we have already had code verified using the two ways to track system memory
> > in fine-grained level.
> > 
> > 
> > > what has been pinned and then ideally put that memory off until the end.
> > > (Which is interesting because I don't think we currently have  a way
> > > to delay RAM pages till the end in qemu).  
> > 
> > I think the problem here is that we mixed pinned pages with dirty pages.
> 
> We are reporting dirty pages, pinned pages are just assumed to be dirty.
> 
> > yes, pinned pages for mdev devices are continuously likely to be dirty
> > until device stopped.
> > But for devices that are able to report dirty pages, dirtied pages
> > will be marked again if hardware writes them later.
> > 
> > So, is it good to introduce a capability to let vfio/qemu know how to
> > treat the dirty pages?
> 
> Dirty pages are dirty, QEMU doesn't need any special flag, instead we
> need to evolve different mechanisms for the vendor driver so that we
> can differentiate pages pinned for read vs pages pinned for write.
> Perhaps interfaces to pin pages without dirtying them, and a separate
> mechanism to dirty a previously pinned-page, ie. promote it permanently
> or transiently to a writable page.
> 
> > (1) for devices have no fine-grained dirty page tracking capability
> >   a. pinned pages are regarded as dirty pages. they are not cleared by
> >   dirty page query
> >   b. unpinned pages are regarded as dirty pages. they are cleared by
> >   dirty page query or UNMAP ioctl.
> > (2) for devices that have fine-grained dirty page tracking capability
> >    a. pinned/unpinned pages are not regarded as dirty pages
> 
> We need a pin-read-only interface for this.
> 
> >    b. only pages they reported are regarded as dirty pages and are to be
> >    cleared by dirty page query and UNMAP ioctl.
> 
> We need a set-dirty or promote-writable interface for this.
> 
> > (3) for dirty pages marking APIs, like vfio_dma_rw()...
> >    pages marked by them are regared as dirty and are to be cleared by
> >    dirty page query and UNMAP ioctl
> > 
> > For (1), qemu VFIO only reports dirty page amount and would not transfer
> > those pages until last round.
> > for (2) and (3), qemu VFIO should report and transfer them in each
> > round.
> 
> IMO, QEMU should not be aware of any of this.  Userspace has an
> interface to retrieve dirtied pages (period).  We should adjust the
> pages that we report as dirtied to be accurate based on the
> capabilities of the vendor driver.  We can evolve those internal APIs
> between the vendor driver and vfio iommu over time without modifying
> this user interface.

I'm not sure;  if you have a block of memory that's constantly marked
dirty in (1) - we need to avoid constantly retransmitting that memory to
the destination; there's no point in sending it until the end of the
iterations - so it shouldn't even get sent once in the iteration.
But at the same time, we can't ignore the fact that those pages are
going to be dirty - because that influences the downtime; so we need
to know we're going to be getting them later, even if we don't
initially mark them as dirty.

> > > [I still worry whether migration will be usable with any
> > > significant amount of system ram that's pinned in this way; the
> > > downside will very easily get above the threshold that people like]
> > >   
> > yes. that's why we have to do multi-round dirty page query and
> > transfer and clear the dirty bitmaps in each round for devices that are
> > able to track in fine grain.
> > and that's why we have to report the amount of dirty pages before
> > stop-and-copy phase for mdev devices, so that people are able to know
> > the real downtime as much as possible.
> 
> Yes, the dirty bitmap should be accurate to report the pages dirtied
> since it was last retrieved and over time we can add internal
> interfaces to give vendor drivers more granularity in marking pinned
> pages dirty and perhaps even exposing the bitmap to the vendor drivers
> to set pages themselves.  I don't necessarily think it's worthwhile to
> create a new class of dirtied pages to transfer at the end, we're
> fighting a losing battle at that point.  We should be focusing on
> improving the granularity of page dirtying in order to reduce the pages
> transferred at the end of migration.  Thanks,

Dave

> Alex
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Tian, Kevin March 25, 2020, 5:31 a.m. UTC | #13
> From: Dr. David Alan Gilbert <dgilbert@redhat.com>
> Sent: Wednesday, March 25, 2020 4:23 AM
> 
> * Alex Williamson (alex.williamson@redhat.com) wrote:
> > On Mon, 23 Mar 2020 23:01:18 -0400
> > Yan Zhao <yan.y.zhao@intel.com> wrote:
> >
> > > On Tue, Mar 24, 2020 at 02:51:14AM +0800, Dr. David Alan Gilbert wrote:
> > > > * Alex Williamson (alex.williamson@redhat.com) wrote:
> > > > > On Mon, 23 Mar 2020 23:24:37 +0530
> > > > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > > >
> > > > > > On 3/21/2020 12:29 AM, Alex Williamson wrote:
> > > > > > > On Sat, 21 Mar 2020 00:12:04 +0530
> > > > > > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > > > > >
> > > > > > >> On 3/20/2020 11:31 PM, Alex Williamson wrote:
> > > > > > >>> On Fri, 20 Mar 2020 23:19:14 +0530
> > > > > > >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > > > > >>>
> > > > > > >>>> On 3/20/2020 4:27 AM, Alex Williamson wrote:
> > > > > > >>>>> On Fri, 20 Mar 2020 01:46:41 +0530
> > > > > > >>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > > > > >>>>>
> > > > > > >>
> > > > > > >> <snip>
> > > > > > >>
> > > > > > >>>>>> +static int vfio_iova_dirty_bitmap(struct vfio_iommu
> *iommu, dma_addr_t iova,
> > > > > > >>>>>> +				  size_t size, uint64_t pgsize,
> > > > > > >>>>>> +				  u64 __user *bitmap)
> > > > > > >>>>>> +{
> > > > > > >>>>>> +	struct vfio_dma *dma;
> > > > > > >>>>>> +	unsigned long pgshift = __ffs(pgsize);
> > > > > > >>>>>> +	unsigned int npages, bitmap_size;
> > > > > > >>>>>> +
> > > > > > >>>>>> +	dma = vfio_find_dma(iommu, iova, 1);
> > > > > > >>>>>> +
> > > > > > >>>>>> +	if (!dma)
> > > > > > >>>>>> +		return -EINVAL;
> > > > > > >>>>>> +
> > > > > > >>>>>> +	if (dma->iova != iova || dma->size != size)
> > > > > > >>>>>> +		return -EINVAL;
> > > > > > >>>>>> +
> > > > > > >>>>>> +	npages = dma->size >> pgshift;
> > > > > > >>>>>> +	bitmap_size = DIRTY_BITMAP_BYTES(npages);
> > > > > > >>>>>> +
> > > > > > >>>>>> +	/* mark all pages dirty if all pages are pinned and
> mapped. */
> > > > > > >>>>>> +	if (dma->iommu_mapped)
> > > > > > >>>>>> +		bitmap_set(dma->bitmap, 0, npages);
> > > > > > >>>>>> +
> > > > > > >>>>>> +	if (copy_to_user((void __user *)bitmap, dma-
> >bitmap, bitmap_size))
> > > > > > >>>>>> +		return -EFAULT;
> > > > > > >>>>>
> > > > > > >>>>> We still need to reset the bitmap here, clearing and re-adding
> the
> > > > > > >>>>> pages that are still pinned.
> > > > > > >>>>>
> > > > > > >>>>>
> https://lore.kernel.org/kvm/20200319070635.2ff5db56@x1.home/
> > > > > > >>>>>
> > > > > > >>>>
> > > > > > >>>> I thought you agreed on my reply to it
> > > > > > >>>> https://lore.kernel.org/kvm/31621b70-02a9-2ea5-045f-
> f72b671fe703@nvidia.com/
> > > > > > >>>>
> > > > > > >>>>    > Why re-populate when there will be no change since
> > > > > > >>>>    > vfio_iova_dirty_bitmap() is called holding iommu->lock? If
> there is any
> > > > > > >>>>    > pin request while vfio_iova_dirty_bitmap() is still working, it
> will
> > > > > > >>>>    > wait till iommu->lock is released. Bitmap will be populated
> when page is
> > > > > > >>>>    > pinned.
> > > > > > >>>
> > > > > > >>> As coded, dirty bits are only ever set in the bitmap, never
> cleared.
> > > > > > >>> If a page is unpinned between iterations of the user recording
> the
> > > > > > >>> dirty bitmap, it should be marked dirty in the iteration
> immediately
> > > > > > >>> after the unpinning and not marked dirty in the following
> iteration.
> > > > > > >>> That doesn't happen here.  We're reporting cumulative dirty
> pages since
> > > > > > >>> logging was enabled, we need to be reporting dirty pages since
> the user
> > > > > > >>> last retrieved the dirty bitmap.  The bitmap should be cleared
> and
> > > > > > >>> currently pinned pages re-added after copying to the user.
> Thanks,
> > > > > > >>>
> > > > > > >>
> > > > > > >> Does that mean, we have to track every iteration? do we really
> need that
> > > > > > >> tracking?
> > > > > > >>
> > > > > > >> Generally the flow is:
> > > > > > >> - vendor driver pin x pages
> > > > > > >> - Enter pre-copy-phase where vCPUs are running - user starts
> dirty pages
> > > > > > >> tracking, then user asks dirty bitmap, x pages reported dirty by
> > > > > > >> VFIO_IOMMU_DIRTY_PAGES ioctl with _GET flag
> > > > > > >> - In pre-copy phase, vendor driver pins y more pages, now
> bitmap
> > > > > > >> consists of x+y bits set
> > > > > > >> - In pre-copy phase, vendor driver unpins z pages, but bitmap is
> not
> > > > > > >> updated, so again bitmap consists of x+y bits set.
> > > > > > >> - Enter in stop-and-copy phase, vCPUs are stopped, mdev devices
> are stopped
> > > > > > >> - user asks dirty bitmap - Since here vCPU and mdev devices are
> stopped,
> > > > > > >> pages should not get dirty by guest driver or the physical device.
> > > > > > >> Hence, x+y dirty pages would be reported.
> > > > > > >>
> > > > > > >> I don't think we need to track every iteration of bitmap reporting.
> > > > > > >
> > > > > > > Yes, once a bitmap is read, it's reset.  In your example, after
> > > > > > > unpinning z pages the user should still see a bitmap with x+y pages,
> > > > > > > but once they've read that bitmap, the next bitmap should be x+y-
> z.
> > > > > > > Userspace can make decisions about when to switch from pre-
> copy to
> > > > > > > stop-and-copy based on convergence, ie. the slope of the line
> recording
> > > > > > > dirty pages per iteration.  The implementation here never allows
> an
> > > > > > > inflection point, dirty pages reported through vfio would always
> either
> > > > > > > be flat or climbing.  There might also be a case that an iommu
> backed
> > > > > > > device could start pinning pages during the course of a migration,
> how
> > > > > > > would the bitmap ever revert from fully populated to only tracking
> the
> > > > > > > pinned pages?  Thanks,
> > > > > > >
> > > > > >
> > > > > > At KVM forum we discussed this - if guest driver pins say 1024 pages
> > > > > > before migration starts, during pre-copy phase device can dirty 0
> pages
> > > > > > in best case and 1024 pages in worst case. In that case, user will
> > > > > > transfer content of 1024 pages during pre-copy phase and in
> > > > > > stop-and-copy phase also, that will be pages will be copied twice. So
> we
> > > > > > decided to only get dirty pages bitmap at stop-and-copy phase. If
> user
> > > > > > is going to get dirty pages in stop-and-copy phase only, then that will
> > > > > > be single iteration.
> > > > > > There aren't any devices yet that can track sys memory dirty pages.
> So
> > > > > > we can go ahead with this patch and support for dirty pages tracking
> > > > > > during pre-copy phase can be added later when there will be
> consumers of
> > > > > > that functionality.
> > > > >
> > > > > So if I understand this right, you're expecting the dirty bitmap to
> > > > > accumulate dirty bits, in perpetuity, so that the user can only
> > > > > retrieve them once at the end of migration?  But if that's the case,
> > > > > the user could simply choose to not retrieve the bitmap until the end
> > > > > of migration, the result would be the same.  What we have here is
> that
> > > > > dirty bits are never cleared, regardless of whether the user has seen
> > > > > them, which is wrong.  Sorry, we had a lot of discussions at KVM
> forum,
> > > > > I don't recall this specific one 5 months later and maybe we weren't
> > > > > considering all aspects.  I see the behavior we have here as incorrect,
> > > > > but it also seems relatively trivial to make correct.  I hope the QEMU
> > > > > code isn't making us go through all this trouble to report a dirty
> > > > > bitmap that gets thrown away because it expects the final one to be
> > > > > cumulative since the beginning of dirty logging.  Thanks,
> > > >
> > > > I remember the discussion that we couldn't track the system memory
> > > > dirtying with current hardware; so the question then is just to track
> > > hi Dave
> > > there are already devices that are able to track the system memory,
> > > through two ways:
> > > (1) software method. like VFs for "Intel(R) Ethernet Controller XL710
> Family
> > > support".
> > > (2) hardware method. through hardware internal buffer (as one Intel
> > > internal hardware not yet to public, but very soon) or through VTD-3.0
> > > IOMMU.
> > >
> > > we have already had code verified using the two ways to track system
> memory
> > > in fine-grained level.
> > >
> > >
> > > > what has been pinned and then ideally put that memory off until the
> end.
> > > > (Which is interesting because I don't think we currently have  a way
> > > > to delay RAM pages till the end in qemu).
> > >
> > > I think the problem here is that we mixed pinned pages with dirty pages.
> >
> > We are reporting dirty pages, pinned pages are just assumed to be dirty.
> >
> > > yes, pinned pages for mdev devices are continuously likely to be dirty
> > > until device stopped.
> > > But for devices that are able to report dirty pages, dirtied pages
> > > will be marked again if hardware writes them later.
> > >
> > > So, is it good to introduce a capability to let vfio/qemu know how to
> > > treat the dirty pages?
> >
> > Dirty pages are dirty, QEMU doesn't need any special flag, instead we
> > need to evolve different mechanisms for the vendor driver so that we
> > can differentiate pages pinned for read vs pages pinned for write.
> > Perhaps interfaces to pin pages without dirtying them, and a separate
> > mechanism to dirty a previously pinned-page, ie. promote it permanently
> > or transiently to a writable page.
> >
> > > (1) for devices have no fine-grained dirty page tracking capability
> > >   a. pinned pages are regarded as dirty pages. they are not cleared by
> > >   dirty page query
> > >   b. unpinned pages are regarded as dirty pages. they are cleared by
> > >   dirty page query or UNMAP ioctl.
> > > (2) for devices that have fine-grained dirty page tracking capability
> > >    a. pinned/unpinned pages are not regarded as dirty pages
> >
> > We need a pin-read-only interface for this.
> >
> > >    b. only pages they reported are regarded as dirty pages and are to be
> > >    cleared by dirty page query and UNMAP ioctl.
> >
> > We need a set-dirty or promote-writable interface for this.
> >
> > > (3) for dirty pages marking APIs, like vfio_dma_rw()...
> > >    pages marked by them are regared as dirty and are to be cleared by
> > >    dirty page query and UNMAP ioctl
> > >
> > > For (1), qemu VFIO only reports dirty page amount and would not
> transfer
> > > those pages until last round.
> > > for (2) and (3), qemu VFIO should report and transfer them in each
> > > round.
> >
> > IMO, QEMU should not be aware of any of this.  Userspace has an
> > interface to retrieve dirtied pages (period).  We should adjust the
> > pages that we report as dirtied to be accurate based on the
> > capabilities of the vendor driver.  We can evolve those internal APIs
> > between the vendor driver and vfio iommu over time without modifying
> > this user interface.
> 
> I'm not sure;  if you have a block of memory that's constantly marked
> dirty in (1) - we need to avoid constantly retransmitting that memory to
> the destination; there's no point in sending it until the end of the
> iterations - so it shouldn't even get sent once in the iteration.
> But at the same time, we can't ignore the fact that those pages are
> going to be dirty - because that influences the downtime; so we need
> to know we're going to be getting them later, even if we don't
> initially mark them as dirty.

For that we possibly need a way to allow VFIO or vendor driver telling
the userspace that I can report dirty pages to you but it is better to do
it in the end since the set is sort of static and big thus not optimal to
transfer them multiple rounds, and I can also report to you the number 
of currently-tracked dirty pages so you may use it to make accurate
prediction to decide when to exit the precopy. But such feature might
be introduced orthogonal to the standard bitmap interface, i.e. not
necessarily to block this series for the baseline live migration support...

Thanks
Kevin

> 
> > > > [I still worry whether migration will be usable with any
> > > > significant amount of system ram that's pinned in this way; the
> > > > downside will very easily get above the threshold that people like]
> > > >
> > > yes. that's why we have to do multi-round dirty page query and
> > > transfer and clear the dirty bitmaps in each round for devices that are
> > > able to track in fine grain.
> > > and that's why we have to report the amount of dirty pages before
> > > stop-and-copy phase for mdev devices, so that people are able to know
> > > the real downtime as much as possible.
> >
> > Yes, the dirty bitmap should be accurate to report the pages dirtied
> > since it was last retrieved and over time we can add internal
> > interfaces to give vendor drivers more granularity in marking pinned
> > pages dirty and perhaps even exposing the bitmap to the vendor drivers
> > to set pages themselves.  I don't necessarily think it's worthwhile to
> > create a new class of dirtied pages to transfer at the end, we're
> > fighting a losing battle at that point.  We should be focusing on
> > improving the granularity of page dirtying in order to reduce the pages
> > transferred at the end of migration.  Thanks,
> 
> Dave
> 
> > Alex
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
diff mbox series

Patch

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 70aeab921d0f..239f61764d03 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -71,6 +71,7 @@  struct vfio_iommu {
 	unsigned int		dma_avail;
 	bool			v2;
 	bool			nesting;
+	bool			dirty_page_tracking;
 };
 
 struct vfio_domain {
@@ -91,6 +92,7 @@  struct vfio_dma {
 	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
 	struct task_struct	*task;
 	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
+	unsigned long		*bitmap;
 };
 
 struct vfio_group {
@@ -125,7 +127,21 @@  struct vfio_regions {
 #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
 					(!list_empty(&iommu->domain_list))
 
+#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
+
+/*
+ * Input argument of number of bits to bitmap_set() is unsigned integer, which
+ * further casts to signed integer for unaligned multi-bit operation,
+ * __bitmap_set().
+ * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
+ * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
+ * system.
+ */
+#define DIRTY_BITMAP_PAGES_MAX	((1UL << 31) - 1)
+#define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
+
 static int put_pfn(unsigned long pfn, int prot);
+static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
 
 /*
  * This code handles mapping and unmapping of user data buffers
@@ -175,6 +191,67 @@  static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
 	rb_erase(&old->node, &iommu->dma_list);
 }
 
+
+static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, uint64_t pgsize)
+{
+	uint64_t npages = dma->size / pgsize;
+
+	dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
+	if (!dma->bitmap)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, uint64_t pgsize)
+{
+	struct rb_node *n = rb_first(&iommu->dma_list);
+
+	for (; n; n = rb_next(n)) {
+		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+		struct rb_node *p;
+		int ret;
+
+		ret = vfio_dma_bitmap_alloc(dma, pgsize);
+		if (ret) {
+			struct rb_node *p = rb_prev(n);
+
+			for (; p; p = rb_prev(p)) {
+				struct vfio_dma *dma = rb_entry(n,
+							struct vfio_dma, node);
+
+				kfree(dma->bitmap);
+				dma->bitmap = NULL;
+			}
+			return ret;
+		}
+
+		if (RB_EMPTY_ROOT(&dma->pfn_list))
+			continue;
+
+		for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
+			struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
+							 node);
+
+			bitmap_set(dma->bitmap,
+				   (vpfn->iova - dma->iova) / pgsize, 1);
+		}
+	}
+	return 0;
+}
+
+static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
+{
+	struct rb_node *n = rb_first(&iommu->dma_list);
+
+	for (; n; n = rb_next(n)) {
+		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+
+		kfree(dma->bitmap);
+		dma->bitmap = NULL;
+	}
+}
+
 /*
  * Helper Functions for host iova-pfn list
  */
@@ -567,6 +644,14 @@  static int vfio_iommu_type1_pin_pages(void *iommu_data,
 			vfio_unpin_page_external(dma, iova, do_accounting);
 			goto pin_unwind;
 		}
+
+		if (iommu->dirty_page_tracking) {
+			unsigned long pgshift =
+					 __ffs(vfio_pgsize_bitmap(iommu));
+
+			bitmap_set(dma->bitmap,
+				   (vpfn->iova - dma->iova) >> pgshift, 1);
+		}
 	}
 
 	ret = i;
@@ -801,6 +886,7 @@  static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
 	vfio_unmap_unpin(iommu, dma, true);
 	vfio_unlink_dma(iommu, dma);
 	put_task_struct(dma->task);
+	kfree(dma->bitmap);
 	kfree(dma);
 	iommu->dma_avail++;
 }
@@ -831,6 +917,50 @@  static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
 	return bitmap;
 }
 
+static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
+				  size_t size, uint64_t pgsize,
+				  u64 __user *bitmap)
+{
+	struct vfio_dma *dma;
+	unsigned long pgshift = __ffs(pgsize);
+	unsigned int npages, bitmap_size;
+
+	dma = vfio_find_dma(iommu, iova, 1);
+
+	if (!dma)
+		return -EINVAL;
+
+	if (dma->iova != iova || dma->size != size)
+		return -EINVAL;
+
+	npages = dma->size >> pgshift;
+	bitmap_size = DIRTY_BITMAP_BYTES(npages);
+
+	/* mark all pages dirty if all pages are pinned and mapped. */
+	if (dma->iommu_mapped)
+		bitmap_set(dma->bitmap, 0, npages);
+
+	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
+{
+	uint64_t bsize;
+
+	if (!npages || !bitmap_size || (bitmap_size > DIRTY_BITMAP_SIZE_MAX))
+		return -EINVAL;
+
+	bsize = DIRTY_BITMAP_BYTES(npages);
+
+	if (bitmap_size < bsize)
+		return -EINVAL;
+
+	return 0;
+}
+
 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 			     struct vfio_iommu_type1_dma_unmap *unmap)
 {
@@ -1038,16 +1168,16 @@  static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	unsigned long vaddr = map->vaddr;
 	size_t size = map->size;
 	int ret = 0, prot = 0;
-	uint64_t mask;
+	uint64_t pgsize;
 	struct vfio_dma *dma;
 
 	/* Verify that none of our __u64 fields overflow */
 	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
 		return -EINVAL;
 
-	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
+	pgsize = (uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu));
 
-	WARN_ON(mask & PAGE_MASK);
+	WARN_ON((pgsize - 1) & PAGE_MASK);
 
 	/* READ/WRITE from device perspective */
 	if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
@@ -1055,7 +1185,7 @@  static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	if (map->flags & VFIO_DMA_MAP_FLAG_READ)
 		prot |= IOMMU_READ;
 
-	if (!prot || !size || (size | iova | vaddr) & mask)
+	if (!prot || !size || (size | iova | vaddr) & (pgsize - 1))
 		return -EINVAL;
 
 	/* Don't allow IOVA or virtual address wrap */
@@ -1130,6 +1260,12 @@  static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	else
 		ret = vfio_pin_map_dma(iommu, dma, size);
 
+	if (!ret && iommu->dirty_page_tracking) {
+		ret = vfio_dma_bitmap_alloc(dma, pgsize);
+		if (ret)
+			vfio_remove_dma(iommu, dma);
+	}
+
 out_unlock:
 	mutex_unlock(&iommu->lock);
 	return ret;
@@ -2278,6 +2414,93 @@  static long vfio_iommu_type1_ioctl(void *iommu_data,
 
 		return copy_to_user((void __user *)arg, &unmap, minsz) ?
 			-EFAULT : 0;
+	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
+		struct vfio_iommu_type1_dirty_bitmap dirty;
+		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
+				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
+				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
+		int ret = 0;
+
+		if (!iommu->v2)
+			return -EACCES;
+
+		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
+				    flags);
+
+		if (copy_from_user(&dirty, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (dirty.argsz < minsz || dirty.flags & ~mask)
+			return -EINVAL;
+
+		/* only one flag should be set at a time */
+		if (__ffs(dirty.flags) != __fls(dirty.flags))
+			return -EINVAL;
+
+		if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
+			uint64_t pgsize = 1 << __ffs(vfio_pgsize_bitmap(iommu));
+
+			mutex_lock(&iommu->lock);
+			if (!iommu->dirty_page_tracking) {
+				ret = vfio_dma_bitmap_alloc_all(iommu, pgsize);
+				if (!ret)
+					iommu->dirty_page_tracking = true;
+			}
+			mutex_unlock(&iommu->lock);
+			return ret;
+		} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
+			mutex_lock(&iommu->lock);
+			if (iommu->dirty_page_tracking) {
+				iommu->dirty_page_tracking = false;
+				vfio_dma_bitmap_free_all(iommu);
+			}
+			mutex_unlock(&iommu->lock);
+			return 0;
+		} else if (dirty.flags &
+				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
+			struct vfio_iommu_type1_dirty_bitmap_get range;
+			unsigned long pgshift;
+			size_t data_size = dirty.argsz - minsz;
+			uint64_t iommu_pgsize =
+					 1 << __ffs(vfio_pgsize_bitmap(iommu));
+
+			if (!data_size || data_size < sizeof(range))
+				return -EINVAL;
+
+			if (copy_from_user(&range, (void __user *)(arg + minsz),
+					   sizeof(range)))
+				return -EFAULT;
+
+			/* allow only min supported pgsize */
+			if (range.bitmap.pgsize != iommu_pgsize)
+				return -EINVAL;
+			if (range.iova & (iommu_pgsize - 1))
+				return -EINVAL;
+			if (!range.size || range.size & (iommu_pgsize - 1))
+				return -EINVAL;
+			if (range.iova + range.size < range.iova)
+				return -EINVAL;
+			if (!access_ok((void __user *)range.bitmap.data,
+				       range.bitmap.size))
+				return -EINVAL;
+
+			pgshift = __ffs(range.bitmap.pgsize);
+			ret = verify_bitmap_size(range.size >> pgshift,
+						 range.bitmap.size);
+			if (ret)
+				return ret;
+
+			mutex_lock(&iommu->lock);
+			if (iommu->dirty_page_tracking)
+				ret = vfio_iova_dirty_bitmap(iommu, range.iova,
+						range.size, range.bitmap.pgsize,
+						range.bitmap.data);
+			else
+				ret = -EINVAL;
+			mutex_unlock(&iommu->lock);
+
+			return ret;
+		}
 	}
 
 	return -ENOTTY;
@@ -2345,10 +2568,17 @@  static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
 
 	vaddr = dma->vaddr + offset;
 
-	if (write)
+	if (write) {
 		*copied = __copy_to_user((void __user *)vaddr, data,
 					 count) ? 0 : count;
-	else
+		if (*copied && iommu->dirty_page_tracking) {
+			unsigned long pgshift =
+				__ffs(vfio_pgsize_bitmap(iommu));
+
+			bitmap_set(dma->bitmap, offset >> pgshift,
+				   *copied >> pgshift);
+		}
+	} else
 		*copied = __copy_from_user(data, (void __user *)vaddr,
 					   count) ? 0 : count;
 	if (kthread)