diff mbox series

[v14,Kernel,4/7] vfio iommu: Implementation of ioctl for dirty pages tracking.

Message ID 1584560474-19946-5-git-send-email-kwankhede@nvidia.com (mailing list archive)
State New, archived
Headers show
Series KABIs to support migration for VFIO devices | expand

Commit Message

Kirti Wankhede March 18, 2020, 7:41 p.m. UTC
VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
- Start dirty pages tracking while migration is active
- Stop dirty pages tracking.
- Get dirty pages bitmap. Its user space application's responsibility to
  copy content of dirty pages from source to destination during migration.

To prevent DoS attack, memory for bitmap is allocated per vfio_dma
structure. Bitmap size is calculated considering smallest supported page
size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled

Bitmap is populated for already pinned pages when bitmap is allocated for
a vfio_dma with the smallest supported page size. Update bitmap from
pinning functions when tracking is enabled. When user application queries
bitmap, check if requested page size is same as page size used to
populated bitmap. If it is equal, copy bitmap, but if not equal, return
error.

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Reviewed-by: Neo Jia <cjia@nvidia.com>
---
 drivers/vfio/vfio_iommu_type1.c | 205 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 203 insertions(+), 2 deletions(-)

Comments

Yan Zhao March 19, 2020, 3:06 a.m. UTC | #1
On Thu, Mar 19, 2020 at 03:41:11AM +0800, Kirti Wankhede wrote:
> VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
> - Start dirty pages tracking while migration is active
> - Stop dirty pages tracking.
> - Get dirty pages bitmap. Its user space application's responsibility to
>   copy content of dirty pages from source to destination during migration.
> 
> To prevent DoS attack, memory for bitmap is allocated per vfio_dma
> structure. Bitmap size is calculated considering smallest supported page
> size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled
> 
> Bitmap is populated for already pinned pages when bitmap is allocated for
> a vfio_dma with the smallest supported page size. Update bitmap from
> pinning functions when tracking is enabled. When user application queries
> bitmap, check if requested page size is same as page size used to
> populated bitmap. If it is equal, copy bitmap, but if not equal, return
> error.
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Reviewed-by: Neo Jia <cjia@nvidia.com>
> ---
>  drivers/vfio/vfio_iommu_type1.c | 205 +++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 203 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 70aeab921d0f..d6417fb02174 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -71,6 +71,7 @@ struct vfio_iommu {
>  	unsigned int		dma_avail;
>  	bool			v2;
>  	bool			nesting;
> +	bool			dirty_page_tracking;
>  };
>  
>  struct vfio_domain {
> @@ -91,6 +92,7 @@ struct vfio_dma {
>  	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
>  	struct task_struct	*task;
>  	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
> +	unsigned long		*bitmap;
>  };
>  
>  struct vfio_group {
> @@ -125,7 +127,10 @@ struct vfio_regions {
>  #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
>  					(!list_empty(&iommu->domain_list))
>  
> +#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
> +
>  static int put_pfn(unsigned long pfn, int prot);
> +static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
>  
>  /*
>   * This code handles mapping and unmapping of user data buffers
> @@ -175,6 +180,55 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
>  	rb_erase(&old->node, &iommu->dma_list);
>  }
>  
> +static int vfio_dma_bitmap_alloc(struct vfio_iommu *iommu, uint64_t pgsize)
> +{
> +	struct rb_node *n = rb_first(&iommu->dma_list);
> +
> +	for (; n; n = rb_next(n)) {
> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> +		struct rb_node *p;
> +		unsigned long npages = dma->size / pgsize;
> +
> +		dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
> +		if (!dma->bitmap) {
> +			struct rb_node *p = rb_prev(n);
> +
> +			for (; p; p = rb_prev(p)) {
> +				struct vfio_dma *dma = rb_entry(n,
> +							struct vfio_dma, node);
> +
> +				kfree(dma->bitmap);
> +				dma->bitmap = NULL;
> +			}
> +			return -ENOMEM;
> +		}
> +
> +		if (RB_EMPTY_ROOT(&dma->pfn_list))
> +			continue;
> +
> +		for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
> +			struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
> +							 node);
> +
> +			bitmap_set(dma->bitmap,
> +					(vpfn->iova - dma->iova) / pgsize, 1);
> +		}
> +	}
> +	return 0;
> +}
> +
> +static void vfio_dma_bitmap_free(struct vfio_iommu *iommu)
> +{
> +	struct rb_node *n = rb_first(&iommu->dma_list);
> +
> +	for (; n; n = rb_next(n)) {
> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> +
> +		kfree(dma->bitmap);
> +		dma->bitmap = NULL;
> +	}
> +}
> +
>  /*
>   * Helper Functions for host iova-pfn list
>   */
> @@ -567,6 +621,14 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>  			vfio_unpin_page_external(dma, iova, do_accounting);
>  			goto pin_unwind;
>  		}
> +
> +		if (iommu->dirty_page_tracking) {
> +			unsigned long pgshift =
> +					 __ffs(vfio_pgsize_bitmap(iommu));
> +
> +			bitmap_set(dma->bitmap,
> +				   (vpfn->iova - dma->iova) >> pgshift, 1);
> +		}
>  	}
>  
>  	ret = i;
> @@ -801,6 +863,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
>  	vfio_unmap_unpin(iommu, dma, true);
>  	vfio_unlink_dma(iommu, dma);
>  	put_task_struct(dma->task);
> +	kfree(dma->bitmap);
>  	kfree(dma);
>  	iommu->dma_avail++;
>  }
> @@ -831,6 +894,50 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
>  	return bitmap;
>  }
>  
> +static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
> +				  size_t size, uint64_t pgsize,
> +				  unsigned char __user *bitmap)
> +{
> +	struct vfio_dma *dma;
> +	unsigned long pgshift = __ffs(pgsize);
> +	unsigned int npages, bitmap_size;
> +
> +	dma = vfio_find_dma(iommu, iova, 1);
> +
> +	if (!dma)
> +		return -EINVAL;
> +
> +	if (dma->iova != iova || dma->size != size)
> +		return -EINVAL;
> +
looks this size is passed from user. how can it ensure size always
equals to dma->size ?

shouldn't we iterate dma tree to look for dirty for whole range if a
single dma cannot meet them all?

> +	npages = dma->size >> pgshift;
> +	bitmap_size = DIRTY_BITMAP_BYTES(npages);
> +
> +	/* mark all pages dirty if all pages are pinned and mapped. */
> +	if (dma->iommu_mapped)
> +		bitmap_set(dma->bitmap, 0, npages);
> +
> +	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
> +		return -EFAULT;
> +
> +	return 0;
> +}
> +
> +static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
> +{
> +	uint64_t bsize;
> +
> +	if (!npages || !bitmap_size || bitmap_size > UINT_MAX)
> +		return -EINVAL;
> +
> +	bsize = DIRTY_BITMAP_BYTES(npages);
> +
> +	if (bitmap_size < bsize)
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
>  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>  			     struct vfio_iommu_type1_dma_unmap *unmap)
>  {
> @@ -2278,6 +2385,93 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>  
>  		return copy_to_user((void __user *)arg, &unmap, minsz) ?
>  			-EFAULT : 0;
> +	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
> +		struct vfio_iommu_type1_dirty_bitmap dirty;
> +		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
> +		int ret = 0;
> +
> +		if (!iommu->v2)
> +			return -EACCES;
> +
> +		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
> +				    flags);
> +
> +		if (copy_from_user(&dirty, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (dirty.argsz < minsz || dirty.flags & ~mask)
> +			return -EINVAL;
> +
> +		/* only one flag should be set at a time */
> +		if (__ffs(dirty.flags) != __fls(dirty.flags))
> +			return -EINVAL;
> +
> +		if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
> +			uint64_t pgsize = 1 << __ffs(vfio_pgsize_bitmap(iommu));
> +
> +			mutex_lock(&iommu->lock);
> +			if (!iommu->dirty_page_tracking) {
> +				ret = vfio_dma_bitmap_alloc(iommu, pgsize);
> +				if (!ret)
> +					iommu->dirty_page_tracking = true;
> +			}
> +			mutex_unlock(&iommu->lock);
> +			return ret;
> +		} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
> +			mutex_lock(&iommu->lock);
> +			if (iommu->dirty_page_tracking) {
> +				iommu->dirty_page_tracking = false;
> +				vfio_dma_bitmap_free(iommu);
> +			}
> +			mutex_unlock(&iommu->lock);
> +			return 0;
> +		} else if (dirty.flags &
> +				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
> +			struct vfio_iommu_type1_dirty_bitmap_get range;
> +			unsigned long pgshift;
> +			size_t data_size = dirty.argsz - minsz;
> +			uint64_t iommu_pgsize =
> +					 1 << __ffs(vfio_pgsize_bitmap(iommu));
> +
> +			if (!data_size || data_size < sizeof(range))
> +				return -EINVAL;
> +
> +			if (copy_from_user(&range, (void __user *)(arg + minsz),
> +					   sizeof(range)))
> +				return -EFAULT;
> +
> +			/* allow only min supported pgsize */
> +			if (range.bitmap.pgsize != iommu_pgsize)
> +				return -EINVAL;
> +			if (range.iova & (iommu_pgsize - 1))
> +				return -EINVAL;
> +			if (!range.size || range.size & (iommu_pgsize - 1))
> +				return -EINVAL;
> +			if (range.iova + range.size < range.iova)
> +				return -EINVAL;
> +			if (!access_ok((void __user *)range.bitmap.data,
> +				       range.bitmap.size))
> +				return -EINVAL;
> +
> +			pgshift = __ffs(range.bitmap.pgsize);
> +			ret = verify_bitmap_size(range.size >> pgshift,
> +						 range.bitmap.size);
> +			if (ret)
> +				return ret;
> +
> +			mutex_lock(&iommu->lock);
> +			if (iommu->dirty_page_tracking)
> +				ret = vfio_iova_dirty_bitmap(iommu, range.iova,
> +					 range.size, range.bitmap.pgsize,
> +				    (unsigned char __user *)range.bitmap.data);
> +			else
> +				ret = -EINVAL;
> +			mutex_unlock(&iommu->lock);
> +
> +			return ret;
> +		}
>  	}
>  
>  	return -ENOTTY;
> @@ -2345,10 +2539,17 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
>  
>  	vaddr = dma->vaddr + offset;
>  
> -	if (write)
> +	if (write) {
>  		*copied = __copy_to_user((void __user *)vaddr, data,
>  					 count) ? 0 : count;
> -	else
> +		if (*copied && iommu->dirty_page_tracking) {
> +			unsigned long pgshift =
> +				__ffs(vfio_pgsize_bitmap(iommu));
> +
> +			bitmap_set(dma->bitmap, offset >> pgshift,
> +				   *copied >> pgshift);
> +		}
> +	} else
>  		*copied = __copy_from_user(data, (void __user *)vaddr,
>  					   count) ? 0 : count;
>  	if (kthread)
> -- 
> 2.7.0
>
Alex Williamson March 19, 2020, 3:45 a.m. UTC | #2
On Thu, 19 Mar 2020 01:11:11 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
> - Start dirty pages tracking while migration is active
> - Stop dirty pages tracking.
> - Get dirty pages bitmap. Its user space application's responsibility to
>   copy content of dirty pages from source to destination during migration.
> 
> To prevent DoS attack, memory for bitmap is allocated per vfio_dma
> structure. Bitmap size is calculated considering smallest supported page
> size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled
> 
> Bitmap is populated for already pinned pages when bitmap is allocated for
> a vfio_dma with the smallest supported page size. Update bitmap from
> pinning functions when tracking is enabled. When user application queries
> bitmap, check if requested page size is same as page size used to
> populated bitmap. If it is equal, copy bitmap, but if not equal, return
> error.
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Reviewed-by: Neo Jia <cjia@nvidia.com>
> ---
>  drivers/vfio/vfio_iommu_type1.c | 205 +++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 203 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 70aeab921d0f..d6417fb02174 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -71,6 +71,7 @@ struct vfio_iommu {
>  	unsigned int		dma_avail;
>  	bool			v2;
>  	bool			nesting;
> +	bool			dirty_page_tracking;
>  };
>  
>  struct vfio_domain {
> @@ -91,6 +92,7 @@ struct vfio_dma {
>  	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
>  	struct task_struct	*task;
>  	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
> +	unsigned long		*bitmap;

We've made the bitmap a width invariant u64 else, should be here as
well.

>  };
>  
>  struct vfio_group {
> @@ -125,7 +127,10 @@ struct vfio_regions {
>  #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
>  					(!list_empty(&iommu->domain_list))
>  
> +#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
> +
>  static int put_pfn(unsigned long pfn, int prot);
> +static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
>  
>  /*
>   * This code handles mapping and unmapping of user data buffers
> @@ -175,6 +180,55 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
>  	rb_erase(&old->node, &iommu->dma_list);
>  }
>  
> +static int vfio_dma_bitmap_alloc(struct vfio_iommu *iommu, uint64_t pgsize)
> +{
> +	struct rb_node *n = rb_first(&iommu->dma_list);
> +
> +	for (; n; n = rb_next(n)) {
> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> +		struct rb_node *p;
> +		unsigned long npages = dma->size / pgsize;
> +
> +		dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
> +		if (!dma->bitmap) {
> +			struct rb_node *p = rb_prev(n);
> +
> +			for (; p; p = rb_prev(p)) {
> +				struct vfio_dma *dma = rb_entry(n,
> +							struct vfio_dma, node);
> +
> +				kfree(dma->bitmap);
> +				dma->bitmap = NULL;
> +			}
> +			return -ENOMEM;
> +		}
> +
> +		if (RB_EMPTY_ROOT(&dma->pfn_list))
> +			continue;
> +
> +		for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
> +			struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
> +							 node);
> +
> +			bitmap_set(dma->bitmap,
> +					(vpfn->iova - dma->iova) / pgsize, 1);
> +		}
> +	}
> +	return 0;
> +}
> +
> +static void vfio_dma_bitmap_free(struct vfio_iommu *iommu)
> +{
> +	struct rb_node *n = rb_first(&iommu->dma_list);
> +
> +	for (; n; n = rb_next(n)) {
> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> +
> +		kfree(dma->bitmap);
> +		dma->bitmap = NULL;
> +	}
> +}
> +
>  /*
>   * Helper Functions for host iova-pfn list
>   */
> @@ -567,6 +621,14 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>  			vfio_unpin_page_external(dma, iova, do_accounting);
>  			goto pin_unwind;
>  		}
> +
> +		if (iommu->dirty_page_tracking) {
> +			unsigned long pgshift =
> +					 __ffs(vfio_pgsize_bitmap(iommu));
> +
> +			bitmap_set(dma->bitmap,
> +				   (vpfn->iova - dma->iova) >> pgshift, 1);
> +		}
>  	}
>  
>  	ret = i;
> @@ -801,6 +863,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
>  	vfio_unmap_unpin(iommu, dma, true);
>  	vfio_unlink_dma(iommu, dma);
>  	put_task_struct(dma->task);
> +	kfree(dma->bitmap);
>  	kfree(dma);
>  	iommu->dma_avail++;
>  }
> @@ -831,6 +894,50 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
>  	return bitmap;
>  }
>  
> +static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
> +				  size_t size, uint64_t pgsize,
> +				  unsigned char __user *bitmap)

And here, why do callers cast to an unsigned char pointer when we're
going to cast to a void pointer anyway?  Should be a u64 __user pointer.

> +{
> +	struct vfio_dma *dma;
> +	unsigned long pgshift = __ffs(pgsize);
> +	unsigned int npages, bitmap_size;
> +
> +	dma = vfio_find_dma(iommu, iova, 1);
> +
> +	if (!dma)
> +		return -EINVAL;
> +
> +	if (dma->iova != iova || dma->size != size)
> +		return -EINVAL;
> +
> +	npages = dma->size >> pgshift;
> +	bitmap_size = DIRTY_BITMAP_BYTES(npages);
> +
> +	/* mark all pages dirty if all pages are pinned and mapped. */
> +	if (dma->iommu_mapped)
> +		bitmap_set(dma->bitmap, 0, npages);
> +
> +	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
> +		return -EFAULT;
> +
> +	return 0;
> +}
> +
> +static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
> +{
> +	uint64_t bsize;
> +
> +	if (!npages || !bitmap_size || bitmap_size > UINT_MAX)

As commented previously, how do we derive this UINT_MAX limitation?

> +		return -EINVAL;
> +
> +	bsize = DIRTY_BITMAP_BYTES(npages);
> +
> +	if (bitmap_size < bsize)
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
>  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>  			     struct vfio_iommu_type1_dma_unmap *unmap)
>  {

We didn't address that vfio_dma_do_map() needs to kvzalloc a bitmap for
any new vfio_dma created while iommu->dirty_page_tracking = true.

> @@ -2278,6 +2385,93 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>  
>  		return copy_to_user((void __user *)arg, &unmap, minsz) ?
>  			-EFAULT : 0;
> +	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
> +		struct vfio_iommu_type1_dirty_bitmap dirty;
> +		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
> +		int ret = 0;
> +
> +		if (!iommu->v2)
> +			return -EACCES;
> +
> +		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
> +				    flags);
> +
> +		if (copy_from_user(&dirty, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (dirty.argsz < minsz || dirty.flags & ~mask)
> +			return -EINVAL;
> +
> +		/* only one flag should be set at a time */
> +		if (__ffs(dirty.flags) != __fls(dirty.flags))
> +			return -EINVAL;
> +
> +		if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
> +			uint64_t pgsize = 1 << __ffs(vfio_pgsize_bitmap(iommu));
> +
> +			mutex_lock(&iommu->lock);
> +			if (!iommu->dirty_page_tracking) {
> +				ret = vfio_dma_bitmap_alloc(iommu, pgsize);
> +				if (!ret)
> +					iommu->dirty_page_tracking = true;
> +			}
> +			mutex_unlock(&iommu->lock);
> +			return ret;
> +		} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
> +			mutex_lock(&iommu->lock);
> +			if (iommu->dirty_page_tracking) {
> +				iommu->dirty_page_tracking = false;
> +				vfio_dma_bitmap_free(iommu);
> +			}
> +			mutex_unlock(&iommu->lock);
> +			return 0;
> +		} else if (dirty.flags &
> +				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
> +			struct vfio_iommu_type1_dirty_bitmap_get range;
> +			unsigned long pgshift;
> +			size_t data_size = dirty.argsz - minsz;
> +			uint64_t iommu_pgsize =
> +					 1 << __ffs(vfio_pgsize_bitmap(iommu));
> +
> +			if (!data_size || data_size < sizeof(range))
> +				return -EINVAL;
> +
> +			if (copy_from_user(&range, (void __user *)(arg + minsz),
> +					   sizeof(range)))
> +				return -EFAULT;
> +
> +			/* allow only min supported pgsize */
> +			if (range.bitmap.pgsize != iommu_pgsize)
> +				return -EINVAL;
> +			if (range.iova & (iommu_pgsize - 1))
> +				return -EINVAL;
> +			if (!range.size || range.size & (iommu_pgsize - 1))
> +				return -EINVAL;
> +			if (range.iova + range.size < range.iova)
> +				return -EINVAL;
> +			if (!access_ok((void __user *)range.bitmap.data,
> +				       range.bitmap.size))
> +				return -EINVAL;
> +
> +			pgshift = __ffs(range.bitmap.pgsize);
> +			ret = verify_bitmap_size(range.size >> pgshift,
> +						 range.bitmap.size);
> +			if (ret)
> +				return ret;
> +
> +			mutex_lock(&iommu->lock);
> +			if (iommu->dirty_page_tracking)
> +				ret = vfio_iova_dirty_bitmap(iommu, range.iova,
> +					 range.size, range.bitmap.pgsize,
> +				    (unsigned char __user *)range.bitmap.data);
> +			else
> +				ret = -EINVAL;
> +			mutex_unlock(&iommu->lock);
> +
> +			return ret;
> +		}
>  	}
>  
>  	return -ENOTTY;
> @@ -2345,10 +2539,17 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
>  
>  	vaddr = dma->vaddr + offset;
>  
> -	if (write)
> +	if (write) {
>  		*copied = __copy_to_user((void __user *)vaddr, data,
>  					 count) ? 0 : count;
> -	else
> +		if (*copied && iommu->dirty_page_tracking) {
> +			unsigned long pgshift =
> +				__ffs(vfio_pgsize_bitmap(iommu));
> +
> +			bitmap_set(dma->bitmap, offset >> pgshift,
> +				   *copied >> pgshift);
> +		}
> +	} else

Great, thanks for adding this!

>  		*copied = __copy_from_user(data, (void __user *)vaddr,
>  					   count) ? 0 : count;
>  	if (kthread)
Alex Williamson March 19, 2020, 4:01 a.m. UTC | #3
On Wed, 18 Mar 2020 23:06:39 -0400
Yan Zhao <yan.y.zhao@intel.com> wrote:

> On Thu, Mar 19, 2020 at 03:41:11AM +0800, Kirti Wankhede wrote:
> > VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
> > - Start dirty pages tracking while migration is active
> > - Stop dirty pages tracking.
> > - Get dirty pages bitmap. Its user space application's responsibility to
> >   copy content of dirty pages from source to destination during migration.
> > 
> > To prevent DoS attack, memory for bitmap is allocated per vfio_dma
> > structure. Bitmap size is calculated considering smallest supported page
> > size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled
> > 
> > Bitmap is populated for already pinned pages when bitmap is allocated for
> > a vfio_dma with the smallest supported page size. Update bitmap from
> > pinning functions when tracking is enabled. When user application queries
> > bitmap, check if requested page size is same as page size used to
> > populated bitmap. If it is equal, copy bitmap, but if not equal, return
> > error.
> > 
> > Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> > Reviewed-by: Neo Jia <cjia@nvidia.com>
> > ---
> >  drivers/vfio/vfio_iommu_type1.c | 205 +++++++++++++++++++++++++++++++++++++++-
> >  1 file changed, 203 insertions(+), 2 deletions(-)
> > 
> > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > index 70aeab921d0f..d6417fb02174 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -71,6 +71,7 @@ struct vfio_iommu {
> >  	unsigned int		dma_avail;
> >  	bool			v2;
> >  	bool			nesting;
> > +	bool			dirty_page_tracking;
> >  };
> >  
> >  struct vfio_domain {
> > @@ -91,6 +92,7 @@ struct vfio_dma {
> >  	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
> >  	struct task_struct	*task;
> >  	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
> > +	unsigned long		*bitmap;
> >  };
> >  
> >  struct vfio_group {
> > @@ -125,7 +127,10 @@ struct vfio_regions {
> >  #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> >  					(!list_empty(&iommu->domain_list))
> >  
> > +#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
> > +
> >  static int put_pfn(unsigned long pfn, int prot);
> > +static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
> >  
> >  /*
> >   * This code handles mapping and unmapping of user data buffers
> > @@ -175,6 +180,55 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
> >  	rb_erase(&old->node, &iommu->dma_list);
> >  }
> >  
> > +static int vfio_dma_bitmap_alloc(struct vfio_iommu *iommu, uint64_t pgsize)
> > +{
> > +	struct rb_node *n = rb_first(&iommu->dma_list);
> > +
> > +	for (; n; n = rb_next(n)) {
> > +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> > +		struct rb_node *p;
> > +		unsigned long npages = dma->size / pgsize;
> > +
> > +		dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
> > +		if (!dma->bitmap) {
> > +			struct rb_node *p = rb_prev(n);
> > +
> > +			for (; p; p = rb_prev(p)) {
> > +				struct vfio_dma *dma = rb_entry(n,
> > +							struct vfio_dma, node);
> > +
> > +				kfree(dma->bitmap);
> > +				dma->bitmap = NULL;
> > +			}
> > +			return -ENOMEM;
> > +		}
> > +
> > +		if (RB_EMPTY_ROOT(&dma->pfn_list))
> > +			continue;
> > +
> > +		for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
> > +			struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
> > +							 node);
> > +
> > +			bitmap_set(dma->bitmap,
> > +					(vpfn->iova - dma->iova) / pgsize, 1);
> > +		}
> > +	}
> > +	return 0;
> > +}
> > +
> > +static void vfio_dma_bitmap_free(struct vfio_iommu *iommu)
> > +{
> > +	struct rb_node *n = rb_first(&iommu->dma_list);
> > +
> > +	for (; n; n = rb_next(n)) {
> > +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> > +
> > +		kfree(dma->bitmap);
> > +		dma->bitmap = NULL;
> > +	}
> > +}
> > +
> >  /*
> >   * Helper Functions for host iova-pfn list
> >   */
> > @@ -567,6 +621,14 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
> >  			vfio_unpin_page_external(dma, iova, do_accounting);
> >  			goto pin_unwind;
> >  		}
> > +
> > +		if (iommu->dirty_page_tracking) {
> > +			unsigned long pgshift =
> > +					 __ffs(vfio_pgsize_bitmap(iommu));
> > +
> > +			bitmap_set(dma->bitmap,
> > +				   (vpfn->iova - dma->iova) >> pgshift, 1);
> > +		}
> >  	}
> >  
> >  	ret = i;
> > @@ -801,6 +863,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
> >  	vfio_unmap_unpin(iommu, dma, true);
> >  	vfio_unlink_dma(iommu, dma);
> >  	put_task_struct(dma->task);
> > +	kfree(dma->bitmap);
> >  	kfree(dma);
> >  	iommu->dma_avail++;
> >  }
> > @@ -831,6 +894,50 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
> >  	return bitmap;
> >  }
> >  
> > +static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
> > +				  size_t size, uint64_t pgsize,
> > +				  unsigned char __user *bitmap)
> > +{
> > +	struct vfio_dma *dma;
> > +	unsigned long pgshift = __ffs(pgsize);
> > +	unsigned int npages, bitmap_size;
> > +
> > +	dma = vfio_find_dma(iommu, iova, 1);
> > +
> > +	if (!dma)
> > +		return -EINVAL;
> > +
> > +	if (dma->iova != iova || dma->size != size)
> > +		return -EINVAL;
> > +  
> looks this size is passed from user. how can it ensure size always
> equals to dma->size ?
> 
> shouldn't we iterate dma tree to look for dirty for whole range if a
> single dma cannot meet them all?

Please see the discussion on v12[1], the problem is with the alignment
of DMA mapped regions versus the bitmap.  A DMA mapping only requires
page alignment, so for example imagine a user requests the bitmap from
page zero to 4GB, but we have a DMA mapping starting at 4KB.  We can't
efficiently copy the bitmap tracked by the vfio_dma structure to the
user buffer when it's shifted by 1 bit.  Adjacent mappings can also
make for a very complicated implementation.  In the discussion linked
we decided to compromise on a more simple implementation that requires
the user to ask for a bitmap which exactly matches a single DMA
mapping, which Kirti indicates is what we require to support QEMU.
Later in the series, the unmap operation also makes this requirement
when used with the flags to retrieve the dirty bitmap.  Thanks,

Alex

[1] https://lore.kernel.org/kvm/20200218215330.5bc8fc6a@w520.home/
 
> > +	npages = dma->size >> pgshift;
> > +	bitmap_size = DIRTY_BITMAP_BYTES(npages);
> > +
> > +	/* mark all pages dirty if all pages are pinned and mapped. */
> > +	if (dma->iommu_mapped)
> > +		bitmap_set(dma->bitmap, 0, npages);
> > +
> > +	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
> > +		return -EFAULT;
> > +
> > +	return 0;
> > +}
> > +
> > +static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
> > +{
> > +	uint64_t bsize;
> > +
> > +	if (!npages || !bitmap_size || bitmap_size > UINT_MAX)
> > +		return -EINVAL;
> > +
> > +	bsize = DIRTY_BITMAP_BYTES(npages);
> > +
> > +	if (bitmap_size < bsize)
> > +		return -EINVAL;
> > +
> > +	return 0;
> > +}
> > +
> >  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> >  			     struct vfio_iommu_type1_dma_unmap *unmap)
> >  {
> > @@ -2278,6 +2385,93 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
> >  
> >  		return copy_to_user((void __user *)arg, &unmap, minsz) ?
> >  			-EFAULT : 0;
> > +	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
> > +		struct vfio_iommu_type1_dirty_bitmap dirty;
> > +		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
> > +				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
> > +				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
> > +		int ret = 0;
> > +
> > +		if (!iommu->v2)
> > +			return -EACCES;
> > +
> > +		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
> > +				    flags);
> > +
> > +		if (copy_from_user(&dirty, (void __user *)arg, minsz))
> > +			return -EFAULT;
> > +
> > +		if (dirty.argsz < minsz || dirty.flags & ~mask)
> > +			return -EINVAL;
> > +
> > +		/* only one flag should be set at a time */
> > +		if (__ffs(dirty.flags) != __fls(dirty.flags))
> > +			return -EINVAL;
> > +
> > +		if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
> > +			uint64_t pgsize = 1 << __ffs(vfio_pgsize_bitmap(iommu));
> > +
> > +			mutex_lock(&iommu->lock);
> > +			if (!iommu->dirty_page_tracking) {
> > +				ret = vfio_dma_bitmap_alloc(iommu, pgsize);
> > +				if (!ret)
> > +					iommu->dirty_page_tracking = true;
> > +			}
> > +			mutex_unlock(&iommu->lock);
> > +			return ret;
> > +		} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
> > +			mutex_lock(&iommu->lock);
> > +			if (iommu->dirty_page_tracking) {
> > +				iommu->dirty_page_tracking = false;
> > +				vfio_dma_bitmap_free(iommu);
> > +			}
> > +			mutex_unlock(&iommu->lock);
> > +			return 0;
> > +		} else if (dirty.flags &
> > +				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
> > +			struct vfio_iommu_type1_dirty_bitmap_get range;
> > +			unsigned long pgshift;
> > +			size_t data_size = dirty.argsz - minsz;
> > +			uint64_t iommu_pgsize =
> > +					 1 << __ffs(vfio_pgsize_bitmap(iommu));
> > +
> > +			if (!data_size || data_size < sizeof(range))
> > +				return -EINVAL;
> > +
> > +			if (copy_from_user(&range, (void __user *)(arg + minsz),
> > +					   sizeof(range)))
> > +				return -EFAULT;
> > +
> > +			/* allow only min supported pgsize */
> > +			if (range.bitmap.pgsize != iommu_pgsize)
> > +				return -EINVAL;
> > +			if (range.iova & (iommu_pgsize - 1))
> > +				return -EINVAL;
> > +			if (!range.size || range.size & (iommu_pgsize - 1))
> > +				return -EINVAL;
> > +			if (range.iova + range.size < range.iova)
> > +				return -EINVAL;
> > +			if (!access_ok((void __user *)range.bitmap.data,
> > +				       range.bitmap.size))
> > +				return -EINVAL;
> > +
> > +			pgshift = __ffs(range.bitmap.pgsize);
> > +			ret = verify_bitmap_size(range.size >> pgshift,
> > +						 range.bitmap.size);
> > +			if (ret)
> > +				return ret;
> > +
> > +			mutex_lock(&iommu->lock);
> > +			if (iommu->dirty_page_tracking)
> > +				ret = vfio_iova_dirty_bitmap(iommu, range.iova,
> > +					 range.size, range.bitmap.pgsize,
> > +				    (unsigned char __user *)range.bitmap.data);
> > +			else
> > +				ret = -EINVAL;
> > +			mutex_unlock(&iommu->lock);
> > +
> > +			return ret;
> > +		}
> >  	}
> >  
> >  	return -ENOTTY;
> > @@ -2345,10 +2539,17 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
> >  
> >  	vaddr = dma->vaddr + offset;
> >  
> > -	if (write)
> > +	if (write) {
> >  		*copied = __copy_to_user((void __user *)vaddr, data,
> >  					 count) ? 0 : count;
> > -	else
> > +		if (*copied && iommu->dirty_page_tracking) {
> > +			unsigned long pgshift =
> > +				__ffs(vfio_pgsize_bitmap(iommu));
> > +
> > +			bitmap_set(dma->bitmap, offset >> pgshift,
> > +				   *copied >> pgshift);
> > +		}
> > +	} else
> >  		*copied = __copy_from_user(data, (void __user *)vaddr,
> >  					   count) ? 0 : count;
> >  	if (kthread)
> > -- 
> > 2.7.0
> >   
>
Yan Zhao March 19, 2020, 4:15 a.m. UTC | #4
On Thu, Mar 19, 2020 at 12:01:00PM +0800, Alex Williamson wrote:
> On Wed, 18 Mar 2020 23:06:39 -0400
> Yan Zhao <yan.y.zhao@intel.com> wrote:
> 
> > On Thu, Mar 19, 2020 at 03:41:11AM +0800, Kirti Wankhede wrote:
> > > VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
> > > - Start dirty pages tracking while migration is active
> > > - Stop dirty pages tracking.
> > > - Get dirty pages bitmap. Its user space application's responsibility to
> > >   copy content of dirty pages from source to destination during migration.
> > > 
> > > To prevent DoS attack, memory for bitmap is allocated per vfio_dma
> > > structure. Bitmap size is calculated considering smallest supported page
> > > size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled
> > > 
> > > Bitmap is populated for already pinned pages when bitmap is allocated for
> > > a vfio_dma with the smallest supported page size. Update bitmap from
> > > pinning functions when tracking is enabled. When user application queries
> > > bitmap, check if requested page size is same as page size used to
> > > populated bitmap. If it is equal, copy bitmap, but if not equal, return
> > > error.
> > > 
> > > Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> > > Reviewed-by: Neo Jia <cjia@nvidia.com>
> > > ---
> > >  drivers/vfio/vfio_iommu_type1.c | 205 +++++++++++++++++++++++++++++++++++++++-
> > >  1 file changed, 203 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > > index 70aeab921d0f..d6417fb02174 100644
> > > --- a/drivers/vfio/vfio_iommu_type1.c
> > > +++ b/drivers/vfio/vfio_iommu_type1.c
> > > @@ -71,6 +71,7 @@ struct vfio_iommu {
> > >  	unsigned int		dma_avail;
> > >  	bool			v2;
> > >  	bool			nesting;
> > > +	bool			dirty_page_tracking;
> > >  };
> > >  
> > >  struct vfio_domain {
> > > @@ -91,6 +92,7 @@ struct vfio_dma {
> > >  	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
> > >  	struct task_struct	*task;
> > >  	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
> > > +	unsigned long		*bitmap;
> > >  };
> > >  
> > >  struct vfio_group {
> > > @@ -125,7 +127,10 @@ struct vfio_regions {
> > >  #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> > >  					(!list_empty(&iommu->domain_list))
> > >  
> > > +#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
> > > +
> > >  static int put_pfn(unsigned long pfn, int prot);
> > > +static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
> > >  
> > >  /*
> > >   * This code handles mapping and unmapping of user data buffers
> > > @@ -175,6 +180,55 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
> > >  	rb_erase(&old->node, &iommu->dma_list);
> > >  }
> > >  
> > > +static int vfio_dma_bitmap_alloc(struct vfio_iommu *iommu, uint64_t pgsize)
> > > +{
> > > +	struct rb_node *n = rb_first(&iommu->dma_list);
> > > +
> > > +	for (; n; n = rb_next(n)) {
> > > +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> > > +		struct rb_node *p;
> > > +		unsigned long npages = dma->size / pgsize;
> > > +
> > > +		dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
> > > +		if (!dma->bitmap) {
> > > +			struct rb_node *p = rb_prev(n);
> > > +
> > > +			for (; p; p = rb_prev(p)) {
> > > +				struct vfio_dma *dma = rb_entry(n,
> > > +							struct vfio_dma, node);
> > > +
> > > +				kfree(dma->bitmap);
> > > +				dma->bitmap = NULL;
> > > +			}
> > > +			return -ENOMEM;
> > > +		}
> > > +
> > > +		if (RB_EMPTY_ROOT(&dma->pfn_list))
> > > +			continue;
> > > +
> > > +		for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
> > > +			struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
> > > +							 node);
> > > +
> > > +			bitmap_set(dma->bitmap,
> > > +					(vpfn->iova - dma->iova) / pgsize, 1);
> > > +		}
> > > +	}
> > > +	return 0;
> > > +}
> > > +
> > > +static void vfio_dma_bitmap_free(struct vfio_iommu *iommu)
> > > +{
> > > +	struct rb_node *n = rb_first(&iommu->dma_list);
> > > +
> > > +	for (; n; n = rb_next(n)) {
> > > +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> > > +
> > > +		kfree(dma->bitmap);
> > > +		dma->bitmap = NULL;
> > > +	}
> > > +}
> > > +
> > >  /*
> > >   * Helper Functions for host iova-pfn list
> > >   */
> > > @@ -567,6 +621,14 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
> > >  			vfio_unpin_page_external(dma, iova, do_accounting);
> > >  			goto pin_unwind;
> > >  		}
> > > +
> > > +		if (iommu->dirty_page_tracking) {
> > > +			unsigned long pgshift =
> > > +					 __ffs(vfio_pgsize_bitmap(iommu));
> > > +
> > > +			bitmap_set(dma->bitmap,
> > > +				   (vpfn->iova - dma->iova) >> pgshift, 1);
> > > +		}
> > >  	}
> > >  
> > >  	ret = i;
> > > @@ -801,6 +863,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
> > >  	vfio_unmap_unpin(iommu, dma, true);
> > >  	vfio_unlink_dma(iommu, dma);
> > >  	put_task_struct(dma->task);
> > > +	kfree(dma->bitmap);
> > >  	kfree(dma);
> > >  	iommu->dma_avail++;
> > >  }
> > > @@ -831,6 +894,50 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
> > >  	return bitmap;
> > >  }
> > >  
> > > +static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
> > > +				  size_t size, uint64_t pgsize,
> > > +				  unsigned char __user *bitmap)
> > > +{
> > > +	struct vfio_dma *dma;
> > > +	unsigned long pgshift = __ffs(pgsize);
> > > +	unsigned int npages, bitmap_size;
> > > +
> > > +	dma = vfio_find_dma(iommu, iova, 1);
> > > +
> > > +	if (!dma)
> > > +		return -EINVAL;
> > > +
> > > +	if (dma->iova != iova || dma->size != size)
> > > +		return -EINVAL;
> > > +  
> > looks this size is passed from user. how can it ensure size always
> > equals to dma->size ?
> > 
> > shouldn't we iterate dma tree to look for dirty for whole range if a
> > single dma cannot meet them all?
> 
> Please see the discussion on v12[1], the problem is with the alignment
> of DMA mapped regions versus the bitmap.  A DMA mapping only requires
> page alignment, so for example imagine a user requests the bitmap from
> page zero to 4GB, but we have a DMA mapping starting at 4KB.  We can't
> efficiently copy the bitmap tracked by the vfio_dma structure to the
> user buffer when it's shifted by 1 bit.  Adjacent mappings can also
> make for a very complicated implementation.  In the discussion linked
> we decided to compromise on a more simple implementation that requires
> the user to ask for a bitmap which exactly matches a single DMA
> mapping, which Kirti indicates is what we require to support QEMU.
> Later in the series, the unmap operation also makes this requirement
> when used with the flags to retrieve the dirty bitmap.  Thanks,
>

so, what about for vIOMMU enabling case?
if IOVAs are mapped per page, then there's a log_sync in qemu,
it's supposed for range from 0-U64MAX, qemu has to find out which
ones are mapped and cut them into pages before calling this IOCTL?
And what if those IOVAs are mapped for len more than one page?

> 
> [1] https://lore.kernel.org/kvm/20200218215330.5bc8fc6a@w520.home/
>  
> > > +	npages = dma->size >> pgshift;
> > > +	bitmap_size = DIRTY_BITMAP_BYTES(npages);
> > > +
> > > +	/* mark all pages dirty if all pages are pinned and mapped. */
> > > +	if (dma->iommu_mapped)
> > > +		bitmap_set(dma->bitmap, 0, npages);
> > > +
> > > +	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
> > > +		return -EFAULT;
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
> > > +{
> > > +	uint64_t bsize;
> > > +
> > > +	if (!npages || !bitmap_size || bitmap_size > UINT_MAX)
> > > +		return -EINVAL;
> > > +
> > > +	bsize = DIRTY_BITMAP_BYTES(npages);
> > > +
> > > +	if (bitmap_size < bsize)
> > > +		return -EINVAL;
> > > +
> > > +	return 0;
> > > +}
> > > +
> > >  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> > >  			     struct vfio_iommu_type1_dma_unmap *unmap)
> > >  {
> > > @@ -2278,6 +2385,93 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
> > >  
> > >  		return copy_to_user((void __user *)arg, &unmap, minsz) ?
> > >  			-EFAULT : 0;
> > > +	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
> > > +		struct vfio_iommu_type1_dirty_bitmap dirty;
> > > +		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
> > > +				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
> > > +				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
> > > +		int ret = 0;
> > > +
> > > +		if (!iommu->v2)
> > > +			return -EACCES;
> > > +
> > > +		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
> > > +				    flags);
> > > +
> > > +		if (copy_from_user(&dirty, (void __user *)arg, minsz))
> > > +			return -EFAULT;
> > > +
> > > +		if (dirty.argsz < minsz || dirty.flags & ~mask)
> > > +			return -EINVAL;
> > > +
> > > +		/* only one flag should be set at a time */
> > > +		if (__ffs(dirty.flags) != __fls(dirty.flags))
> > > +			return -EINVAL;
> > > +
> > > +		if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
> > > +			uint64_t pgsize = 1 << __ffs(vfio_pgsize_bitmap(iommu));
> > > +
> > > +			mutex_lock(&iommu->lock);
> > > +			if (!iommu->dirty_page_tracking) {
> > > +				ret = vfio_dma_bitmap_alloc(iommu, pgsize);
> > > +				if (!ret)
> > > +					iommu->dirty_page_tracking = true;
> > > +			}
> > > +			mutex_unlock(&iommu->lock);
> > > +			return ret;
> > > +		} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
> > > +			mutex_lock(&iommu->lock);
> > > +			if (iommu->dirty_page_tracking) {
> > > +				iommu->dirty_page_tracking = false;
> > > +				vfio_dma_bitmap_free(iommu);
> > > +			}
> > > +			mutex_unlock(&iommu->lock);
> > > +			return 0;
> > > +		} else if (dirty.flags &
> > > +				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
> > > +			struct vfio_iommu_type1_dirty_bitmap_get range;
> > > +			unsigned long pgshift;
> > > +			size_t data_size = dirty.argsz - minsz;
> > > +			uint64_t iommu_pgsize =
> > > +					 1 << __ffs(vfio_pgsize_bitmap(iommu));
> > > +
> > > +			if (!data_size || data_size < sizeof(range))
> > > +				return -EINVAL;
> > > +
> > > +			if (copy_from_user(&range, (void __user *)(arg + minsz),
> > > +					   sizeof(range)))
> > > +				return -EFAULT;
> > > +
> > > +			/* allow only min supported pgsize */
> > > +			if (range.bitmap.pgsize != iommu_pgsize)
> > > +				return -EINVAL;
> > > +			if (range.iova & (iommu_pgsize - 1))
> > > +				return -EINVAL;
> > > +			if (!range.size || range.size & (iommu_pgsize - 1))
> > > +				return -EINVAL;
> > > +			if (range.iova + range.size < range.iova)
> > > +				return -EINVAL;
> > > +			if (!access_ok((void __user *)range.bitmap.data,
> > > +				       range.bitmap.size))
> > > +				return -EINVAL;
> > > +
> > > +			pgshift = __ffs(range.bitmap.pgsize);
> > > +			ret = verify_bitmap_size(range.size >> pgshift,
> > > +						 range.bitmap.size);
> > > +			if (ret)
> > > +				return ret;
> > > +
> > > +			mutex_lock(&iommu->lock);
> > > +			if (iommu->dirty_page_tracking)
> > > +				ret = vfio_iova_dirty_bitmap(iommu, range.iova,
> > > +					 range.size, range.bitmap.pgsize,
> > > +				    (unsigned char __user *)range.bitmap.data);
> > > +			else
> > > +				ret = -EINVAL;
> > > +			mutex_unlock(&iommu->lock);
> > > +
> > > +			return ret;
> > > +		}
> > >  	}
> > >  
> > >  	return -ENOTTY;
> > > @@ -2345,10 +2539,17 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
> > >  
> > >  	vaddr = dma->vaddr + offset;
> > >  
> > > -	if (write)
> > > +	if (write) {
> > >  		*copied = __copy_to_user((void __user *)vaddr, data,
> > >  					 count) ? 0 : count;
> > > -	else
> > > +		if (*copied && iommu->dirty_page_tracking) {
> > > +			unsigned long pgshift =
> > > +				__ffs(vfio_pgsize_bitmap(iommu));
> > > +
> > > +			bitmap_set(dma->bitmap, offset >> pgshift,
> > > +				   *copied >> pgshift);
> > > +		}
> > > +	} else
> > >  		*copied = __copy_from_user(data, (void __user *)vaddr,
> > >  					   count) ? 0 : count;
> > >  	if (kthread)
> > > -- 
> > > 2.7.0
> > >   
> > 
>
Alex Williamson March 19, 2020, 4:40 a.m. UTC | #5
On Thu, 19 Mar 2020 00:15:33 -0400
Yan Zhao <yan.y.zhao@intel.com> wrote:

> On Thu, Mar 19, 2020 at 12:01:00PM +0800, Alex Williamson wrote:
> > On Wed, 18 Mar 2020 23:06:39 -0400
> > Yan Zhao <yan.y.zhao@intel.com> wrote:
> >   
> > > On Thu, Mar 19, 2020 at 03:41:11AM +0800, Kirti Wankhede wrote:  
> > > > VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
> > > > - Start dirty pages tracking while migration is active
> > > > - Stop dirty pages tracking.
> > > > - Get dirty pages bitmap. Its user space application's responsibility to
> > > >   copy content of dirty pages from source to destination during migration.
> > > > 
> > > > To prevent DoS attack, memory for bitmap is allocated per vfio_dma
> > > > structure. Bitmap size is calculated considering smallest supported page
> > > > size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled
> > > > 
> > > > Bitmap is populated for already pinned pages when bitmap is allocated for
> > > > a vfio_dma with the smallest supported page size. Update bitmap from
> > > > pinning functions when tracking is enabled. When user application queries
> > > > bitmap, check if requested page size is same as page size used to
> > > > populated bitmap. If it is equal, copy bitmap, but if not equal, return
> > > > error.
> > > > 
> > > > Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> > > > Reviewed-by: Neo Jia <cjia@nvidia.com>
> > > > ---
> > > >  drivers/vfio/vfio_iommu_type1.c | 205 +++++++++++++++++++++++++++++++++++++++-
> > > >  1 file changed, 203 insertions(+), 2 deletions(-)
> > > > 
> > > > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > > > index 70aeab921d0f..d6417fb02174 100644
> > > > --- a/drivers/vfio/vfio_iommu_type1.c
> > > > +++ b/drivers/vfio/vfio_iommu_type1.c
> > > > @@ -71,6 +71,7 @@ struct vfio_iommu {
> > > >  	unsigned int		dma_avail;
> > > >  	bool			v2;
> > > >  	bool			nesting;
> > > > +	bool			dirty_page_tracking;
> > > >  };
> > > >  
> > > >  struct vfio_domain {
> > > > @@ -91,6 +92,7 @@ struct vfio_dma {
> > > >  	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
> > > >  	struct task_struct	*task;
> > > >  	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
> > > > +	unsigned long		*bitmap;
> > > >  };
> > > >  
> > > >  struct vfio_group {
> > > > @@ -125,7 +127,10 @@ struct vfio_regions {
> > > >  #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> > > >  					(!list_empty(&iommu->domain_list))
> > > >  
> > > > +#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
> > > > +
> > > >  static int put_pfn(unsigned long pfn, int prot);
> > > > +static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
> > > >  
> > > >  /*
> > > >   * This code handles mapping and unmapping of user data buffers
> > > > @@ -175,6 +180,55 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
> > > >  	rb_erase(&old->node, &iommu->dma_list);
> > > >  }
> > > >  
> > > > +static int vfio_dma_bitmap_alloc(struct vfio_iommu *iommu, uint64_t pgsize)
> > > > +{
> > > > +	struct rb_node *n = rb_first(&iommu->dma_list);
> > > > +
> > > > +	for (; n; n = rb_next(n)) {
> > > > +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> > > > +		struct rb_node *p;
> > > > +		unsigned long npages = dma->size / pgsize;
> > > > +
> > > > +		dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
> > > > +		if (!dma->bitmap) {
> > > > +			struct rb_node *p = rb_prev(n);
> > > > +
> > > > +			for (; p; p = rb_prev(p)) {
> > > > +				struct vfio_dma *dma = rb_entry(n,
> > > > +							struct vfio_dma, node);
> > > > +
> > > > +				kfree(dma->bitmap);
> > > > +				dma->bitmap = NULL;
> > > > +			}
> > > > +			return -ENOMEM;
> > > > +		}
> > > > +
> > > > +		if (RB_EMPTY_ROOT(&dma->pfn_list))
> > > > +			continue;
> > > > +
> > > > +		for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
> > > > +			struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
> > > > +							 node);
> > > > +
> > > > +			bitmap_set(dma->bitmap,
> > > > +					(vpfn->iova - dma->iova) / pgsize, 1);
> > > > +		}
> > > > +	}
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +static void vfio_dma_bitmap_free(struct vfio_iommu *iommu)
> > > > +{
> > > > +	struct rb_node *n = rb_first(&iommu->dma_list);
> > > > +
> > > > +	for (; n; n = rb_next(n)) {
> > > > +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> > > > +
> > > > +		kfree(dma->bitmap);
> > > > +		dma->bitmap = NULL;
> > > > +	}
> > > > +}
> > > > +
> > > >  /*
> > > >   * Helper Functions for host iova-pfn list
> > > >   */
> > > > @@ -567,6 +621,14 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
> > > >  			vfio_unpin_page_external(dma, iova, do_accounting);
> > > >  			goto pin_unwind;
> > > >  		}
> > > > +
> > > > +		if (iommu->dirty_page_tracking) {
> > > > +			unsigned long pgshift =
> > > > +					 __ffs(vfio_pgsize_bitmap(iommu));
> > > > +
> > > > +			bitmap_set(dma->bitmap,
> > > > +				   (vpfn->iova - dma->iova) >> pgshift, 1);
> > > > +		}
> > > >  	}
> > > >  
> > > >  	ret = i;
> > > > @@ -801,6 +863,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
> > > >  	vfio_unmap_unpin(iommu, dma, true);
> > > >  	vfio_unlink_dma(iommu, dma);
> > > >  	put_task_struct(dma->task);
> > > > +	kfree(dma->bitmap);
> > > >  	kfree(dma);
> > > >  	iommu->dma_avail++;
> > > >  }
> > > > @@ -831,6 +894,50 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
> > > >  	return bitmap;
> > > >  }
> > > >  
> > > > +static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
> > > > +				  size_t size, uint64_t pgsize,
> > > > +				  unsigned char __user *bitmap)
> > > > +{
> > > > +	struct vfio_dma *dma;
> > > > +	unsigned long pgshift = __ffs(pgsize);
> > > > +	unsigned int npages, bitmap_size;
> > > > +
> > > > +	dma = vfio_find_dma(iommu, iova, 1);
> > > > +
> > > > +	if (!dma)
> > > > +		return -EINVAL;
> > > > +
> > > > +	if (dma->iova != iova || dma->size != size)
> > > > +		return -EINVAL;
> > > > +    
> > > looks this size is passed from user. how can it ensure size always
> > > equals to dma->size ?
> > > 
> > > shouldn't we iterate dma tree to look for dirty for whole range if a
> > > single dma cannot meet them all?  
> > 
> > Please see the discussion on v12[1], the problem is with the alignment
> > of DMA mapped regions versus the bitmap.  A DMA mapping only requires
> > page alignment, so for example imagine a user requests the bitmap from
> > page zero to 4GB, but we have a DMA mapping starting at 4KB.  We can't
> > efficiently copy the bitmap tracked by the vfio_dma structure to the
> > user buffer when it's shifted by 1 bit.  Adjacent mappings can also
> > make for a very complicated implementation.  In the discussion linked
> > we decided to compromise on a more simple implementation that requires
> > the user to ask for a bitmap which exactly matches a single DMA
> > mapping, which Kirti indicates is what we require to support QEMU.
> > Later in the series, the unmap operation also makes this requirement
> > when used with the flags to retrieve the dirty bitmap.  Thanks,
> >  
> 
> so, what about for vIOMMU enabling case?
> if IOVAs are mapped per page, then there's a log_sync in qemu,
> it's supposed for range from 0-U64MAX, qemu has to find out which
> ones are mapped and cut them into pages before calling this IOCTL?
> And what if those IOVAs are mapped for len more than one page?

Good question.  Kirti?

> > [1] https://lore.kernel.org/kvm/20200218215330.5bc8fc6a@w520.home/
> >    
> > > > +	npages = dma->size >> pgshift;
> > > > +	bitmap_size = DIRTY_BITMAP_BYTES(npages);
> > > > +
> > > > +	/* mark all pages dirty if all pages are pinned and mapped. */
> > > > +	if (dma->iommu_mapped)
> > > > +		bitmap_set(dma->bitmap, 0, npages);
> > > > +
> > > > +	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
> > > > +		return -EFAULT;
> > > > +
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
> > > > +{
> > > > +	uint64_t bsize;
> > > > +
> > > > +	if (!npages || !bitmap_size || bitmap_size > UINT_MAX)
> > > > +		return -EINVAL;
> > > > +
> > > > +	bsize = DIRTY_BITMAP_BYTES(npages);
> > > > +
> > > > +	if (bitmap_size < bsize)
> > > > +		return -EINVAL;
> > > > +
> > > > +	return 0;
> > > > +}
> > > > +
> > > >  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> > > >  			     struct vfio_iommu_type1_dma_unmap *unmap)
> > > >  {
> > > > @@ -2278,6 +2385,93 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
> > > >  
> > > >  		return copy_to_user((void __user *)arg, &unmap, minsz) ?
> > > >  			-EFAULT : 0;
> > > > +	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
> > > > +		struct vfio_iommu_type1_dirty_bitmap dirty;
> > > > +		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
> > > > +				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
> > > > +				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
> > > > +		int ret = 0;
> > > > +
> > > > +		if (!iommu->v2)
> > > > +			return -EACCES;
> > > > +
> > > > +		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
> > > > +				    flags);
> > > > +
> > > > +		if (copy_from_user(&dirty, (void __user *)arg, minsz))
> > > > +			return -EFAULT;
> > > > +
> > > > +		if (dirty.argsz < minsz || dirty.flags & ~mask)
> > > > +			return -EINVAL;
> > > > +
> > > > +		/* only one flag should be set at a time */
> > > > +		if (__ffs(dirty.flags) != __fls(dirty.flags))
> > > > +			return -EINVAL;
> > > > +
> > > > +		if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
> > > > +			uint64_t pgsize = 1 << __ffs(vfio_pgsize_bitmap(iommu));
> > > > +
> > > > +			mutex_lock(&iommu->lock);
> > > > +			if (!iommu->dirty_page_tracking) {
> > > > +				ret = vfio_dma_bitmap_alloc(iommu, pgsize);
> > > > +				if (!ret)
> > > > +					iommu->dirty_page_tracking = true;
> > > > +			}
> > > > +			mutex_unlock(&iommu->lock);
> > > > +			return ret;
> > > > +		} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
> > > > +			mutex_lock(&iommu->lock);
> > > > +			if (iommu->dirty_page_tracking) {
> > > > +				iommu->dirty_page_tracking = false;
> > > > +				vfio_dma_bitmap_free(iommu);
> > > > +			}
> > > > +			mutex_unlock(&iommu->lock);
> > > > +			return 0;
> > > > +		} else if (dirty.flags &
> > > > +				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
> > > > +			struct vfio_iommu_type1_dirty_bitmap_get range;
> > > > +			unsigned long pgshift;
> > > > +			size_t data_size = dirty.argsz - minsz;
> > > > +			uint64_t iommu_pgsize =
> > > > +					 1 << __ffs(vfio_pgsize_bitmap(iommu));
> > > > +
> > > > +			if (!data_size || data_size < sizeof(range))
> > > > +				return -EINVAL;
> > > > +
> > > > +			if (copy_from_user(&range, (void __user *)(arg + minsz),
> > > > +					   sizeof(range)))
> > > > +				return -EFAULT;
> > > > +
> > > > +			/* allow only min supported pgsize */
> > > > +			if (range.bitmap.pgsize != iommu_pgsize)
> > > > +				return -EINVAL;
> > > > +			if (range.iova & (iommu_pgsize - 1))
> > > > +				return -EINVAL;
> > > > +			if (!range.size || range.size & (iommu_pgsize - 1))
> > > > +				return -EINVAL;
> > > > +			if (range.iova + range.size < range.iova)
> > > > +				return -EINVAL;
> > > > +			if (!access_ok((void __user *)range.bitmap.data,
> > > > +				       range.bitmap.size))
> > > > +				return -EINVAL;
> > > > +
> > > > +			pgshift = __ffs(range.bitmap.pgsize);
> > > > +			ret = verify_bitmap_size(range.size >> pgshift,
> > > > +						 range.bitmap.size);
> > > > +			if (ret)
> > > > +				return ret;
> > > > +
> > > > +			mutex_lock(&iommu->lock);
> > > > +			if (iommu->dirty_page_tracking)
> > > > +				ret = vfio_iova_dirty_bitmap(iommu, range.iova,
> > > > +					 range.size, range.bitmap.pgsize,
> > > > +				    (unsigned char __user *)range.bitmap.data);
> > > > +			else
> > > > +				ret = -EINVAL;
> > > > +			mutex_unlock(&iommu->lock);
> > > > +
> > > > +			return ret;
> > > > +		}
> > > >  	}
> > > >  
> > > >  	return -ENOTTY;
> > > > @@ -2345,10 +2539,17 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
> > > >  
> > > >  	vaddr = dma->vaddr + offset;
> > > >  
> > > > -	if (write)
> > > > +	if (write) {
> > > >  		*copied = __copy_to_user((void __user *)vaddr, data,
> > > >  					 count) ? 0 : count;
> > > > -	else
> > > > +		if (*copied && iommu->dirty_page_tracking) {
> > > > +			unsigned long pgshift =
> > > > +				__ffs(vfio_pgsize_bitmap(iommu));
> > > > +
> > > > +			bitmap_set(dma->bitmap, offset >> pgshift,
> > > > +				   *copied >> pgshift);
> > > > +		}
> > > > +	} else
> > > >  		*copied = __copy_from_user(data, (void __user *)vaddr,
> > > >  					   count) ? 0 : count;
> > > >  	if (kthread)
> > > > -- 
> > > > 2.7.0
> > > >     
> > >   
> >   
>
Yan Zhao March 19, 2020, 6:15 a.m. UTC | #6
On Thu, Mar 19, 2020 at 12:40:53PM +0800, Alex Williamson wrote:
> On Thu, 19 Mar 2020 00:15:33 -0400
> Yan Zhao <yan.y.zhao@intel.com> wrote:
> 
> > On Thu, Mar 19, 2020 at 12:01:00PM +0800, Alex Williamson wrote:
> > > On Wed, 18 Mar 2020 23:06:39 -0400
> > > Yan Zhao <yan.y.zhao@intel.com> wrote:
> > >   
> > > > On Thu, Mar 19, 2020 at 03:41:11AM +0800, Kirti Wankhede wrote:  
> > > > > VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
> > > > > - Start dirty pages tracking while migration is active
> > > > > - Stop dirty pages tracking.
> > > > > - Get dirty pages bitmap. Its user space application's responsibility to
> > > > >   copy content of dirty pages from source to destination during migration.
> > > > > 
> > > > > To prevent DoS attack, memory for bitmap is allocated per vfio_dma
> > > > > structure. Bitmap size is calculated considering smallest supported page
> > > > > size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled
> > > > > 
> > > > > Bitmap is populated for already pinned pages when bitmap is allocated for
> > > > > a vfio_dma with the smallest supported page size. Update bitmap from
> > > > > pinning functions when tracking is enabled. When user application queries
> > > > > bitmap, check if requested page size is same as page size used to
> > > > > populated bitmap. If it is equal, copy bitmap, but if not equal, return
> > > > > error.
> > > > > 
> > > > > Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> > > > > Reviewed-by: Neo Jia <cjia@nvidia.com>
> > > > > ---
> > > > >  drivers/vfio/vfio_iommu_type1.c | 205 +++++++++++++++++++++++++++++++++++++++-
> > > > >  1 file changed, 203 insertions(+), 2 deletions(-)
> > > > > 
> > > > > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > > > > index 70aeab921d0f..d6417fb02174 100644
> > > > > --- a/drivers/vfio/vfio_iommu_type1.c
> > > > > +++ b/drivers/vfio/vfio_iommu_type1.c
> > > > > @@ -71,6 +71,7 @@ struct vfio_iommu {
> > > > >  	unsigned int		dma_avail;
> > > > >  	bool			v2;
> > > > >  	bool			nesting;
> > > > > +	bool			dirty_page_tracking;
> > > > >  };
> > > > >  
> > > > >  struct vfio_domain {
> > > > > @@ -91,6 +92,7 @@ struct vfio_dma {
> > > > >  	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
> > > > >  	struct task_struct	*task;
> > > > >  	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
> > > > > +	unsigned long		*bitmap;
> > > > >  };
> > > > >  
> > > > >  struct vfio_group {
> > > > > @@ -125,7 +127,10 @@ struct vfio_regions {
> > > > >  #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> > > > >  					(!list_empty(&iommu->domain_list))
> > > > >  
> > > > > +#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
> > > > > +
> > > > >  static int put_pfn(unsigned long pfn, int prot);
> > > > > +static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
> > > > >  
> > > > >  /*
> > > > >   * This code handles mapping and unmapping of user data buffers
> > > > > @@ -175,6 +180,55 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
> > > > >  	rb_erase(&old->node, &iommu->dma_list);
> > > > >  }
> > > > >  
> > > > > +static int vfio_dma_bitmap_alloc(struct vfio_iommu *iommu, uint64_t pgsize)
> > > > > +{
> > > > > +	struct rb_node *n = rb_first(&iommu->dma_list);
> > > > > +
> > > > > +	for (; n; n = rb_next(n)) {
> > > > > +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> > > > > +		struct rb_node *p;
> > > > > +		unsigned long npages = dma->size / pgsize;
> > > > > +
> > > > > +		dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
> > > > > +		if (!dma->bitmap) {
> > > > > +			struct rb_node *p = rb_prev(n);
> > > > > +
> > > > > +			for (; p; p = rb_prev(p)) {
> > > > > +				struct vfio_dma *dma = rb_entry(n,
> > > > > +							struct vfio_dma, node);
> > > > > +
> > > > > +				kfree(dma->bitmap);
> > > > > +				dma->bitmap = NULL;
> > > > > +			}
> > > > > +			return -ENOMEM;
> > > > > +		}
> > > > > +
> > > > > +		if (RB_EMPTY_ROOT(&dma->pfn_list))
> > > > > +			continue;
> > > > > +
> > > > > +		for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
> > > > > +			struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
> > > > > +							 node);
> > > > > +
> > > > > +			bitmap_set(dma->bitmap,
> > > > > +					(vpfn->iova - dma->iova) / pgsize, 1);
> > > > > +		}
> > > > > +	}
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > > +static void vfio_dma_bitmap_free(struct vfio_iommu *iommu)
> > > > > +{
> > > > > +	struct rb_node *n = rb_first(&iommu->dma_list);
> > > > > +
> > > > > +	for (; n; n = rb_next(n)) {
> > > > > +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> > > > > +
> > > > > +		kfree(dma->bitmap);
> > > > > +		dma->bitmap = NULL;
> > > > > +	}
> > > > > +}
> > > > > +
> > > > >  /*
> > > > >   * Helper Functions for host iova-pfn list
> > > > >   */
> > > > > @@ -567,6 +621,14 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
> > > > >  			vfio_unpin_page_external(dma, iova, do_accounting);
> > > > >  			goto pin_unwind;
> > > > >  		}
> > > > > +
> > > > > +		if (iommu->dirty_page_tracking) {
> > > > > +			unsigned long pgshift =
> > > > > +					 __ffs(vfio_pgsize_bitmap(iommu));
> > > > > +
> > > > > +			bitmap_set(dma->bitmap,
> > > > > +				   (vpfn->iova - dma->iova) >> pgshift, 1);
> > > > > +		}
> > > > >  	}
> > > > >  
> > > > >  	ret = i;
> > > > > @@ -801,6 +863,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
> > > > >  	vfio_unmap_unpin(iommu, dma, true);
> > > > >  	vfio_unlink_dma(iommu, dma);
> > > > >  	put_task_struct(dma->task);
> > > > > +	kfree(dma->bitmap);
> > > > >  	kfree(dma);
> > > > >  	iommu->dma_avail++;
> > > > >  }
> > > > > @@ -831,6 +894,50 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
> > > > >  	return bitmap;
> > > > >  }
> > > > >  
> > > > > +static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
> > > > > +				  size_t size, uint64_t pgsize,
> > > > > +				  unsigned char __user *bitmap)
> > > > > +{
> > > > > +	struct vfio_dma *dma;
> > > > > +	unsigned long pgshift = __ffs(pgsize);
> > > > > +	unsigned int npages, bitmap_size;
> > > > > +
> > > > > +	dma = vfio_find_dma(iommu, iova, 1);
> > > > > +
> > > > > +	if (!dma)
> > > > > +		return -EINVAL;
> > > > > +
> > > > > +	if (dma->iova != iova || dma->size != size)
> > > > > +		return -EINVAL;
> > > > > +    
> > > > looks this size is passed from user. how can it ensure size always
> > > > equals to dma->size ?
> > > > 
> > > > shouldn't we iterate dma tree to look for dirty for whole range if a
> > > > single dma cannot meet them all?  
> > > 
> > > Please see the discussion on v12[1], the problem is with the alignment
> > > of DMA mapped regions versus the bitmap.  A DMA mapping only requires
> > > page alignment, so for example imagine a user requests the bitmap from
> > > page zero to 4GB, but we have a DMA mapping starting at 4KB.  We can't
> > > efficiently copy the bitmap tracked by the vfio_dma structure to the
> > > user buffer when it's shifted by 1 bit.  Adjacent mappings can also
> > > make for a very complicated implementation.  In the discussion linked
> > > we decided to compromise on a more simple implementation that requires
> > > the user to ask for a bitmap which exactly matches a single DMA
> > > mapping, which Kirti indicates is what we require to support QEMU.
> > > Later in the series, the unmap operation also makes this requirement
> > > when used with the flags to retrieve the dirty bitmap.  Thanks,
> > >  
> > 
> > so, what about for vIOMMU enabling case?
> > if IOVAs are mapped per page, then there's a log_sync in qemu,
> > it's supposed for range from 0-U64MAX, qemu has to find out which
> > ones are mapped and cut them into pages before calling this IOCTL?
> > And what if those IOVAs are mapped for len more than one page?
> 
> Good question.  Kirti?
> 
> > > [1] https://lore.kernel.org/kvm/20200218215330.5bc8fc6a@w520.home/
> > >    
> > > > > +	npages = dma->size >> pgshift;
> > > > > +	bitmap_size = DIRTY_BITMAP_BYTES(npages);
> > > > > +
> > > > > +	/* mark all pages dirty if all pages are pinned and mapped. */
> > > > > +	if (dma->iommu_mapped)
> > > > > +		bitmap_set(dma->bitmap, 0, npages);
> > > > > +
> > > > > +	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
> > > > > +		return -EFAULT;
> > > > > +
Here, dma->bitmap needs to be cleared. right?

> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > > +static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
> > > > > +{
> > > > > +	uint64_t bsize;
> > > > > +
> > > > > +	if (!npages || !bitmap_size || bitmap_size > UINT_MAX)
> > > > > +		return -EINVAL;
> > > > > +
> > > > > +	bsize = DIRTY_BITMAP_BYTES(npages);
> > > > > +
> > > > > +	if (bitmap_size < bsize)
> > > > > +		return -EINVAL;
> > > > > +
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > >  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> > > > >  			     struct vfio_iommu_type1_dma_unmap *unmap)
> > > > >  {
> > > > > @@ -2278,6 +2385,93 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
> > > > >  
> > > > >  		return copy_to_user((void __user *)arg, &unmap, minsz) ?
> > > > >  			-EFAULT : 0;
> > > > > +	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
> > > > > +		struct vfio_iommu_type1_dirty_bitmap dirty;
> > > > > +		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
> > > > > +				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
> > > > > +				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
> > > > > +		int ret = 0;
> > > > > +
> > > > > +		if (!iommu->v2)
> > > > > +			return -EACCES;
> > > > > +
> > > > > +		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
> > > > > +				    flags);
> > > > > +
> > > > > +		if (copy_from_user(&dirty, (void __user *)arg, minsz))
> > > > > +			return -EFAULT;
> > > > > +
> > > > > +		if (dirty.argsz < minsz || dirty.flags & ~mask)
> > > > > +			return -EINVAL;
> > > > > +
> > > > > +		/* only one flag should be set at a time */
> > > > > +		if (__ffs(dirty.flags) != __fls(dirty.flags))
> > > > > +			return -EINVAL;
> > > > > +
> > > > > +		if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
> > > > > +			uint64_t pgsize = 1 << __ffs(vfio_pgsize_bitmap(iommu));
> > > > > +
> > > > > +			mutex_lock(&iommu->lock);
> > > > > +			if (!iommu->dirty_page_tracking) {
> > > > > +				ret = vfio_dma_bitmap_alloc(iommu, pgsize);
> > > > > +				if (!ret)
> > > > > +					iommu->dirty_page_tracking = true;
> > > > > +			}
> > > > > +			mutex_unlock(&iommu->lock);
> > > > > +			return ret;
> > > > > +		} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
> > > > > +			mutex_lock(&iommu->lock);
> > > > > +			if (iommu->dirty_page_tracking) {
> > > > > +				iommu->dirty_page_tracking = false;
> > > > > +				vfio_dma_bitmap_free(iommu);
> > > > > +			}
> > > > > +			mutex_unlock(&iommu->lock);
> > > > > +			return 0;
> > > > > +		} else if (dirty.flags &
> > > > > +				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
> > > > > +			struct vfio_iommu_type1_dirty_bitmap_get range;
> > > > > +			unsigned long pgshift;
> > > > > +			size_t data_size = dirty.argsz - minsz;
> > > > > +			uint64_t iommu_pgsize =
> > > > > +					 1 << __ffs(vfio_pgsize_bitmap(iommu));
> > > > > +
> > > > > +			if (!data_size || data_size < sizeof(range))
> > > > > +				return -EINVAL;
> > > > > +
> > > > > +			if (copy_from_user(&range, (void __user *)(arg + minsz),
> > > > > +					   sizeof(range)))
> > > > > +				return -EFAULT;
> > > > > +
> > > > > +			/* allow only min supported pgsize */
> > > > > +			if (range.bitmap.pgsize != iommu_pgsize)
> > > > > +				return -EINVAL;
> > > > > +			if (range.iova & (iommu_pgsize - 1))
> > > > > +				return -EINVAL;
> > > > > +			if (!range.size || range.size & (iommu_pgsize - 1))
> > > > > +				return -EINVAL;
> > > > > +			if (range.iova + range.size < range.iova)
> > > > > +				return -EINVAL;
> > > > > +			if (!access_ok((void __user *)range.bitmap.data,
> > > > > +				       range.bitmap.size))
> > > > > +				return -EINVAL;
> > > > > +
> > > > > +			pgshift = __ffs(range.bitmap.pgsize);
> > > > > +			ret = verify_bitmap_size(range.size >> pgshift,
> > > > > +						 range.bitmap.size);
> > > > > +			if (ret)
> > > > > +				return ret;
> > > > > +
> > > > > +			mutex_lock(&iommu->lock);
> > > > > +			if (iommu->dirty_page_tracking)
> > > > > +				ret = vfio_iova_dirty_bitmap(iommu, range.iova,
> > > > > +					 range.size, range.bitmap.pgsize,
> > > > > +				    (unsigned char __user *)range.bitmap.data);
> > > > > +			else
> > > > > +				ret = -EINVAL;
> > > > > +			mutex_unlock(&iommu->lock);
> > > > > +
> > > > > +			return ret;
> > > > > +		}
> > > > >  	}
> > > > >  
> > > > >  	return -ENOTTY;
> > > > > @@ -2345,10 +2539,17 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
> > > > >  
> > > > >  	vaddr = dma->vaddr + offset;
> > > > >  
> > > > > -	if (write)
> > > > > +	if (write) {
> > > > >  		*copied = __copy_to_user((void __user *)vaddr, data,
> > > > >  					 count) ? 0 : count;
> > > > > -	else
> > > > > +		if (*copied && iommu->dirty_page_tracking) {
> > > > > +			unsigned long pgshift =
> > > > > +				__ffs(vfio_pgsize_bitmap(iommu));
> > > > > +
> > > > > +			bitmap_set(dma->bitmap, offset >> pgshift,
> > > > > +				   *copied >> pgshift);
> > > > > +		}
> > > > > +	} else
> > > > >  		*copied = __copy_from_user(data, (void __user *)vaddr,
> > > > >  					   count) ? 0 : count;
> > > > >  	if (kthread)
> > > > > -- 
> > > > > 2.7.0
> > > > >     
> > > >   
> > >   
> > 
>
Alex Williamson March 19, 2020, 1:06 p.m. UTC | #7
On Thu, 19 Mar 2020 02:15:34 -0400
Yan Zhao <yan.y.zhao@intel.com> wrote:

> On Thu, Mar 19, 2020 at 12:40:53PM +0800, Alex Williamson wrote:
> > On Thu, 19 Mar 2020 00:15:33 -0400
> > Yan Zhao <yan.y.zhao@intel.com> wrote:
> >   
> > > On Thu, Mar 19, 2020 at 12:01:00PM +0800, Alex Williamson wrote:  
> > > > On Wed, 18 Mar 2020 23:06:39 -0400
> > > > Yan Zhao <yan.y.zhao@intel.com> wrote:
> > > >     
> > > > > On Thu, Mar 19, 2020 at 03:41:11AM +0800, Kirti Wankhede wrote:    
> > > > > > VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
> > > > > > - Start dirty pages tracking while migration is active
> > > > > > - Stop dirty pages tracking.
> > > > > > - Get dirty pages bitmap. Its user space application's responsibility to
> > > > > >   copy content of dirty pages from source to destination during migration.
> > > > > > 
> > > > > > To prevent DoS attack, memory for bitmap is allocated per vfio_dma
> > > > > > structure. Bitmap size is calculated considering smallest supported page
> > > > > > size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled
> > > > > > 
> > > > > > Bitmap is populated for already pinned pages when bitmap is allocated for
> > > > > > a vfio_dma with the smallest supported page size. Update bitmap from
> > > > > > pinning functions when tracking is enabled. When user application queries
> > > > > > bitmap, check if requested page size is same as page size used to
> > > > > > populated bitmap. If it is equal, copy bitmap, but if not equal, return
> > > > > > error.
> > > > > > 
> > > > > > Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> > > > > > Reviewed-by: Neo Jia <cjia@nvidia.com>
> > > > > > ---
> > > > > >  drivers/vfio/vfio_iommu_type1.c | 205 +++++++++++++++++++++++++++++++++++++++-
> > > > > >  1 file changed, 203 insertions(+), 2 deletions(-)
> > > > > > 
> > > > > > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > > > > > index 70aeab921d0f..d6417fb02174 100644
> > > > > > --- a/drivers/vfio/vfio_iommu_type1.c
> > > > > > +++ b/drivers/vfio/vfio_iommu_type1.c
> > > > > > @@ -71,6 +71,7 @@ struct vfio_iommu {
> > > > > >  	unsigned int		dma_avail;
> > > > > >  	bool			v2;
> > > > > >  	bool			nesting;
> > > > > > +	bool			dirty_page_tracking;
> > > > > >  };
> > > > > >  
> > > > > >  struct vfio_domain {
> > > > > > @@ -91,6 +92,7 @@ struct vfio_dma {
> > > > > >  	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
> > > > > >  	struct task_struct	*task;
> > > > > >  	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
> > > > > > +	unsigned long		*bitmap;
> > > > > >  };
> > > > > >  
> > > > > >  struct vfio_group {
> > > > > > @@ -125,7 +127,10 @@ struct vfio_regions {
> > > > > >  #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> > > > > >  					(!list_empty(&iommu->domain_list))
> > > > > >  
> > > > > > +#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
> > > > > > +
> > > > > >  static int put_pfn(unsigned long pfn, int prot);
> > > > > > +static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
> > > > > >  
> > > > > >  /*
> > > > > >   * This code handles mapping and unmapping of user data buffers
> > > > > > @@ -175,6 +180,55 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
> > > > > >  	rb_erase(&old->node, &iommu->dma_list);
> > > > > >  }
> > > > > >  
> > > > > > +static int vfio_dma_bitmap_alloc(struct vfio_iommu *iommu, uint64_t pgsize)
> > > > > > +{
> > > > > > +	struct rb_node *n = rb_first(&iommu->dma_list);
> > > > > > +
> > > > > > +	for (; n; n = rb_next(n)) {
> > > > > > +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> > > > > > +		struct rb_node *p;
> > > > > > +		unsigned long npages = dma->size / pgsize;
> > > > > > +
> > > > > > +		dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
> > > > > > +		if (!dma->bitmap) {
> > > > > > +			struct rb_node *p = rb_prev(n);
> > > > > > +
> > > > > > +			for (; p; p = rb_prev(p)) {
> > > > > > +				struct vfio_dma *dma = rb_entry(n,
> > > > > > +							struct vfio_dma, node);
> > > > > > +
> > > > > > +				kfree(dma->bitmap);
> > > > > > +				dma->bitmap = NULL;
> > > > > > +			}
> > > > > > +			return -ENOMEM;
> > > > > > +		}
> > > > > > +
> > > > > > +		if (RB_EMPTY_ROOT(&dma->pfn_list))
> > > > > > +			continue;
> > > > > > +
> > > > > > +		for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
> > > > > > +			struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
> > > > > > +							 node);
> > > > > > +
> > > > > > +			bitmap_set(dma->bitmap,
> > > > > > +					(vpfn->iova - dma->iova) / pgsize, 1);
> > > > > > +		}
> > > > > > +	}
> > > > > > +	return 0;
> > > > > > +}
> > > > > > +
> > > > > > +static void vfio_dma_bitmap_free(struct vfio_iommu *iommu)
> > > > > > +{
> > > > > > +	struct rb_node *n = rb_first(&iommu->dma_list);
> > > > > > +
> > > > > > +	for (; n; n = rb_next(n)) {
> > > > > > +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> > > > > > +
> > > > > > +		kfree(dma->bitmap);
> > > > > > +		dma->bitmap = NULL;
> > > > > > +	}
> > > > > > +}
> > > > > > +
> > > > > >  /*
> > > > > >   * Helper Functions for host iova-pfn list
> > > > > >   */
> > > > > > @@ -567,6 +621,14 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
> > > > > >  			vfio_unpin_page_external(dma, iova, do_accounting);
> > > > > >  			goto pin_unwind;
> > > > > >  		}
> > > > > > +
> > > > > > +		if (iommu->dirty_page_tracking) {
> > > > > > +			unsigned long pgshift =
> > > > > > +					 __ffs(vfio_pgsize_bitmap(iommu));
> > > > > > +
> > > > > > +			bitmap_set(dma->bitmap,
> > > > > > +				   (vpfn->iova - dma->iova) >> pgshift, 1);
> > > > > > +		}
> > > > > >  	}
> > > > > >  
> > > > > >  	ret = i;
> > > > > > @@ -801,6 +863,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
> > > > > >  	vfio_unmap_unpin(iommu, dma, true);
> > > > > >  	vfio_unlink_dma(iommu, dma);
> > > > > >  	put_task_struct(dma->task);
> > > > > > +	kfree(dma->bitmap);
> > > > > >  	kfree(dma);
> > > > > >  	iommu->dma_avail++;
> > > > > >  }
> > > > > > @@ -831,6 +894,50 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
> > > > > >  	return bitmap;
> > > > > >  }
> > > > > >  
> > > > > > +static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
> > > > > > +				  size_t size, uint64_t pgsize,
> > > > > > +				  unsigned char __user *bitmap)
> > > > > > +{
> > > > > > +	struct vfio_dma *dma;
> > > > > > +	unsigned long pgshift = __ffs(pgsize);
> > > > > > +	unsigned int npages, bitmap_size;
> > > > > > +
> > > > > > +	dma = vfio_find_dma(iommu, iova, 1);
> > > > > > +
> > > > > > +	if (!dma)
> > > > > > +		return -EINVAL;
> > > > > > +
> > > > > > +	if (dma->iova != iova || dma->size != size)
> > > > > > +		return -EINVAL;
> > > > > > +      
> > > > > looks this size is passed from user. how can it ensure size always
> > > > > equals to dma->size ?
> > > > > 
> > > > > shouldn't we iterate dma tree to look for dirty for whole range if a
> > > > > single dma cannot meet them all?    
> > > > 
> > > > Please see the discussion on v12[1], the problem is with the alignment
> > > > of DMA mapped regions versus the bitmap.  A DMA mapping only requires
> > > > page alignment, so for example imagine a user requests the bitmap from
> > > > page zero to 4GB, but we have a DMA mapping starting at 4KB.  We can't
> > > > efficiently copy the bitmap tracked by the vfio_dma structure to the
> > > > user buffer when it's shifted by 1 bit.  Adjacent mappings can also
> > > > make for a very complicated implementation.  In the discussion linked
> > > > we decided to compromise on a more simple implementation that requires
> > > > the user to ask for a bitmap which exactly matches a single DMA
> > > > mapping, which Kirti indicates is what we require to support QEMU.
> > > > Later in the series, the unmap operation also makes this requirement
> > > > when used with the flags to retrieve the dirty bitmap.  Thanks,
> > > >    
> > > 
> > > so, what about for vIOMMU enabling case?
> > > if IOVAs are mapped per page, then there's a log_sync in qemu,
> > > it's supposed for range from 0-U64MAX, qemu has to find out which
> > > ones are mapped and cut them into pages before calling this IOCTL?
> > > And what if those IOVAs are mapped for len more than one page?  
> > 
> > Good question.  Kirti?
> >   
> > > > [1] https://lore.kernel.org/kvm/20200218215330.5bc8fc6a@w520.home/
> > > >      
> > > > > > +	npages = dma->size >> pgshift;
> > > > > > +	bitmap_size = DIRTY_BITMAP_BYTES(npages);
> > > > > > +
> > > > > > +	/* mark all pages dirty if all pages are pinned and mapped. */
> > > > > > +	if (dma->iommu_mapped)
> > > > > > +		bitmap_set(dma->bitmap, 0, npages);
> > > > > > +
> > > > > > +	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
> > > > > > +		return -EFAULT;
> > > > > > +  
> Here, dma->bitmap needs to be cleared. right?

Ah, I missed re-checking this in my review.  v13 did clear it, but I
noted that we need to re-populate any currently pinned pages.  This
neither clears nor repopulates.  That's wrong.  Thanks,

Alex
 
> > > > > > +	return 0;
> > > > > > +}
> > > > > > +
> > > > > > +static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
> > > > > > +{
> > > > > > +	uint64_t bsize;
> > > > > > +
> > > > > > +	if (!npages || !bitmap_size || bitmap_size > UINT_MAX)
> > > > > > +		return -EINVAL;
> > > > > > +
> > > > > > +	bsize = DIRTY_BITMAP_BYTES(npages);
> > > > > > +
> > > > > > +	if (bitmap_size < bsize)
> > > > > > +		return -EINVAL;
> > > > > > +
> > > > > > +	return 0;
> > > > > > +}
> > > > > > +
> > > > > >  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> > > > > >  			     struct vfio_iommu_type1_dma_unmap *unmap)
> > > > > >  {
> > > > > > @@ -2278,6 +2385,93 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
> > > > > >  
> > > > > >  		return copy_to_user((void __user *)arg, &unmap, minsz) ?
> > > > > >  			-EFAULT : 0;
> > > > > > +	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
> > > > > > +		struct vfio_iommu_type1_dirty_bitmap dirty;
> > > > > > +		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
> > > > > > +				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
> > > > > > +				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
> > > > > > +		int ret = 0;
> > > > > > +
> > > > > > +		if (!iommu->v2)
> > > > > > +			return -EACCES;
> > > > > > +
> > > > > > +		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
> > > > > > +				    flags);
> > > > > > +
> > > > > > +		if (copy_from_user(&dirty, (void __user *)arg, minsz))
> > > > > > +			return -EFAULT;
> > > > > > +
> > > > > > +		if (dirty.argsz < minsz || dirty.flags & ~mask)
> > > > > > +			return -EINVAL;
> > > > > > +
> > > > > > +		/* only one flag should be set at a time */
> > > > > > +		if (__ffs(dirty.flags) != __fls(dirty.flags))
> > > > > > +			return -EINVAL;
> > > > > > +
> > > > > > +		if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
> > > > > > +			uint64_t pgsize = 1 << __ffs(vfio_pgsize_bitmap(iommu));
> > > > > > +
> > > > > > +			mutex_lock(&iommu->lock);
> > > > > > +			if (!iommu->dirty_page_tracking) {
> > > > > > +				ret = vfio_dma_bitmap_alloc(iommu, pgsize);
> > > > > > +				if (!ret)
> > > > > > +					iommu->dirty_page_tracking = true;
> > > > > > +			}
> > > > > > +			mutex_unlock(&iommu->lock);
> > > > > > +			return ret;
> > > > > > +		} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
> > > > > > +			mutex_lock(&iommu->lock);
> > > > > > +			if (iommu->dirty_page_tracking) {
> > > > > > +				iommu->dirty_page_tracking = false;
> > > > > > +				vfio_dma_bitmap_free(iommu);
> > > > > > +			}
> > > > > > +			mutex_unlock(&iommu->lock);
> > > > > > +			return 0;
> > > > > > +		} else if (dirty.flags &
> > > > > > +				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
> > > > > > +			struct vfio_iommu_type1_dirty_bitmap_get range;
> > > > > > +			unsigned long pgshift;
> > > > > > +			size_t data_size = dirty.argsz - minsz;
> > > > > > +			uint64_t iommu_pgsize =
> > > > > > +					 1 << __ffs(vfio_pgsize_bitmap(iommu));
> > > > > > +
> > > > > > +			if (!data_size || data_size < sizeof(range))
> > > > > > +				return -EINVAL;
> > > > > > +
> > > > > > +			if (copy_from_user(&range, (void __user *)(arg + minsz),
> > > > > > +					   sizeof(range)))
> > > > > > +				return -EFAULT;
> > > > > > +
> > > > > > +			/* allow only min supported pgsize */
> > > > > > +			if (range.bitmap.pgsize != iommu_pgsize)
> > > > > > +				return -EINVAL;
> > > > > > +			if (range.iova & (iommu_pgsize - 1))
> > > > > > +				return -EINVAL;
> > > > > > +			if (!range.size || range.size & (iommu_pgsize - 1))
> > > > > > +				return -EINVAL;
> > > > > > +			if (range.iova + range.size < range.iova)
> > > > > > +				return -EINVAL;
> > > > > > +			if (!access_ok((void __user *)range.bitmap.data,
> > > > > > +				       range.bitmap.size))
> > > > > > +				return -EINVAL;
> > > > > > +
> > > > > > +			pgshift = __ffs(range.bitmap.pgsize);
> > > > > > +			ret = verify_bitmap_size(range.size >> pgshift,
> > > > > > +						 range.bitmap.size);
> > > > > > +			if (ret)
> > > > > > +				return ret;
> > > > > > +
> > > > > > +			mutex_lock(&iommu->lock);
> > > > > > +			if (iommu->dirty_page_tracking)
> > > > > > +				ret = vfio_iova_dirty_bitmap(iommu, range.iova,
> > > > > > +					 range.size, range.bitmap.pgsize,
> > > > > > +				    (unsigned char __user *)range.bitmap.data);
> > > > > > +			else
> > > > > > +				ret = -EINVAL;
> > > > > > +			mutex_unlock(&iommu->lock);
> > > > > > +
> > > > > > +			return ret;
> > > > > > +		}
> > > > > >  	}
> > > > > >  
> > > > > >  	return -ENOTTY;
> > > > > > @@ -2345,10 +2539,17 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
> > > > > >  
> > > > > >  	vaddr = dma->vaddr + offset;
> > > > > >  
> > > > > > -	if (write)
> > > > > > +	if (write) {
> > > > > >  		*copied = __copy_to_user((void __user *)vaddr, data,
> > > > > >  					 count) ? 0 : count;
> > > > > > -	else
> > > > > > +		if (*copied && iommu->dirty_page_tracking) {
> > > > > > +			unsigned long pgshift =
> > > > > > +				__ffs(vfio_pgsize_bitmap(iommu));
> > > > > > +
> > > > > > +			bitmap_set(dma->bitmap, offset >> pgshift,
> > > > > > +				   *copied >> pgshift);
> > > > > > +		}
> > > > > > +	} else
> > > > > >  		*copied = __copy_from_user(data, (void __user *)vaddr,
> > > > > >  					   count) ? 0 : count;
> > > > > >  	if (kthread)
> > > > > > -- 
> > > > > > 2.7.0
> > > > > >       
> > > > >     
> > > >     
> > >   
> >   
>
Kirti Wankhede March 19, 2020, 2:52 p.m. UTC | #8
On 3/19/2020 9:15 AM, Alex Williamson wrote:
> On Thu, 19 Mar 2020 01:11:11 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
>> - Start dirty pages tracking while migration is active
>> - Stop dirty pages tracking.
>> - Get dirty pages bitmap. Its user space application's responsibility to
>>    copy content of dirty pages from source to destination during migration.
>>
>> To prevent DoS attack, memory for bitmap is allocated per vfio_dma
>> structure. Bitmap size is calculated considering smallest supported page
>> size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled
>>
>> Bitmap is populated for already pinned pages when bitmap is allocated for
>> a vfio_dma with the smallest supported page size. Update bitmap from
>> pinning functions when tracking is enabled. When user application queries
>> bitmap, check if requested page size is same as page size used to
>> populated bitmap. If it is equal, copy bitmap, but if not equal, return
>> error.
>>
>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>> Reviewed-by: Neo Jia <cjia@nvidia.com>
>> ---
>>   drivers/vfio/vfio_iommu_type1.c | 205 +++++++++++++++++++++++++++++++++++++++-
>>   1 file changed, 203 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>> index 70aeab921d0f..d6417fb02174 100644
>> --- a/drivers/vfio/vfio_iommu_type1.c
>> +++ b/drivers/vfio/vfio_iommu_type1.c
>> @@ -71,6 +71,7 @@ struct vfio_iommu {
>>   	unsigned int		dma_avail;
>>   	bool			v2;
>>   	bool			nesting;
>> +	bool			dirty_page_tracking;
>>   };
>>   
>>   struct vfio_domain {
>> @@ -91,6 +92,7 @@ struct vfio_dma {
>>   	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
>>   	struct task_struct	*task;
>>   	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
>> +	unsigned long		*bitmap;
> 
> We've made the bitmap a width invariant u64 else, should be here as
> well.
> 
>>   };
>>   
>>   struct vfio_group {
>> @@ -125,7 +127,10 @@ struct vfio_regions {
>>   #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
>>   					(!list_empty(&iommu->domain_list))
>>   
>> +#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
>> +
>>   static int put_pfn(unsigned long pfn, int prot);
>> +static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
>>   
>>   /*
>>    * This code handles mapping and unmapping of user data buffers
>> @@ -175,6 +180,55 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
>>   	rb_erase(&old->node, &iommu->dma_list);
>>   }
>>   
>> +static int vfio_dma_bitmap_alloc(struct vfio_iommu *iommu, uint64_t pgsize)
>> +{
>> +	struct rb_node *n = rb_first(&iommu->dma_list);
>> +
>> +	for (; n; n = rb_next(n)) {
>> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
>> +		struct rb_node *p;
>> +		unsigned long npages = dma->size / pgsize;
>> +
>> +		dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
>> +		if (!dma->bitmap) {
>> +			struct rb_node *p = rb_prev(n);
>> +
>> +			for (; p; p = rb_prev(p)) {
>> +				struct vfio_dma *dma = rb_entry(n,
>> +							struct vfio_dma, node);
>> +
>> +				kfree(dma->bitmap);
>> +				dma->bitmap = NULL;
>> +			}
>> +			return -ENOMEM;
>> +		}
>> +
>> +		if (RB_EMPTY_ROOT(&dma->pfn_list))
>> +			continue;
>> +
>> +		for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
>> +			struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
>> +							 node);
>> +
>> +			bitmap_set(dma->bitmap,
>> +					(vpfn->iova - dma->iova) / pgsize, 1);
>> +		}
>> +	}
>> +	return 0;
>> +}
>> +
>> +static void vfio_dma_bitmap_free(struct vfio_iommu *iommu)
>> +{
>> +	struct rb_node *n = rb_first(&iommu->dma_list);
>> +
>> +	for (; n; n = rb_next(n)) {
>> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
>> +
>> +		kfree(dma->bitmap);
>> +		dma->bitmap = NULL;
>> +	}
>> +}
>> +
>>   /*
>>    * Helper Functions for host iova-pfn list
>>    */
>> @@ -567,6 +621,14 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>>   			vfio_unpin_page_external(dma, iova, do_accounting);
>>   			goto pin_unwind;
>>   		}
>> +
>> +		if (iommu->dirty_page_tracking) {
>> +			unsigned long pgshift =
>> +					 __ffs(vfio_pgsize_bitmap(iommu));
>> +
>> +			bitmap_set(dma->bitmap,
>> +				   (vpfn->iova - dma->iova) >> pgshift, 1);
>> +		}
>>   	}
>>   
>>   	ret = i;
>> @@ -801,6 +863,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
>>   	vfio_unmap_unpin(iommu, dma, true);
>>   	vfio_unlink_dma(iommu, dma);
>>   	put_task_struct(dma->task);
>> +	kfree(dma->bitmap);
>>   	kfree(dma);
>>   	iommu->dma_avail++;
>>   }
>> @@ -831,6 +894,50 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
>>   	return bitmap;
>>   }
>>   
>> +static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
>> +				  size_t size, uint64_t pgsize,
>> +				  unsigned char __user *bitmap)
> 
> And here, why do callers cast to an unsigned char pointer when we're
> going to cast to a void pointer anyway?  Should be a u64 __user pointer.
> 
>> +{
>> +	struct vfio_dma *dma;
>> +	unsigned long pgshift = __ffs(pgsize);
>> +	unsigned int npages, bitmap_size;
>> +
>> +	dma = vfio_find_dma(iommu, iova, 1);
>> +
>> +	if (!dma)
>> +		return -EINVAL;
>> +
>> +	if (dma->iova != iova || dma->size != size)
>> +		return -EINVAL;
>> +
>> +	npages = dma->size >> pgshift;
>> +	bitmap_size = DIRTY_BITMAP_BYTES(npages);
>> +
>> +	/* mark all pages dirty if all pages are pinned and mapped. */
>> +	if (dma->iommu_mapped)
>> +		bitmap_set(dma->bitmap, 0, npages);
>> +
>> +	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
>> +		return -EFAULT;
>> +
>> +	return 0;
>> +}
>> +
>> +static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
>> +{
>> +	uint64_t bsize;
>> +
>> +	if (!npages || !bitmap_size || bitmap_size > UINT_MAX)
> 
> As commented previously, how do we derive this UINT_MAX limitation?
> 

Sorry, I missed that earlier

 > UINT_MAX seems arbitrary, is this specified in our API?  The size of a
 > vfio_dma is limited to what the user is able to pin, and therefore
 > their locked memory limit, but do we have an explicit limit elsewhere
 > that results in this limit here.  I think a 4GB bitmap would track
 > something like 2^47 bytes of memory, that's pretty excessive, but still
 > an arbitrary limit.

There has to be some upper limit check. In core KVM, in
virt/kvm/kvm_main.c there is max number of pages check:

if (new.npages > KVM_MEM_MAX_NR_PAGES)

Where
/*
  * Some of the bitops functions do not support too long bitmaps.
  * This number must be determined not to exceed such limits.
  */
#define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1)

Though I don't know which bitops functions do not support long bitmaps.

Something similar as above can be done or same as you also mentioned of 
4GB bitmap limit? that is U32_MAX instead of UINT_MAX?

>> +		return -EINVAL;
>> +
>> +	bsize = DIRTY_BITMAP_BYTES(npages);
>> +
>> +	if (bitmap_size < bsize)
>> +		return -EINVAL;
>> +
>> +	return 0;
>> +}
>> +
>>   static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>>   			     struct vfio_iommu_type1_dma_unmap *unmap)
>>   {
> 
> We didn't address that vfio_dma_do_map() needs to kvzalloc a bitmap for
> any new vfio_dma created while iommu->dirty_page_tracking = true.
> 

Good point. Adding it.

Thanks,
Kirti

>> @@ -2278,6 +2385,93 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>>   
>>   		return copy_to_user((void __user *)arg, &unmap, minsz) ?
>>   			-EFAULT : 0;
>> +	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
>> +		struct vfio_iommu_type1_dirty_bitmap dirty;
>> +		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
>> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
>> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
>> +		int ret = 0;
>> +
>> +		if (!iommu->v2)
>> +			return -EACCES;
>> +
>> +		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
>> +				    flags);
>> +
>> +		if (copy_from_user(&dirty, (void __user *)arg, minsz))
>> +			return -EFAULT;
>> +
>> +		if (dirty.argsz < minsz || dirty.flags & ~mask)
>> +			return -EINVAL;
>> +
>> +		/* only one flag should be set at a time */
>> +		if (__ffs(dirty.flags) != __fls(dirty.flags))
>> +			return -EINVAL;
>> +
>> +		if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
>> +			uint64_t pgsize = 1 << __ffs(vfio_pgsize_bitmap(iommu));
>> +
>> +			mutex_lock(&iommu->lock);
>> +			if (!iommu->dirty_page_tracking) {
>> +				ret = vfio_dma_bitmap_alloc(iommu, pgsize);
>> +				if (!ret)
>> +					iommu->dirty_page_tracking = true;
>> +			}
>> +			mutex_unlock(&iommu->lock);
>> +			return ret;
>> +		} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
>> +			mutex_lock(&iommu->lock);
>> +			if (iommu->dirty_page_tracking) {
>> +				iommu->dirty_page_tracking = false;
>> +				vfio_dma_bitmap_free(iommu);
>> +			}
>> +			mutex_unlock(&iommu->lock);
>> +			return 0;
>> +		} else if (dirty.flags &
>> +				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
>> +			struct vfio_iommu_type1_dirty_bitmap_get range;
>> +			unsigned long pgshift;
>> +			size_t data_size = dirty.argsz - minsz;
>> +			uint64_t iommu_pgsize =
>> +					 1 << __ffs(vfio_pgsize_bitmap(iommu));
>> +
>> +			if (!data_size || data_size < sizeof(range))
>> +				return -EINVAL;
>> +
>> +			if (copy_from_user(&range, (void __user *)(arg + minsz),
>> +					   sizeof(range)))
>> +				return -EFAULT;
>> +
>> +			/* allow only min supported pgsize */
>> +			if (range.bitmap.pgsize != iommu_pgsize)
>> +				return -EINVAL;
>> +			if (range.iova & (iommu_pgsize - 1))
>> +				return -EINVAL;
>> +			if (!range.size || range.size & (iommu_pgsize - 1))
>> +				return -EINVAL;
>> +			if (range.iova + range.size < range.iova)
>> +				return -EINVAL;
>> +			if (!access_ok((void __user *)range.bitmap.data,
>> +				       range.bitmap.size))
>> +				return -EINVAL;
>> +
>> +			pgshift = __ffs(range.bitmap.pgsize);
>> +			ret = verify_bitmap_size(range.size >> pgshift,
>> +						 range.bitmap.size);
>> +			if (ret)
>> +				return ret;
>> +
>> +			mutex_lock(&iommu->lock);
>> +			if (iommu->dirty_page_tracking)
>> +				ret = vfio_iova_dirty_bitmap(iommu, range.iova,
>> +					 range.size, range.bitmap.pgsize,
>> +				    (unsigned char __user *)range.bitmap.data);
>> +			else
>> +				ret = -EINVAL;
>> +			mutex_unlock(&iommu->lock);
>> +
>> +			return ret;
>> +		}
>>   	}
>>   
>>   	return -ENOTTY;
>> @@ -2345,10 +2539,17 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
>>   
>>   	vaddr = dma->vaddr + offset;
>>   
>> -	if (write)
>> +	if (write) {
>>   		*copied = __copy_to_user((void __user *)vaddr, data,
>>   					 count) ? 0 : count;
>> -	else
>> +		if (*copied && iommu->dirty_page_tracking) {
>> +			unsigned long pgshift =
>> +				__ffs(vfio_pgsize_bitmap(iommu));
>> +
>> +			bitmap_set(dma->bitmap, offset >> pgshift,
>> +				   *copied >> pgshift);
>> +		}
>> +	} else
> 
> Great, thanks for adding this!
> 
>>   		*copied = __copy_from_user(data, (void __user *)vaddr,
>>   					   count) ? 0 : count;
>>   	if (kthread)
>
Alex Williamson March 19, 2020, 4:22 p.m. UTC | #9
On Thu, 19 Mar 2020 20:22:41 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 3/19/2020 9:15 AM, Alex Williamson wrote:
> > On Thu, 19 Mar 2020 01:11:11 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> >> VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
> >> - Start dirty pages tracking while migration is active
> >> - Stop dirty pages tracking.
> >> - Get dirty pages bitmap. Its user space application's responsibility to
> >>    copy content of dirty pages from source to destination during migration.
> >>
> >> To prevent DoS attack, memory for bitmap is allocated per vfio_dma
> >> structure. Bitmap size is calculated considering smallest supported page
> >> size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled
> >>
> >> Bitmap is populated for already pinned pages when bitmap is allocated for
> >> a vfio_dma with the smallest supported page size. Update bitmap from
> >> pinning functions when tracking is enabled. When user application queries
> >> bitmap, check if requested page size is same as page size used to
> >> populated bitmap. If it is equal, copy bitmap, but if not equal, return
> >> error.
> >>
> >> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> >> Reviewed-by: Neo Jia <cjia@nvidia.com>
> >> ---
> >>   drivers/vfio/vfio_iommu_type1.c | 205 +++++++++++++++++++++++++++++++++++++++-
> >>   1 file changed, 203 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> >> index 70aeab921d0f..d6417fb02174 100644
> >> --- a/drivers/vfio/vfio_iommu_type1.c
> >> +++ b/drivers/vfio/vfio_iommu_type1.c
> >> @@ -71,6 +71,7 @@ struct vfio_iommu {
> >>   	unsigned int		dma_avail;
> >>   	bool			v2;
> >>   	bool			nesting;
> >> +	bool			dirty_page_tracking;
> >>   };
> >>   
> >>   struct vfio_domain {
> >> @@ -91,6 +92,7 @@ struct vfio_dma {
> >>   	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
> >>   	struct task_struct	*task;
> >>   	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
> >> +	unsigned long		*bitmap;  
> > 
> > We've made the bitmap a width invariant u64 else, should be here as
> > well.
> >   
> >>   };
> >>   
> >>   struct vfio_group {
> >> @@ -125,7 +127,10 @@ struct vfio_regions {
> >>   #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> >>   					(!list_empty(&iommu->domain_list))
> >>   
> >> +#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
> >> +
> >>   static int put_pfn(unsigned long pfn, int prot);
> >> +static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
> >>   
> >>   /*
> >>    * This code handles mapping and unmapping of user data buffers
> >> @@ -175,6 +180,55 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
> >>   	rb_erase(&old->node, &iommu->dma_list);
> >>   }
> >>   
> >> +static int vfio_dma_bitmap_alloc(struct vfio_iommu *iommu, uint64_t pgsize)
> >> +{
> >> +	struct rb_node *n = rb_first(&iommu->dma_list);
> >> +
> >> +	for (; n; n = rb_next(n)) {
> >> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> >> +		struct rb_node *p;
> >> +		unsigned long npages = dma->size / pgsize;
> >> +
> >> +		dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
> >> +		if (!dma->bitmap) {
> >> +			struct rb_node *p = rb_prev(n);
> >> +
> >> +			for (; p; p = rb_prev(p)) {
> >> +				struct vfio_dma *dma = rb_entry(n,
> >> +							struct vfio_dma, node);
> >> +
> >> +				kfree(dma->bitmap);
> >> +				dma->bitmap = NULL;
> >> +			}
> >> +			return -ENOMEM;
> >> +		}
> >> +
> >> +		if (RB_EMPTY_ROOT(&dma->pfn_list))
> >> +			continue;
> >> +
> >> +		for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
> >> +			struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
> >> +							 node);
> >> +
> >> +			bitmap_set(dma->bitmap,
> >> +					(vpfn->iova - dma->iova) / pgsize, 1);
> >> +		}
> >> +	}
> >> +	return 0;
> >> +}
> >> +
> >> +static void vfio_dma_bitmap_free(struct vfio_iommu *iommu)
> >> +{
> >> +	struct rb_node *n = rb_first(&iommu->dma_list);
> >> +
> >> +	for (; n; n = rb_next(n)) {
> >> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> >> +
> >> +		kfree(dma->bitmap);
> >> +		dma->bitmap = NULL;
> >> +	}
> >> +}
> >> +
> >>   /*
> >>    * Helper Functions for host iova-pfn list
> >>    */
> >> @@ -567,6 +621,14 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
> >>   			vfio_unpin_page_external(dma, iova, do_accounting);
> >>   			goto pin_unwind;
> >>   		}
> >> +
> >> +		if (iommu->dirty_page_tracking) {
> >> +			unsigned long pgshift =
> >> +					 __ffs(vfio_pgsize_bitmap(iommu));
> >> +
> >> +			bitmap_set(dma->bitmap,
> >> +				   (vpfn->iova - dma->iova) >> pgshift, 1);
> >> +		}
> >>   	}
> >>   
> >>   	ret = i;
> >> @@ -801,6 +863,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
> >>   	vfio_unmap_unpin(iommu, dma, true);
> >>   	vfio_unlink_dma(iommu, dma);
> >>   	put_task_struct(dma->task);
> >> +	kfree(dma->bitmap);
> >>   	kfree(dma);
> >>   	iommu->dma_avail++;
> >>   }
> >> @@ -831,6 +894,50 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
> >>   	return bitmap;
> >>   }
> >>   
> >> +static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
> >> +				  size_t size, uint64_t pgsize,
> >> +				  unsigned char __user *bitmap)  
> > 
> > And here, why do callers cast to an unsigned char pointer when we're
> > going to cast to a void pointer anyway?  Should be a u64 __user pointer.
> >   
> >> +{
> >> +	struct vfio_dma *dma;
> >> +	unsigned long pgshift = __ffs(pgsize);
> >> +	unsigned int npages, bitmap_size;
> >> +
> >> +	dma = vfio_find_dma(iommu, iova, 1);
> >> +
> >> +	if (!dma)
> >> +		return -EINVAL;
> >> +
> >> +	if (dma->iova != iova || dma->size != size)
> >> +		return -EINVAL;
> >> +
> >> +	npages = dma->size >> pgshift;
> >> +	bitmap_size = DIRTY_BITMAP_BYTES(npages);
> >> +
> >> +	/* mark all pages dirty if all pages are pinned and mapped. */
> >> +	if (dma->iommu_mapped)
> >> +		bitmap_set(dma->bitmap, 0, npages);
> >> +
> >> +	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
> >> +		return -EFAULT;
> >> +
> >> +	return 0;
> >> +}
> >> +
> >> +static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
> >> +{
> >> +	uint64_t bsize;
> >> +
> >> +	if (!npages || !bitmap_size || bitmap_size > UINT_MAX)  
> > 
> > As commented previously, how do we derive this UINT_MAX limitation?
> >   
> 
> Sorry, I missed that earlier
> 
>  > UINT_MAX seems arbitrary, is this specified in our API?  The size of a
>  > vfio_dma is limited to what the user is able to pin, and therefore
>  > their locked memory limit, but do we have an explicit limit elsewhere
>  > that results in this limit here.  I think a 4GB bitmap would track
>  > something like 2^47 bytes of memory, that's pretty excessive, but still
>  > an arbitrary limit.  
> 
> There has to be some upper limit check. In core KVM, in
> virt/kvm/kvm_main.c there is max number of pages check:
> 
> if (new.npages > KVM_MEM_MAX_NR_PAGES)
> 
> Where
> /*
>   * Some of the bitops functions do not support too long bitmaps.
>   * This number must be determined not to exceed such limits.
>   */
> #define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1)
> 
> Though I don't know which bitops functions do not support long bitmaps.
> 
> Something similar as above can be done or same as you also mentioned of 
> 4GB bitmap limit? that is U32_MAX instead of UINT_MAX?

Let's see, we use bitmap_set():

void bitmap_set(unsigned long *map, unsigned int start, unsigned int nbits)

So we're limited to an unsigned int number of bits, but for an
unaligned, multi-bit operation this will call __bitmap_set():

void __bitmap_set(unsigned long *map, unsigned int start, int len)

So we're down to a signed int number of bits (seems like an API bug in
bitops there), so it makes sense that KVM is testing against MAX_INT
number of pages, ie. number of bits.  But that still suggests a bitmap
size of MAX_UINT is off by a factor of 16.  So we can have 2^31 bits
divided by 2^3 bits/byte yields a maximum bitmap size of 2^28 (ie.
256MB), which maps 2^31 * 2^12 = 2^43 (8TB) on a 4K system.

Let's fix the limit check and put a nice comment explaining it.  Thanks,

Alex

> >> +		return -EINVAL;
> >> +
> >> +	bsize = DIRTY_BITMAP_BYTES(npages);
> >> +
> >> +	if (bitmap_size < bsize)
> >> +		return -EINVAL;
> >> +
> >> +	return 0;
> >> +}
> >> +
> >>   static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> >>   			     struct vfio_iommu_type1_dma_unmap *unmap)
> >>   {  
> > 
> > We didn't address that vfio_dma_do_map() needs to kvzalloc a bitmap for
> > any new vfio_dma created while iommu->dirty_page_tracking = true.
> >   
> 
> Good point. Adding it.
> 
> Thanks,
> Kirti
> 
> >> @@ -2278,6 +2385,93 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
> >>   
> >>   		return copy_to_user((void __user *)arg, &unmap, minsz) ?
> >>   			-EFAULT : 0;
> >> +	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
> >> +		struct vfio_iommu_type1_dirty_bitmap dirty;
> >> +		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
> >> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
> >> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
> >> +		int ret = 0;
> >> +
> >> +		if (!iommu->v2)
> >> +			return -EACCES;
> >> +
> >> +		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
> >> +				    flags);
> >> +
> >> +		if (copy_from_user(&dirty, (void __user *)arg, minsz))
> >> +			return -EFAULT;
> >> +
> >> +		if (dirty.argsz < minsz || dirty.flags & ~mask)
> >> +			return -EINVAL;
> >> +
> >> +		/* only one flag should be set at a time */
> >> +		if (__ffs(dirty.flags) != __fls(dirty.flags))
> >> +			return -EINVAL;
> >> +
> >> +		if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
> >> +			uint64_t pgsize = 1 << __ffs(vfio_pgsize_bitmap(iommu));
> >> +
> >> +			mutex_lock(&iommu->lock);
> >> +			if (!iommu->dirty_page_tracking) {
> >> +				ret = vfio_dma_bitmap_alloc(iommu, pgsize);
> >> +				if (!ret)
> >> +					iommu->dirty_page_tracking = true;
> >> +			}
> >> +			mutex_unlock(&iommu->lock);
> >> +			return ret;
> >> +		} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
> >> +			mutex_lock(&iommu->lock);
> >> +			if (iommu->dirty_page_tracking) {
> >> +				iommu->dirty_page_tracking = false;
> >> +				vfio_dma_bitmap_free(iommu);
> >> +			}
> >> +			mutex_unlock(&iommu->lock);
> >> +			return 0;
> >> +		} else if (dirty.flags &
> >> +				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
> >> +			struct vfio_iommu_type1_dirty_bitmap_get range;
> >> +			unsigned long pgshift;
> >> +			size_t data_size = dirty.argsz - minsz;
> >> +			uint64_t iommu_pgsize =
> >> +					 1 << __ffs(vfio_pgsize_bitmap(iommu));
> >> +
> >> +			if (!data_size || data_size < sizeof(range))
> >> +				return -EINVAL;
> >> +
> >> +			if (copy_from_user(&range, (void __user *)(arg + minsz),
> >> +					   sizeof(range)))
> >> +				return -EFAULT;
> >> +
> >> +			/* allow only min supported pgsize */
> >> +			if (range.bitmap.pgsize != iommu_pgsize)
> >> +				return -EINVAL;
> >> +			if (range.iova & (iommu_pgsize - 1))
> >> +				return -EINVAL;
> >> +			if (!range.size || range.size & (iommu_pgsize - 1))
> >> +				return -EINVAL;
> >> +			if (range.iova + range.size < range.iova)
> >> +				return -EINVAL;
> >> +			if (!access_ok((void __user *)range.bitmap.data,
> >> +				       range.bitmap.size))
> >> +				return -EINVAL;
> >> +
> >> +			pgshift = __ffs(range.bitmap.pgsize);
> >> +			ret = verify_bitmap_size(range.size >> pgshift,
> >> +						 range.bitmap.size);
> >> +			if (ret)
> >> +				return ret;
> >> +
> >> +			mutex_lock(&iommu->lock);
> >> +			if (iommu->dirty_page_tracking)
> >> +				ret = vfio_iova_dirty_bitmap(iommu, range.iova,
> >> +					 range.size, range.bitmap.pgsize,
> >> +				    (unsigned char __user *)range.bitmap.data);
> >> +			else
> >> +				ret = -EINVAL;
> >> +			mutex_unlock(&iommu->lock);
> >> +
> >> +			return ret;
> >> +		}
> >>   	}
> >>   
> >>   	return -ENOTTY;
> >> @@ -2345,10 +2539,17 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
> >>   
> >>   	vaddr = dma->vaddr + offset;
> >>   
> >> -	if (write)
> >> +	if (write) {
> >>   		*copied = __copy_to_user((void __user *)vaddr, data,
> >>   					 count) ? 0 : count;
> >> -	else
> >> +		if (*copied && iommu->dirty_page_tracking) {
> >> +			unsigned long pgshift =
> >> +				__ffs(vfio_pgsize_bitmap(iommu));
> >> +
> >> +			bitmap_set(dma->bitmap, offset >> pgshift,
> >> +				   *copied >> pgshift);
> >> +		}
> >> +	} else  
> > 
> > Great, thanks for adding this!
> >   
> >>   		*copied = __copy_from_user(data, (void __user *)vaddr,
> >>   					   count) ? 0 : count;
> >>   	if (kthread)  
> >   
>
Kirti Wankhede March 19, 2020, 4:57 p.m. UTC | #10
On 3/19/2020 6:36 PM, Alex Williamson wrote:
> On Thu, 19 Mar 2020 02:15:34 -0400
> Yan Zhao <yan.y.zhao@intel.com> wrote:
> 
>> On Thu, Mar 19, 2020 at 12:40:53PM +0800, Alex Williamson wrote:
>>> On Thu, 19 Mar 2020 00:15:33 -0400
>>> Yan Zhao <yan.y.zhao@intel.com> wrote:
>>>    
>>>> On Thu, Mar 19, 2020 at 12:01:00PM +0800, Alex Williamson wrote:
>>>>> On Wed, 18 Mar 2020 23:06:39 -0400
>>>>> Yan Zhao <yan.y.zhao@intel.com> wrote:
>>>>>      
>>>>>> On Thu, Mar 19, 2020 at 03:41:11AM +0800, Kirti Wankhede wrote:
>>>>>>> VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
>>>>>>> - Start dirty pages tracking while migration is active
>>>>>>> - Stop dirty pages tracking.
>>>>>>> - Get dirty pages bitmap. Its user space application's responsibility to
>>>>>>>    copy content of dirty pages from source to destination during migration.
>>>>>>>
>>>>>>> To prevent DoS attack, memory for bitmap is allocated per vfio_dma
>>>>>>> structure. Bitmap size is calculated considering smallest supported page
>>>>>>> size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled
>>>>>>>
>>>>>>> Bitmap is populated for already pinned pages when bitmap is allocated for
>>>>>>> a vfio_dma with the smallest supported page size. Update bitmap from
>>>>>>> pinning functions when tracking is enabled. When user application queries
>>>>>>> bitmap, check if requested page size is same as page size used to
>>>>>>> populated bitmap. If it is equal, copy bitmap, but if not equal, return
>>>>>>> error.
>>>>>>>
>>>>>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>>>>>>> Reviewed-by: Neo Jia <cjia@nvidia.com>
>>>>>>> ---
>>>>>>>   drivers/vfio/vfio_iommu_type1.c | 205 +++++++++++++++++++++++++++++++++++++++-
>>>>>>>   1 file changed, 203 insertions(+), 2 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>>>>>>> index 70aeab921d0f..d6417fb02174 100644
>>>>>>> --- a/drivers/vfio/vfio_iommu_type1.c
>>>>>>> +++ b/drivers/vfio/vfio_iommu_type1.c
>>>>>>> @@ -71,6 +71,7 @@ struct vfio_iommu {
>>>>>>>   	unsigned int		dma_avail;
>>>>>>>   	bool			v2;
>>>>>>>   	bool			nesting;
>>>>>>> +	bool			dirty_page_tracking;
>>>>>>>   };
>>>>>>>   
>>>>>>>   struct vfio_domain {
>>>>>>> @@ -91,6 +92,7 @@ struct vfio_dma {
>>>>>>>   	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
>>>>>>>   	struct task_struct	*task;
>>>>>>>   	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
>>>>>>> +	unsigned long		*bitmap;
>>>>>>>   };
>>>>>>>   
>>>>>>>   struct vfio_group {
>>>>>>> @@ -125,7 +127,10 @@ struct vfio_regions {
>>>>>>>   #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
>>>>>>>   					(!list_empty(&iommu->domain_list))
>>>>>>>   
>>>>>>> +#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
>>>>>>> +
>>>>>>>   static int put_pfn(unsigned long pfn, int prot);
>>>>>>> +static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
>>>>>>>   
>>>>>>>   /*
>>>>>>>    * This code handles mapping and unmapping of user data buffers
>>>>>>> @@ -175,6 +180,55 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
>>>>>>>   	rb_erase(&old->node, &iommu->dma_list);
>>>>>>>   }
>>>>>>>   
>>>>>>> +static int vfio_dma_bitmap_alloc(struct vfio_iommu *iommu, uint64_t pgsize)
>>>>>>> +{
>>>>>>> +	struct rb_node *n = rb_first(&iommu->dma_list);
>>>>>>> +
>>>>>>> +	for (; n; n = rb_next(n)) {
>>>>>>> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
>>>>>>> +		struct rb_node *p;
>>>>>>> +		unsigned long npages = dma->size / pgsize;
>>>>>>> +
>>>>>>> +		dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
>>>>>>> +		if (!dma->bitmap) {
>>>>>>> +			struct rb_node *p = rb_prev(n);
>>>>>>> +
>>>>>>> +			for (; p; p = rb_prev(p)) {
>>>>>>> +				struct vfio_dma *dma = rb_entry(n,
>>>>>>> +							struct vfio_dma, node);
>>>>>>> +
>>>>>>> +				kfree(dma->bitmap);
>>>>>>> +				dma->bitmap = NULL;
>>>>>>> +			}
>>>>>>> +			return -ENOMEM;
>>>>>>> +		}
>>>>>>> +
>>>>>>> +		if (RB_EMPTY_ROOT(&dma->pfn_list))
>>>>>>> +			continue;
>>>>>>> +
>>>>>>> +		for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
>>>>>>> +			struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
>>>>>>> +							 node);
>>>>>>> +
>>>>>>> +			bitmap_set(dma->bitmap,
>>>>>>> +					(vpfn->iova - dma->iova) / pgsize, 1);
>>>>>>> +		}
>>>>>>> +	}
>>>>>>> +	return 0;
>>>>>>> +}
>>>>>>> +
>>>>>>> +static void vfio_dma_bitmap_free(struct vfio_iommu *iommu)
>>>>>>> +{
>>>>>>> +	struct rb_node *n = rb_first(&iommu->dma_list);
>>>>>>> +
>>>>>>> +	for (; n; n = rb_next(n)) {
>>>>>>> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
>>>>>>> +
>>>>>>> +		kfree(dma->bitmap);
>>>>>>> +		dma->bitmap = NULL;
>>>>>>> +	}
>>>>>>> +}
>>>>>>> +
>>>>>>>   /*
>>>>>>>    * Helper Functions for host iova-pfn list
>>>>>>>    */
>>>>>>> @@ -567,6 +621,14 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>>>>>>>   			vfio_unpin_page_external(dma, iova, do_accounting);
>>>>>>>   			goto pin_unwind;
>>>>>>>   		}
>>>>>>> +
>>>>>>> +		if (iommu->dirty_page_tracking) {
>>>>>>> +			unsigned long pgshift =
>>>>>>> +					 __ffs(vfio_pgsize_bitmap(iommu));
>>>>>>> +
>>>>>>> +			bitmap_set(dma->bitmap,
>>>>>>> +				   (vpfn->iova - dma->iova) >> pgshift, 1);
>>>>>>> +		}
>>>>>>>   	}
>>>>>>>   
>>>>>>>   	ret = i;
>>>>>>> @@ -801,6 +863,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
>>>>>>>   	vfio_unmap_unpin(iommu, dma, true);
>>>>>>>   	vfio_unlink_dma(iommu, dma);
>>>>>>>   	put_task_struct(dma->task);
>>>>>>> +	kfree(dma->bitmap);
>>>>>>>   	kfree(dma);
>>>>>>>   	iommu->dma_avail++;
>>>>>>>   }
>>>>>>> @@ -831,6 +894,50 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
>>>>>>>   	return bitmap;
>>>>>>>   }
>>>>>>>   
>>>>>>> +static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
>>>>>>> +				  size_t size, uint64_t pgsize,
>>>>>>> +				  unsigned char __user *bitmap)
>>>>>>> +{
>>>>>>> +	struct vfio_dma *dma;
>>>>>>> +	unsigned long pgshift = __ffs(pgsize);
>>>>>>> +	unsigned int npages, bitmap_size;
>>>>>>> +
>>>>>>> +	dma = vfio_find_dma(iommu, iova, 1);
>>>>>>> +
>>>>>>> +	if (!dma)
>>>>>>> +		return -EINVAL;
>>>>>>> +
>>>>>>> +	if (dma->iova != iova || dma->size != size)
>>>>>>> +		return -EINVAL;
>>>>>>> +
>>>>>> looks this size is passed from user. how can it ensure size always
>>>>>> equals to dma->size ?
>>>>>>
>>>>>> shouldn't we iterate dma tree to look for dirty for whole range if a
>>>>>> single dma cannot meet them all?
>>>>>
>>>>> Please see the discussion on v12[1], the problem is with the alignment
>>>>> of DMA mapped regions versus the bitmap.  A DMA mapping only requires
>>>>> page alignment, so for example imagine a user requests the bitmap from
>>>>> page zero to 4GB, but we have a DMA mapping starting at 4KB.  We can't
>>>>> efficiently copy the bitmap tracked by the vfio_dma structure to the
>>>>> user buffer when it's shifted by 1 bit.  Adjacent mappings can also
>>>>> make for a very complicated implementation.  In the discussion linked
>>>>> we decided to compromise on a more simple implementation that requires
>>>>> the user to ask for a bitmap which exactly matches a single DMA
>>>>> mapping, which Kirti indicates is what we require to support QEMU.
>>>>> Later in the series, the unmap operation also makes this requirement
>>>>> when used with the flags to retrieve the dirty bitmap.  Thanks,
>>>>>     
>>>>
>>>> so, what about for vIOMMU enabling case?
>>>> if IOVAs are mapped per page, then there's a log_sync in qemu,
>>>> it's supposed for range from 0-U64MAX, qemu has to find out which
>>>> ones are mapped and cut them into pages before calling this IOCTL?
>>>> And what if those IOVAs are mapped for len more than one page?
>>>
>>> Good question.  Kirti?
>>>

In log_sync with vIOMMU, loop for range such that:

- find iotlb entry for iova, get iova_xlat
- size = iotlb.addr_mask + 1; This is same caculation as when mapping 
are created from vfio_iommu_map_notify()
- use the <iova_xlat, size> for VFIO_IOMMU_DIRTY_PAGES ioctl
- increment iova: iova += size
- iterate above steps till end of range.

>>>>> [1] https://lore.kernel.org/kvm/20200218215330.5bc8fc6a@w520.home/
>>>>>       
>>>>>>> +	npages = dma->size >> pgshift;
>>>>>>> +	bitmap_size = DIRTY_BITMAP_BYTES(npages);
>>>>>>> +
>>>>>>> +	/* mark all pages dirty if all pages are pinned and mapped. */
>>>>>>> +	if (dma->iommu_mapped)
>>>>>>> +		bitmap_set(dma->bitmap, 0, npages);
>>>>>>> +
>>>>>>> +	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
>>>>>>> +		return -EFAULT;
>>>>>>> +
>> Here, dma->bitmap needs to be cleared. right?
> 
> Ah, I missed re-checking this in my review.  v13 did clear it, but I
> noted that we need to re-populate any currently pinned pages.  This
> neither clears nor repopulates.  That's wrong.  Thanks,
> 

Why re-populate when there will be no change since 
vfio_iova_dirty_bitmap() is called holding iommu->lock? If there is any 
pin request while vfio_iova_dirty_bitmap() is still working, it will 
wait till iommu->lock is released. Bitmap will be populated when page is 
pinned.

Thanks,
Kirti

> Alex
>   
>>>>>>> +	return 0;
>>>>>>> +}
>>>>>>> +
>>>>>>> +static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
>>>>>>> +{
>>>>>>> +	uint64_t bsize;
>>>>>>> +
>>>>>>> +	if (!npages || !bitmap_size || bitmap_size > UINT_MAX)
>>>>>>> +		return -EINVAL;
>>>>>>> +
>>>>>>> +	bsize = DIRTY_BITMAP_BYTES(npages);
>>>>>>> +
>>>>>>> +	if (bitmap_size < bsize)
>>>>>>> +		return -EINVAL;
>>>>>>> +
>>>>>>> +	return 0;
>>>>>>> +}
>>>>>>> +
>>>>>>>   static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>>>>>>>   			     struct vfio_iommu_type1_dma_unmap *unmap)
>>>>>>>   {
>>>>>>> @@ -2278,6 +2385,93 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>>>>>>>   
>>>>>>>   		return copy_to_user((void __user *)arg, &unmap, minsz) ?
>>>>>>>   			-EFAULT : 0;
>>>>>>> +	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
>>>>>>> +		struct vfio_iommu_type1_dirty_bitmap dirty;
>>>>>>> +		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
>>>>>>> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
>>>>>>> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
>>>>>>> +		int ret = 0;
>>>>>>> +
>>>>>>> +		if (!iommu->v2)
>>>>>>> +			return -EACCES;
>>>>>>> +
>>>>>>> +		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
>>>>>>> +				    flags);
>>>>>>> +
>>>>>>> +		if (copy_from_user(&dirty, (void __user *)arg, minsz))
>>>>>>> +			return -EFAULT;
>>>>>>> +
>>>>>>> +		if (dirty.argsz < minsz || dirty.flags & ~mask)
>>>>>>> +			return -EINVAL;
>>>>>>> +
>>>>>>> +		/* only one flag should be set at a time */
>>>>>>> +		if (__ffs(dirty.flags) != __fls(dirty.flags))
>>>>>>> +			return -EINVAL;
>>>>>>> +
>>>>>>> +		if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
>>>>>>> +			uint64_t pgsize = 1 << __ffs(vfio_pgsize_bitmap(iommu));
>>>>>>> +
>>>>>>> +			mutex_lock(&iommu->lock);
>>>>>>> +			if (!iommu->dirty_page_tracking) {
>>>>>>> +				ret = vfio_dma_bitmap_alloc(iommu, pgsize);
>>>>>>> +				if (!ret)
>>>>>>> +					iommu->dirty_page_tracking = true;
>>>>>>> +			}
>>>>>>> +			mutex_unlock(&iommu->lock);
>>>>>>> +			return ret;
>>>>>>> +		} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
>>>>>>> +			mutex_lock(&iommu->lock);
>>>>>>> +			if (iommu->dirty_page_tracking) {
>>>>>>> +				iommu->dirty_page_tracking = false;
>>>>>>> +				vfio_dma_bitmap_free(iommu);
>>>>>>> +			}
>>>>>>> +			mutex_unlock(&iommu->lock);
>>>>>>> +			return 0;
>>>>>>> +		} else if (dirty.flags &
>>>>>>> +				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
>>>>>>> +			struct vfio_iommu_type1_dirty_bitmap_get range;
>>>>>>> +			unsigned long pgshift;
>>>>>>> +			size_t data_size = dirty.argsz - minsz;
>>>>>>> +			uint64_t iommu_pgsize =
>>>>>>> +					 1 << __ffs(vfio_pgsize_bitmap(iommu));
>>>>>>> +
>>>>>>> +			if (!data_size || data_size < sizeof(range))
>>>>>>> +				return -EINVAL;
>>>>>>> +
>>>>>>> +			if (copy_from_user(&range, (void __user *)(arg + minsz),
>>>>>>> +					   sizeof(range)))
>>>>>>> +				return -EFAULT;
>>>>>>> +
>>>>>>> +			/* allow only min supported pgsize */
>>>>>>> +			if (range.bitmap.pgsize != iommu_pgsize)
>>>>>>> +				return -EINVAL;
>>>>>>> +			if (range.iova & (iommu_pgsize - 1))
>>>>>>> +				return -EINVAL;
>>>>>>> +			if (!range.size || range.size & (iommu_pgsize - 1))
>>>>>>> +				return -EINVAL;
>>>>>>> +			if (range.iova + range.size < range.iova)
>>>>>>> +				return -EINVAL;
>>>>>>> +			if (!access_ok((void __user *)range.bitmap.data,
>>>>>>> +				       range.bitmap.size))
>>>>>>> +				return -EINVAL;
>>>>>>> +
>>>>>>> +			pgshift = __ffs(range.bitmap.pgsize);
>>>>>>> +			ret = verify_bitmap_size(range.size >> pgshift,
>>>>>>> +						 range.bitmap.size);
>>>>>>> +			if (ret)
>>>>>>> +				return ret;
>>>>>>> +
>>>>>>> +			mutex_lock(&iommu->lock);
>>>>>>> +			if (iommu->dirty_page_tracking)
>>>>>>> +				ret = vfio_iova_dirty_bitmap(iommu, range.iova,
>>>>>>> +					 range.size, range.bitmap.pgsize,
>>>>>>> +				    (unsigned char __user *)range.bitmap.data);
>>>>>>> +			else
>>>>>>> +				ret = -EINVAL;
>>>>>>> +			mutex_unlock(&iommu->lock);
>>>>>>> +
>>>>>>> +			return ret;
>>>>>>> +		}
>>>>>>>   	}
>>>>>>>   
>>>>>>>   	return -ENOTTY;
>>>>>>> @@ -2345,10 +2539,17 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
>>>>>>>   
>>>>>>>   	vaddr = dma->vaddr + offset;
>>>>>>>   
>>>>>>> -	if (write)
>>>>>>> +	if (write) {
>>>>>>>   		*copied = __copy_to_user((void __user *)vaddr, data,
>>>>>>>   					 count) ? 0 : count;
>>>>>>> -	else
>>>>>>> +		if (*copied && iommu->dirty_page_tracking) {
>>>>>>> +			unsigned long pgshift =
>>>>>>> +				__ffs(vfio_pgsize_bitmap(iommu));
>>>>>>> +
>>>>>>> +			bitmap_set(dma->bitmap, offset >> pgshift,
>>>>>>> +				   *copied >> pgshift);
>>>>>>> +		}
>>>>>>> +	} else
>>>>>>>   		*copied = __copy_from_user(data, (void __user *)vaddr,
>>>>>>>   					   count) ? 0 : count;
>>>>>>>   	if (kthread)
>>>>>>> -- 
>>>>>>> 2.7.0
>>>>>>>        
>>>>>>      
>>>>>      
>>>>    
>>>    
>>
>
Kirti Wankhede March 19, 2020, 6:57 p.m. UTC | #11
On 3/19/2020 9:15 AM, Alex Williamson wrote:
> On Thu, 19 Mar 2020 01:11:11 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
>> - Start dirty pages tracking while migration is active
>> - Stop dirty pages tracking.
>> - Get dirty pages bitmap. Its user space application's responsibility to
>>    copy content of dirty pages from source to destination during migration.
>>
>> To prevent DoS attack, memory for bitmap is allocated per vfio_dma
>> structure. Bitmap size is calculated considering smallest supported page
>> size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled
>>
>> Bitmap is populated for already pinned pages when bitmap is allocated for
>> a vfio_dma with the smallest supported page size. Update bitmap from
>> pinning functions when tracking is enabled. When user application queries
>> bitmap, check if requested page size is same as page size used to
>> populated bitmap. If it is equal, copy bitmap, but if not equal, return
>> error.
>>
>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>> Reviewed-by: Neo Jia <cjia@nvidia.com>
>> ---
>>   drivers/vfio/vfio_iommu_type1.c | 205 +++++++++++++++++++++++++++++++++++++++-
>>   1 file changed, 203 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>> index 70aeab921d0f..d6417fb02174 100644
>> --- a/drivers/vfio/vfio_iommu_type1.c
>> +++ b/drivers/vfio/vfio_iommu_type1.c
>> @@ -71,6 +71,7 @@ struct vfio_iommu {
>>   	unsigned int		dma_avail;
>>   	bool			v2;
>>   	bool			nesting;
>> +	bool			dirty_page_tracking;
>>   };
>>   
>>   struct vfio_domain {
>> @@ -91,6 +92,7 @@ struct vfio_dma {
>>   	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
>>   	struct task_struct	*task;
>>   	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
>> +	unsigned long		*bitmap;
> 
> We've made the bitmap a width invariant u64 else, should be here as
> well.
> 

Changing to u64 causes compile time warnings as below. Keeping 'unsigned 
long *'

drivers/vfio/vfio_iommu_type1.c: In function ‘vfio_dma_bitmap_alloc_all’:
drivers/vfio/vfio_iommu_type1.c:232:8: warning: passing argument 1 of 
‘bitmap_set’ from incompatible pointer type [enabled by default]
         (vpfn->iova - dma->iova) / pgsize, 1);
         ^
In file included from ./include/linux/cpumask.h:12:0,
                  from ./arch/x86/include/asm/cpumask.h:5,
                  from ./arch/x86/include/asm/msr.h:11,
                  from ./arch/x86/include/asm/processor.h:22,
                  from ./arch/x86/include/asm/cpufeature.h:5,
                  from ./arch/x86/include/asm/thread_info.h:53,
                  from ./include/linux/thread_info.h:38,
                  from ./arch/x86/include/asm/preempt.h:7,
                  from ./include/linux/preempt.h:78,
                  from ./include/linux/spinlock.h:51,
                  from ./include/linux/seqlock.h:36,
                  from ./include/linux/time.h:6,
                  from ./include/linux/compat.h:10,
                  from drivers/vfio/vfio_iommu_type1.c:24:
./include/linux/bitmap.h:405:29: note: expected ‘long unsigned int *’ 
but argument is of type ‘u64 *’
  static __always_inline void bitmap_set(unsigned long *map, unsigned 
int start,

Thanks,
Kirti
Kirti Wankhede March 19, 2020, 8:25 p.m. UTC | #12
On 3/19/2020 9:52 PM, Alex Williamson wrote:
> On Thu, 19 Mar 2020 20:22:41 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> On 3/19/2020 9:15 AM, Alex Williamson wrote:
>>> On Thu, 19 Mar 2020 01:11:11 +0530
>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>    

<snip>

>>>> +
>>>> +static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
>>>> +{
>>>> +	uint64_t bsize;
>>>> +
>>>> +	if (!npages || !bitmap_size || bitmap_size > UINT_MAX)
>>>
>>> As commented previously, how do we derive this UINT_MAX limitation?
>>>    
>>
>> Sorry, I missed that earlier
>>
>>   > UINT_MAX seems arbitrary, is this specified in our API?  The size of a
>>   > vfio_dma is limited to what the user is able to pin, and therefore
>>   > their locked memory limit, but do we have an explicit limit elsewhere
>>   > that results in this limit here.  I think a 4GB bitmap would track
>>   > something like 2^47 bytes of memory, that's pretty excessive, but still
>>   > an arbitrary limit.
>>
>> There has to be some upper limit check. In core KVM, in
>> virt/kvm/kvm_main.c there is max number of pages check:
>>
>> if (new.npages > KVM_MEM_MAX_NR_PAGES)
>>
>> Where
>> /*
>>    * Some of the bitops functions do not support too long bitmaps.
>>    * This number must be determined not to exceed such limits.
>>    */
>> #define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1)
>>
>> Though I don't know which bitops functions do not support long bitmaps.
>>
>> Something similar as above can be done or same as you also mentioned of
>> 4GB bitmap limit? that is U32_MAX instead of UINT_MAX?
> 
> Let's see, we use bitmap_set():
> 
> void bitmap_set(unsigned long *map, unsigned int start, unsigned int nbits)
> 
> So we're limited to an unsigned int number of bits, but for an
> unaligned, multi-bit operation this will call __bitmap_set():
> 
> void __bitmap_set(unsigned long *map, unsigned int start, int len)
> 
> So we're down to a signed int number of bits (seems like an API bug in
> bitops there), so it makes sense that KVM is testing against MAX_INT
> number of pages, ie. number of bits.  But that still suggests a bitmap
> size of MAX_UINT is off by a factor of 16.  So we can have 2^31 bits
> divided by 2^3 bits/byte yields a maximum bitmap size of 2^28 (ie.
> 256MB), which maps 2^31 * 2^12 = 2^43 (8TB) on a 4K system.
> 
> Let's fix the limit check and put a nice comment explaining it.  Thanks,
> 

Agreed. Adding DIRTY_BITMAP_SIZE_MAX macro and comment as below.

/*
  * Input argument of number of bits to bitmap_set() is unsigned 
integer, which
  * further casts to signed integer for unaligned multi-bit operation,
  * __bitmap_set().
  * Then maximum bitmap size supported is 2^31 bits divided by 2^3 
bits/byte,
  * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
  * system.
  */
#define DIRTY_BITMAP_PAGES_MAX  ((1UL << 31) - 1)
#define DIRTY_BITMAP_SIZE_MAX 	\
			DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)


Thanks,
Kirti
Alex Williamson March 19, 2020, 8:54 p.m. UTC | #13
On Fri, 20 Mar 2020 01:55:10 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 3/19/2020 9:52 PM, Alex Williamson wrote:
> > On Thu, 19 Mar 2020 20:22:41 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> >> On 3/19/2020 9:15 AM, Alex Williamson wrote:  
> >>> On Thu, 19 Mar 2020 01:11:11 +0530
> >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>      
> 
> <snip>
> 
> >>>> +
> >>>> +static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
> >>>> +{
> >>>> +	uint64_t bsize;
> >>>> +
> >>>> +	if (!npages || !bitmap_size || bitmap_size > UINT_MAX)  
> >>>
> >>> As commented previously, how do we derive this UINT_MAX limitation?
> >>>      
> >>
> >> Sorry, I missed that earlier
> >>  
> >>   > UINT_MAX seems arbitrary, is this specified in our API?  The size of a
> >>   > vfio_dma is limited to what the user is able to pin, and therefore
> >>   > their locked memory limit, but do we have an explicit limit elsewhere
> >>   > that results in this limit here.  I think a 4GB bitmap would track
> >>   > something like 2^47 bytes of memory, that's pretty excessive, but still
> >>   > an arbitrary limit.  
> >>
> >> There has to be some upper limit check. In core KVM, in
> >> virt/kvm/kvm_main.c there is max number of pages check:
> >>
> >> if (new.npages > KVM_MEM_MAX_NR_PAGES)
> >>
> >> Where
> >> /*
> >>    * Some of the bitops functions do not support too long bitmaps.
> >>    * This number must be determined not to exceed such limits.
> >>    */
> >> #define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1)
> >>
> >> Though I don't know which bitops functions do not support long bitmaps.
> >>
> >> Something similar as above can be done or same as you also mentioned of
> >> 4GB bitmap limit? that is U32_MAX instead of UINT_MAX?  
> > 
> > Let's see, we use bitmap_set():
> > 
> > void bitmap_set(unsigned long *map, unsigned int start, unsigned int nbits)
> > 
> > So we're limited to an unsigned int number of bits, but for an
> > unaligned, multi-bit operation this will call __bitmap_set():
> > 
> > void __bitmap_set(unsigned long *map, unsigned int start, int len)
> > 
> > So we're down to a signed int number of bits (seems like an API bug in
> > bitops there), so it makes sense that KVM is testing against MAX_INT
> > number of pages, ie. number of bits.  But that still suggests a bitmap
> > size of MAX_UINT is off by a factor of 16.  So we can have 2^31 bits
> > divided by 2^3 bits/byte yields a maximum bitmap size of 2^28 (ie.
> > 256MB), which maps 2^31 * 2^12 = 2^43 (8TB) on a 4K system.
> > 
> > Let's fix the limit check and put a nice comment explaining it.  Thanks,
> >   
> 
> Agreed. Adding DIRTY_BITMAP_SIZE_MAX macro and comment as below.
> 
> /*
>   * Input argument of number of bits to bitmap_set() is unsigned 
> integer, which
>   * further casts to signed integer for unaligned multi-bit operation,
>   * __bitmap_set().
>   * Then maximum bitmap size supported is 2^31 bits divided by 2^3 
> bits/byte,
>   * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
>   * system.
>   */
> #define DIRTY_BITMAP_PAGES_MAX  ((1UL << 31) - 1)

nit, can we just use INT_MAX here?

> #define DIRTY_BITMAP_SIZE_MAX 	\
> 			DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
> 
> 
> Thanks,
> Kirti
>
Yan Zhao March 20, 2020, 12:51 a.m. UTC | #14
On Fri, Mar 20, 2020 at 12:57:30AM +0800, Kirti Wankhede wrote:
> 
> 
> On 3/19/2020 6:36 PM, Alex Williamson wrote:
> > On Thu, 19 Mar 2020 02:15:34 -0400
> > Yan Zhao <yan.y.zhao@intel.com> wrote:
> > 
> >> On Thu, Mar 19, 2020 at 12:40:53PM +0800, Alex Williamson wrote:
> >>> On Thu, 19 Mar 2020 00:15:33 -0400
> >>> Yan Zhao <yan.y.zhao@intel.com> wrote:
> >>>    
> >>>> On Thu, Mar 19, 2020 at 12:01:00PM +0800, Alex Williamson wrote:
> >>>>> On Wed, 18 Mar 2020 23:06:39 -0400
> >>>>> Yan Zhao <yan.y.zhao@intel.com> wrote:
> >>>>>      
> >>>>>> On Thu, Mar 19, 2020 at 03:41:11AM +0800, Kirti Wankhede wrote:
> >>>>>>> VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
> >>>>>>> - Start dirty pages tracking while migration is active
> >>>>>>> - Stop dirty pages tracking.
> >>>>>>> - Get dirty pages bitmap. Its user space application's responsibility to
> >>>>>>>    copy content of dirty pages from source to destination during migration.
> >>>>>>>
> >>>>>>> To prevent DoS attack, memory for bitmap is allocated per vfio_dma
> >>>>>>> structure. Bitmap size is calculated considering smallest supported page
> >>>>>>> size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled
> >>>>>>>
> >>>>>>> Bitmap is populated for already pinned pages when bitmap is allocated for
> >>>>>>> a vfio_dma with the smallest supported page size. Update bitmap from
> >>>>>>> pinning functions when tracking is enabled. When user application queries
> >>>>>>> bitmap, check if requested page size is same as page size used to
> >>>>>>> populated bitmap. If it is equal, copy bitmap, but if not equal, return
> >>>>>>> error.
> >>>>>>>
> >>>>>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> >>>>>>> Reviewed-by: Neo Jia <cjia@nvidia.com>
> >>>>>>> ---
> >>>>>>>   drivers/vfio/vfio_iommu_type1.c | 205 +++++++++++++++++++++++++++++++++++++++-
> >>>>>>>   1 file changed, 203 insertions(+), 2 deletions(-)
> >>>>>>>
> >>>>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> >>>>>>> index 70aeab921d0f..d6417fb02174 100644
> >>>>>>> --- a/drivers/vfio/vfio_iommu_type1.c
> >>>>>>> +++ b/drivers/vfio/vfio_iommu_type1.c
> >>>>>>> @@ -71,6 +71,7 @@ struct vfio_iommu {
> >>>>>>>   	unsigned int		dma_avail;
> >>>>>>>   	bool			v2;
> >>>>>>>   	bool			nesting;
> >>>>>>> +	bool			dirty_page_tracking;
> >>>>>>>   };
> >>>>>>>   
> >>>>>>>   struct vfio_domain {
> >>>>>>> @@ -91,6 +92,7 @@ struct vfio_dma {
> >>>>>>>   	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
> >>>>>>>   	struct task_struct	*task;
> >>>>>>>   	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
> >>>>>>> +	unsigned long		*bitmap;
> >>>>>>>   };
> >>>>>>>   
> >>>>>>>   struct vfio_group {
> >>>>>>> @@ -125,7 +127,10 @@ struct vfio_regions {
> >>>>>>>   #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> >>>>>>>   					(!list_empty(&iommu->domain_list))
> >>>>>>>   
> >>>>>>> +#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
> >>>>>>> +
> >>>>>>>   static int put_pfn(unsigned long pfn, int prot);
> >>>>>>> +static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
> >>>>>>>   
> >>>>>>>   /*
> >>>>>>>    * This code handles mapping and unmapping of user data buffers
> >>>>>>> @@ -175,6 +180,55 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
> >>>>>>>   	rb_erase(&old->node, &iommu->dma_list);
> >>>>>>>   }
> >>>>>>>   
> >>>>>>> +static int vfio_dma_bitmap_alloc(struct vfio_iommu *iommu, uint64_t pgsize)
> >>>>>>> +{
> >>>>>>> +	struct rb_node *n = rb_first(&iommu->dma_list);
> >>>>>>> +
> >>>>>>> +	for (; n; n = rb_next(n)) {
> >>>>>>> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> >>>>>>> +		struct rb_node *p;
> >>>>>>> +		unsigned long npages = dma->size / pgsize;
> >>>>>>> +
> >>>>>>> +		dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
> >>>>>>> +		if (!dma->bitmap) {
> >>>>>>> +			struct rb_node *p = rb_prev(n);
> >>>>>>> +
> >>>>>>> +			for (; p; p = rb_prev(p)) {
> >>>>>>> +				struct vfio_dma *dma = rb_entry(n,
> >>>>>>> +							struct vfio_dma, node);
> >>>>>>> +
> >>>>>>> +				kfree(dma->bitmap);
> >>>>>>> +				dma->bitmap = NULL;
> >>>>>>> +			}
> >>>>>>> +			return -ENOMEM;
> >>>>>>> +		}
> >>>>>>> +
> >>>>>>> +		if (RB_EMPTY_ROOT(&dma->pfn_list))
> >>>>>>> +			continue;
> >>>>>>> +
> >>>>>>> +		for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
> >>>>>>> +			struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
> >>>>>>> +							 node);
> >>>>>>> +
> >>>>>>> +			bitmap_set(dma->bitmap,
> >>>>>>> +					(vpfn->iova - dma->iova) / pgsize, 1);
> >>>>>>> +		}
> >>>>>>> +	}
> >>>>>>> +	return 0;
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> +static void vfio_dma_bitmap_free(struct vfio_iommu *iommu)
> >>>>>>> +{
> >>>>>>> +	struct rb_node *n = rb_first(&iommu->dma_list);
> >>>>>>> +
> >>>>>>> +	for (; n; n = rb_next(n)) {
> >>>>>>> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> >>>>>>> +
> >>>>>>> +		kfree(dma->bitmap);
> >>>>>>> +		dma->bitmap = NULL;
> >>>>>>> +	}
> >>>>>>> +}
> >>>>>>> +
> >>>>>>>   /*
> >>>>>>>    * Helper Functions for host iova-pfn list
> >>>>>>>    */
> >>>>>>> @@ -567,6 +621,14 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
> >>>>>>>   			vfio_unpin_page_external(dma, iova, do_accounting);
> >>>>>>>   			goto pin_unwind;
> >>>>>>>   		}
> >>>>>>> +
> >>>>>>> +		if (iommu->dirty_page_tracking) {
> >>>>>>> +			unsigned long pgshift =
> >>>>>>> +					 __ffs(vfio_pgsize_bitmap(iommu));
> >>>>>>> +
> >>>>>>> +			bitmap_set(dma->bitmap,
> >>>>>>> +				   (vpfn->iova - dma->iova) >> pgshift, 1);
> >>>>>>> +		}
> >>>>>>>   	}
> >>>>>>>   
> >>>>>>>   	ret = i;
> >>>>>>> @@ -801,6 +863,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
> >>>>>>>   	vfio_unmap_unpin(iommu, dma, true);
> >>>>>>>   	vfio_unlink_dma(iommu, dma);
> >>>>>>>   	put_task_struct(dma->task);
> >>>>>>> +	kfree(dma->bitmap);
> >>>>>>>   	kfree(dma);
> >>>>>>>   	iommu->dma_avail++;
> >>>>>>>   }
> >>>>>>> @@ -831,6 +894,50 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
> >>>>>>>   	return bitmap;
> >>>>>>>   }
> >>>>>>>   
> >>>>>>> +static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
> >>>>>>> +				  size_t size, uint64_t pgsize,
> >>>>>>> +				  unsigned char __user *bitmap)
> >>>>>>> +{
> >>>>>>> +	struct vfio_dma *dma;
> >>>>>>> +	unsigned long pgshift = __ffs(pgsize);
> >>>>>>> +	unsigned int npages, bitmap_size;
> >>>>>>> +
> >>>>>>> +	dma = vfio_find_dma(iommu, iova, 1);
> >>>>>>> +
> >>>>>>> +	if (!dma)
> >>>>>>> +		return -EINVAL;
> >>>>>>> +
> >>>>>>> +	if (dma->iova != iova || dma->size != size)
> >>>>>>> +		return -EINVAL;
> >>>>>>> +
> >>>>>> looks this size is passed from user. how can it ensure size always
> >>>>>> equals to dma->size ?
> >>>>>>
> >>>>>> shouldn't we iterate dma tree to look for dirty for whole range if a
> >>>>>> single dma cannot meet them all?
> >>>>>
> >>>>> Please see the discussion on v12[1], the problem is with the alignment
> >>>>> of DMA mapped regions versus the bitmap.  A DMA mapping only requires
> >>>>> page alignment, so for example imagine a user requests the bitmap from
> >>>>> page zero to 4GB, but we have a DMA mapping starting at 4KB.  We can't
> >>>>> efficiently copy the bitmap tracked by the vfio_dma structure to the
> >>>>> user buffer when it's shifted by 1 bit.  Adjacent mappings can also
> >>>>> make for a very complicated implementation.  In the discussion linked
> >>>>> we decided to compromise on a more simple implementation that requires
> >>>>> the user to ask for a bitmap which exactly matches a single DMA
> >>>>> mapping, which Kirti indicates is what we require to support QEMU.
> >>>>> Later in the series, the unmap operation also makes this requirement
> >>>>> when used with the flags to retrieve the dirty bitmap.  Thanks,
> >>>>>     
> >>>>
> >>>> so, what about for vIOMMU enabling case?
> >>>> if IOVAs are mapped per page, then there's a log_sync in qemu,
> >>>> it's supposed for range from 0-U64MAX, qemu has to find out which
> >>>> ones are mapped and cut them into pages before calling this IOCTL?
> >>>> And what if those IOVAs are mapped for len more than one page?
> >>>
> >>> Good question.  Kirti?
> >>>
> 
> In log_sync with vIOMMU, loop for range such that:
> 
> - find iotlb entry for iova, get iova_xlat
> - size = iotlb.addr_mask + 1; This is same caculation as when mapping 
> are created from vfio_iommu_map_notify()
> - use the <iova_xlat, size> for VFIO_IOMMU_DIRTY_PAGES ioctl
> - increment iova: iova += size
> - iterate above steps till end of range.
>
Ok. It makes sense, though not efficient :)
think about when there's no iotlb found for an iova, page by page
incremental of iova is required. right?

> >>>>> [1] https://lore.kernel.org/kvm/20200218215330.5bc8fc6a@w520.home/
> >>>>>       
> >>>>>>> +	npages = dma->size >> pgshift;
> >>>>>>> +	bitmap_size = DIRTY_BITMAP_BYTES(npages);
> >>>>>>> +
> >>>>>>> +	/* mark all pages dirty if all pages are pinned and mapped. */
> >>>>>>> +	if (dma->iommu_mapped)
> >>>>>>> +		bitmap_set(dma->bitmap, 0, npages);
> >>>>>>> +
> >>>>>>> +	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
> >>>>>>> +		return -EFAULT;
> >>>>>>> +
> >> Here, dma->bitmap needs to be cleared. right?
> > 
> > Ah, I missed re-checking this in my review.  v13 did clear it, but I
> > noted that we need to re-populate any currently pinned pages.  This
> > neither clears nor repopulates.  That's wrong.  Thanks,
> > 
> 
> Why re-populate when there will be no change since 
> vfio_iova_dirty_bitmap() is called holding iommu->lock? If there is any 
> pin request while vfio_iova_dirty_bitmap() is still working, it will 
> wait till iommu->lock is released. Bitmap will be populated when page is 
> pinned.
> 
> Thanks,
> Kirti
> 
> > Alex
> >   
> >>>>>>> +	return 0;
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> +static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
> >>>>>>> +{
> >>>>>>> +	uint64_t bsize;
> >>>>>>> +
> >>>>>>> +	if (!npages || !bitmap_size || bitmap_size > UINT_MAX)
> >>>>>>> +		return -EINVAL;
> >>>>>>> +
> >>>>>>> +	bsize = DIRTY_BITMAP_BYTES(npages);
> >>>>>>> +
> >>>>>>> +	if (bitmap_size < bsize)
> >>>>>>> +		return -EINVAL;
> >>>>>>> +
> >>>>>>> +	return 0;
> >>>>>>> +}
> >>>>>>> +
> >>>>>>>   static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> >>>>>>>   			     struct vfio_iommu_type1_dma_unmap *unmap)
> >>>>>>>   {
> >>>>>>> @@ -2278,6 +2385,93 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
> >>>>>>>   
> >>>>>>>   		return copy_to_user((void __user *)arg, &unmap, minsz) ?
> >>>>>>>   			-EFAULT : 0;
> >>>>>>> +	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
> >>>>>>> +		struct vfio_iommu_type1_dirty_bitmap dirty;
> >>>>>>> +		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
> >>>>>>> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
> >>>>>>> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
> >>>>>>> +		int ret = 0;
> >>>>>>> +
> >>>>>>> +		if (!iommu->v2)
> >>>>>>> +			return -EACCES;
> >>>>>>> +
> >>>>>>> +		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
> >>>>>>> +				    flags);
> >>>>>>> +
> >>>>>>> +		if (copy_from_user(&dirty, (void __user *)arg, minsz))
> >>>>>>> +			return -EFAULT;
> >>>>>>> +
> >>>>>>> +		if (dirty.argsz < minsz || dirty.flags & ~mask)
> >>>>>>> +			return -EINVAL;
> >>>>>>> +
> >>>>>>> +		/* only one flag should be set at a time */
> >>>>>>> +		if (__ffs(dirty.flags) != __fls(dirty.flags))
> >>>>>>> +			return -EINVAL;
> >>>>>>> +
> >>>>>>> +		if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
> >>>>>>> +			uint64_t pgsize = 1 << __ffs(vfio_pgsize_bitmap(iommu));
> >>>>>>> +
> >>>>>>> +			mutex_lock(&iommu->lock);
> >>>>>>> +			if (!iommu->dirty_page_tracking) {
> >>>>>>> +				ret = vfio_dma_bitmap_alloc(iommu, pgsize);
> >>>>>>> +				if (!ret)
> >>>>>>> +					iommu->dirty_page_tracking = true;
> >>>>>>> +			}
> >>>>>>> +			mutex_unlock(&iommu->lock);
> >>>>>>> +			return ret;
> >>>>>>> +		} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
> >>>>>>> +			mutex_lock(&iommu->lock);
> >>>>>>> +			if (iommu->dirty_page_tracking) {
> >>>>>>> +				iommu->dirty_page_tracking = false;
> >>>>>>> +				vfio_dma_bitmap_free(iommu);
> >>>>>>> +			}
> >>>>>>> +			mutex_unlock(&iommu->lock);
> >>>>>>> +			return 0;
> >>>>>>> +		} else if (dirty.flags &
> >>>>>>> +				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
> >>>>>>> +			struct vfio_iommu_type1_dirty_bitmap_get range;
> >>>>>>> +			unsigned long pgshift;
> >>>>>>> +			size_t data_size = dirty.argsz - minsz;
> >>>>>>> +			uint64_t iommu_pgsize =
> >>>>>>> +					 1 << __ffs(vfio_pgsize_bitmap(iommu));
> >>>>>>> +
> >>>>>>> +			if (!data_size || data_size < sizeof(range))
> >>>>>>> +				return -EINVAL;
> >>>>>>> +
> >>>>>>> +			if (copy_from_user(&range, (void __user *)(arg + minsz),
> >>>>>>> +					   sizeof(range)))
> >>>>>>> +				return -EFAULT;
> >>>>>>> +
> >>>>>>> +			/* allow only min supported pgsize */
> >>>>>>> +			if (range.bitmap.pgsize != iommu_pgsize)
> >>>>>>> +				return -EINVAL;
> >>>>>>> +			if (range.iova & (iommu_pgsize - 1))
> >>>>>>> +				return -EINVAL;
> >>>>>>> +			if (!range.size || range.size & (iommu_pgsize - 1))
> >>>>>>> +				return -EINVAL;
> >>>>>>> +			if (range.iova + range.size < range.iova)
> >>>>>>> +				return -EINVAL;
> >>>>>>> +			if (!access_ok((void __user *)range.bitmap.data,
> >>>>>>> +				       range.bitmap.size))
> >>>>>>> +				return -EINVAL;
> >>>>>>> +
> >>>>>>> +			pgshift = __ffs(range.bitmap.pgsize);
> >>>>>>> +			ret = verify_bitmap_size(range.size >> pgshift,
> >>>>>>> +						 range.bitmap.size);
> >>>>>>> +			if (ret)
> >>>>>>> +				return ret;
> >>>>>>> +
> >>>>>>> +			mutex_lock(&iommu->lock);
> >>>>>>> +			if (iommu->dirty_page_tracking)
> >>>>>>> +				ret = vfio_iova_dirty_bitmap(iommu, range.iova,
> >>>>>>> +					 range.size, range.bitmap.pgsize,
> >>>>>>> +				    (unsigned char __user *)range.bitmap.data);
> >>>>>>> +			else
> >>>>>>> +				ret = -EINVAL;
> >>>>>>> +			mutex_unlock(&iommu->lock);
> >>>>>>> +
> >>>>>>> +			return ret;
> >>>>>>> +		}
> >>>>>>>   	}
> >>>>>>>   
> >>>>>>>   	return -ENOTTY;
> >>>>>>> @@ -2345,10 +2539,17 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
> >>>>>>>   
> >>>>>>>   	vaddr = dma->vaddr + offset;
> >>>>>>>   
> >>>>>>> -	if (write)
> >>>>>>> +	if (write) {
> >>>>>>>   		*copied = __copy_to_user((void __user *)vaddr, data,
> >>>>>>>   					 count) ? 0 : count;
> >>>>>>> -	else
> >>>>>>> +		if (*copied && iommu->dirty_page_tracking) {
> >>>>>>> +			unsigned long pgshift =
> >>>>>>> +				__ffs(vfio_pgsize_bitmap(iommu));
> >>>>>>> +
> >>>>>>> +			bitmap_set(dma->bitmap, offset >> pgshift,
> >>>>>>> +				   *copied >> pgshift);
> >>>>>>> +		}
> >>>>>>> +	} else
> >>>>>>>   		*copied = __copy_from_user(data, (void __user *)vaddr,
> >>>>>>>   					   count) ? 0 : count;
> >>>>>>>   	if (kthread)
> >>>>>>> -- 
> >>>>>>> 2.7.0
> >>>>>>>        
> >>>>>>      
> >>>>>      
> >>>>    
> >>>    
> >>
> >
diff mbox series

Patch

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 70aeab921d0f..d6417fb02174 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -71,6 +71,7 @@  struct vfio_iommu {
 	unsigned int		dma_avail;
 	bool			v2;
 	bool			nesting;
+	bool			dirty_page_tracking;
 };
 
 struct vfio_domain {
@@ -91,6 +92,7 @@  struct vfio_dma {
 	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
 	struct task_struct	*task;
 	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
+	unsigned long		*bitmap;
 };
 
 struct vfio_group {
@@ -125,7 +127,10 @@  struct vfio_regions {
 #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
 					(!list_empty(&iommu->domain_list))
 
+#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
+
 static int put_pfn(unsigned long pfn, int prot);
+static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
 
 /*
  * This code handles mapping and unmapping of user data buffers
@@ -175,6 +180,55 @@  static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
 	rb_erase(&old->node, &iommu->dma_list);
 }
 
+static int vfio_dma_bitmap_alloc(struct vfio_iommu *iommu, uint64_t pgsize)
+{
+	struct rb_node *n = rb_first(&iommu->dma_list);
+
+	for (; n; n = rb_next(n)) {
+		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+		struct rb_node *p;
+		unsigned long npages = dma->size / pgsize;
+
+		dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
+		if (!dma->bitmap) {
+			struct rb_node *p = rb_prev(n);
+
+			for (; p; p = rb_prev(p)) {
+				struct vfio_dma *dma = rb_entry(n,
+							struct vfio_dma, node);
+
+				kfree(dma->bitmap);
+				dma->bitmap = NULL;
+			}
+			return -ENOMEM;
+		}
+
+		if (RB_EMPTY_ROOT(&dma->pfn_list))
+			continue;
+
+		for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
+			struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
+							 node);
+
+			bitmap_set(dma->bitmap,
+					(vpfn->iova - dma->iova) / pgsize, 1);
+		}
+	}
+	return 0;
+}
+
+static void vfio_dma_bitmap_free(struct vfio_iommu *iommu)
+{
+	struct rb_node *n = rb_first(&iommu->dma_list);
+
+	for (; n; n = rb_next(n)) {
+		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+
+		kfree(dma->bitmap);
+		dma->bitmap = NULL;
+	}
+}
+
 /*
  * Helper Functions for host iova-pfn list
  */
@@ -567,6 +621,14 @@  static int vfio_iommu_type1_pin_pages(void *iommu_data,
 			vfio_unpin_page_external(dma, iova, do_accounting);
 			goto pin_unwind;
 		}
+
+		if (iommu->dirty_page_tracking) {
+			unsigned long pgshift =
+					 __ffs(vfio_pgsize_bitmap(iommu));
+
+			bitmap_set(dma->bitmap,
+				   (vpfn->iova - dma->iova) >> pgshift, 1);
+		}
 	}
 
 	ret = i;
@@ -801,6 +863,7 @@  static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
 	vfio_unmap_unpin(iommu, dma, true);
 	vfio_unlink_dma(iommu, dma);
 	put_task_struct(dma->task);
+	kfree(dma->bitmap);
 	kfree(dma);
 	iommu->dma_avail++;
 }
@@ -831,6 +894,50 @@  static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
 	return bitmap;
 }
 
+static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
+				  size_t size, uint64_t pgsize,
+				  unsigned char __user *bitmap)
+{
+	struct vfio_dma *dma;
+	unsigned long pgshift = __ffs(pgsize);
+	unsigned int npages, bitmap_size;
+
+	dma = vfio_find_dma(iommu, iova, 1);
+
+	if (!dma)
+		return -EINVAL;
+
+	if (dma->iova != iova || dma->size != size)
+		return -EINVAL;
+
+	npages = dma->size >> pgshift;
+	bitmap_size = DIRTY_BITMAP_BYTES(npages);
+
+	/* mark all pages dirty if all pages are pinned and mapped. */
+	if (dma->iommu_mapped)
+		bitmap_set(dma->bitmap, 0, npages);
+
+	if (copy_to_user((void __user *)bitmap, dma->bitmap, bitmap_size))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
+{
+	uint64_t bsize;
+
+	if (!npages || !bitmap_size || bitmap_size > UINT_MAX)
+		return -EINVAL;
+
+	bsize = DIRTY_BITMAP_BYTES(npages);
+
+	if (bitmap_size < bsize)
+		return -EINVAL;
+
+	return 0;
+}
+
 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 			     struct vfio_iommu_type1_dma_unmap *unmap)
 {
@@ -2278,6 +2385,93 @@  static long vfio_iommu_type1_ioctl(void *iommu_data,
 
 		return copy_to_user((void __user *)arg, &unmap, minsz) ?
 			-EFAULT : 0;
+	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
+		struct vfio_iommu_type1_dirty_bitmap dirty;
+		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
+				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
+				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
+		int ret = 0;
+
+		if (!iommu->v2)
+			return -EACCES;
+
+		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
+				    flags);
+
+		if (copy_from_user(&dirty, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (dirty.argsz < minsz || dirty.flags & ~mask)
+			return -EINVAL;
+
+		/* only one flag should be set at a time */
+		if (__ffs(dirty.flags) != __fls(dirty.flags))
+			return -EINVAL;
+
+		if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
+			uint64_t pgsize = 1 << __ffs(vfio_pgsize_bitmap(iommu));
+
+			mutex_lock(&iommu->lock);
+			if (!iommu->dirty_page_tracking) {
+				ret = vfio_dma_bitmap_alloc(iommu, pgsize);
+				if (!ret)
+					iommu->dirty_page_tracking = true;
+			}
+			mutex_unlock(&iommu->lock);
+			return ret;
+		} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
+			mutex_lock(&iommu->lock);
+			if (iommu->dirty_page_tracking) {
+				iommu->dirty_page_tracking = false;
+				vfio_dma_bitmap_free(iommu);
+			}
+			mutex_unlock(&iommu->lock);
+			return 0;
+		} else if (dirty.flags &
+				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
+			struct vfio_iommu_type1_dirty_bitmap_get range;
+			unsigned long pgshift;
+			size_t data_size = dirty.argsz - minsz;
+			uint64_t iommu_pgsize =
+					 1 << __ffs(vfio_pgsize_bitmap(iommu));
+
+			if (!data_size || data_size < sizeof(range))
+				return -EINVAL;
+
+			if (copy_from_user(&range, (void __user *)(arg + minsz),
+					   sizeof(range)))
+				return -EFAULT;
+
+			/* allow only min supported pgsize */
+			if (range.bitmap.pgsize != iommu_pgsize)
+				return -EINVAL;
+			if (range.iova & (iommu_pgsize - 1))
+				return -EINVAL;
+			if (!range.size || range.size & (iommu_pgsize - 1))
+				return -EINVAL;
+			if (range.iova + range.size < range.iova)
+				return -EINVAL;
+			if (!access_ok((void __user *)range.bitmap.data,
+				       range.bitmap.size))
+				return -EINVAL;
+
+			pgshift = __ffs(range.bitmap.pgsize);
+			ret = verify_bitmap_size(range.size >> pgshift,
+						 range.bitmap.size);
+			if (ret)
+				return ret;
+
+			mutex_lock(&iommu->lock);
+			if (iommu->dirty_page_tracking)
+				ret = vfio_iova_dirty_bitmap(iommu, range.iova,
+					 range.size, range.bitmap.pgsize,
+				    (unsigned char __user *)range.bitmap.data);
+			else
+				ret = -EINVAL;
+			mutex_unlock(&iommu->lock);
+
+			return ret;
+		}
 	}
 
 	return -ENOTTY;
@@ -2345,10 +2539,17 @@  static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
 
 	vaddr = dma->vaddr + offset;
 
-	if (write)
+	if (write) {
 		*copied = __copy_to_user((void __user *)vaddr, data,
 					 count) ? 0 : count;
-	else
+		if (*copied && iommu->dirty_page_tracking) {
+			unsigned long pgshift =
+				__ffs(vfio_pgsize_bitmap(iommu));
+
+			bitmap_set(dma->bitmap, offset >> pgshift,
+				   *copied >> pgshift);
+		}
+	} else
 		*copied = __copy_from_user(data, (void __user *)vaddr,
 					   count) ? 0 : count;
 	if (kthread)