diff mbox series

[v14,Kernel,5/7] vfio iommu: Update UNMAP_DMA ioctl to get dirty bitmap before unmap

Message ID 1584560474-19946-6-git-send-email-kwankhede@nvidia.com (mailing list archive)
State New, archived
Headers show
Series KABIs to support migration for VFIO devices | expand

Commit Message

Kirti Wankhede March 18, 2020, 7:41 p.m. UTC
DMA mapped pages, including those pinned by mdev vendor drivers, might
get unpinned and unmapped while migration is active and device is still
running. For example, in pre-copy phase while guest driver could access
those pages, host device or vendor driver can dirty these mapped pages.
Such pages should be marked dirty so as to maintain memory consistency
for a user making use of dirty page tracking.

To get bitmap during unmap, user should set flag
VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP, bitmap memory should be allocated and
zeroed by user space application. Bitmap size and page size should be set
by user application.

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Reviewed-by: Neo Jia <cjia@nvidia.com>
---
 drivers/vfio/vfio_iommu_type1.c | 55 ++++++++++++++++++++++++++++++++++++++---
 include/uapi/linux/vfio.h       | 11 +++++++++
 2 files changed, 62 insertions(+), 4 deletions(-)

Comments

Alex Williamson March 19, 2020, 3:45 a.m. UTC | #1
On Thu, 19 Mar 2020 01:11:12 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> DMA mapped pages, including those pinned by mdev vendor drivers, might
> get unpinned and unmapped while migration is active and device is still
> running. For example, in pre-copy phase while guest driver could access
> those pages, host device or vendor driver can dirty these mapped pages.
> Such pages should be marked dirty so as to maintain memory consistency
> for a user making use of dirty page tracking.
> 
> To get bitmap during unmap, user should set flag
> VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP, bitmap memory should be allocated and
> zeroed by user space application. Bitmap size and page size should be set
> by user application.

Looks good, but as mentioned we no longer require the user to zero the
bitmap.  It's mentioned in the commit log above and in the uapi
comment.  Thanks,

Alex

> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Reviewed-by: Neo Jia <cjia@nvidia.com>
> ---
>  drivers/vfio/vfio_iommu_type1.c | 55 ++++++++++++++++++++++++++++++++++++++---
>  include/uapi/linux/vfio.h       | 11 +++++++++
>  2 files changed, 62 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index d6417fb02174..aa1ac30f7854 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -939,7 +939,8 @@ static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
>  }
>  
>  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> -			     struct vfio_iommu_type1_dma_unmap *unmap)
> +			     struct vfio_iommu_type1_dma_unmap *unmap,
> +			     struct vfio_bitmap *bitmap)
>  {
>  	uint64_t mask;
>  	struct vfio_dma *dma, *dma_last = NULL;
> @@ -990,6 +991,10 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>  	 * will be returned if these conditions are not met.  The v2 interface
>  	 * will only return success and a size of zero if there were no
>  	 * mappings within the range.
> +	 *
> +	 * When VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP flag is set, unmap request
> +	 * must be for single mapping. Multiple mappings with this flag set is
> +	 * not supported.
>  	 */
>  	if (iommu->v2) {
>  		dma = vfio_find_dma(iommu, unmap->iova, 1);
> @@ -997,6 +1002,13 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>  			ret = -EINVAL;
>  			goto unlock;
>  		}
> +
> +		if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
> +		    (dma->iova != unmap->iova || dma->size != unmap->size)) {
> +			ret = -EINVAL;
> +			goto unlock;
> +		}
> +
>  		dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
>  		if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
>  			ret = -EINVAL;
> @@ -1014,6 +1026,12 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>  		if (dma->task->mm != current->mm)
>  			break;
>  
> +		if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
> +		     iommu->dirty_page_tracking)
> +			vfio_iova_dirty_bitmap(iommu, dma->iova, dma->size,
> +					bitmap->pgsize,
> +					(unsigned char __user *) bitmap->data);
> +
>  		if (!RB_EMPTY_ROOT(&dma->pfn_list)) {
>  			struct vfio_iommu_type1_dma_unmap nb_unmap;
>  
> @@ -2369,17 +2387,46 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>  
>  	} else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
>  		struct vfio_iommu_type1_dma_unmap unmap;
> -		long ret;
> +		struct vfio_bitmap bitmap = { 0 };
> +		int ret;
>  
>  		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
>  
>  		if (copy_from_user(&unmap, (void __user *)arg, minsz))
>  			return -EFAULT;
>  
> -		if (unmap.argsz < minsz || unmap.flags)
> +		if (unmap.argsz < minsz ||
> +		    unmap.flags & ~VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP)
>  			return -EINVAL;
>  
> -		ret = vfio_dma_do_unmap(iommu, &unmap);
> +		if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
> +			unsigned long pgshift;
> +			uint64_t iommu_pgsize =
> +					 1 << __ffs(vfio_pgsize_bitmap(iommu));
> +
> +			if (unmap.argsz < (minsz + sizeof(bitmap)))
> +				return -EINVAL;
> +
> +			if (copy_from_user(&bitmap,
> +					   (void __user *)(arg + minsz),
> +					   sizeof(bitmap)))
> +				return -EFAULT;
> +
> +			/* allow only min supported pgsize */
> +			if (bitmap.pgsize != iommu_pgsize)
> +				return -EINVAL;
> +			if (!access_ok((void __user *)bitmap.data, bitmap.size))
> +				return -EINVAL;
> +
> +			pgshift = __ffs(bitmap.pgsize);
> +			ret = verify_bitmap_size(unmap.size >> pgshift,
> +						 bitmap.size);
> +			if (ret)
> +				return ret;
> +
> +		}
> +
> +		ret = vfio_dma_do_unmap(iommu, &unmap, &bitmap);
>  		if (ret)
>  			return ret;
>  
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 043e9eafb255..a704e5380f04 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -1010,12 +1010,23 @@ struct vfio_bitmap {
>   * field.  No guarantee is made to the user that arbitrary unmaps of iova
>   * or size different from those used in the original mapping call will
>   * succeed.
> + * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP should be set to get dirty bitmap
> + * before unmapping IO virtual addresses. When this flag is set, user must
> + * provide data[] as structure vfio_bitmap. User must allocate memory to get
> + * bitmap, clear the bitmap memory by setting zero and must set size of
> + * allocated memory in vfio_bitmap.size field. One bit in bitmap
> + * represents per page, page of user provided page size in 'pgsize',
> + * consecutively starting from iova offset. Bit set indicates page at that
> + * offset from iova is dirty. Bitmap of pages in the range of unmapped size is
> + * returned in vfio_bitmap.data
>   */
>  struct vfio_iommu_type1_dma_unmap {
>  	__u32	argsz;
>  	__u32	flags;
> +#define VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP (1 << 0)
>  	__u64	iova;				/* IO virtual address */
>  	__u64	size;				/* Size of mapping (bytes) */
> +	__u8    data[];
>  };
>  
>  #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
Yan Zhao March 20, 2020, 8:35 a.m. UTC | #2
On Thu, Mar 19, 2020 at 03:41:12AM +0800, Kirti Wankhede wrote:
> DMA mapped pages, including those pinned by mdev vendor drivers, might
> get unpinned and unmapped while migration is active and device is still
> running. For example, in pre-copy phase while guest driver could access
> those pages, host device or vendor driver can dirty these mapped pages.
> Such pages should be marked dirty so as to maintain memory consistency
> for a user making use of dirty page tracking.
> 
> To get bitmap during unmap, user should set flag
> VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP, bitmap memory should be allocated and
> zeroed by user space application. Bitmap size and page size should be set
> by user application.
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Reviewed-by: Neo Jia <cjia@nvidia.com>
> ---
>  drivers/vfio/vfio_iommu_type1.c | 55 ++++++++++++++++++++++++++++++++++++++---
>  include/uapi/linux/vfio.h       | 11 +++++++++
>  2 files changed, 62 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index d6417fb02174..aa1ac30f7854 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -939,7 +939,8 @@ static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
>  }
>  
>  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> -			     struct vfio_iommu_type1_dma_unmap *unmap)
> +			     struct vfio_iommu_type1_dma_unmap *unmap,
> +			     struct vfio_bitmap *bitmap)
>  {
>  	uint64_t mask;
>  	struct vfio_dma *dma, *dma_last = NULL;
> @@ -990,6 +991,10 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>  	 * will be returned if these conditions are not met.  The v2 interface
>  	 * will only return success and a size of zero if there were no
>  	 * mappings within the range.
> +	 *
> +	 * When VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP flag is set, unmap request
> +	 * must be for single mapping. Multiple mappings with this flag set is
> +	 * not supported.
>  	 */
>  	if (iommu->v2) {
>  		dma = vfio_find_dma(iommu, unmap->iova, 1);
> @@ -997,6 +1002,13 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>  			ret = -EINVAL;
>  			goto unlock;
>  		}
> +
> +		if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
> +		    (dma->iova != unmap->iova || dma->size != unmap->size)) {
dma is probably NULL here!

And this restriction on UNMAP would make some UNMAP operations of vIOMMU
fail.

e.g. below condition indeed happens in reality.
an UNMAP ioctl comes for IOVA range from 0xff800000, of size 0x200000
However, IOVAs in this range are mapped page by page.i.e., dma->size is 0x1000.

Previous, this UNMAP ioctl could unmap successfully as a whole.

Thanks
Yan

> +			ret = -EINVAL;
> +			goto unlock;
> +		}
> +
>  		dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
>  		if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
>  			ret = -EINVAL;
> @@ -1014,6 +1026,12 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>  		if (dma->task->mm != current->mm)
>  			break;
>  
> +		if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
> +		     iommu->dirty_page_tracking)
> +			vfio_iova_dirty_bitmap(iommu, dma->iova, dma->size,
> +					bitmap->pgsize,
> +					(unsigned char __user *) bitmap->data);
> +
>  		if (!RB_EMPTY_ROOT(&dma->pfn_list)) {
>  			struct vfio_iommu_type1_dma_unmap nb_unmap;
>  
> @@ -2369,17 +2387,46 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>  
>  	} else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
>  		struct vfio_iommu_type1_dma_unmap unmap;
> -		long ret;
> +		struct vfio_bitmap bitmap = { 0 };
> +		int ret;
>  
>  		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
>  
>  		if (copy_from_user(&unmap, (void __user *)arg, minsz))
>  			return -EFAULT;
>  
> -		if (unmap.argsz < minsz || unmap.flags)
> +		if (unmap.argsz < minsz ||
> +		    unmap.flags & ~VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP)
>  			return -EINVAL;
>  
> -		ret = vfio_dma_do_unmap(iommu, &unmap);
> +		if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
> +			unsigned long pgshift;
> +			uint64_t iommu_pgsize =
> +					 1 << __ffs(vfio_pgsize_bitmap(iommu));
> +
> +			if (unmap.argsz < (minsz + sizeof(bitmap)))
> +				return -EINVAL;
> +
> +			if (copy_from_user(&bitmap,
> +					   (void __user *)(arg + minsz),
> +					   sizeof(bitmap)))
> +				return -EFAULT;
> +
> +			/* allow only min supported pgsize */
> +			if (bitmap.pgsize != iommu_pgsize)
> +				return -EINVAL;
> +			if (!access_ok((void __user *)bitmap.data, bitmap.size))
> +				return -EINVAL;
> +
> +			pgshift = __ffs(bitmap.pgsize);
> +			ret = verify_bitmap_size(unmap.size >> pgshift,
> +						 bitmap.size);
> +			if (ret)
> +				return ret;
> +
> +		}
> +
> +		ret = vfio_dma_do_unmap(iommu, &unmap, &bitmap);
>  		if (ret)
>  			return ret;
>  
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 043e9eafb255..a704e5380f04 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -1010,12 +1010,23 @@ struct vfio_bitmap {
>   * field.  No guarantee is made to the user that arbitrary unmaps of iova
>   * or size different from those used in the original mapping call will
>   * succeed.
> + * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP should be set to get dirty bitmap
> + * before unmapping IO virtual addresses. When this flag is set, user must
> + * provide data[] as structure vfio_bitmap. User must allocate memory to get
> + * bitmap, clear the bitmap memory by setting zero and must set size of
> + * allocated memory in vfio_bitmap.size field. One bit in bitmap
> + * represents per page, page of user provided page size in 'pgsize',
> + * consecutively starting from iova offset. Bit set indicates page at that
> + * offset from iova is dirty. Bitmap of pages in the range of unmapped size is
> + * returned in vfio_bitmap.data
>   */
>  struct vfio_iommu_type1_dma_unmap {
>  	__u32	argsz;
>  	__u32	flags;
> +#define VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP (1 << 0)
>  	__u64	iova;				/* IO virtual address */
>  	__u64	size;				/* Size of mapping (bytes) */
> +	__u8    data[];
>  };
>  
>  #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
> -- 
> 2.7.0
>
Alex Williamson March 20, 2020, 3:40 p.m. UTC | #3
On Fri, 20 Mar 2020 04:35:29 -0400
Yan Zhao <yan.y.zhao@intel.com> wrote:

> On Thu, Mar 19, 2020 at 03:41:12AM +0800, Kirti Wankhede wrote:
> > DMA mapped pages, including those pinned by mdev vendor drivers, might
> > get unpinned and unmapped while migration is active and device is still
> > running. For example, in pre-copy phase while guest driver could access
> > those pages, host device or vendor driver can dirty these mapped pages.
> > Such pages should be marked dirty so as to maintain memory consistency
> > for a user making use of dirty page tracking.
> > 
> > To get bitmap during unmap, user should set flag
> > VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP, bitmap memory should be allocated and
> > zeroed by user space application. Bitmap size and page size should be set
> > by user application.
> > 
> > Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> > Reviewed-by: Neo Jia <cjia@nvidia.com>
> > ---
> >  drivers/vfio/vfio_iommu_type1.c | 55 ++++++++++++++++++++++++++++++++++++++---
> >  include/uapi/linux/vfio.h       | 11 +++++++++
> >  2 files changed, 62 insertions(+), 4 deletions(-)
> > 
> > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > index d6417fb02174..aa1ac30f7854 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -939,7 +939,8 @@ static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
> >  }
> >  
> >  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> > -			     struct vfio_iommu_type1_dma_unmap *unmap)
> > +			     struct vfio_iommu_type1_dma_unmap *unmap,
> > +			     struct vfio_bitmap *bitmap)
> >  {
> >  	uint64_t mask;
> >  	struct vfio_dma *dma, *dma_last = NULL;
> > @@ -990,6 +991,10 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> >  	 * will be returned if these conditions are not met.  The v2 interface
> >  	 * will only return success and a size of zero if there were no
> >  	 * mappings within the range.
> > +	 *
> > +	 * When VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP flag is set, unmap request
> > +	 * must be for single mapping. Multiple mappings with this flag set is
> > +	 * not supported.
> >  	 */
> >  	if (iommu->v2) {
> >  		dma = vfio_find_dma(iommu, unmap->iova, 1);
> > @@ -997,6 +1002,13 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> >  			ret = -EINVAL;
> >  			goto unlock;
> >  		}
> > +
> > +		if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
> > +		    (dma->iova != unmap->iova || dma->size != unmap->size)) {  
> dma is probably NULL here!

Yep, I didn't look closely enough there.  This is situated right
between the check to make sure we're not bisecting a mapping at the
start of the unmap and the check to make sure we're not bisecting a
mapping at the end of the unmap.  There's no guarantee that we have a
valid pointer here.  The test should be in the while() loop below this
code.

> And this restriction on UNMAP would make some UNMAP operations of vIOMMU
> fail.
> 
> e.g. below condition indeed happens in reality.
> an UNMAP ioctl comes for IOVA range from 0xff800000, of size 0x200000
> However, IOVAs in this range are mapped page by page.i.e., dma->size is 0x1000.
> 
> Previous, this UNMAP ioctl could unmap successfully as a whole.

What triggers this in the guest?  Note that it's only when using the
GET_DIRTY_BITMAP flag that this is restricted.  Does the event you're
referring to potentially occur under normal circumstances in that mode?
Thanks,

Alex


> > +			ret = -EINVAL;
> > +			goto unlock;
> > +		}
> > +
> >  		dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
> >  		if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
> >  			ret = -EINVAL;
> > @@ -1014,6 +1026,12 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> >  		if (dma->task->mm != current->mm)
> >  			break;
> >  
> > +		if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
> > +		     iommu->dirty_page_tracking)
> > +			vfio_iova_dirty_bitmap(iommu, dma->iova, dma->size,
> > +					bitmap->pgsize,
> > +					(unsigned char __user *) bitmap->data);
> > +
> >  		if (!RB_EMPTY_ROOT(&dma->pfn_list)) {
> >  			struct vfio_iommu_type1_dma_unmap nb_unmap;
> >  
> > @@ -2369,17 +2387,46 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
> >  
> >  	} else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
> >  		struct vfio_iommu_type1_dma_unmap unmap;
> > -		long ret;
> > +		struct vfio_bitmap bitmap = { 0 };
> > +		int ret;
> >  
> >  		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
> >  
> >  		if (copy_from_user(&unmap, (void __user *)arg, minsz))
> >  			return -EFAULT;
> >  
> > -		if (unmap.argsz < minsz || unmap.flags)
> > +		if (unmap.argsz < minsz ||
> > +		    unmap.flags & ~VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP)
> >  			return -EINVAL;
> >  
> > -		ret = vfio_dma_do_unmap(iommu, &unmap);
> > +		if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
> > +			unsigned long pgshift;
> > +			uint64_t iommu_pgsize =
> > +					 1 << __ffs(vfio_pgsize_bitmap(iommu));
> > +
> > +			if (unmap.argsz < (minsz + sizeof(bitmap)))
> > +				return -EINVAL;
> > +
> > +			if (copy_from_user(&bitmap,
> > +					   (void __user *)(arg + minsz),
> > +					   sizeof(bitmap)))
> > +				return -EFAULT;
> > +
> > +			/* allow only min supported pgsize */
> > +			if (bitmap.pgsize != iommu_pgsize)
> > +				return -EINVAL;
> > +			if (!access_ok((void __user *)bitmap.data, bitmap.size))
> > +				return -EINVAL;
> > +
> > +			pgshift = __ffs(bitmap.pgsize);
> > +			ret = verify_bitmap_size(unmap.size >> pgshift,
> > +						 bitmap.size);
> > +			if (ret)
> > +				return ret;
> > +
> > +		}
> > +
> > +		ret = vfio_dma_do_unmap(iommu, &unmap, &bitmap);
> >  		if (ret)
> >  			return ret;
> >  
> > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > index 043e9eafb255..a704e5380f04 100644
> > --- a/include/uapi/linux/vfio.h
> > +++ b/include/uapi/linux/vfio.h
> > @@ -1010,12 +1010,23 @@ struct vfio_bitmap {
> >   * field.  No guarantee is made to the user that arbitrary unmaps of iova
> >   * or size different from those used in the original mapping call will
> >   * succeed.
> > + * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP should be set to get dirty bitmap
> > + * before unmapping IO virtual addresses. When this flag is set, user must
> > + * provide data[] as structure vfio_bitmap. User must allocate memory to get
> > + * bitmap, clear the bitmap memory by setting zero and must set size of
> > + * allocated memory in vfio_bitmap.size field. One bit in bitmap
> > + * represents per page, page of user provided page size in 'pgsize',
> > + * consecutively starting from iova offset. Bit set indicates page at that
> > + * offset from iova is dirty. Bitmap of pages in the range of unmapped size is
> > + * returned in vfio_bitmap.data
> >   */
> >  struct vfio_iommu_type1_dma_unmap {
> >  	__u32	argsz;
> >  	__u32	flags;
> > +#define VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP (1 << 0)
> >  	__u64	iova;				/* IO virtual address */
> >  	__u64	size;				/* Size of mapping (bytes) */
> > +	__u8    data[];
> >  };
> >  
> >  #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
> > -- 
> > 2.7.0
> >   
>
Alex Williamson March 20, 2020, 3:47 p.m. UTC | #4
On Fri, 20 Mar 2020 09:40:39 -0600
Alex Williamson <alex.williamson@redhat.com> wrote:

> On Fri, 20 Mar 2020 04:35:29 -0400
> Yan Zhao <yan.y.zhao@intel.com> wrote:
> 
> > On Thu, Mar 19, 2020 at 03:41:12AM +0800, Kirti Wankhede wrote:  
> > > DMA mapped pages, including those pinned by mdev vendor drivers, might
> > > get unpinned and unmapped while migration is active and device is still
> > > running. For example, in pre-copy phase while guest driver could access
> > > those pages, host device or vendor driver can dirty these mapped pages.
> > > Such pages should be marked dirty so as to maintain memory consistency
> > > for a user making use of dirty page tracking.
> > > 
> > > To get bitmap during unmap, user should set flag
> > > VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP, bitmap memory should be allocated and
> > > zeroed by user space application. Bitmap size and page size should be set
> > > by user application.
> > > 
> > > Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> > > Reviewed-by: Neo Jia <cjia@nvidia.com>
> > > ---
> > >  drivers/vfio/vfio_iommu_type1.c | 55 ++++++++++++++++++++++++++++++++++++++---
> > >  include/uapi/linux/vfio.h       | 11 +++++++++
> > >  2 files changed, 62 insertions(+), 4 deletions(-)
> > > 
> > > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > > index d6417fb02174..aa1ac30f7854 100644
> > > --- a/drivers/vfio/vfio_iommu_type1.c
> > > +++ b/drivers/vfio/vfio_iommu_type1.c
> > > @@ -939,7 +939,8 @@ static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
> > >  }
> > >  
> > >  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> > > -			     struct vfio_iommu_type1_dma_unmap *unmap)
> > > +			     struct vfio_iommu_type1_dma_unmap *unmap,
> > > +			     struct vfio_bitmap *bitmap)
> > >  {
> > >  	uint64_t mask;
> > >  	struct vfio_dma *dma, *dma_last = NULL;
> > > @@ -990,6 +991,10 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> > >  	 * will be returned if these conditions are not met.  The v2 interface
> > >  	 * will only return success and a size of zero if there were no
> > >  	 * mappings within the range.
> > > +	 *
> > > +	 * When VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP flag is set, unmap request
> > > +	 * must be for single mapping. Multiple mappings with this flag set is
> > > +	 * not supported.
> > >  	 */
> > >  	if (iommu->v2) {
> > >  		dma = vfio_find_dma(iommu, unmap->iova, 1);
> > > @@ -997,6 +1002,13 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> > >  			ret = -EINVAL;
> > >  			goto unlock;
> > >  		}
> > > +
> > > +		if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
> > > +		    (dma->iova != unmap->iova || dma->size != unmap->size)) {    
> > dma is probably NULL here!  
> 
> Yep, I didn't look closely enough there.  This is situated right
> between the check to make sure we're not bisecting a mapping at the
> start of the unmap and the check to make sure we're not bisecting a
> mapping at the end of the unmap.  There's no guarantee that we have a
> valid pointer here.  The test should be in the while() loop below this
> code.

Actually the test could remain here, we can exit here if we can't find
a dma at the start of the unmap range with the GET_DIRTY_BITMAP flag,
but we absolutely cannot deref dma without testing it.

> > And this restriction on UNMAP would make some UNMAP operations of vIOMMU
> > fail.
> > 
> > e.g. below condition indeed happens in reality.
> > an UNMAP ioctl comes for IOVA range from 0xff800000, of size 0x200000
> > However, IOVAs in this range are mapped page by page.i.e., dma->size is 0x1000.
> > 
> > Previous, this UNMAP ioctl could unmap successfully as a whole.  
> 
> What triggers this in the guest?  Note that it's only when using the
> GET_DIRTY_BITMAP flag that this is restricted.  Does the event you're
> referring to potentially occur under normal circumstances in that mode?
> Thanks,
> 
> Alex
> 
> 
> > > +			ret = -EINVAL;
> > > +			goto unlock;
> > > +		}
> > > +
> > >  		dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
> > >  		if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
> > >  			ret = -EINVAL;
> > > @@ -1014,6 +1026,12 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> > >  		if (dma->task->mm != current->mm)
> > >  			break;
> > >  
> > > +		if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
> > > +		     iommu->dirty_page_tracking)
> > > +			vfio_iova_dirty_bitmap(iommu, dma->iova, dma->size,
> > > +					bitmap->pgsize,
> > > +					(unsigned char __user *) bitmap->data);
> > > +
> > >  		if (!RB_EMPTY_ROOT(&dma->pfn_list)) {
> > >  			struct vfio_iommu_type1_dma_unmap nb_unmap;
> > >  
> > > @@ -2369,17 +2387,46 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
> > >  
> > >  	} else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
> > >  		struct vfio_iommu_type1_dma_unmap unmap;
> > > -		long ret;
> > > +		struct vfio_bitmap bitmap = { 0 };
> > > +		int ret;
> > >  
> > >  		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
> > >  
> > >  		if (copy_from_user(&unmap, (void __user *)arg, minsz))
> > >  			return -EFAULT;
> > >  
> > > -		if (unmap.argsz < minsz || unmap.flags)
> > > +		if (unmap.argsz < minsz ||
> > > +		    unmap.flags & ~VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP)
> > >  			return -EINVAL;
> > >  
> > > -		ret = vfio_dma_do_unmap(iommu, &unmap);
> > > +		if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
> > > +			unsigned long pgshift;
> > > +			uint64_t iommu_pgsize =
> > > +					 1 << __ffs(vfio_pgsize_bitmap(iommu));
> > > +
> > > +			if (unmap.argsz < (minsz + sizeof(bitmap)))
> > > +				return -EINVAL;
> > > +
> > > +			if (copy_from_user(&bitmap,
> > > +					   (void __user *)(arg + minsz),
> > > +					   sizeof(bitmap)))
> > > +				return -EFAULT;
> > > +
> > > +			/* allow only min supported pgsize */
> > > +			if (bitmap.pgsize != iommu_pgsize)
> > > +				return -EINVAL;
> > > +			if (!access_ok((void __user *)bitmap.data, bitmap.size))
> > > +				return -EINVAL;
> > > +
> > > +			pgshift = __ffs(bitmap.pgsize);
> > > +			ret = verify_bitmap_size(unmap.size >> pgshift,
> > > +						 bitmap.size);
> > > +			if (ret)
> > > +				return ret;
> > > +
> > > +		}
> > > +
> > > +		ret = vfio_dma_do_unmap(iommu, &unmap, &bitmap);
> > >  		if (ret)
> > >  			return ret;
> > >  
> > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > > index 043e9eafb255..a704e5380f04 100644
> > > --- a/include/uapi/linux/vfio.h
> > > +++ b/include/uapi/linux/vfio.h
> > > @@ -1010,12 +1010,23 @@ struct vfio_bitmap {
> > >   * field.  No guarantee is made to the user that arbitrary unmaps of iova
> > >   * or size different from those used in the original mapping call will
> > >   * succeed.
> > > + * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP should be set to get dirty bitmap
> > > + * before unmapping IO virtual addresses. When this flag is set, user must
> > > + * provide data[] as structure vfio_bitmap. User must allocate memory to get
> > > + * bitmap, clear the bitmap memory by setting zero and must set size of
> > > + * allocated memory in vfio_bitmap.size field. One bit in bitmap
> > > + * represents per page, page of user provided page size in 'pgsize',
> > > + * consecutively starting from iova offset. Bit set indicates page at that
> > > + * offset from iova is dirty. Bitmap of pages in the range of unmapped size is
> > > + * returned in vfio_bitmap.data
> > >   */
> > >  struct vfio_iommu_type1_dma_unmap {
> > >  	__u32	argsz;
> > >  	__u32	flags;
> > > +#define VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP (1 << 0)
> > >  	__u64	iova;				/* IO virtual address */
> > >  	__u64	size;				/* Size of mapping (bytes) */
> > > +	__u8    data[];
> > >  };
> > >  
> > >  #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
> > > -- 
> > > 2.7.0
> > >     
> >   
>
Kirti Wankhede March 20, 2020, 7:14 p.m. UTC | #5
On 3/20/2020 9:17 PM, Alex Williamson wrote:
> On Fri, 20 Mar 2020 09:40:39 -0600
> Alex Williamson <alex.williamson@redhat.com> wrote:
> 
>> On Fri, 20 Mar 2020 04:35:29 -0400
>> Yan Zhao <yan.y.zhao@intel.com> wrote:
>>
>>> On Thu, Mar 19, 2020 at 03:41:12AM +0800, Kirti Wankhede wrote:
>>>> DMA mapped pages, including those pinned by mdev vendor drivers, might
>>>> get unpinned and unmapped while migration is active and device is still
>>>> running. For example, in pre-copy phase while guest driver could access
>>>> those pages, host device or vendor driver can dirty these mapped pages.
>>>> Such pages should be marked dirty so as to maintain memory consistency
>>>> for a user making use of dirty page tracking.
>>>>
>>>> To get bitmap during unmap, user should set flag
>>>> VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP, bitmap memory should be allocated and
>>>> zeroed by user space application. Bitmap size and page size should be set
>>>> by user application.
>>>>
>>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>>>> Reviewed-by: Neo Jia <cjia@nvidia.com>
>>>> ---
>>>>   drivers/vfio/vfio_iommu_type1.c | 55 ++++++++++++++++++++++++++++++++++++++---
>>>>   include/uapi/linux/vfio.h       | 11 +++++++++
>>>>   2 files changed, 62 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>>>> index d6417fb02174..aa1ac30f7854 100644
>>>> --- a/drivers/vfio/vfio_iommu_type1.c
>>>> +++ b/drivers/vfio/vfio_iommu_type1.c
>>>> @@ -939,7 +939,8 @@ static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
>>>>   }
>>>>   
>>>>   static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>>>> -			     struct vfio_iommu_type1_dma_unmap *unmap)
>>>> +			     struct vfio_iommu_type1_dma_unmap *unmap,
>>>> +			     struct vfio_bitmap *bitmap)
>>>>   {
>>>>   	uint64_t mask;
>>>>   	struct vfio_dma *dma, *dma_last = NULL;
>>>> @@ -990,6 +991,10 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>>>>   	 * will be returned if these conditions are not met.  The v2 interface
>>>>   	 * will only return success and a size of zero if there were no
>>>>   	 * mappings within the range.
>>>> +	 *
>>>> +	 * When VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP flag is set, unmap request
>>>> +	 * must be for single mapping. Multiple mappings with this flag set is
>>>> +	 * not supported.
>>>>   	 */
>>>>   	if (iommu->v2) {
>>>>   		dma = vfio_find_dma(iommu, unmap->iova, 1);
>>>> @@ -997,6 +1002,13 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>>>>   			ret = -EINVAL;
>>>>   			goto unlock;
>>>>   		}
>>>> +
>>>> +		if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
>>>> +		    (dma->iova != unmap->iova || dma->size != unmap->size)) {
>>> dma is probably NULL here!
>>
>> Yep, I didn't look closely enough there.  This is situated right
>> between the check to make sure we're not bisecting a mapping at the
>> start of the unmap and the check to make sure we're not bisecting a
>> mapping at the end of the unmap.  There's no guarantee that we have a
>> valid pointer here.  The test should be in the while() loop below this
>> code.
> 
> Actually the test could remain here, we can exit here if we can't find
> a dma at the start of the unmap range with the GET_DIRTY_BITMAP flag,
> but we absolutely cannot deref dma without testing it.
> 

In the check above newly added check, if dma is NULL then its an error 
condition, because Unmap requests must fully cover previous mappings, right?

>>> And this restriction on UNMAP would make some UNMAP operations of vIOMMU
>>> fail.
>>>
>>> e.g. below condition indeed happens in reality.
>>> an UNMAP ioctl comes for IOVA range from 0xff800000, of size 0x200000
>>> However, IOVAs in this range are mapped page by page.i.e., dma->size is 0x1000.
>>>
>>> Previous, this UNMAP ioctl could unmap successfully as a whole.
>>
>> What triggers this in the guest?  Note that it's only when using the
>> GET_DIRTY_BITMAP flag that this is restricted.  Does the event you're
>> referring to potentially occur under normal circumstances in that mode?
>> Thanks,
>>

Such unmap would callback vfio_iommu_map_notify() in QEMU. In 
vfio_iommu_map_notify(), unmap is called on same range <iova, 
iotlb->addr_mask + 1> which was used for map. Secondly unmap with bitmap 
will be called only when device state has _SAVING flag set.

Thanks,
Kirti
Alex Williamson March 20, 2020, 7:28 p.m. UTC | #6
On Sat, 21 Mar 2020 00:44:32 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 3/20/2020 9:17 PM, Alex Williamson wrote:
> > On Fri, 20 Mar 2020 09:40:39 -0600
> > Alex Williamson <alex.williamson@redhat.com> wrote:
> >   
> >> On Fri, 20 Mar 2020 04:35:29 -0400
> >> Yan Zhao <yan.y.zhao@intel.com> wrote:
> >>  
> >>> On Thu, Mar 19, 2020 at 03:41:12AM +0800, Kirti Wankhede wrote:  
> >>>> DMA mapped pages, including those pinned by mdev vendor drivers, might
> >>>> get unpinned and unmapped while migration is active and device is still
> >>>> running. For example, in pre-copy phase while guest driver could access
> >>>> those pages, host device or vendor driver can dirty these mapped pages.
> >>>> Such pages should be marked dirty so as to maintain memory consistency
> >>>> for a user making use of dirty page tracking.
> >>>>
> >>>> To get bitmap during unmap, user should set flag
> >>>> VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP, bitmap memory should be allocated and
> >>>> zeroed by user space application. Bitmap size and page size should be set
> >>>> by user application.
> >>>>
> >>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> >>>> Reviewed-by: Neo Jia <cjia@nvidia.com>
> >>>> ---
> >>>>   drivers/vfio/vfio_iommu_type1.c | 55 ++++++++++++++++++++++++++++++++++++++---
> >>>>   include/uapi/linux/vfio.h       | 11 +++++++++
> >>>>   2 files changed, 62 insertions(+), 4 deletions(-)
> >>>>
> >>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> >>>> index d6417fb02174..aa1ac30f7854 100644
> >>>> --- a/drivers/vfio/vfio_iommu_type1.c
> >>>> +++ b/drivers/vfio/vfio_iommu_type1.c
> >>>> @@ -939,7 +939,8 @@ static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
> >>>>   }
> >>>>   
> >>>>   static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> >>>> -			     struct vfio_iommu_type1_dma_unmap *unmap)
> >>>> +			     struct vfio_iommu_type1_dma_unmap *unmap,
> >>>> +			     struct vfio_bitmap *bitmap)
> >>>>   {
> >>>>   	uint64_t mask;
> >>>>   	struct vfio_dma *dma, *dma_last = NULL;
> >>>> @@ -990,6 +991,10 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> >>>>   	 * will be returned if these conditions are not met.  The v2 interface
> >>>>   	 * will only return success and a size of zero if there were no
> >>>>   	 * mappings within the range.
> >>>> +	 *
> >>>> +	 * When VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP flag is set, unmap request
> >>>> +	 * must be for single mapping. Multiple mappings with this flag set is
> >>>> +	 * not supported.
> >>>>   	 */
> >>>>   	if (iommu->v2) {
> >>>>   		dma = vfio_find_dma(iommu, unmap->iova, 1);
> >>>> @@ -997,6 +1002,13 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> >>>>   			ret = -EINVAL;
> >>>>   			goto unlock;
> >>>>   		}
> >>>> +
> >>>> +		if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
> >>>> +		    (dma->iova != unmap->iova || dma->size != unmap->size)) {  
> >>> dma is probably NULL here!  
> >>
> >> Yep, I didn't look closely enough there.  This is situated right
> >> between the check to make sure we're not bisecting a mapping at the
> >> start of the unmap and the check to make sure we're not bisecting a
> >> mapping at the end of the unmap.  There's no guarantee that we have a
> >> valid pointer here.  The test should be in the while() loop below this
> >> code.  
> > 
> > Actually the test could remain here, we can exit here if we can't find
> > a dma at the start of the unmap range with the GET_DIRTY_BITMAP flag,
> > but we absolutely cannot deref dma without testing it.
> >   
> 
> In the check above newly added check, if dma is NULL then its an error 
> condition, because Unmap requests must fully cover previous mappings, right?

Yes, but we'll do a null pointer deref before we return error.
 
> >>> And this restriction on UNMAP would make some UNMAP operations of vIOMMU
> >>> fail.
> >>>
> >>> e.g. below condition indeed happens in reality.
> >>> an UNMAP ioctl comes for IOVA range from 0xff800000, of size 0x200000
> >>> However, IOVAs in this range are mapped page by page.i.e., dma->size is 0x1000.
> >>>
> >>> Previous, this UNMAP ioctl could unmap successfully as a whole.  
> >>
> >> What triggers this in the guest?  Note that it's only when using the
> >> GET_DIRTY_BITMAP flag that this is restricted.  Does the event you're
> >> referring to potentially occur under normal circumstances in that mode?
> >> Thanks,
> >>  
> 
> Such unmap would callback vfio_iommu_map_notify() in QEMU. In 
> vfio_iommu_map_notify(), unmap is called on same range <iova, 
> iotlb->addr_mask + 1> which was used for map. Secondly unmap with bitmap 
> will be called only when device state has _SAVING flag set.

It might be helpful for Yan, and everyone else, to see the latest QEMU
patch series.  Thanks,

Alex
Yan Zhao March 23, 2020, 1:10 a.m. UTC | #7
On Sat, Mar 21, 2020 at 03:28:21AM +0800, Alex Williamson wrote:
> On Sat, 21 Mar 2020 00:44:32 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
> > On 3/20/2020 9:17 PM, Alex Williamson wrote:
> > > On Fri, 20 Mar 2020 09:40:39 -0600
> > > Alex Williamson <alex.williamson@redhat.com> wrote:
> > >   
> > >> On Fri, 20 Mar 2020 04:35:29 -0400
> > >> Yan Zhao <yan.y.zhao@intel.com> wrote:
> > >>  
> > >>> On Thu, Mar 19, 2020 at 03:41:12AM +0800, Kirti Wankhede wrote:  
> > >>>> DMA mapped pages, including those pinned by mdev vendor drivers, might
> > >>>> get unpinned and unmapped while migration is active and device is still
> > >>>> running. For example, in pre-copy phase while guest driver could access
> > >>>> those pages, host device or vendor driver can dirty these mapped pages.
> > >>>> Such pages should be marked dirty so as to maintain memory consistency
> > >>>> for a user making use of dirty page tracking.
> > >>>>
> > >>>> To get bitmap during unmap, user should set flag
> > >>>> VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP, bitmap memory should be allocated and
> > >>>> zeroed by user space application. Bitmap size and page size should be set
> > >>>> by user application.
> > >>>>
> > >>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> > >>>> Reviewed-by: Neo Jia <cjia@nvidia.com>
> > >>>> ---
> > >>>>   drivers/vfio/vfio_iommu_type1.c | 55 ++++++++++++++++++++++++++++++++++++++---
> > >>>>   include/uapi/linux/vfio.h       | 11 +++++++++
> > >>>>   2 files changed, 62 insertions(+), 4 deletions(-)
> > >>>>
> > >>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > >>>> index d6417fb02174..aa1ac30f7854 100644
> > >>>> --- a/drivers/vfio/vfio_iommu_type1.c
> > >>>> +++ b/drivers/vfio/vfio_iommu_type1.c
> > >>>> @@ -939,7 +939,8 @@ static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
> > >>>>   }
> > >>>>   
> > >>>>   static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> > >>>> -			     struct vfio_iommu_type1_dma_unmap *unmap)
> > >>>> +			     struct vfio_iommu_type1_dma_unmap *unmap,
> > >>>> +			     struct vfio_bitmap *bitmap)
> > >>>>   {
> > >>>>   	uint64_t mask;
> > >>>>   	struct vfio_dma *dma, *dma_last = NULL;
> > >>>> @@ -990,6 +991,10 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> > >>>>   	 * will be returned if these conditions are not met.  The v2 interface
> > >>>>   	 * will only return success and a size of zero if there were no
> > >>>>   	 * mappings within the range.
> > >>>> +	 *
> > >>>> +	 * When VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP flag is set, unmap request
> > >>>> +	 * must be for single mapping. Multiple mappings with this flag set is
> > >>>> +	 * not supported.
> > >>>>   	 */
> > >>>>   	if (iommu->v2) {
> > >>>>   		dma = vfio_find_dma(iommu, unmap->iova, 1);
> > >>>> @@ -997,6 +1002,13 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> > >>>>   			ret = -EINVAL;
> > >>>>   			goto unlock;
> > >>>>   		}
> > >>>> +
> > >>>> +		if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
> > >>>> +		    (dma->iova != unmap->iova || dma->size != unmap->size)) {  
> > >>> dma is probably NULL here!  
> > >>
> > >> Yep, I didn't look closely enough there.  This is situated right
> > >> between the check to make sure we're not bisecting a mapping at the
> > >> start of the unmap and the check to make sure we're not bisecting a
> > >> mapping at the end of the unmap.  There's no guarantee that we have a
> > >> valid pointer here.  The test should be in the while() loop below this
> > >> code.  
> > > 
> > > Actually the test could remain here, we can exit here if we can't find
> > > a dma at the start of the unmap range with the GET_DIRTY_BITMAP flag,
> > > but we absolutely cannot deref dma without testing it.
> > >   
> > 
> > In the check above newly added check, if dma is NULL then its an error 
> > condition, because Unmap requests must fully cover previous mappings, right?
> 
> Yes, but we'll do a null pointer deref before we return error.
>  
> > >>> And this restriction on UNMAP would make some UNMAP operations of vIOMMU
> > >>> fail.
> > >>>
> > >>> e.g. below condition indeed happens in reality.
> > >>> an UNMAP ioctl comes for IOVA range from 0xff800000, of size 0x200000
> > >>> However, IOVAs in this range are mapped page by page.i.e., dma->size is 0x1000.
> > >>>
> > >>> Previous, this UNMAP ioctl could unmap successfully as a whole.  
> > >>
> > >> What triggers this in the guest?  Note that it's only when using the
> > >> GET_DIRTY_BITMAP flag that this is restricted.  Does the event you're
> > >> referring to potentially occur under normal circumstances in that mode?
> > >> Thanks,
> > >>  

it happens in vIOMMU Domain level invalidation of IOTLB
(domain-selective invalidation, see vtd_iotlb_domain_invalidate() in qemu).
common in VTD lazy mode, and NOT just happening once at boot time.
rather than invalidate page by page, it batches the page invalidation.
so, when this invalidation takes place, even higher level page tables
have been invalid and therefore it has to invalidate a bigger combined range.
That's why we see IOVAs are mapped in 4k pages, but are unmapped in 2M
pages.

I think those UNMAPs should also have GET_DIRTY_BIMTAP flag on, right?
> > 
> > Such unmap would callback vfio_iommu_map_notify() in QEMU. In 
> > vfio_iommu_map_notify(), unmap is called on same range <iova, 
> > iotlb->addr_mask + 1> which was used for map. Secondly unmap with bitmap 
> > will be called only when device state has _SAVING flag set.
> 
in this case, iotlb->addr_mask in unmap is 0x200000 -1.
different than 0x1000 -1 used for map.
> It might be helpful for Yan, and everyone else, to see the latest QEMU
> patch series.  Thanks,
>
yes, please. also curious of log_sync part for vIOMMU. given most IOVAs in
address space are unmapped and therefore no IOTLBs are able to be found.

Thanks
Yan
diff mbox series

Patch

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index d6417fb02174..aa1ac30f7854 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -939,7 +939,8 @@  static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
 }
 
 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
-			     struct vfio_iommu_type1_dma_unmap *unmap)
+			     struct vfio_iommu_type1_dma_unmap *unmap,
+			     struct vfio_bitmap *bitmap)
 {
 	uint64_t mask;
 	struct vfio_dma *dma, *dma_last = NULL;
@@ -990,6 +991,10 @@  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 	 * will be returned if these conditions are not met.  The v2 interface
 	 * will only return success and a size of zero if there were no
 	 * mappings within the range.
+	 *
+	 * When VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP flag is set, unmap request
+	 * must be for single mapping. Multiple mappings with this flag set is
+	 * not supported.
 	 */
 	if (iommu->v2) {
 		dma = vfio_find_dma(iommu, unmap->iova, 1);
@@ -997,6 +1002,13 @@  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 			ret = -EINVAL;
 			goto unlock;
 		}
+
+		if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
+		    (dma->iova != unmap->iova || dma->size != unmap->size)) {
+			ret = -EINVAL;
+			goto unlock;
+		}
+
 		dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
 		if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
 			ret = -EINVAL;
@@ -1014,6 +1026,12 @@  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 		if (dma->task->mm != current->mm)
 			break;
 
+		if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
+		     iommu->dirty_page_tracking)
+			vfio_iova_dirty_bitmap(iommu, dma->iova, dma->size,
+					bitmap->pgsize,
+					(unsigned char __user *) bitmap->data);
+
 		if (!RB_EMPTY_ROOT(&dma->pfn_list)) {
 			struct vfio_iommu_type1_dma_unmap nb_unmap;
 
@@ -2369,17 +2387,46 @@  static long vfio_iommu_type1_ioctl(void *iommu_data,
 
 	} else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
 		struct vfio_iommu_type1_dma_unmap unmap;
-		long ret;
+		struct vfio_bitmap bitmap = { 0 };
+		int ret;
 
 		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
 
 		if (copy_from_user(&unmap, (void __user *)arg, minsz))
 			return -EFAULT;
 
-		if (unmap.argsz < minsz || unmap.flags)
+		if (unmap.argsz < minsz ||
+		    unmap.flags & ~VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP)
 			return -EINVAL;
 
-		ret = vfio_dma_do_unmap(iommu, &unmap);
+		if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
+			unsigned long pgshift;
+			uint64_t iommu_pgsize =
+					 1 << __ffs(vfio_pgsize_bitmap(iommu));
+
+			if (unmap.argsz < (minsz + sizeof(bitmap)))
+				return -EINVAL;
+
+			if (copy_from_user(&bitmap,
+					   (void __user *)(arg + minsz),
+					   sizeof(bitmap)))
+				return -EFAULT;
+
+			/* allow only min supported pgsize */
+			if (bitmap.pgsize != iommu_pgsize)
+				return -EINVAL;
+			if (!access_ok((void __user *)bitmap.data, bitmap.size))
+				return -EINVAL;
+
+			pgshift = __ffs(bitmap.pgsize);
+			ret = verify_bitmap_size(unmap.size >> pgshift,
+						 bitmap.size);
+			if (ret)
+				return ret;
+
+		}
+
+		ret = vfio_dma_do_unmap(iommu, &unmap, &bitmap);
 		if (ret)
 			return ret;
 
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 043e9eafb255..a704e5380f04 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1010,12 +1010,23 @@  struct vfio_bitmap {
  * field.  No guarantee is made to the user that arbitrary unmaps of iova
  * or size different from those used in the original mapping call will
  * succeed.
+ * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP should be set to get dirty bitmap
+ * before unmapping IO virtual addresses. When this flag is set, user must
+ * provide data[] as structure vfio_bitmap. User must allocate memory to get
+ * bitmap, clear the bitmap memory by setting zero and must set size of
+ * allocated memory in vfio_bitmap.size field. One bit in bitmap
+ * represents per page, page of user provided page size in 'pgsize',
+ * consecutively starting from iova offset. Bit set indicates page at that
+ * offset from iova is dirty. Bitmap of pages in the range of unmapped size is
+ * returned in vfio_bitmap.data
  */
 struct vfio_iommu_type1_dma_unmap {
 	__u32	argsz;
 	__u32	flags;
+#define VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP (1 << 0)
 	__u64	iova;				/* IO virtual address */
 	__u64	size;				/* Size of mapping (bytes) */
+	__u8    data[];
 };
 
 #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)