diff mbox series

[Kernel,v18,5/7] vfio iommu: Update UNMAP_DMA ioctl to get dirty bitmap before unmap

Message ID 1588607939-26441-6-git-send-email-kwankhede@nvidia.com (mailing list archive)
State New, archived
Headers show
Series KABIs to support migration for VFIO devices | expand

Commit Message

Kirti Wankhede May 4, 2020, 3:58 p.m. UTC
DMA mapped pages, including those pinned by mdev vendor drivers, might
get unpinned and unmapped while migration is active and device is still
running. For example, in pre-copy phase while guest driver could access
those pages, host device or vendor driver can dirty these mapped pages.
Such pages should be marked dirty so as to maintain memory consistency
for a user making use of dirty page tracking.

To get bitmap during unmap, user should allocate memory for bitmap, set
size of allocated memory, set page size to be considered for bitmap and
set flag VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP.

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Reviewed-by: Neo Jia <cjia@nvidia.com>
---
 drivers/vfio/vfio_iommu_type1.c | 84 +++++++++++++++++++++++++++++++++++++++--
 include/uapi/linux/vfio.h       | 10 +++++
 2 files changed, 90 insertions(+), 4 deletions(-)

Comments

Alex Williamson May 6, 2020, 10:25 p.m. UTC | #1
On Mon, 4 May 2020 21:28:57 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> DMA mapped pages, including those pinned by mdev vendor drivers, might
> get unpinned and unmapped while migration is active and device is still
> running. For example, in pre-copy phase while guest driver could access
> those pages, host device or vendor driver can dirty these mapped pages.
> Such pages should be marked dirty so as to maintain memory consistency
> for a user making use of dirty page tracking.
> 
> To get bitmap during unmap, user should allocate memory for bitmap, set
> size of allocated memory, set page size to be considered for bitmap and
> set flag VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP.
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Reviewed-by: Neo Jia <cjia@nvidia.com>
> ---
>  drivers/vfio/vfio_iommu_type1.c | 84 +++++++++++++++++++++++++++++++++++++++--
>  include/uapi/linux/vfio.h       | 10 +++++
>  2 files changed, 90 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 01dcb417836f..8b27faf1ec38 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -983,12 +983,14 @@ static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
>  }
>  
>  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> -			     struct vfio_iommu_type1_dma_unmap *unmap)
> +			     struct vfio_iommu_type1_dma_unmap *unmap,
> +			     struct vfio_bitmap *bitmap)
>  {
>  	uint64_t mask;
>  	struct vfio_dma *dma, *dma_last = NULL;
>  	size_t unmapped = 0;
>  	int ret = 0, retries = 0;
> +	unsigned long *final_bitmap = NULL, *temp_bitmap = NULL;
>  
>  	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
>  
> @@ -1041,6 +1043,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>  			ret = -EINVAL;
>  			goto unlock;
>  		}
> +
>  		dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
>  		if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
>  			ret = -EINVAL;
> @@ -1048,6 +1051,22 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>  		}
>  	}
>  
> +	if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
> +	     iommu->dirty_page_tracking) {

Why do we even accept VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP when not
dirty page tracking rather than returning -EINVAL?  It would simplify
things here to reject it at the ioctl and silently ignoring a flag is
rarely if ever the right approach.

> +		final_bitmap = kvzalloc(bitmap->size, GFP_KERNEL);
> +		if (!final_bitmap) {
> +			ret = -ENOMEM;
> +			goto unlock;
> +		}
> +
> +		temp_bitmap = kvzalloc(bitmap->size, GFP_KERNEL);
> +		if (!temp_bitmap) {
> +			ret = -ENOMEM;
> +			kfree(final_bitmap);
> +			goto unlock;
> +		}

YIKES!  So the user can instantly trigger the kernel to internally
allocate 2 x 256MB, regardless of how much they can actually map.

> +	}
> +
>  	while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
>  		if (!iommu->v2 && unmap->iova > dma->iova)
>  			break;
> @@ -1058,6 +1077,24 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>  		if (dma->task->mm != current->mm)
>  			break;
>  
> +		if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
> +		     iommu->dirty_page_tracking) {
> +			unsigned long pgshift = __ffs(bitmap->pgsize);
> +			unsigned int npages = dma->size >> pgshift;
> +			unsigned int shift;
> +
> +			vfio_iova_dirty_bitmap(iommu, dma->iova, dma->size,
> +					bitmap->pgsize, (u64 *)temp_bitmap);

vfio_iova_dirty_bitmap() takes a __user bitmap, we're doing
copy_to_user() on a kernel allocated buffer???

> +
> +			shift = (dma->iova - unmap->iova) >> pgshift;
> +			if (shift)
> +				bitmap_shift_left(temp_bitmap, temp_bitmap,
> +						  shift, npages);
> +			bitmap_or(final_bitmap, final_bitmap, temp_bitmap,
> +				  shift + npages);
> +			memset(temp_bitmap, 0, bitmap->size);
> +		}

It seems like if the per vfio_dma dirty bitmap was oversized by a long
that we could shift it in place, then we'd only need one working bitmap
buffer and we could size that to fit the vfio_dma (or the largest
vfio_dma if we don't want to free and re-alloc for each vfio_dma).
We'd need to do more copy_to/from_user()s, but we'd also avoid copying
between sparse mappings (user zero'd bitmap required) and we'd have a
far more reasonable memory usage.  Thanks,

Alex

> +
>  		if (!RB_EMPTY_ROOT(&dma->pfn_list)) {
>  			struct vfio_iommu_type1_dma_unmap nb_unmap;
>  
> @@ -1088,6 +1125,16 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>  	}
>  
>  unlock:
> +	if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
> +	     iommu->dirty_page_tracking && final_bitmap) {
> +		if (copy_to_user((void __user *)bitmap->data, final_bitmap,
> +				 bitmap->size))
> +			ret = -EFAULT;
> +
> +		kfree(final_bitmap);
> +		kfree(temp_bitmap);
> +	}
> +
>  	mutex_unlock(&iommu->lock);
>  
>  	/* Report how much was unmapped */
> @@ -2419,17 +2466,46 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>  
>  	} else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
>  		struct vfio_iommu_type1_dma_unmap unmap;
> -		long ret;
> +		struct vfio_bitmap bitmap = { 0 };
> +		int ret;
>  
>  		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
>  
>  		if (copy_from_user(&unmap, (void __user *)arg, minsz))
>  			return -EFAULT;
>  
> -		if (unmap.argsz < minsz || unmap.flags)
> +		if (unmap.argsz < minsz ||
> +		    unmap.flags & ~VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP)
>  			return -EINVAL;
>  
> -		ret = vfio_dma_do_unmap(iommu, &unmap);
> +		if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
> +			unsigned long pgshift;
> +			size_t iommu_pgsize =
> +				(size_t)1 << __ffs(vfio_pgsize_bitmap(iommu));
> +
> +			if (unmap.argsz < (minsz + sizeof(bitmap)))
> +				return -EINVAL;
> +
> +			if (copy_from_user(&bitmap,
> +					   (void __user *)(arg + minsz),
> +					   sizeof(bitmap)))
> +				return -EFAULT;
> +
> +			/* allow only min supported pgsize */
> +			if (bitmap.pgsize != iommu_pgsize)
> +				return -EINVAL;
> +			if (!access_ok((void __user *)bitmap.data, bitmap.size))
> +				return -EINVAL;
> +
> +			pgshift = __ffs(bitmap.pgsize);
> +			ret = verify_bitmap_size(unmap.size >> pgshift,
> +						 bitmap.size);
> +			if (ret)
> +				return ret;
> +
> +		}
> +
> +		ret = vfio_dma_do_unmap(iommu, &unmap, &bitmap);
>  		if (ret)
>  			return ret;
>  
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 5f359c63f5ef..e3cbf8b78623 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -1048,12 +1048,22 @@ struct vfio_bitmap {
>   * field.  No guarantee is made to the user that arbitrary unmaps of iova
>   * or size different from those used in the original mapping call will
>   * succeed.
> + * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP should be set to get dirty bitmap
> + * before unmapping IO virtual addresses. When this flag is set, user must
> + * provide data[] as structure vfio_bitmap. User must allocate memory to get
> + * bitmap and must set size of allocated memory in vfio_bitmap.size field.
> + * A bit in bitmap represents one page of user provided page size in 'pgsize',
> + * consecutively starting from iova offset. Bit set indicates page at that
> + * offset from iova is dirty. Bitmap of pages in the range of unmapped size is
> + * returned in vfio_bitmap.data
>   */
>  struct vfio_iommu_type1_dma_unmap {
>  	__u32	argsz;
>  	__u32	flags;
> +#define VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP (1 << 0)
>  	__u64	iova;				/* IO virtual address */
>  	__u64	size;				/* Size of mapping (bytes) */
> +	__u8    data[];
>  };
>  
>  #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
Kirti Wankhede May 12, 2020, 8:30 p.m. UTC | #2
On 5/7/2020 3:55 AM, Alex Williamson wrote:
> On Mon, 4 May 2020 21:28:57 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> DMA mapped pages, including those pinned by mdev vendor drivers, might
>> get unpinned and unmapped while migration is active and device is still
>> running. For example, in pre-copy phase while guest driver could access
>> those pages, host device or vendor driver can dirty these mapped pages.
>> Such pages should be marked dirty so as to maintain memory consistency
>> for a user making use of dirty page tracking.
>>
>> To get bitmap during unmap, user should allocate memory for bitmap, set
>> size of allocated memory, set page size to be considered for bitmap and
>> set flag VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP.
>>
>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>> Reviewed-by: Neo Jia <cjia@nvidia.com>
>> ---
>>   drivers/vfio/vfio_iommu_type1.c | 84 +++++++++++++++++++++++++++++++++++++++--
>>   include/uapi/linux/vfio.h       | 10 +++++
>>   2 files changed, 90 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>> index 01dcb417836f..8b27faf1ec38 100644
>> --- a/drivers/vfio/vfio_iommu_type1.c
>> +++ b/drivers/vfio/vfio_iommu_type1.c
>> @@ -983,12 +983,14 @@ static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
>>   }
>>   
>>   static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>> -			     struct vfio_iommu_type1_dma_unmap *unmap)
>> +			     struct vfio_iommu_type1_dma_unmap *unmap,
>> +			     struct vfio_bitmap *bitmap)
>>   {
>>   	uint64_t mask;
>>   	struct vfio_dma *dma, *dma_last = NULL;
>>   	size_t unmapped = 0;
>>   	int ret = 0, retries = 0;
>> +	unsigned long *final_bitmap = NULL, *temp_bitmap = NULL;
>>   
>>   	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
>>   
>> @@ -1041,6 +1043,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>>   			ret = -EINVAL;
>>   			goto unlock;
>>   		}
>> +
>>   		dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
>>   		if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
>>   			ret = -EINVAL;
>> @@ -1048,6 +1051,22 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>>   		}
>>   	}
>>   
>> +	if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
>> +	     iommu->dirty_page_tracking) {
> 
> Why do we even accept VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP when not
> dirty page tracking rather than returning -EINVAL?  It would simplify
> things here to reject it at the ioctl and silently ignoring a flag is
> rarely if ever the right approach.
> 
>> +		final_bitmap = kvzalloc(bitmap->size, GFP_KERNEL);
>> +		if (!final_bitmap) {
>> +			ret = -ENOMEM;
>> +			goto unlock;
>> +		}
>> +
>> +		temp_bitmap = kvzalloc(bitmap->size, GFP_KERNEL);
>> +		if (!temp_bitmap) {
>> +			ret = -ENOMEM;
>> +			kfree(final_bitmap);
>> +			goto unlock;
>> +		}
> 
> YIKES!  So the user can instantly trigger the kernel to internally
> allocate 2 x 256MB, regardless of how much they can actually map.
> 

That is worst case senario. I don't think ideally that will ever hit. 
More comment below regarding this.

>> +	}
>> +
>>   	while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
>>   		if (!iommu->v2 && unmap->iova > dma->iova)
>>   			break;
>> @@ -1058,6 +1077,24 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>>   		if (dma->task->mm != current->mm)
>>   			break;
>>   
>> +		if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
>> +		     iommu->dirty_page_tracking) {
>> +			unsigned long pgshift = __ffs(bitmap->pgsize);
>> +			unsigned int npages = dma->size >> pgshift;
>> +			unsigned int shift;
>> +
>> +			vfio_iova_dirty_bitmap(iommu, dma->iova, dma->size,
>> +					bitmap->pgsize, (u64 *)temp_bitmap);
> 
> vfio_iova_dirty_bitmap() takes a __user bitmap, we're doing
> copy_to_user() on a kernel allocated buffer???
> 

Actually, there is no need to call vfio_iova_dirty_bitmap(), dma pointer 
is known here and since its getting unmapped, there is no need to 
repopulate bitmap. Removing vfio_iova_dirty_bitmap() and changing it as 
below:

if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
     unsigned long pgshift = __ffs(bitmap->pgsize);
     unsigned int npages = dma->size >> pgshift;
     unsigned int bitmap_size = DIRTY_BITMAP_BYTES(npages);
     unsigned int shift = (dma->iova - unmap->iova) >>
                                             pgshift;
     /*
      * mark all pages dirty if all pages are pinned and
      * mapped.
      */
     if (dma->iommu_mapped)
         bitmap_set(temp_bitmap, 0, npages);
     else
         memcpy(temp_bitmap, dma->bitmap, bitmap_size);

     if (shift)
         bitmap_shift_left(temp_bitmap, temp_bitmap,
                           shift, npages);
     bitmap_or(final_bitmap, final_bitmap, temp_bitmap,
               shift + npages);
     memset(temp_bitmap, 0, bitmap->size);
}

>> +
>> +			shift = (dma->iova - unmap->iova) >> pgshift;
>> +			if (shift)
>> +				bitmap_shift_left(temp_bitmap, temp_bitmap,
>> +						  shift, npages);
>> +			bitmap_or(final_bitmap, final_bitmap, temp_bitmap,
>> +				  shift + npages);
>> +			memset(temp_bitmap, 0, bitmap->size);
>> +		}
> 
> It seems like if the per vfio_dma dirty bitmap was oversized by a long
> that we could shift it in place, then we'd only need one working bitmap
> buffer and we could size that to fit the vfio_dma (or the largest
> vfio_dma if we don't want to free and re-alloc for each vfio_dma).
> We'd need to do more copy_to/from_user()s, but we'd also avoid copying
> between sparse mappings (user zero'd bitmap required) and we'd have a
> far more reasonable memory usage.  Thanks,
>

I thought about it, but couldn't optimize to use one bitmap buffer.
This case will only hit during migration with vIOMMU enabled.
Can we keep these 2 bitmap buffers for now and optimize it later?

Thanks,
Kirti
Alex Williamson May 12, 2020, 9:21 p.m. UTC | #3
On Wed, 13 May 2020 02:00:54 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 5/7/2020 3:55 AM, Alex Williamson wrote:
> > On Mon, 4 May 2020 21:28:57 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> >> DMA mapped pages, including those pinned by mdev vendor drivers, might
> >> get unpinned and unmapped while migration is active and device is still
> >> running. For example, in pre-copy phase while guest driver could access
> >> those pages, host device or vendor driver can dirty these mapped pages.
> >> Such pages should be marked dirty so as to maintain memory consistency
> >> for a user making use of dirty page tracking.
> >>
> >> To get bitmap during unmap, user should allocate memory for bitmap, set
> >> size of allocated memory, set page size to be considered for bitmap and
> >> set flag VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP.
> >>
> >> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> >> Reviewed-by: Neo Jia <cjia@nvidia.com>
> >> ---
> >>   drivers/vfio/vfio_iommu_type1.c | 84 +++++++++++++++++++++++++++++++++++++++--
> >>   include/uapi/linux/vfio.h       | 10 +++++
> >>   2 files changed, 90 insertions(+), 4 deletions(-)
> >>
> >> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> >> index 01dcb417836f..8b27faf1ec38 100644
> >> --- a/drivers/vfio/vfio_iommu_type1.c
> >> +++ b/drivers/vfio/vfio_iommu_type1.c
> >> @@ -983,12 +983,14 @@ static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
> >>   }
> >>   
> >>   static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> >> -			     struct vfio_iommu_type1_dma_unmap *unmap)
> >> +			     struct vfio_iommu_type1_dma_unmap *unmap,
> >> +			     struct vfio_bitmap *bitmap)
> >>   {
> >>   	uint64_t mask;
> >>   	struct vfio_dma *dma, *dma_last = NULL;
> >>   	size_t unmapped = 0;
> >>   	int ret = 0, retries = 0;
> >> +	unsigned long *final_bitmap = NULL, *temp_bitmap = NULL;
> >>   
> >>   	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
> >>   
> >> @@ -1041,6 +1043,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> >>   			ret = -EINVAL;
> >>   			goto unlock;
> >>   		}
> >> +
> >>   		dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
> >>   		if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
> >>   			ret = -EINVAL;
> >> @@ -1048,6 +1051,22 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> >>   		}
> >>   	}
> >>   
> >> +	if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
> >> +	     iommu->dirty_page_tracking) {  
> > 
> > Why do we even accept VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP when not
> > dirty page tracking rather than returning -EINVAL?  It would simplify
> > things here to reject it at the ioctl and silently ignoring a flag is
> > rarely if ever the right approach.
> >   
> >> +		final_bitmap = kvzalloc(bitmap->size, GFP_KERNEL);
> >> +		if (!final_bitmap) {
> >> +			ret = -ENOMEM;
> >> +			goto unlock;
> >> +		}
> >> +
> >> +		temp_bitmap = kvzalloc(bitmap->size, GFP_KERNEL);
> >> +		if (!temp_bitmap) {
> >> +			ret = -ENOMEM;
> >> +			kfree(final_bitmap);
> >> +			goto unlock;
> >> +		}  
> > 
> > YIKES!  So the user can instantly trigger the kernel to internally
> > allocate 2 x 256MB, regardless of how much they can actually map.
> >   
> 
> That is worst case senario. I don't think ideally that will ever hit. 
> More comment below regarding this.

If a user has the ability to lock 8TB of memory, then yeah, triggering
the kernel to allocate 512MB might not be a big deal.  But in this case
a malicious user can trigger the kernel to allocate 512MB of memory
regardless of their locked memory limits.  I think that's unacceptable.
 
> >> +	}
> >> +
> >>   	while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
> >>   		if (!iommu->v2 && unmap->iova > dma->iova)
> >>   			break;
> >> @@ -1058,6 +1077,24 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> >>   		if (dma->task->mm != current->mm)
> >>   			break;
> >>   
> >> +		if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
> >> +		     iommu->dirty_page_tracking) {
> >> +			unsigned long pgshift = __ffs(bitmap->pgsize);
> >> +			unsigned int npages = dma->size >> pgshift;
> >> +			unsigned int shift;
> >> +
> >> +			vfio_iova_dirty_bitmap(iommu, dma->iova, dma->size,
> >> +					bitmap->pgsize, (u64 *)temp_bitmap);  
> > 
> > vfio_iova_dirty_bitmap() takes a __user bitmap, we're doing
> > copy_to_user() on a kernel allocated buffer???
> >   
> 
> Actually, there is no need to call vfio_iova_dirty_bitmap(), dma pointer 
> is known here and since its getting unmapped, there is no need to 
> repopulate bitmap. Removing vfio_iova_dirty_bitmap() and changing it as 
> below:
> 
> if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
>      unsigned long pgshift = __ffs(bitmap->pgsize);
>      unsigned int npages = dma->size >> pgshift;
>      unsigned int bitmap_size = DIRTY_BITMAP_BYTES(npages);
>      unsigned int shift = (dma->iova - unmap->iova) >>
>                                              pgshift;
>      /*
>       * mark all pages dirty if all pages are pinned and
>       * mapped.
>       */
>      if (dma->iommu_mapped)
>          bitmap_set(temp_bitmap, 0, npages);
>      else
>          memcpy(temp_bitmap, dma->bitmap, bitmap_size);
> 
>      if (shift)
>          bitmap_shift_left(temp_bitmap, temp_bitmap,
>                            shift, npages);
>      bitmap_or(final_bitmap, final_bitmap, temp_bitmap,
>                shift + npages);
>      memset(temp_bitmap, 0, bitmap->size);
> }

Solves that problem, but I think also illustrates that if we could
shift dma->bitmap in place we could avoid a memcpy and a memset.

> >> +
> >> +			shift = (dma->iova - unmap->iova) >> pgshift;
> >> +			if (shift)
> >> +				bitmap_shift_left(temp_bitmap, temp_bitmap,
> >> +						  shift, npages);
> >> +			bitmap_or(final_bitmap, final_bitmap, temp_bitmap,
> >> +				  shift + npages);
> >> +			memset(temp_bitmap, 0, bitmap->size);
> >> +		}  
> > 
> > It seems like if the per vfio_dma dirty bitmap was oversized by a long
> > that we could shift it in place, then we'd only need one working bitmap
> > buffer and we could size that to fit the vfio_dma (or the largest
> > vfio_dma if we don't want to free and re-alloc for each vfio_dma).
> > We'd need to do more copy_to/from_user()s, but we'd also avoid copying
> > between sparse mappings (user zero'd bitmap required) and we'd have a
> > far more reasonable memory usage.  Thanks,
> >  
> 
> I thought about it, but couldn't optimize to use one bitmap buffer.
> This case will only hit during migration with vIOMMU enabled.
> Can we keep these 2 bitmap buffers for now and optimize it later?

I don't see how we can limit it to that use case, or why that use case
makes it any less important to avoid exploitable inefficiencies like
this.  We're also potentially changing the uapi from not requiring a
user zero'd buffer to requiring a user zero'd buffer, which means we'd
need to support a backwards compatible uapi, exposing a really
inefficient means for the kernel to zero user memory.

Can you identify what doesn't work about the above proposal?  TBH, once
we shift the dma->bitmap in place, we could make it so that we can
directly copy_to_user and we'd only need to fixup any unaligned
overlaps between mappings, which we could do with just a few bytes
rather than some large final_bitmap buffer.  Thanks,

Alex
diff mbox series

Patch

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 01dcb417836f..8b27faf1ec38 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -983,12 +983,14 @@  static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
 }
 
 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
-			     struct vfio_iommu_type1_dma_unmap *unmap)
+			     struct vfio_iommu_type1_dma_unmap *unmap,
+			     struct vfio_bitmap *bitmap)
 {
 	uint64_t mask;
 	struct vfio_dma *dma, *dma_last = NULL;
 	size_t unmapped = 0;
 	int ret = 0, retries = 0;
+	unsigned long *final_bitmap = NULL, *temp_bitmap = NULL;
 
 	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
 
@@ -1041,6 +1043,7 @@  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 			ret = -EINVAL;
 			goto unlock;
 		}
+
 		dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
 		if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
 			ret = -EINVAL;
@@ -1048,6 +1051,22 @@  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 		}
 	}
 
+	if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
+	     iommu->dirty_page_tracking) {
+		final_bitmap = kvzalloc(bitmap->size, GFP_KERNEL);
+		if (!final_bitmap) {
+			ret = -ENOMEM;
+			goto unlock;
+		}
+
+		temp_bitmap = kvzalloc(bitmap->size, GFP_KERNEL);
+		if (!temp_bitmap) {
+			ret = -ENOMEM;
+			kfree(final_bitmap);
+			goto unlock;
+		}
+	}
+
 	while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
 		if (!iommu->v2 && unmap->iova > dma->iova)
 			break;
@@ -1058,6 +1077,24 @@  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 		if (dma->task->mm != current->mm)
 			break;
 
+		if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
+		     iommu->dirty_page_tracking) {
+			unsigned long pgshift = __ffs(bitmap->pgsize);
+			unsigned int npages = dma->size >> pgshift;
+			unsigned int shift;
+
+			vfio_iova_dirty_bitmap(iommu, dma->iova, dma->size,
+					bitmap->pgsize, (u64 *)temp_bitmap);
+
+			shift = (dma->iova - unmap->iova) >> pgshift;
+			if (shift)
+				bitmap_shift_left(temp_bitmap, temp_bitmap,
+						  shift, npages);
+			bitmap_or(final_bitmap, final_bitmap, temp_bitmap,
+				  shift + npages);
+			memset(temp_bitmap, 0, bitmap->size);
+		}
+
 		if (!RB_EMPTY_ROOT(&dma->pfn_list)) {
 			struct vfio_iommu_type1_dma_unmap nb_unmap;
 
@@ -1088,6 +1125,16 @@  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 	}
 
 unlock:
+	if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
+	     iommu->dirty_page_tracking && final_bitmap) {
+		if (copy_to_user((void __user *)bitmap->data, final_bitmap,
+				 bitmap->size))
+			ret = -EFAULT;
+
+		kfree(final_bitmap);
+		kfree(temp_bitmap);
+	}
+
 	mutex_unlock(&iommu->lock);
 
 	/* Report how much was unmapped */
@@ -2419,17 +2466,46 @@  static long vfio_iommu_type1_ioctl(void *iommu_data,
 
 	} else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
 		struct vfio_iommu_type1_dma_unmap unmap;
-		long ret;
+		struct vfio_bitmap bitmap = { 0 };
+		int ret;
 
 		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
 
 		if (copy_from_user(&unmap, (void __user *)arg, minsz))
 			return -EFAULT;
 
-		if (unmap.argsz < minsz || unmap.flags)
+		if (unmap.argsz < minsz ||
+		    unmap.flags & ~VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP)
 			return -EINVAL;
 
-		ret = vfio_dma_do_unmap(iommu, &unmap);
+		if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
+			unsigned long pgshift;
+			size_t iommu_pgsize =
+				(size_t)1 << __ffs(vfio_pgsize_bitmap(iommu));
+
+			if (unmap.argsz < (minsz + sizeof(bitmap)))
+				return -EINVAL;
+
+			if (copy_from_user(&bitmap,
+					   (void __user *)(arg + minsz),
+					   sizeof(bitmap)))
+				return -EFAULT;
+
+			/* allow only min supported pgsize */
+			if (bitmap.pgsize != iommu_pgsize)
+				return -EINVAL;
+			if (!access_ok((void __user *)bitmap.data, bitmap.size))
+				return -EINVAL;
+
+			pgshift = __ffs(bitmap.pgsize);
+			ret = verify_bitmap_size(unmap.size >> pgshift,
+						 bitmap.size);
+			if (ret)
+				return ret;
+
+		}
+
+		ret = vfio_dma_do_unmap(iommu, &unmap, &bitmap);
 		if (ret)
 			return ret;
 
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 5f359c63f5ef..e3cbf8b78623 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1048,12 +1048,22 @@  struct vfio_bitmap {
  * field.  No guarantee is made to the user that arbitrary unmaps of iova
  * or size different from those used in the original mapping call will
  * succeed.
+ * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP should be set to get dirty bitmap
+ * before unmapping IO virtual addresses. When this flag is set, user must
+ * provide data[] as structure vfio_bitmap. User must allocate memory to get
+ * bitmap and must set size of allocated memory in vfio_bitmap.size field.
+ * A bit in bitmap represents one page of user provided page size in 'pgsize',
+ * consecutively starting from iova offset. Bit set indicates page at that
+ * offset from iova is dirty. Bitmap of pages in the range of unmapped size is
+ * returned in vfio_bitmap.data
  */
 struct vfio_iommu_type1_dma_unmap {
 	__u32	argsz;
 	__u32	flags;
+#define VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP (1 << 0)
 	__u64	iova;				/* IO virtual address */
 	__u64	size;				/* Size of mapping (bytes) */
+	__u8    data[];
 };
 
 #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)