diff mbox series

[v10,Kernel,4/5] vfio iommu: Implementation of ioctl to for dirty pages tracking.

Message ID 1576527700-21805-5-git-send-email-kwankhede@nvidia.com (mailing list archive)
State New, archived
Headers show
Series KABIs to support migration for VFIO devices | expand

Commit Message

Kirti Wankhede Dec. 16, 2019, 8:21 p.m. UTC
VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
- Start unpinned pages dirty pages tracking while migration is active and
  device is running, i.e. during pre-copy phase.
- Stop unpinned pages dirty pages tracking. This is required to stop
  unpinned dirty pages tracking if migration failed or cancelled during
  pre-copy phase. Unpinned pages tracking is clear.
- Get dirty pages bitmap. Stop unpinned dirty pages tracking and clear
  unpinned pages information on bitmap read. This ioctl returns bitmap of
  dirty pages, its user space application responsibility to copy content
  of dirty pages from source to destination during migration.

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Reviewed-by: Neo Jia <cjia@nvidia.com>
---
 drivers/vfio/vfio_iommu_type1.c | 210 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 203 insertions(+), 7 deletions(-)

Comments

Yan Zhao Dec. 17, 2019, 5:15 a.m. UTC | #1
On Tue, Dec 17, 2019 at 04:21:39AM +0800, Kirti Wankhede wrote:
> VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
> - Start unpinned pages dirty pages tracking while migration is active and
>   device is running, i.e. during pre-copy phase.
> - Stop unpinned pages dirty pages tracking. This is required to stop
>   unpinned dirty pages tracking if migration failed or cancelled during
>   pre-copy phase. Unpinned pages tracking is clear.
> - Get dirty pages bitmap. Stop unpinned dirty pages tracking and clear
>   unpinned pages information on bitmap read. This ioctl returns bitmap of
>   dirty pages, its user space application responsibility to copy content
>   of dirty pages from source to destination during migration.
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Reviewed-by: Neo Jia <cjia@nvidia.com>
> ---
>  drivers/vfio/vfio_iommu_type1.c | 210 ++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 203 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 3f6b04f2334f..264449654d3f 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -70,6 +70,7 @@ struct vfio_iommu {
>  	unsigned int		dma_avail;
>  	bool			v2;
>  	bool			nesting;
> +	bool			dirty_page_tracking;
>  };
>  
>  struct vfio_domain {
> @@ -112,6 +113,7 @@ struct vfio_pfn {
>  	dma_addr_t		iova;		/* Device address */
>  	unsigned long		pfn;		/* Host pfn */
>  	atomic_t		ref_count;
> +	bool			unpinned;
>  };
>  
>  struct vfio_regions {
> @@ -244,6 +246,32 @@ static void vfio_remove_from_pfn_list(struct vfio_dma *dma,
>  	kfree(vpfn);
>  }
>  
> +static void vfio_remove_unpinned_from_pfn_list(struct vfio_dma *dma, bool warn)
> +{
> +	struct rb_node *n = rb_first(&dma->pfn_list);
> +
> +	for (; n; n = rb_next(n)) {
> +		struct vfio_pfn *vpfn = rb_entry(n, struct vfio_pfn, node);
> +
> +		if (warn)
> +			WARN_ON_ONCE(vpfn->unpinned);
> +
> +		if (vpfn->unpinned)
> +			vfio_remove_from_pfn_list(dma, vpfn);
> +	}
> +}
> +
> +static void vfio_remove_unpinned_from_dma_list(struct vfio_iommu *iommu)
> +{
> +	struct rb_node *n = rb_first(&iommu->dma_list);
> +
> +	for (; n; n = rb_next(n)) {
> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> +
> +		vfio_remove_unpinned_from_pfn_list(dma, false);
> +	}
> +}
> +
>  static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
>  					       unsigned long iova)
>  {
> @@ -254,13 +282,17 @@ static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
>  	return vpfn;
>  }
>  
> -static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
> +static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn,
> +				  bool dirty_tracking)
>  {
>  	int ret = 0;
>  
>  	if (atomic_dec_and_test(&vpfn->ref_count)) {
>  		ret = put_pfn(vpfn->pfn, dma->prot);
if physical page here is put, it may cause problem when pin this iova
next time:
vfio_iommu_type1_pin_pages {
    ...
    vpfn = vfio_iova_get_vfio_pfn(dma, iova);
    if (vpfn) {
        phys_pfn[i] = vpfn->pfn;
        continue;
    }
    ...
}

> -		vfio_remove_from_pfn_list(dma, vpfn);
> +		if (dirty_tracking)
> +			vpfn->unpinned = true;
> +		else
> +			vfio_remove_from_pfn_list(dma, vpfn);
so the unpinned pages before dirty page tracking is not treated as
dirty?

>  	}
>  	return ret;
>  }
> @@ -504,7 +536,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
>  }
>  
>  static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
> -				    bool do_accounting)
> +				    bool do_accounting, bool dirty_tracking)
>  {
>  	int unlocked;
>  	struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
> @@ -512,7 +544,10 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
>  	if (!vpfn)
>  		return 0;
>  
> -	unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
> +	if (vpfn->unpinned)
> +		return 0;
> +
> +	unlocked = vfio_iova_put_vfio_pfn(dma, vpfn, dirty_tracking);
>  
>  	if (do_accounting)
>  		vfio_lock_acct(dma, -unlocked, true);
> @@ -583,7 +618,8 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>  
>  		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
>  		if (ret) {
> -			vfio_unpin_page_external(dma, iova, do_accounting);
> +			vfio_unpin_page_external(dma, iova, do_accounting,
> +						 false);
>  			goto pin_unwind;
>  		}
>  	}
> @@ -598,7 +634,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>  
>  		iova = user_pfn[j] << PAGE_SHIFT;
>  		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
> -		vfio_unpin_page_external(dma, iova, do_accounting);
> +		vfio_unpin_page_external(dma, iova, do_accounting, false);
>  		phys_pfn[j] = 0;
>  	}
>  pin_done:
> @@ -632,7 +668,8 @@ static int vfio_iommu_type1_unpin_pages(void *iommu_data,
>  		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
>  		if (!dma)
>  			goto unpin_exit;
> -		vfio_unpin_page_external(dma, iova, do_accounting);
> +		vfio_unpin_page_external(dma, iova, do_accounting,
> +					 iommu->dirty_page_tracking);
>  	}
>  
>  unpin_exit:
> @@ -850,6 +887,88 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
>  	return bitmap;
>  }
>  
> +/*
> + * start_iova is the reference from where bitmaping started. This is called
> + * from DMA_UNMAP where start_iova can be different than iova
> + */
> +
> +static void vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
> +				  size_t size, uint64_t pgsize,
> +				  dma_addr_t start_iova, unsigned long *bitmap)
> +{
> +	struct vfio_dma *dma;
> +	dma_addr_t i = iova;
> +	unsigned long pgshift = __ffs(pgsize);
> +
> +	while ((dma = vfio_find_dma(iommu, i, pgsize))) {
> +		/* mark all pages dirty if all pages are pinned and mapped. */
> +		if (dma->iommu_mapped) {
This prevents pass-through devices from calling vfio_pin_pages to do
fine grained log dirty.
> +			dma_addr_t iova_limit;
> +
> +			iova_limit = (dma->iova + dma->size) < (iova + size) ?
> +				     (dma->iova + dma->size) : (iova + size);
> +
> +			for (; i < iova_limit; i += pgsize) {
> +				unsigned int start;
> +
> +				start = (i - start_iova) >> pgshift;
> +
> +				__bitmap_set(bitmap, start, 1);
> +			}
> +			if (i >= iova + size)
> +				return;
> +		} else {
> +			struct rb_node *n = rb_first(&dma->pfn_list);
> +			bool found = false;
> +
> +			for (; n; n = rb_next(n)) {
> +				struct vfio_pfn *vpfn = rb_entry(n,
> +							struct vfio_pfn, node);
> +				if (vpfn->iova >= i) {
> +					found = true;
> +					break;
> +				}
> +			}
> +
> +			if (!found) {
> +				i += dma->size;
> +				continue;
> +			}
> +
> +			for (; n; n = rb_next(n)) {
> +				unsigned int start;
> +				struct vfio_pfn *vpfn = rb_entry(n,
> +							struct vfio_pfn, node);
> +
> +				if (vpfn->iova >= iova + size)
> +					return;
> +
> +				start = (vpfn->iova - start_iova) >> pgshift;
> +
> +				__bitmap_set(bitmap, start, 1);
> +
> +				i = vpfn->iova + pgsize;
> +			}
> +		}
> +		vfio_remove_unpinned_from_pfn_list(dma, false);
> +	}
> +}
> +
> +static long verify_bitmap_size(unsigned long npages, unsigned long bitmap_size)
> +{
> +	long bsize;
> +
> +	if (!bitmap_size || bitmap_size > SIZE_MAX)
> +		return -EINVAL;
> +
> +	bsize = ALIGN(npages, BITS_PER_LONG) / sizeof(unsigned long);
> +
> +	if (bitmap_size < bsize)
> +		return -EINVAL;
> +
> +	return bsize;
> +}
> +
>  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>  			     struct vfio_iommu_type1_dma_unmap *unmap)
>  {
> @@ -2298,6 +2417,83 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>  
>  		return copy_to_user((void __user *)arg, &unmap, minsz) ?
>  			-EFAULT : 0;
> +	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
> +		struct vfio_iommu_type1_dirty_bitmap range;
> +		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
> +		int ret;
> +
> +		if (!iommu->v2)
> +			return -EACCES;
> +
> +		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
> +				    bitmap);
> +
> +		if (copy_from_user(&range, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (range.argsz < minsz || range.flags & ~mask)
> +			return -EINVAL;
> +
> +		if (range.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
> +			iommu->dirty_page_tracking = true;
> +			return 0;
> +		} else if (range.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
> +			iommu->dirty_page_tracking = false;
> +
> +			mutex_lock(&iommu->lock);
> +			vfio_remove_unpinned_from_dma_list(iommu);
> +			mutex_unlock(&iommu->lock);
> +			return 0;
> +
> +		} else if (range.flags &
> +				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
> +			uint64_t iommu_pgmask;
> +			unsigned long pgshift = __ffs(range.pgsize);
> +			unsigned long *bitmap;
> +			long bsize;
> +
> +			iommu_pgmask =
> +			 ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
> +
> +			if (((range.pgsize - 1) & iommu_pgmask) !=
> +			    (range.pgsize - 1))
> +				return -EINVAL;
> +
> +			if (range.iova & iommu_pgmask)
> +				return -EINVAL;
> +			if (!range.size || range.size > SIZE_MAX)
> +				return -EINVAL;
> +			if (range.iova + range.size < range.iova)
> +				return -EINVAL;
> +
> +			bsize = verify_bitmap_size(range.size >> pgshift,
> +						   range.bitmap_size);
> +			if (bsize)
> +				return ret;
> +
> +			bitmap = kmalloc(bsize, GFP_KERNEL);
> +			if (!bitmap)
> +				return -ENOMEM;
> +
> +			ret = copy_from_user(bitmap,
> +			     (void __user *)range.bitmap, bsize) ? -EFAULT : 0;
> +			if (ret)
> +				goto bitmap_exit;
> +
> +			iommu->dirty_page_tracking = false;
why iommu->dirty_page_tracking is false here?
suppose this ioctl can be called several times.

Thanks
Yan
> +			mutex_lock(&iommu->lock);
> +			vfio_iova_dirty_bitmap(iommu, range.iova, range.size,
> +					     range.pgsize, range.iova, bitmap);
> +			mutex_unlock(&iommu->lock);
> +
> +			ret = copy_to_user((void __user *)range.bitmap, bitmap,
> +					   range.bitmap_size) ? -EFAULT : 0;
> +bitmap_exit:
> +			kfree(bitmap);
> +			return ret;
> +		}
>  	}
>  
>  	return -ENOTTY;
> -- 
> 2.7.0
>
Kirti Wankhede Dec. 17, 2019, 9:24 a.m. UTC | #2
On 12/17/2019 10:45 AM, Yan Zhao wrote:
> On Tue, Dec 17, 2019 at 04:21:39AM +0800, Kirti Wankhede wrote:
>> VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
>> - Start unpinned pages dirty pages tracking while migration is active and
>>    device is running, i.e. during pre-copy phase.
>> - Stop unpinned pages dirty pages tracking. This is required to stop
>>    unpinned dirty pages tracking if migration failed or cancelled during
>>    pre-copy phase. Unpinned pages tracking is clear.
>> - Get dirty pages bitmap. Stop unpinned dirty pages tracking and clear
>>    unpinned pages information on bitmap read. This ioctl returns bitmap of
>>    dirty pages, its user space application responsibility to copy content
>>    of dirty pages from source to destination during migration.
>>
>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>> Reviewed-by: Neo Jia <cjia@nvidia.com>
>> ---
>>   drivers/vfio/vfio_iommu_type1.c | 210 ++++++++++++++++++++++++++++++++++++++--
>>   1 file changed, 203 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>> index 3f6b04f2334f..264449654d3f 100644
>> --- a/drivers/vfio/vfio_iommu_type1.c
>> +++ b/drivers/vfio/vfio_iommu_type1.c
>> @@ -70,6 +70,7 @@ struct vfio_iommu {
>>   	unsigned int		dma_avail;
>>   	bool			v2;
>>   	bool			nesting;
>> +	bool			dirty_page_tracking;
>>   };
>>   
>>   struct vfio_domain {
>> @@ -112,6 +113,7 @@ struct vfio_pfn {
>>   	dma_addr_t		iova;		/* Device address */
>>   	unsigned long		pfn;		/* Host pfn */
>>   	atomic_t		ref_count;
>> +	bool			unpinned;
>>   };
>>   
>>   struct vfio_regions {
>> @@ -244,6 +246,32 @@ static void vfio_remove_from_pfn_list(struct vfio_dma *dma,
>>   	kfree(vpfn);
>>   }
>>   
>> +static void vfio_remove_unpinned_from_pfn_list(struct vfio_dma *dma, bool warn)
>> +{
>> +	struct rb_node *n = rb_first(&dma->pfn_list);
>> +
>> +	for (; n; n = rb_next(n)) {
>> +		struct vfio_pfn *vpfn = rb_entry(n, struct vfio_pfn, node);
>> +
>> +		if (warn)
>> +			WARN_ON_ONCE(vpfn->unpinned);
>> +
>> +		if (vpfn->unpinned)
>> +			vfio_remove_from_pfn_list(dma, vpfn);
>> +	}
>> +}
>> +
>> +static void vfio_remove_unpinned_from_dma_list(struct vfio_iommu *iommu)
>> +{
>> +	struct rb_node *n = rb_first(&iommu->dma_list);
>> +
>> +	for (; n; n = rb_next(n)) {
>> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
>> +
>> +		vfio_remove_unpinned_from_pfn_list(dma, false);
>> +	}
>> +}
>> +
>>   static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
>>   					       unsigned long iova)
>>   {
>> @@ -254,13 +282,17 @@ static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
>>   	return vpfn;
>>   }
>>   
>> -static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
>> +static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn,
>> +				  bool dirty_tracking)
>>   {
>>   	int ret = 0;
>>   
>>   	if (atomic_dec_and_test(&vpfn->ref_count)) {
>>   		ret = put_pfn(vpfn->pfn, dma->prot);
> if physical page here is put, it may cause problem when pin this iova
> next time:
> vfio_iommu_type1_pin_pages {
>      ...
>      vpfn = vfio_iova_get_vfio_pfn(dma, iova);
>      if (vpfn) {
>          phys_pfn[i] = vpfn->pfn;
>          continue;
>      }
>      ...
> }
> 

Good point. Fixing it as:

                 vpfn = vfio_iova_get_vfio_pfn(dma, iova);
                 if (vpfn) {
-                       phys_pfn[i] = vpfn->pfn;
-                       continue;
+                       if (vpfn->unpinned)
+                               vfio_remove_from_pfn_list(dma, vpfn);
+                       else {
+                               phys_pfn[i] = vpfn->pfn;
+                               continue;
+                       }
                 }



>> -		vfio_remove_from_pfn_list(dma, vpfn);
>> +		if (dirty_tracking)
>> +			vpfn->unpinned = true;
>> +		else
>> +			vfio_remove_from_pfn_list(dma, vpfn);
> so the unpinned pages before dirty page tracking is not treated as
> dirty?
> 

Yes. That's we agreed on previous version:
https://www.mail-archive.com/qemu-devel@nongnu.org/msg663157.html

>>   	}
>>   	return ret;
>>   }
>> @@ -504,7 +536,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
>>   }
>>   
>>   static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
>> -				    bool do_accounting)
>> +				    bool do_accounting, bool dirty_tracking)
>>   {
>>   	int unlocked;
>>   	struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
>> @@ -512,7 +544,10 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
>>   	if (!vpfn)
>>   		return 0;
>>   
>> -	unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
>> +	if (vpfn->unpinned)
>> +		return 0;
>> +
>> +	unlocked = vfio_iova_put_vfio_pfn(dma, vpfn, dirty_tracking);
>>   
>>   	if (do_accounting)
>>   		vfio_lock_acct(dma, -unlocked, true);
>> @@ -583,7 +618,8 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>>   
>>   		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
>>   		if (ret) {
>> -			vfio_unpin_page_external(dma, iova, do_accounting);
>> +			vfio_unpin_page_external(dma, iova, do_accounting,
>> +						 false);
>>   			goto pin_unwind;
>>   		}
>>   	}
>> @@ -598,7 +634,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>>   
>>   		iova = user_pfn[j] << PAGE_SHIFT;
>>   		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
>> -		vfio_unpin_page_external(dma, iova, do_accounting);
>> +		vfio_unpin_page_external(dma, iova, do_accounting, false);
>>   		phys_pfn[j] = 0;
>>   	}
>>   pin_done:
>> @@ -632,7 +668,8 @@ static int vfio_iommu_type1_unpin_pages(void *iommu_data,
>>   		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
>>   		if (!dma)
>>   			goto unpin_exit;
>> -		vfio_unpin_page_external(dma, iova, do_accounting);
>> +		vfio_unpin_page_external(dma, iova, do_accounting,
>> +					 iommu->dirty_page_tracking);
>>   	}
>>   
>>   unpin_exit:
>> @@ -850,6 +887,88 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
>>   	return bitmap;
>>   }
>>   
>> +/*
>> + * start_iova is the reference from where bitmaping started. This is called
>> + * from DMA_UNMAP where start_iova can be different than iova
>> + */
>> +
>> +static void vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
>> +				  size_t size, uint64_t pgsize,
>> +				  dma_addr_t start_iova, unsigned long *bitmap)
>> +{
>> +	struct vfio_dma *dma;
>> +	dma_addr_t i = iova;
>> +	unsigned long pgshift = __ffs(pgsize);
>> +
>> +	while ((dma = vfio_find_dma(iommu, i, pgsize))) {
>> +		/* mark all pages dirty if all pages are pinned and mapped. */
>> +		if (dma->iommu_mapped) {
> This prevents pass-through devices from calling vfio_pin_pages to do
> fine grained log dirty.

Yes, I mentioned that in yet TODO item in cover letter:

"If IOMMU capable device is present in the container, then all pages are
marked dirty. Need to think smart way to know if IOMMU capable device's
driver is smart to report pages to be marked dirty by pinning those 
pages externally."


>> +			dma_addr_t iova_limit;
>> +
>> +			iova_limit = (dma->iova + dma->size) < (iova + size) ?
>> +				     (dma->iova + dma->size) : (iova + size);
>> +
>> +			for (; i < iova_limit; i += pgsize) {
>> +				unsigned int start;
>> +
>> +				start = (i - start_iova) >> pgshift;
>> +
>> +				__bitmap_set(bitmap, start, 1);
>> +			}
>> +			if (i >= iova + size)
>> +				return;
>> +		} else {
>> +			struct rb_node *n = rb_first(&dma->pfn_list);
>> +			bool found = false;
>> +
>> +			for (; n; n = rb_next(n)) {
>> +				struct vfio_pfn *vpfn = rb_entry(n,
>> +							struct vfio_pfn, node);
>> +				if (vpfn->iova >= i) {
>> +					found = true;
>> +					break;
>> +				}
>> +			}
>> +
>> +			if (!found) {
>> +				i += dma->size;
>> +				continue;
>> +			}
>> +
>> +			for (; n; n = rb_next(n)) {
>> +				unsigned int start;
>> +				struct vfio_pfn *vpfn = rb_entry(n,
>> +							struct vfio_pfn, node);
>> +
>> +				if (vpfn->iova >= iova + size)
>> +					return;
>> +
>> +				start = (vpfn->iova - start_iova) >> pgshift;
>> +
>> +				__bitmap_set(bitmap, start, 1);
>> +
>> +				i = vpfn->iova + pgsize;
>> +			}
>> +		}
>> +		vfio_remove_unpinned_from_pfn_list(dma, false);
>> +	}
>> +}
>> +
>> +static long verify_bitmap_size(unsigned long npages, unsigned long bitmap_size)
>> +{
>> +	long bsize;
>> +
>> +	if (!bitmap_size || bitmap_size > SIZE_MAX)
>> +		return -EINVAL;
>> +
>> +	bsize = ALIGN(npages, BITS_PER_LONG) / sizeof(unsigned long);
>> +
>> +	if (bitmap_size < bsize)
>> +		return -EINVAL;
>> +
>> +	return bsize;
>> +}
>> +
>>   static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>>   			     struct vfio_iommu_type1_dma_unmap *unmap)
>>   {
>> @@ -2298,6 +2417,83 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>>   
>>   		return copy_to_user((void __user *)arg, &unmap, minsz) ?
>>   			-EFAULT : 0;
>> +	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
>> +		struct vfio_iommu_type1_dirty_bitmap range;
>> +		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
>> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
>> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
>> +		int ret;
>> +
>> +		if (!iommu->v2)
>> +			return -EACCES;
>> +
>> +		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
>> +				    bitmap);
>> +
>> +		if (copy_from_user(&range, (void __user *)arg, minsz))
>> +			return -EFAULT;
>> +
>> +		if (range.argsz < minsz || range.flags & ~mask)
>> +			return -EINVAL;
>> +
>> +		if (range.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
>> +			iommu->dirty_page_tracking = true;
>> +			return 0;
>> +		} else if (range.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
>> +			iommu->dirty_page_tracking = false;
>> +
>> +			mutex_lock(&iommu->lock);
>> +			vfio_remove_unpinned_from_dma_list(iommu);
>> +			mutex_unlock(&iommu->lock);
>> +			return 0;
>> +
>> +		} else if (range.flags &
>> +				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
>> +			uint64_t iommu_pgmask;
>> +			unsigned long pgshift = __ffs(range.pgsize);
>> +			unsigned long *bitmap;
>> +			long bsize;
>> +
>> +			iommu_pgmask =
>> +			 ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
>> +
>> +			if (((range.pgsize - 1) & iommu_pgmask) !=
>> +			    (range.pgsize - 1))
>> +				return -EINVAL;
>> +
>> +			if (range.iova & iommu_pgmask)
>> +				return -EINVAL;
>> +			if (!range.size || range.size > SIZE_MAX)
>> +				return -EINVAL;
>> +			if (range.iova + range.size < range.iova)
>> +				return -EINVAL;
>> +
>> +			bsize = verify_bitmap_size(range.size >> pgshift,
>> +						   range.bitmap_size);
>> +			if (bsize)
>> +				return ret;
>> +
>> +			bitmap = kmalloc(bsize, GFP_KERNEL);
>> +			if (!bitmap)
>> +				return -ENOMEM;
>> +
>> +			ret = copy_from_user(bitmap,
>> +			     (void __user *)range.bitmap, bsize) ? -EFAULT : 0;
>> +			if (ret)
>> +				goto bitmap_exit;
>> +
>> +			iommu->dirty_page_tracking = false;
> why iommu->dirty_page_tracking is false here?
> suppose this ioctl can be called several times.
> 

This ioctl can be called several times, but once this ioctl is called 
that means vCPUs are stopped and VFIO devices are stopped (i.e. in 
stop-and-copy phase) and dirty pages bitmap are being queried by user.

Thanks,
Kirti


> Thanks
> Yan
>> +			mutex_lock(&iommu->lock);
>> +			vfio_iova_dirty_bitmap(iommu, range.iova, range.size,
>> +					     range.pgsize, range.iova, bitmap);
>> +			mutex_unlock(&iommu->lock);
>> +
>> +			ret = copy_to_user((void __user *)range.bitmap, bitmap,
>> +					   range.bitmap_size) ? -EFAULT : 0;
>> +bitmap_exit:
>> +			kfree(bitmap);
>> +			return ret;
>> +		}
>>   	}
>>   
>>   	return -ENOTTY;
>> -- 
>> 2.7.0
>>
Yan Zhao Dec. 17, 2019, 9:51 a.m. UTC | #3
On Tue, Dec 17, 2019 at 05:24:14PM +0800, Kirti Wankhede wrote:
> 
> 
> On 12/17/2019 10:45 AM, Yan Zhao wrote:
> > On Tue, Dec 17, 2019 at 04:21:39AM +0800, Kirti Wankhede wrote:
> >> VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
> >> - Start unpinned pages dirty pages tracking while migration is active and
> >>    device is running, i.e. during pre-copy phase.
> >> - Stop unpinned pages dirty pages tracking. This is required to stop
> >>    unpinned dirty pages tracking if migration failed or cancelled during
> >>    pre-copy phase. Unpinned pages tracking is clear.
> >> - Get dirty pages bitmap. Stop unpinned dirty pages tracking and clear
> >>    unpinned pages information on bitmap read. This ioctl returns bitmap of
> >>    dirty pages, its user space application responsibility to copy content
> >>    of dirty pages from source to destination during migration.
> >>
> >> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> >> Reviewed-by: Neo Jia <cjia@nvidia.com>
> >> ---
> >>   drivers/vfio/vfio_iommu_type1.c | 210 ++++++++++++++++++++++++++++++++++++++--
> >>   1 file changed, 203 insertions(+), 7 deletions(-)
> >>
> >> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> >> index 3f6b04f2334f..264449654d3f 100644
> >> --- a/drivers/vfio/vfio_iommu_type1.c
> >> +++ b/drivers/vfio/vfio_iommu_type1.c
> >> @@ -70,6 +70,7 @@ struct vfio_iommu {
> >>   	unsigned int		dma_avail;
> >>   	bool			v2;
> >>   	bool			nesting;
> >> +	bool			dirty_page_tracking;
> >>   };
> >>   
> >>   struct vfio_domain {
> >> @@ -112,6 +113,7 @@ struct vfio_pfn {
> >>   	dma_addr_t		iova;		/* Device address */
> >>   	unsigned long		pfn;		/* Host pfn */
> >>   	atomic_t		ref_count;
> >> +	bool			unpinned;
> >>   };
> >>   
> >>   struct vfio_regions {
> >> @@ -244,6 +246,32 @@ static void vfio_remove_from_pfn_list(struct vfio_dma *dma,
> >>   	kfree(vpfn);
> >>   }
> >>   
> >> +static void vfio_remove_unpinned_from_pfn_list(struct vfio_dma *dma, bool warn)
> >> +{
> >> +	struct rb_node *n = rb_first(&dma->pfn_list);
> >> +
> >> +	for (; n; n = rb_next(n)) {
> >> +		struct vfio_pfn *vpfn = rb_entry(n, struct vfio_pfn, node);
> >> +
> >> +		if (warn)
> >> +			WARN_ON_ONCE(vpfn->unpinned);
> >> +
> >> +		if (vpfn->unpinned)
> >> +			vfio_remove_from_pfn_list(dma, vpfn);
> >> +	}
> >> +}
> >> +
> >> +static void vfio_remove_unpinned_from_dma_list(struct vfio_iommu *iommu)
> >> +{
> >> +	struct rb_node *n = rb_first(&iommu->dma_list);
> >> +
> >> +	for (; n; n = rb_next(n)) {
> >> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> >> +
> >> +		vfio_remove_unpinned_from_pfn_list(dma, false);
> >> +	}
> >> +}
> >> +
> >>   static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
> >>   					       unsigned long iova)
> >>   {
> >> @@ -254,13 +282,17 @@ static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
> >>   	return vpfn;
> >>   }
> >>   
> >> -static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
> >> +static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn,
> >> +				  bool dirty_tracking)
> >>   {
> >>   	int ret = 0;
> >>   
> >>   	if (atomic_dec_and_test(&vpfn->ref_count)) {
> >>   		ret = put_pfn(vpfn->pfn, dma->prot);
> > if physical page here is put, it may cause problem when pin this iova
> > next time:
> > vfio_iommu_type1_pin_pages {
> >      ...
> >      vpfn = vfio_iova_get_vfio_pfn(dma, iova);
> >      if (vpfn) {
> >          phys_pfn[i] = vpfn->pfn;
> >          continue;
> >      }
> >      ...
> > }
> > 
> 
> Good point. Fixing it as:
> 
>                  vpfn = vfio_iova_get_vfio_pfn(dma, iova);
>                  if (vpfn) {
> -                       phys_pfn[i] = vpfn->pfn;
> -                       continue;
> +                       if (vpfn->unpinned)
> +                               vfio_remove_from_pfn_list(dma, vpfn);
what about updating vpfn instead?

> +                       else {
> +                               phys_pfn[i] = vpfn->pfn;
> +                               continue;
> +                       }
>                  }
> 
> 
> 
> >> -		vfio_remove_from_pfn_list(dma, vpfn);
> >> +		if (dirty_tracking)
> >> +			vpfn->unpinned = true;
> >> +		else
> >> +			vfio_remove_from_pfn_list(dma, vpfn);
> > so the unpinned pages before dirty page tracking is not treated as
> > dirty?
> > 
> 
> Yes. That's we agreed on previous version:
> https://www.mail-archive.com/qemu-devel@nongnu.org/msg663157.html
> 
> >>   	}
> >>   	return ret;
> >>   }
> >> @@ -504,7 +536,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
> >>   }
> >>   
> >>   static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
> >> -				    bool do_accounting)
> >> +				    bool do_accounting, bool dirty_tracking)
> >>   {
> >>   	int unlocked;
> >>   	struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
> >> @@ -512,7 +544,10 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
> >>   	if (!vpfn)
> >>   		return 0;
> >>   
> >> -	unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
> >> +	if (vpfn->unpinned)
> >> +		return 0;
> >> +
> >> +	unlocked = vfio_iova_put_vfio_pfn(dma, vpfn, dirty_tracking);
> >>   
> >>   	if (do_accounting)
> >>   		vfio_lock_acct(dma, -unlocked, true);
> >> @@ -583,7 +618,8 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
> >>   
> >>   		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
> >>   		if (ret) {
> >> -			vfio_unpin_page_external(dma, iova, do_accounting);
> >> +			vfio_unpin_page_external(dma, iova, do_accounting,
> >> +						 false);
> >>   			goto pin_unwind;
> >>   		}
> >>   	}
> >> @@ -598,7 +634,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
> >>   
> >>   		iova = user_pfn[j] << PAGE_SHIFT;
> >>   		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
> >> -		vfio_unpin_page_external(dma, iova, do_accounting);
> >> +		vfio_unpin_page_external(dma, iova, do_accounting, false);
> >>   		phys_pfn[j] = 0;
> >>   	}
> >>   pin_done:
> >> @@ -632,7 +668,8 @@ static int vfio_iommu_type1_unpin_pages(void *iommu_data,
> >>   		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
> >>   		if (!dma)
> >>   			goto unpin_exit;
> >> -		vfio_unpin_page_external(dma, iova, do_accounting);
> >> +		vfio_unpin_page_external(dma, iova, do_accounting,
> >> +					 iommu->dirty_page_tracking);
> >>   	}
> >>   
> >>   unpin_exit:
> >> @@ -850,6 +887,88 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
> >>   	return bitmap;
> >>   }
> >>   
> >> +/*
> >> + * start_iova is the reference from where bitmaping started. This is called
> >> + * from DMA_UNMAP where start_iova can be different than iova
> >> + */
> >> +
> >> +static void vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
> >> +				  size_t size, uint64_t pgsize,
> >> +				  dma_addr_t start_iova, unsigned long *bitmap)
> >> +{
> >> +	struct vfio_dma *dma;
> >> +	dma_addr_t i = iova;
> >> +	unsigned long pgshift = __ffs(pgsize);
> >> +
> >> +	while ((dma = vfio_find_dma(iommu, i, pgsize))) {
> >> +		/* mark all pages dirty if all pages are pinned and mapped. */
> >> +		if (dma->iommu_mapped) {
> > This prevents pass-through devices from calling vfio_pin_pages to do
> > fine grained log dirty.
> 
> Yes, I mentioned that in yet TODO item in cover letter:
> 
> "If IOMMU capable device is present in the container, then all pages are
> marked dirty. Need to think smart way to know if IOMMU capable device's
> driver is smart to report pages to be marked dirty by pinning those 
> pages externally."
>
why not just check first if any vpfn present for IOMMU capable devices?

> 
> >> +			dma_addr_t iova_limit;
> >> +
> >> +			iova_limit = (dma->iova + dma->size) < (iova + size) ?
> >> +				     (dma->iova + dma->size) : (iova + size);
> >> +
> >> +			for (; i < iova_limit; i += pgsize) {
> >> +				unsigned int start;
> >> +
> >> +				start = (i - start_iova) >> pgshift;
> >> +
> >> +				__bitmap_set(bitmap, start, 1);
> >> +			}
> >> +			if (i >= iova + size)
> >> +				return;
> >> +		} else {
> >> +			struct rb_node *n = rb_first(&dma->pfn_list);
> >> +			bool found = false;
> >> +
> >> +			for (; n; n = rb_next(n)) {
> >> +				struct vfio_pfn *vpfn = rb_entry(n,
> >> +							struct vfio_pfn, node);
> >> +				if (vpfn->iova >= i) {
> >> +					found = true;
> >> +					break;
> >> +				}
> >> +			}
> >> +
> >> +			if (!found) {
> >> +				i += dma->size;
> >> +				continue;
> >> +			}
> >> +
> >> +			for (; n; n = rb_next(n)) {
> >> +				unsigned int start;
> >> +				struct vfio_pfn *vpfn = rb_entry(n,
> >> +							struct vfio_pfn, node);
> >> +
> >> +				if (vpfn->iova >= iova + size)
> >> +					return;
> >> +
> >> +				start = (vpfn->iova - start_iova) >> pgshift;
> >> +
> >> +				__bitmap_set(bitmap, start, 1);
> >> +
> >> +				i = vpfn->iova + pgsize;
> >> +			}
> >> +		}
> >> +		vfio_remove_unpinned_from_pfn_list(dma, false);
> >> +	}
> >> +}
> >> +
> >> +static long verify_bitmap_size(unsigned long npages, unsigned long bitmap_size)
> >> +{
> >> +	long bsize;
> >> +
> >> +	if (!bitmap_size || bitmap_size > SIZE_MAX)
> >> +		return -EINVAL;
> >> +
> >> +	bsize = ALIGN(npages, BITS_PER_LONG) / sizeof(unsigned long);
> >> +
> >> +	if (bitmap_size < bsize)
> >> +		return -EINVAL;
> >> +
> >> +	return bsize;
> >> +}
> >> +
> >>   static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> >>   			     struct vfio_iommu_type1_dma_unmap *unmap)
> >>   {
> >> @@ -2298,6 +2417,83 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
> >>   
> >>   		return copy_to_user((void __user *)arg, &unmap, minsz) ?
> >>   			-EFAULT : 0;
> >> +	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
> >> +		struct vfio_iommu_type1_dirty_bitmap range;
> >> +		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
> >> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
> >> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
> >> +		int ret;
> >> +
> >> +		if (!iommu->v2)
> >> +			return -EACCES;
> >> +
> >> +		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
> >> +				    bitmap);
> >> +
> >> +		if (copy_from_user(&range, (void __user *)arg, minsz))
> >> +			return -EFAULT;
> >> +
> >> +		if (range.argsz < minsz || range.flags & ~mask)
> >> +			return -EINVAL;
> >> +
> >> +		if (range.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
> >> +			iommu->dirty_page_tracking = true;
> >> +			return 0;
> >> +		} else if (range.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
> >> +			iommu->dirty_page_tracking = false;
> >> +
> >> +			mutex_lock(&iommu->lock);
> >> +			vfio_remove_unpinned_from_dma_list(iommu);
> >> +			mutex_unlock(&iommu->lock);
> >> +			return 0;
> >> +
> >> +		} else if (range.flags &
> >> +				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
> >> +			uint64_t iommu_pgmask;
> >> +			unsigned long pgshift = __ffs(range.pgsize);
> >> +			unsigned long *bitmap;
> >> +			long bsize;
> >> +
> >> +			iommu_pgmask =
> >> +			 ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
> >> +
> >> +			if (((range.pgsize - 1) & iommu_pgmask) !=
> >> +			    (range.pgsize - 1))
> >> +				return -EINVAL;
> >> +
> >> +			if (range.iova & iommu_pgmask)
> >> +				return -EINVAL;
> >> +			if (!range.size || range.size > SIZE_MAX)
> >> +				return -EINVAL;
> >> +			if (range.iova + range.size < range.iova)
> >> +				return -EINVAL;
> >> +
> >> +			bsize = verify_bitmap_size(range.size >> pgshift,
> >> +						   range.bitmap_size);
> >> +			if (bsize)
> >> +				return ret;
> >> +
> >> +			bitmap = kmalloc(bsize, GFP_KERNEL);
> >> +			if (!bitmap)
> >> +				return -ENOMEM;
> >> +
> >> +			ret = copy_from_user(bitmap,
> >> +			     (void __user *)range.bitmap, bsize) ? -EFAULT : 0;
> >> +			if (ret)
> >> +				goto bitmap_exit;
> >> +
> >> +			iommu->dirty_page_tracking = false;
> > why iommu->dirty_page_tracking is false here?
> > suppose this ioctl can be called several times.
> > 
> 
> This ioctl can be called several times, but once this ioctl is called 
> that means vCPUs are stopped and VFIO devices are stopped (i.e. in 
> stop-and-copy phase) and dirty pages bitmap are being queried by user.
> 
can't agree that VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP can only be
called in stop-and-copy phase.
As stated in last version, this will cause QEMU to get a wrong expectation
of VM downtime and this is also the reason for previously pinned pages
before log_sync cannot be treated as dirty. If this get bitmap ioctl can
be called early in save_setup phase, then it's no problem even all ram
is dirty.

Thanks
Yan
> 
> 
> > Thanks
> > Yan
> >> +			mutex_lock(&iommu->lock);
> >> +			vfio_iova_dirty_bitmap(iommu, range.iova, range.size,
> >> +					     range.pgsize, range.iova, bitmap);
> >> +			mutex_unlock(&iommu->lock);
> >> +
> >> +			ret = copy_to_user((void __user *)range.bitmap, bitmap,
> >> +					   range.bitmap_size) ? -EFAULT : 0;
> >> +bitmap_exit:
> >> +			kfree(bitmap);
> >> +			return ret;
> >> +		}
> >>   	}
> >>   
> >>   	return -ENOTTY;
> >> -- 
> >> 2.7.0
> >>
Kirti Wankhede Dec. 17, 2019, 11:47 a.m. UTC | #4
On 12/17/2019 3:21 PM, Yan Zhao wrote:
> On Tue, Dec 17, 2019 at 05:24:14PM +0800, Kirti Wankhede wrote:
>>
>>
>> On 12/17/2019 10:45 AM, Yan Zhao wrote:
>>> On Tue, Dec 17, 2019 at 04:21:39AM +0800, Kirti Wankhede wrote:
>>>> VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
>>>> - Start unpinned pages dirty pages tracking while migration is active and
>>>>     device is running, i.e. during pre-copy phase.
>>>> - Stop unpinned pages dirty pages tracking. This is required to stop
>>>>     unpinned dirty pages tracking if migration failed or cancelled during
>>>>     pre-copy phase. Unpinned pages tracking is clear.
>>>> - Get dirty pages bitmap. Stop unpinned dirty pages tracking and clear
>>>>     unpinned pages information on bitmap read. This ioctl returns bitmap of
>>>>     dirty pages, its user space application responsibility to copy content
>>>>     of dirty pages from source to destination during migration.
>>>>
>>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>>>> Reviewed-by: Neo Jia <cjia@nvidia.com>
>>>> ---
>>>>    drivers/vfio/vfio_iommu_type1.c | 210 ++++++++++++++++++++++++++++++++++++++--
>>>>    1 file changed, 203 insertions(+), 7 deletions(-)
>>>>
>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>>>> index 3f6b04f2334f..264449654d3f 100644
>>>> --- a/drivers/vfio/vfio_iommu_type1.c
>>>> +++ b/drivers/vfio/vfio_iommu_type1.c
>>>> @@ -70,6 +70,7 @@ struct vfio_iommu {
>>>>    	unsigned int		dma_avail;
>>>>    	bool			v2;
>>>>    	bool			nesting;
>>>> +	bool			dirty_page_tracking;
>>>>    };
>>>>    
>>>>    struct vfio_domain {
>>>> @@ -112,6 +113,7 @@ struct vfio_pfn {
>>>>    	dma_addr_t		iova;		/* Device address */
>>>>    	unsigned long		pfn;		/* Host pfn */
>>>>    	atomic_t		ref_count;
>>>> +	bool			unpinned;
>>>>    };
>>>>    
>>>>    struct vfio_regions {
>>>> @@ -244,6 +246,32 @@ static void vfio_remove_from_pfn_list(struct vfio_dma *dma,
>>>>    	kfree(vpfn);
>>>>    }
>>>>    
>>>> +static void vfio_remove_unpinned_from_pfn_list(struct vfio_dma *dma, bool warn)
>>>> +{
>>>> +	struct rb_node *n = rb_first(&dma->pfn_list);
>>>> +
>>>> +	for (; n; n = rb_next(n)) {
>>>> +		struct vfio_pfn *vpfn = rb_entry(n, struct vfio_pfn, node);
>>>> +
>>>> +		if (warn)
>>>> +			WARN_ON_ONCE(vpfn->unpinned);
>>>> +
>>>> +		if (vpfn->unpinned)
>>>> +			vfio_remove_from_pfn_list(dma, vpfn);
>>>> +	}
>>>> +}
>>>> +
>>>> +static void vfio_remove_unpinned_from_dma_list(struct vfio_iommu *iommu)
>>>> +{
>>>> +	struct rb_node *n = rb_first(&iommu->dma_list);
>>>> +
>>>> +	for (; n; n = rb_next(n)) {
>>>> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
>>>> +
>>>> +		vfio_remove_unpinned_from_pfn_list(dma, false);
>>>> +	}
>>>> +}
>>>> +
>>>>    static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
>>>>    					       unsigned long iova)
>>>>    {
>>>> @@ -254,13 +282,17 @@ static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
>>>>    	return vpfn;
>>>>    }
>>>>    
>>>> -static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
>>>> +static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn,
>>>> +				  bool dirty_tracking)
>>>>    {
>>>>    	int ret = 0;
>>>>    
>>>>    	if (atomic_dec_and_test(&vpfn->ref_count)) {
>>>>    		ret = put_pfn(vpfn->pfn, dma->prot);
>>> if physical page here is put, it may cause problem when pin this iova
>>> next time:
>>> vfio_iommu_type1_pin_pages {
>>>       ...
>>>       vpfn = vfio_iova_get_vfio_pfn(dma, iova);
>>>       if (vpfn) {
>>>           phys_pfn[i] = vpfn->pfn;
>>>           continue;
>>>       }
>>>       ...
>>> }
>>>
>>
>> Good point. Fixing it as:
>>
>>                   vpfn = vfio_iova_get_vfio_pfn(dma, iova);
>>                   if (vpfn) {
>> -                       phys_pfn[i] = vpfn->pfn;
>> -                       continue;
>> +                       if (vpfn->unpinned)
>> +                               vfio_remove_from_pfn_list(dma, vpfn);
> what about updating vpfn instead?
> 

vfio_pin_page_external() takes care of verification checks and mem lock 
accounting. I prefer to free existing and add new node with existing 
functions.

>> +                       else {
>> +                               phys_pfn[i] = vpfn->pfn;
>> +                               continue;
>> +                       }
>>                   }
>>
>>
>>
>>>> -		vfio_remove_from_pfn_list(dma, vpfn);
>>>> +		if (dirty_tracking)
>>>> +			vpfn->unpinned = true;
>>>> +		else
>>>> +			vfio_remove_from_pfn_list(dma, vpfn);
>>> so the unpinned pages before dirty page tracking is not treated as
>>> dirty?
>>>
>>
>> Yes. That's we agreed on previous version:
>> https://www.mail-archive.com/qemu-devel@nongnu.org/msg663157.html
>>
>>>>    	}
>>>>    	return ret;
>>>>    }
>>>> @@ -504,7 +536,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
>>>>    }
>>>>    
>>>>    static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
>>>> -				    bool do_accounting)
>>>> +				    bool do_accounting, bool dirty_tracking)
>>>>    {
>>>>    	int unlocked;
>>>>    	struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
>>>> @@ -512,7 +544,10 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
>>>>    	if (!vpfn)
>>>>    		return 0;
>>>>    
>>>> -	unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
>>>> +	if (vpfn->unpinned)
>>>> +		return 0;
>>>> +
>>>> +	unlocked = vfio_iova_put_vfio_pfn(dma, vpfn, dirty_tracking);
>>>>    
>>>>    	if (do_accounting)
>>>>    		vfio_lock_acct(dma, -unlocked, true);
>>>> @@ -583,7 +618,8 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>>>>    
>>>>    		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
>>>>    		if (ret) {
>>>> -			vfio_unpin_page_external(dma, iova, do_accounting);
>>>> +			vfio_unpin_page_external(dma, iova, do_accounting,
>>>> +						 false);
>>>>    			goto pin_unwind;
>>>>    		}
>>>>    	}
>>>> @@ -598,7 +634,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>>>>    
>>>>    		iova = user_pfn[j] << PAGE_SHIFT;
>>>>    		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
>>>> -		vfio_unpin_page_external(dma, iova, do_accounting);
>>>> +		vfio_unpin_page_external(dma, iova, do_accounting, false);
>>>>    		phys_pfn[j] = 0;
>>>>    	}
>>>>    pin_done:
>>>> @@ -632,7 +668,8 @@ static int vfio_iommu_type1_unpin_pages(void *iommu_data,
>>>>    		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
>>>>    		if (!dma)
>>>>    			goto unpin_exit;
>>>> -		vfio_unpin_page_external(dma, iova, do_accounting);
>>>> +		vfio_unpin_page_external(dma, iova, do_accounting,
>>>> +					 iommu->dirty_page_tracking);
>>>>    	}
>>>>    
>>>>    unpin_exit:
>>>> @@ -850,6 +887,88 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
>>>>    	return bitmap;
>>>>    }
>>>>    
>>>> +/*
>>>> + * start_iova is the reference from where bitmaping started. This is called
>>>> + * from DMA_UNMAP where start_iova can be different than iova
>>>> + */
>>>> +
>>>> +static void vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
>>>> +				  size_t size, uint64_t pgsize,
>>>> +				  dma_addr_t start_iova, unsigned long *bitmap)
>>>> +{
>>>> +	struct vfio_dma *dma;
>>>> +	dma_addr_t i = iova;
>>>> +	unsigned long pgshift = __ffs(pgsize);
>>>> +
>>>> +	while ((dma = vfio_find_dma(iommu, i, pgsize))) {
>>>> +		/* mark all pages dirty if all pages are pinned and mapped. */
>>>> +		if (dma->iommu_mapped) {
>>> This prevents pass-through devices from calling vfio_pin_pages to do
>>> fine grained log dirty.
>>
>> Yes, I mentioned that in yet TODO item in cover letter:
>>
>> "If IOMMU capable device is present in the container, then all pages are
>> marked dirty. Need to think smart way to know if IOMMU capable device's
>> driver is smart to report pages to be marked dirty by pinning those
>> pages externally."
>>
> why not just check first if any vpfn present for IOMMU capable devices?
> 

vfio_pin_pages(dev, ...) calls driver->ops->pin_pages(iommu, ...)

In vfio_iommu_type1 module, vfio_iommu_type1_pin_pages() doesn't know 
the device. vpfn are tracked against container->iommu, not against 
device. Need to think of smart way to know if devices in container are 
all smart which report pages dirty ny pinning those pages manually.


>>
>>>> +			dma_addr_t iova_limit;
>>>> +
>>>> +			iova_limit = (dma->iova + dma->size) < (iova + size) ?
>>>> +				     (dma->iova + dma->size) : (iova + size);
>>>> +
>>>> +			for (; i < iova_limit; i += pgsize) {
>>>> +				unsigned int start;
>>>> +
>>>> +				start = (i - start_iova) >> pgshift;
>>>> +
>>>> +				__bitmap_set(bitmap, start, 1);
>>>> +			}
>>>> +			if (i >= iova + size)
>>>> +				return;
>>>> +		} else {
>>>> +			struct rb_node *n = rb_first(&dma->pfn_list);
>>>> +			bool found = false;
>>>> +
>>>> +			for (; n; n = rb_next(n)) {
>>>> +				struct vfio_pfn *vpfn = rb_entry(n,
>>>> +							struct vfio_pfn, node);
>>>> +				if (vpfn->iova >= i) {
>>>> +					found = true;
>>>> +					break;
>>>> +				}
>>>> +			}
>>>> +
>>>> +			if (!found) {
>>>> +				i += dma->size;
>>>> +				continue;
>>>> +			}
>>>> +
>>>> +			for (; n; n = rb_next(n)) {
>>>> +				unsigned int start;
>>>> +				struct vfio_pfn *vpfn = rb_entry(n,
>>>> +							struct vfio_pfn, node);
>>>> +
>>>> +				if (vpfn->iova >= iova + size)
>>>> +					return;
>>>> +
>>>> +				start = (vpfn->iova - start_iova) >> pgshift;
>>>> +
>>>> +				__bitmap_set(bitmap, start, 1);
>>>> +
>>>> +				i = vpfn->iova + pgsize;
>>>> +			}
>>>> +		}
>>>> +		vfio_remove_unpinned_from_pfn_list(dma, false);
>>>> +	}
>>>> +}
>>>> +
>>>> +static long verify_bitmap_size(unsigned long npages, unsigned long bitmap_size)
>>>> +{
>>>> +	long bsize;
>>>> +
>>>> +	if (!bitmap_size || bitmap_size > SIZE_MAX)
>>>> +		return -EINVAL;
>>>> +
>>>> +	bsize = ALIGN(npages, BITS_PER_LONG) / sizeof(unsigned long);
>>>> +
>>>> +	if (bitmap_size < bsize)
>>>> +		return -EINVAL;
>>>> +
>>>> +	return bsize;
>>>> +}
>>>> +
>>>>    static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>>>>    			     struct vfio_iommu_type1_dma_unmap *unmap)
>>>>    {
>>>> @@ -2298,6 +2417,83 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>>>>    
>>>>    		return copy_to_user((void __user *)arg, &unmap, minsz) ?
>>>>    			-EFAULT : 0;
>>>> +	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
>>>> +		struct vfio_iommu_type1_dirty_bitmap range;
>>>> +		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
>>>> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
>>>> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
>>>> +		int ret;
>>>> +
>>>> +		if (!iommu->v2)
>>>> +			return -EACCES;
>>>> +
>>>> +		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
>>>> +				    bitmap);
>>>> +
>>>> +		if (copy_from_user(&range, (void __user *)arg, minsz))
>>>> +			return -EFAULT;
>>>> +
>>>> +		if (range.argsz < minsz || range.flags & ~mask)
>>>> +			return -EINVAL;
>>>> +
>>>> +		if (range.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
>>>> +			iommu->dirty_page_tracking = true;
>>>> +			return 0;
>>>> +		} else if (range.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
>>>> +			iommu->dirty_page_tracking = false;
>>>> +
>>>> +			mutex_lock(&iommu->lock);
>>>> +			vfio_remove_unpinned_from_dma_list(iommu);
>>>> +			mutex_unlock(&iommu->lock);
>>>> +			return 0;
>>>> +
>>>> +		} else if (range.flags &
>>>> +				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
>>>> +			uint64_t iommu_pgmask;
>>>> +			unsigned long pgshift = __ffs(range.pgsize);
>>>> +			unsigned long *bitmap;
>>>> +			long bsize;
>>>> +
>>>> +			iommu_pgmask =
>>>> +			 ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
>>>> +
>>>> +			if (((range.pgsize - 1) & iommu_pgmask) !=
>>>> +			    (range.pgsize - 1))
>>>> +				return -EINVAL;
>>>> +
>>>> +			if (range.iova & iommu_pgmask)
>>>> +				return -EINVAL;
>>>> +			if (!range.size || range.size > SIZE_MAX)
>>>> +				return -EINVAL;
>>>> +			if (range.iova + range.size < range.iova)
>>>> +				return -EINVAL;
>>>> +
>>>> +			bsize = verify_bitmap_size(range.size >> pgshift,
>>>> +						   range.bitmap_size);
>>>> +			if (bsize)
>>>> +				return ret;
>>>> +
>>>> +			bitmap = kmalloc(bsize, GFP_KERNEL);
>>>> +			if (!bitmap)
>>>> +				return -ENOMEM;
>>>> +
>>>> +			ret = copy_from_user(bitmap,
>>>> +			     (void __user *)range.bitmap, bsize) ? -EFAULT : 0;
>>>> +			if (ret)
>>>> +				goto bitmap_exit;
>>>> +
>>>> +			iommu->dirty_page_tracking = false;
>>> why iommu->dirty_page_tracking is false here?
>>> suppose this ioctl can be called several times.
>>>
>>
>> This ioctl can be called several times, but once this ioctl is called
>> that means vCPUs are stopped and VFIO devices are stopped (i.e. in
>> stop-and-copy phase) and dirty pages bitmap are being queried by user.
>>
> can't agree that VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP can only be
> called in stop-and-copy phase.
> As stated in last version, this will cause QEMU to get a wrong expectation
> of VM downtime and this is also the reason for previously pinned pages
> before log_sync cannot be treated as dirty. If this get bitmap ioctl can
> be called early in save_setup phase, then it's no problem even all ram
> is dirty.
> 

Device can also write to pages which are pinned, and then there is no 
way to know pages dirtied by device during pre-copy phase.
If user ask dirty bitmap in per-copy phase, even then user will have to 
query dirty bitmap in stop-and-copy phase where this will be superset 
including all pages reported during pre-copy. Then instead of copying 
all pages twice, its better to do it once during stop-and-copy phase.

Thanks,
Kirti


> Thanks
> Yan
>>
>>
>>> Thanks
>>> Yan
>>>> +			mutex_lock(&iommu->lock);
>>>> +			vfio_iova_dirty_bitmap(iommu, range.iova, range.size,
>>>> +					     range.pgsize, range.iova, bitmap);
>>>> +			mutex_unlock(&iommu->lock);
>>>> +
>>>> +			ret = copy_to_user((void __user *)range.bitmap, bitmap,
>>>> +					   range.bitmap_size) ? -EFAULT : 0;
>>>> +bitmap_exit:
>>>> +			kfree(bitmap);
>>>> +			return ret;
>>>> +		}
>>>>    	}
>>>>    
>>>>    	return -ENOTTY;
>>>> -- 
>>>> 2.7.0
>>>>
Yan Zhao Dec. 18, 2019, 1:04 a.m. UTC | #5
On Tue, Dec 17, 2019 at 07:47:05PM +0800, Kirti Wankhede wrote:
> 
> 
> On 12/17/2019 3:21 PM, Yan Zhao wrote:
> > On Tue, Dec 17, 2019 at 05:24:14PM +0800, Kirti Wankhede wrote:
> >>
> >>
> >> On 12/17/2019 10:45 AM, Yan Zhao wrote:
> >>> On Tue, Dec 17, 2019 at 04:21:39AM +0800, Kirti Wankhede wrote:
> >>>> VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
> >>>> - Start unpinned pages dirty pages tracking while migration is active and
> >>>>     device is running, i.e. during pre-copy phase.
> >>>> - Stop unpinned pages dirty pages tracking. This is required to stop
> >>>>     unpinned dirty pages tracking if migration failed or cancelled during
> >>>>     pre-copy phase. Unpinned pages tracking is clear.
> >>>> - Get dirty pages bitmap. Stop unpinned dirty pages tracking and clear
> >>>>     unpinned pages information on bitmap read. This ioctl returns bitmap of
> >>>>     dirty pages, its user space application responsibility to copy content
> >>>>     of dirty pages from source to destination during migration.
> >>>>
> >>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> >>>> Reviewed-by: Neo Jia <cjia@nvidia.com>
> >>>> ---
> >>>>    drivers/vfio/vfio_iommu_type1.c | 210 ++++++++++++++++++++++++++++++++++++++--
> >>>>    1 file changed, 203 insertions(+), 7 deletions(-)
> >>>>
> >>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> >>>> index 3f6b04f2334f..264449654d3f 100644
> >>>> --- a/drivers/vfio/vfio_iommu_type1.c
> >>>> +++ b/drivers/vfio/vfio_iommu_type1.c
> >>>> @@ -70,6 +70,7 @@ struct vfio_iommu {
> >>>>    	unsigned int		dma_avail;
> >>>>    	bool			v2;
> >>>>    	bool			nesting;
> >>>> +	bool			dirty_page_tracking;
> >>>>    };
> >>>>    
> >>>>    struct vfio_domain {
> >>>> @@ -112,6 +113,7 @@ struct vfio_pfn {
> >>>>    	dma_addr_t		iova;		/* Device address */
> >>>>    	unsigned long		pfn;		/* Host pfn */
> >>>>    	atomic_t		ref_count;
> >>>> +	bool			unpinned;
> >>>>    };
> >>>>    
> >>>>    struct vfio_regions {
> >>>> @@ -244,6 +246,32 @@ static void vfio_remove_from_pfn_list(struct vfio_dma *dma,
> >>>>    	kfree(vpfn);
> >>>>    }
> >>>>    
> >>>> +static void vfio_remove_unpinned_from_pfn_list(struct vfio_dma *dma, bool warn)
> >>>> +{
> >>>> +	struct rb_node *n = rb_first(&dma->pfn_list);
> >>>> +
> >>>> +	for (; n; n = rb_next(n)) {
> >>>> +		struct vfio_pfn *vpfn = rb_entry(n, struct vfio_pfn, node);
> >>>> +
> >>>> +		if (warn)
> >>>> +			WARN_ON_ONCE(vpfn->unpinned);
> >>>> +
> >>>> +		if (vpfn->unpinned)
> >>>> +			vfio_remove_from_pfn_list(dma, vpfn);
> >>>> +	}
> >>>> +}
> >>>> +
> >>>> +static void vfio_remove_unpinned_from_dma_list(struct vfio_iommu *iommu)
> >>>> +{
> >>>> +	struct rb_node *n = rb_first(&iommu->dma_list);
> >>>> +
> >>>> +	for (; n; n = rb_next(n)) {
> >>>> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> >>>> +
> >>>> +		vfio_remove_unpinned_from_pfn_list(dma, false);
> >>>> +	}
> >>>> +}
> >>>> +
> >>>>    static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
> >>>>    					       unsigned long iova)
> >>>>    {
> >>>> @@ -254,13 +282,17 @@ static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
> >>>>    	return vpfn;
> >>>>    }
> >>>>    
> >>>> -static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
> >>>> +static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn,
> >>>> +				  bool dirty_tracking)
> >>>>    {
> >>>>    	int ret = 0;
> >>>>    
> >>>>    	if (atomic_dec_and_test(&vpfn->ref_count)) {
> >>>>    		ret = put_pfn(vpfn->pfn, dma->prot);
> >>> if physical page here is put, it may cause problem when pin this iova
> >>> next time:
> >>> vfio_iommu_type1_pin_pages {
> >>>       ...
> >>>       vpfn = vfio_iova_get_vfio_pfn(dma, iova);
> >>>       if (vpfn) {
> >>>           phys_pfn[i] = vpfn->pfn;
> >>>           continue;
> >>>       }
> >>>       ...
> >>> }
> >>>
> >>
> >> Good point. Fixing it as:
> >>
> >>                   vpfn = vfio_iova_get_vfio_pfn(dma, iova);
> >>                   if (vpfn) {
> >> -                       phys_pfn[i] = vpfn->pfn;
> >> -                       continue;
> >> +                       if (vpfn->unpinned)
> >> +                               vfio_remove_from_pfn_list(dma, vpfn);
> > what about updating vpfn instead?
> > 
> 
> vfio_pin_page_external() takes care of verification checks and mem lock 
> accounting. I prefer to free existing and add new node with existing 
> functions.
> 
> >> +                       else {
> >> +                               phys_pfn[i] = vpfn->pfn;
> >> +                               continue;
> >> +                       }
> >>                   }
> >>
> >>
> >>
> >>>> -		vfio_remove_from_pfn_list(dma, vpfn);
> >>>> +		if (dirty_tracking)
> >>>> +			vpfn->unpinned = true;
> >>>> +		else
> >>>> +			vfio_remove_from_pfn_list(dma, vpfn);
> >>> so the unpinned pages before dirty page tracking is not treated as
> >>> dirty?
> >>>
> >>
> >> Yes. That's we agreed on previous version:
> >> https://www.mail-archive.com/qemu-devel@nongnu.org/msg663157.html
> >>
> >>>>    	}
> >>>>    	return ret;
> >>>>    }
> >>>> @@ -504,7 +536,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
> >>>>    }
> >>>>    
> >>>>    static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
> >>>> -				    bool do_accounting)
> >>>> +				    bool do_accounting, bool dirty_tracking)
> >>>>    {
> >>>>    	int unlocked;
> >>>>    	struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
> >>>> @@ -512,7 +544,10 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
> >>>>    	if (!vpfn)
> >>>>    		return 0;
> >>>>    
> >>>> -	unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
> >>>> +	if (vpfn->unpinned)
> >>>> +		return 0;
> >>>> +
> >>>> +	unlocked = vfio_iova_put_vfio_pfn(dma, vpfn, dirty_tracking);
> >>>>    
> >>>>    	if (do_accounting)
> >>>>    		vfio_lock_acct(dma, -unlocked, true);
> >>>> @@ -583,7 +618,8 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
> >>>>    
> >>>>    		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
> >>>>    		if (ret) {
> >>>> -			vfio_unpin_page_external(dma, iova, do_accounting);
> >>>> +			vfio_unpin_page_external(dma, iova, do_accounting,
> >>>> +						 false);
> >>>>    			goto pin_unwind;
> >>>>    		}
> >>>>    	}
> >>>> @@ -598,7 +634,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
> >>>>    
> >>>>    		iova = user_pfn[j] << PAGE_SHIFT;
> >>>>    		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
> >>>> -		vfio_unpin_page_external(dma, iova, do_accounting);
> >>>> +		vfio_unpin_page_external(dma, iova, do_accounting, false);
> >>>>    		phys_pfn[j] = 0;
> >>>>    	}
> >>>>    pin_done:
> >>>> @@ -632,7 +668,8 @@ static int vfio_iommu_type1_unpin_pages(void *iommu_data,
> >>>>    		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
> >>>>    		if (!dma)
> >>>>    			goto unpin_exit;
> >>>> -		vfio_unpin_page_external(dma, iova, do_accounting);
> >>>> +		vfio_unpin_page_external(dma, iova, do_accounting,
> >>>> +					 iommu->dirty_page_tracking);
> >>>>    	}
> >>>>    
> >>>>    unpin_exit:
> >>>> @@ -850,6 +887,88 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
> >>>>    	return bitmap;
> >>>>    }
> >>>>    
> >>>> +/*
> >>>> + * start_iova is the reference from where bitmaping started. This is called
> >>>> + * from DMA_UNMAP where start_iova can be different than iova
> >>>> + */
> >>>> +
> >>>> +static void vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
> >>>> +				  size_t size, uint64_t pgsize,
> >>>> +				  dma_addr_t start_iova, unsigned long *bitmap)
> >>>> +{
> >>>> +	struct vfio_dma *dma;
> >>>> +	dma_addr_t i = iova;
> >>>> +	unsigned long pgshift = __ffs(pgsize);
> >>>> +
> >>>> +	while ((dma = vfio_find_dma(iommu, i, pgsize))) {
> >>>> +		/* mark all pages dirty if all pages are pinned and mapped. */
> >>>> +		if (dma->iommu_mapped) {
> >>> This prevents pass-through devices from calling vfio_pin_pages to do
> >>> fine grained log dirty.
> >>
> >> Yes, I mentioned that in yet TODO item in cover letter:
> >>
> >> "If IOMMU capable device is present in the container, then all pages are
> >> marked dirty. Need to think smart way to know if IOMMU capable device's
> >> driver is smart to report pages to be marked dirty by pinning those
> >> pages externally."
> >>
> > why not just check first if any vpfn present for IOMMU capable devices?
> > 
> 
> vfio_pin_pages(dev, ...) calls driver->ops->pin_pages(iommu, ...)
> 
> In vfio_iommu_type1 module, vfio_iommu_type1_pin_pages() doesn't know 
> the device. vpfn are tracked against container->iommu, not against 
> device. Need to think of smart way to know if devices in container are 
> all smart which report pages dirty ny pinning those pages manually.
>
I believe in such case, the mdev on top of device is in the same iommu
group (i.e. 1:1 mdev on top of device).
device vendor driver calls vfio_pin_pages to notify vfio which pages are dirty. 
> 
> >>
> >>>> +			dma_addr_t iova_limit;
> >>>> +
> >>>> +			iova_limit = (dma->iova + dma->size) < (iova + size) ?
> >>>> +				     (dma->iova + dma->size) : (iova + size);
> >>>> +
> >>>> +			for (; i < iova_limit; i += pgsize) {
> >>>> +				unsigned int start;
> >>>> +
> >>>> +				start = (i - start_iova) >> pgshift;
> >>>> +
> >>>> +				__bitmap_set(bitmap, start, 1);
> >>>> +			}
> >>>> +			if (i >= iova + size)
> >>>> +				return;
> >>>> +		} else {
> >>>> +			struct rb_node *n = rb_first(&dma->pfn_list);
> >>>> +			bool found = false;
> >>>> +
> >>>> +			for (; n; n = rb_next(n)) {
> >>>> +				struct vfio_pfn *vpfn = rb_entry(n,
> >>>> +							struct vfio_pfn, node);
> >>>> +				if (vpfn->iova >= i) {
> >>>> +					found = true;
> >>>> +					break;
> >>>> +				}
> >>>> +			}
> >>>> +
> >>>> +			if (!found) {
> >>>> +				i += dma->size;
> >>>> +				continue;
> >>>> +			}
> >>>> +
> >>>> +			for (; n; n = rb_next(n)) {
> >>>> +				unsigned int start;
> >>>> +				struct vfio_pfn *vpfn = rb_entry(n,
> >>>> +							struct vfio_pfn, node);
> >>>> +
> >>>> +				if (vpfn->iova >= iova + size)
> >>>> +					return;
> >>>> +
> >>>> +				start = (vpfn->iova - start_iova) >> pgshift;
> >>>> +
> >>>> +				__bitmap_set(bitmap, start, 1);
> >>>> +
> >>>> +				i = vpfn->iova + pgsize;
> >>>> +			}
> >>>> +		}
> >>>> +		vfio_remove_unpinned_from_pfn_list(dma, false);
> >>>> +	}
> >>>> +}
> >>>> +
> >>>> +static long verify_bitmap_size(unsigned long npages, unsigned long bitmap_size)
> >>>> +{
> >>>> +	long bsize;
> >>>> +
> >>>> +	if (!bitmap_size || bitmap_size > SIZE_MAX)
> >>>> +		return -EINVAL;
> >>>> +
> >>>> +	bsize = ALIGN(npages, BITS_PER_LONG) / sizeof(unsigned long);
> >>>> +
> >>>> +	if (bitmap_size < bsize)
> >>>> +		return -EINVAL;
> >>>> +
> >>>> +	return bsize;
> >>>> +}
> >>>> +
> >>>>    static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> >>>>    			     struct vfio_iommu_type1_dma_unmap *unmap)
> >>>>    {
> >>>> @@ -2298,6 +2417,83 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
> >>>>    
> >>>>    		return copy_to_user((void __user *)arg, &unmap, minsz) ?
> >>>>    			-EFAULT : 0;
> >>>> +	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
> >>>> +		struct vfio_iommu_type1_dirty_bitmap range;
> >>>> +		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
> >>>> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
> >>>> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
> >>>> +		int ret;
> >>>> +
> >>>> +		if (!iommu->v2)
> >>>> +			return -EACCES;
> >>>> +
> >>>> +		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
> >>>> +				    bitmap);
> >>>> +
> >>>> +		if (copy_from_user(&range, (void __user *)arg, minsz))
> >>>> +			return -EFAULT;
> >>>> +
> >>>> +		if (range.argsz < minsz || range.flags & ~mask)
> >>>> +			return -EINVAL;
> >>>> +
> >>>> +		if (range.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
> >>>> +			iommu->dirty_page_tracking = true;
> >>>> +			return 0;
> >>>> +		} else if (range.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
> >>>> +			iommu->dirty_page_tracking = false;
> >>>> +
> >>>> +			mutex_lock(&iommu->lock);
> >>>> +			vfio_remove_unpinned_from_dma_list(iommu);
> >>>> +			mutex_unlock(&iommu->lock);
> >>>> +			return 0;
> >>>> +
> >>>> +		} else if (range.flags &
> >>>> +				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
> >>>> +			uint64_t iommu_pgmask;
> >>>> +			unsigned long pgshift = __ffs(range.pgsize);
> >>>> +			unsigned long *bitmap;
> >>>> +			long bsize;
> >>>> +
> >>>> +			iommu_pgmask =
> >>>> +			 ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
> >>>> +
> >>>> +			if (((range.pgsize - 1) & iommu_pgmask) !=
> >>>> +			    (range.pgsize - 1))
> >>>> +				return -EINVAL;
> >>>> +
> >>>> +			if (range.iova & iommu_pgmask)
> >>>> +				return -EINVAL;
> >>>> +			if (!range.size || range.size > SIZE_MAX)
> >>>> +				return -EINVAL;
> >>>> +			if (range.iova + range.size < range.iova)
> >>>> +				return -EINVAL;
> >>>> +
> >>>> +			bsize = verify_bitmap_size(range.size >> pgshift,
> >>>> +						   range.bitmap_size);
> >>>> +			if (bsize)
> >>>> +				return ret;
> >>>> +
> >>>> +			bitmap = kmalloc(bsize, GFP_KERNEL);
> >>>> +			if (!bitmap)
> >>>> +				return -ENOMEM;
> >>>> +
> >>>> +			ret = copy_from_user(bitmap,
> >>>> +			     (void __user *)range.bitmap, bsize) ? -EFAULT : 0;
> >>>> +			if (ret)
> >>>> +				goto bitmap_exit;
> >>>> +
> >>>> +			iommu->dirty_page_tracking = false;
> >>> why iommu->dirty_page_tracking is false here?
> >>> suppose this ioctl can be called several times.
> >>>
> >>
> >> This ioctl can be called several times, but once this ioctl is called
> >> that means vCPUs are stopped and VFIO devices are stopped (i.e. in
> >> stop-and-copy phase) and dirty pages bitmap are being queried by user.
> >>
> > can't agree that VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP can only be
> > called in stop-and-copy phase.
> > As stated in last version, this will cause QEMU to get a wrong expectation
> > of VM downtime and this is also the reason for previously pinned pages
> > before log_sync cannot be treated as dirty. If this get bitmap ioctl can
> > be called early in save_setup phase, then it's no problem even all ram
> > is dirty.
> > 
> 
> Device can also write to pages which are pinned, and then there is no 
> way to know pages dirtied by device during pre-copy phase.
> If user ask dirty bitmap in per-copy phase, even then user will have to 
> query dirty bitmap in stop-and-copy phase where this will be superset 
> including all pages reported during pre-copy. Then instead of copying 
> all pages twice, its better to do it once during stop-and-copy phase.
>
I think the flow should be like this:
1. save_setup --> GET_BITMAP ioctl --> return bitmap for currently + previously
pinned pages and clean all previously pinned pages

2. save_pending --> GET_BITMAP ioctl  --> return bitmap of (currently
pinned pages + previously pinned pages since last clean) and clean all
previously pinned pages

3. save_complete_precopy --> GET_BITMAP ioctl --> return bitmap of (currently
pinned pages + previously pinned pages since last clean) and clean all
previously pinned pages


Copying pinned pages multiple times is unavoidable because those pinned pages
are always treated as dirty. That's per vendor's implementation.
But if the pinned pages are not reported as dirty before stop-and-copy phase,
QEMU would think dirty pages has converged
and enter blackout phase, making downtime_limit severely incorrect.

Thanks
Yan

> >>>> +			mutex_lock(&iommu->lock);
> >>>> +			vfio_iova_dirty_bitmap(iommu, range.iova, range.size,
> >>>> +					     range.pgsize, range.iova, bitmap);
> >>>> +			mutex_unlock(&iommu->lock);
> >>>> +
> >>>> +			ret = copy_to_user((void __user *)range.bitmap, bitmap,
> >>>> +					   range.bitmap_size) ? -EFAULT : 0;
> >>>> +bitmap_exit:
> >>>> +			kfree(bitmap);
> >>>> +			return ret;
> >>>> +		}
> >>>>    	}
> >>>>    
> >>>>    	return -ENOTTY;
> >>>> -- 
> >>>> 2.7.0
> >>>>
Dr. David Alan Gilbert Dec. 18, 2019, 8:05 p.m. UTC | #6
* Yan Zhao (yan.y.zhao@intel.com) wrote:
> On Tue, Dec 17, 2019 at 07:47:05PM +0800, Kirti Wankhede wrote:
> > 
> > 
> > On 12/17/2019 3:21 PM, Yan Zhao wrote:
> > > On Tue, Dec 17, 2019 at 05:24:14PM +0800, Kirti Wankhede wrote:
> > >>
> > >>
> > >> On 12/17/2019 10:45 AM, Yan Zhao wrote:
> > >>> On Tue, Dec 17, 2019 at 04:21:39AM +0800, Kirti Wankhede wrote:
> > >>>> VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
> > >>>> - Start unpinned pages dirty pages tracking while migration is active and
> > >>>>     device is running, i.e. during pre-copy phase.
> > >>>> - Stop unpinned pages dirty pages tracking. This is required to stop
> > >>>>     unpinned dirty pages tracking if migration failed or cancelled during
> > >>>>     pre-copy phase. Unpinned pages tracking is clear.
> > >>>> - Get dirty pages bitmap. Stop unpinned dirty pages tracking and clear
> > >>>>     unpinned pages information on bitmap read. This ioctl returns bitmap of
> > >>>>     dirty pages, its user space application responsibility to copy content
> > >>>>     of dirty pages from source to destination during migration.
> > >>>>
> > >>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> > >>>> Reviewed-by: Neo Jia <cjia@nvidia.com>
> > >>>> ---
> > >>>>    drivers/vfio/vfio_iommu_type1.c | 210 ++++++++++++++++++++++++++++++++++++++--
> > >>>>    1 file changed, 203 insertions(+), 7 deletions(-)
> > >>>>
> > >>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > >>>> index 3f6b04f2334f..264449654d3f 100644
> > >>>> --- a/drivers/vfio/vfio_iommu_type1.c
> > >>>> +++ b/drivers/vfio/vfio_iommu_type1.c
> > >>>> @@ -70,6 +70,7 @@ struct vfio_iommu {
> > >>>>    	unsigned int		dma_avail;
> > >>>>    	bool			v2;
> > >>>>    	bool			nesting;
> > >>>> +	bool			dirty_page_tracking;
> > >>>>    };
> > >>>>    
> > >>>>    struct vfio_domain {
> > >>>> @@ -112,6 +113,7 @@ struct vfio_pfn {
> > >>>>    	dma_addr_t		iova;		/* Device address */
> > >>>>    	unsigned long		pfn;		/* Host pfn */
> > >>>>    	atomic_t		ref_count;
> > >>>> +	bool			unpinned;
> > >>>>    };
> > >>>>    
> > >>>>    struct vfio_regions {
> > >>>> @@ -244,6 +246,32 @@ static void vfio_remove_from_pfn_list(struct vfio_dma *dma,
> > >>>>    	kfree(vpfn);
> > >>>>    }
> > >>>>    
> > >>>> +static void vfio_remove_unpinned_from_pfn_list(struct vfio_dma *dma, bool warn)
> > >>>> +{
> > >>>> +	struct rb_node *n = rb_first(&dma->pfn_list);
> > >>>> +
> > >>>> +	for (; n; n = rb_next(n)) {
> > >>>> +		struct vfio_pfn *vpfn = rb_entry(n, struct vfio_pfn, node);
> > >>>> +
> > >>>> +		if (warn)
> > >>>> +			WARN_ON_ONCE(vpfn->unpinned);
> > >>>> +
> > >>>> +		if (vpfn->unpinned)
> > >>>> +			vfio_remove_from_pfn_list(dma, vpfn);
> > >>>> +	}
> > >>>> +}
> > >>>> +
> > >>>> +static void vfio_remove_unpinned_from_dma_list(struct vfio_iommu *iommu)
> > >>>> +{
> > >>>> +	struct rb_node *n = rb_first(&iommu->dma_list);
> > >>>> +
> > >>>> +	for (; n; n = rb_next(n)) {
> > >>>> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> > >>>> +
> > >>>> +		vfio_remove_unpinned_from_pfn_list(dma, false);
> > >>>> +	}
> > >>>> +}
> > >>>> +
> > >>>>    static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
> > >>>>    					       unsigned long iova)
> > >>>>    {
> > >>>> @@ -254,13 +282,17 @@ static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
> > >>>>    	return vpfn;
> > >>>>    }
> > >>>>    
> > >>>> -static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
> > >>>> +static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn,
> > >>>> +				  bool dirty_tracking)
> > >>>>    {
> > >>>>    	int ret = 0;
> > >>>>    
> > >>>>    	if (atomic_dec_and_test(&vpfn->ref_count)) {
> > >>>>    		ret = put_pfn(vpfn->pfn, dma->prot);
> > >>> if physical page here is put, it may cause problem when pin this iova
> > >>> next time:
> > >>> vfio_iommu_type1_pin_pages {
> > >>>       ...
> > >>>       vpfn = vfio_iova_get_vfio_pfn(dma, iova);
> > >>>       if (vpfn) {
> > >>>           phys_pfn[i] = vpfn->pfn;
> > >>>           continue;
> > >>>       }
> > >>>       ...
> > >>> }
> > >>>
> > >>
> > >> Good point. Fixing it as:
> > >>
> > >>                   vpfn = vfio_iova_get_vfio_pfn(dma, iova);
> > >>                   if (vpfn) {
> > >> -                       phys_pfn[i] = vpfn->pfn;
> > >> -                       continue;
> > >> +                       if (vpfn->unpinned)
> > >> +                               vfio_remove_from_pfn_list(dma, vpfn);
> > > what about updating vpfn instead?
> > > 
> > 
> > vfio_pin_page_external() takes care of verification checks and mem lock 
> > accounting. I prefer to free existing and add new node with existing 
> > functions.
> > 
> > >> +                       else {
> > >> +                               phys_pfn[i] = vpfn->pfn;
> > >> +                               continue;
> > >> +                       }
> > >>                   }
> > >>
> > >>
> > >>
> > >>>> -		vfio_remove_from_pfn_list(dma, vpfn);
> > >>>> +		if (dirty_tracking)
> > >>>> +			vpfn->unpinned = true;
> > >>>> +		else
> > >>>> +			vfio_remove_from_pfn_list(dma, vpfn);
> > >>> so the unpinned pages before dirty page tracking is not treated as
> > >>> dirty?
> > >>>
> > >>
> > >> Yes. That's we agreed on previous version:
> > >> https://www.mail-archive.com/qemu-devel@nongnu.org/msg663157.html
> > >>
> > >>>>    	}
> > >>>>    	return ret;
> > >>>>    }
> > >>>> @@ -504,7 +536,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
> > >>>>    }
> > >>>>    
> > >>>>    static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
> > >>>> -				    bool do_accounting)
> > >>>> +				    bool do_accounting, bool dirty_tracking)
> > >>>>    {
> > >>>>    	int unlocked;
> > >>>>    	struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
> > >>>> @@ -512,7 +544,10 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
> > >>>>    	if (!vpfn)
> > >>>>    		return 0;
> > >>>>    
> > >>>> -	unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
> > >>>> +	if (vpfn->unpinned)
> > >>>> +		return 0;
> > >>>> +
> > >>>> +	unlocked = vfio_iova_put_vfio_pfn(dma, vpfn, dirty_tracking);
> > >>>>    
> > >>>>    	if (do_accounting)
> > >>>>    		vfio_lock_acct(dma, -unlocked, true);
> > >>>> @@ -583,7 +618,8 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
> > >>>>    
> > >>>>    		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
> > >>>>    		if (ret) {
> > >>>> -			vfio_unpin_page_external(dma, iova, do_accounting);
> > >>>> +			vfio_unpin_page_external(dma, iova, do_accounting,
> > >>>> +						 false);
> > >>>>    			goto pin_unwind;
> > >>>>    		}
> > >>>>    	}
> > >>>> @@ -598,7 +634,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
> > >>>>    
> > >>>>    		iova = user_pfn[j] << PAGE_SHIFT;
> > >>>>    		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
> > >>>> -		vfio_unpin_page_external(dma, iova, do_accounting);
> > >>>> +		vfio_unpin_page_external(dma, iova, do_accounting, false);
> > >>>>    		phys_pfn[j] = 0;
> > >>>>    	}
> > >>>>    pin_done:
> > >>>> @@ -632,7 +668,8 @@ static int vfio_iommu_type1_unpin_pages(void *iommu_data,
> > >>>>    		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
> > >>>>    		if (!dma)
> > >>>>    			goto unpin_exit;
> > >>>> -		vfio_unpin_page_external(dma, iova, do_accounting);
> > >>>> +		vfio_unpin_page_external(dma, iova, do_accounting,
> > >>>> +					 iommu->dirty_page_tracking);
> > >>>>    	}
> > >>>>    
> > >>>>    unpin_exit:
> > >>>> @@ -850,6 +887,88 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
> > >>>>    	return bitmap;
> > >>>>    }
> > >>>>    
> > >>>> +/*
> > >>>> + * start_iova is the reference from where bitmaping started. This is called
> > >>>> + * from DMA_UNMAP where start_iova can be different than iova
> > >>>> + */
> > >>>> +
> > >>>> +static void vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
> > >>>> +				  size_t size, uint64_t pgsize,
> > >>>> +				  dma_addr_t start_iova, unsigned long *bitmap)
> > >>>> +{
> > >>>> +	struct vfio_dma *dma;
> > >>>> +	dma_addr_t i = iova;
> > >>>> +	unsigned long pgshift = __ffs(pgsize);
> > >>>> +
> > >>>> +	while ((dma = vfio_find_dma(iommu, i, pgsize))) {
> > >>>> +		/* mark all pages dirty if all pages are pinned and mapped. */
> > >>>> +		if (dma->iommu_mapped) {
> > >>> This prevents pass-through devices from calling vfio_pin_pages to do
> > >>> fine grained log dirty.
> > >>
> > >> Yes, I mentioned that in yet TODO item in cover letter:
> > >>
> > >> "If IOMMU capable device is present in the container, then all pages are
> > >> marked dirty. Need to think smart way to know if IOMMU capable device's
> > >> driver is smart to report pages to be marked dirty by pinning those
> > >> pages externally."
> > >>
> > > why not just check first if any vpfn present for IOMMU capable devices?
> > > 
> > 
> > vfio_pin_pages(dev, ...) calls driver->ops->pin_pages(iommu, ...)
> > 
> > In vfio_iommu_type1 module, vfio_iommu_type1_pin_pages() doesn't know 
> > the device. vpfn are tracked against container->iommu, not against 
> > device. Need to think of smart way to know if devices in container are 
> > all smart which report pages dirty ny pinning those pages manually.
> >
> I believe in such case, the mdev on top of device is in the same iommu
> group (i.e. 1:1 mdev on top of device).
> device vendor driver calls vfio_pin_pages to notify vfio which pages are dirty. 
> > 
> > >>
> > >>>> +			dma_addr_t iova_limit;
> > >>>> +
> > >>>> +			iova_limit = (dma->iova + dma->size) < (iova + size) ?
> > >>>> +				     (dma->iova + dma->size) : (iova + size);
> > >>>> +
> > >>>> +			for (; i < iova_limit; i += pgsize) {
> > >>>> +				unsigned int start;
> > >>>> +
> > >>>> +				start = (i - start_iova) >> pgshift;
> > >>>> +
> > >>>> +				__bitmap_set(bitmap, start, 1);
> > >>>> +			}
> > >>>> +			if (i >= iova + size)
> > >>>> +				return;
> > >>>> +		} else {
> > >>>> +			struct rb_node *n = rb_first(&dma->pfn_list);
> > >>>> +			bool found = false;
> > >>>> +
> > >>>> +			for (; n; n = rb_next(n)) {
> > >>>> +				struct vfio_pfn *vpfn = rb_entry(n,
> > >>>> +							struct vfio_pfn, node);
> > >>>> +				if (vpfn->iova >= i) {
> > >>>> +					found = true;
> > >>>> +					break;
> > >>>> +				}
> > >>>> +			}
> > >>>> +
> > >>>> +			if (!found) {
> > >>>> +				i += dma->size;
> > >>>> +				continue;
> > >>>> +			}
> > >>>> +
> > >>>> +			for (; n; n = rb_next(n)) {
> > >>>> +				unsigned int start;
> > >>>> +				struct vfio_pfn *vpfn = rb_entry(n,
> > >>>> +							struct vfio_pfn, node);
> > >>>> +
> > >>>> +				if (vpfn->iova >= iova + size)
> > >>>> +					return;
> > >>>> +
> > >>>> +				start = (vpfn->iova - start_iova) >> pgshift;
> > >>>> +
> > >>>> +				__bitmap_set(bitmap, start, 1);
> > >>>> +
> > >>>> +				i = vpfn->iova + pgsize;
> > >>>> +			}
> > >>>> +		}
> > >>>> +		vfio_remove_unpinned_from_pfn_list(dma, false);
> > >>>> +	}
> > >>>> +}
> > >>>> +
> > >>>> +static long verify_bitmap_size(unsigned long npages, unsigned long bitmap_size)
> > >>>> +{
> > >>>> +	long bsize;
> > >>>> +
> > >>>> +	if (!bitmap_size || bitmap_size > SIZE_MAX)
> > >>>> +		return -EINVAL;
> > >>>> +
> > >>>> +	bsize = ALIGN(npages, BITS_PER_LONG) / sizeof(unsigned long);
> > >>>> +
> > >>>> +	if (bitmap_size < bsize)
> > >>>> +		return -EINVAL;
> > >>>> +
> > >>>> +	return bsize;
> > >>>> +}
> > >>>> +
> > >>>>    static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> > >>>>    			     struct vfio_iommu_type1_dma_unmap *unmap)
> > >>>>    {
> > >>>> @@ -2298,6 +2417,83 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
> > >>>>    
> > >>>>    		return copy_to_user((void __user *)arg, &unmap, minsz) ?
> > >>>>    			-EFAULT : 0;
> > >>>> +	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
> > >>>> +		struct vfio_iommu_type1_dirty_bitmap range;
> > >>>> +		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
> > >>>> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
> > >>>> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
> > >>>> +		int ret;
> > >>>> +
> > >>>> +		if (!iommu->v2)
> > >>>> +			return -EACCES;
> > >>>> +
> > >>>> +		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
> > >>>> +				    bitmap);
> > >>>> +
> > >>>> +		if (copy_from_user(&range, (void __user *)arg, minsz))
> > >>>> +			return -EFAULT;
> > >>>> +
> > >>>> +		if (range.argsz < minsz || range.flags & ~mask)
> > >>>> +			return -EINVAL;
> > >>>> +
> > >>>> +		if (range.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
> > >>>> +			iommu->dirty_page_tracking = true;
> > >>>> +			return 0;
> > >>>> +		} else if (range.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
> > >>>> +			iommu->dirty_page_tracking = false;
> > >>>> +
> > >>>> +			mutex_lock(&iommu->lock);
> > >>>> +			vfio_remove_unpinned_from_dma_list(iommu);
> > >>>> +			mutex_unlock(&iommu->lock);
> > >>>> +			return 0;
> > >>>> +
> > >>>> +		} else if (range.flags &
> > >>>> +				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
> > >>>> +			uint64_t iommu_pgmask;
> > >>>> +			unsigned long pgshift = __ffs(range.pgsize);
> > >>>> +			unsigned long *bitmap;
> > >>>> +			long bsize;
> > >>>> +
> > >>>> +			iommu_pgmask =
> > >>>> +			 ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
> > >>>> +
> > >>>> +			if (((range.pgsize - 1) & iommu_pgmask) !=
> > >>>> +			    (range.pgsize - 1))
> > >>>> +				return -EINVAL;
> > >>>> +
> > >>>> +			if (range.iova & iommu_pgmask)
> > >>>> +				return -EINVAL;
> > >>>> +			if (!range.size || range.size > SIZE_MAX)
> > >>>> +				return -EINVAL;
> > >>>> +			if (range.iova + range.size < range.iova)
> > >>>> +				return -EINVAL;
> > >>>> +
> > >>>> +			bsize = verify_bitmap_size(range.size >> pgshift,
> > >>>> +						   range.bitmap_size);
> > >>>> +			if (bsize)
> > >>>> +				return ret;
> > >>>> +
> > >>>> +			bitmap = kmalloc(bsize, GFP_KERNEL);
> > >>>> +			if (!bitmap)
> > >>>> +				return -ENOMEM;
> > >>>> +
> > >>>> +			ret = copy_from_user(bitmap,
> > >>>> +			     (void __user *)range.bitmap, bsize) ? -EFAULT : 0;
> > >>>> +			if (ret)
> > >>>> +				goto bitmap_exit;
> > >>>> +
> > >>>> +			iommu->dirty_page_tracking = false;
> > >>> why iommu->dirty_page_tracking is false here?
> > >>> suppose this ioctl can be called several times.
> > >>>
> > >>
> > >> This ioctl can be called several times, but once this ioctl is called
> > >> that means vCPUs are stopped and VFIO devices are stopped (i.e. in
> > >> stop-and-copy phase) and dirty pages bitmap are being queried by user.
> > >>
> > > can't agree that VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP can only be
> > > called in stop-and-copy phase.
> > > As stated in last version, this will cause QEMU to get a wrong expectation
> > > of VM downtime and this is also the reason for previously pinned pages
> > > before log_sync cannot be treated as dirty. If this get bitmap ioctl can
> > > be called early in save_setup phase, then it's no problem even all ram
> > > is dirty.
> > > 
> > 
> > Device can also write to pages which are pinned, and then there is no 
> > way to know pages dirtied by device during pre-copy phase.
> > If user ask dirty bitmap in per-copy phase, even then user will have to 
> > query dirty bitmap in stop-and-copy phase where this will be superset 
> > including all pages reported during pre-copy. Then instead of copying 
> > all pages twice, its better to do it once during stop-and-copy phase.
> >
> I think the flow should be like this:
> 1. save_setup --> GET_BITMAP ioctl --> return bitmap for currently + previously
> pinned pages and clean all previously pinned pages
> 
> 2. save_pending --> GET_BITMAP ioctl  --> return bitmap of (currently
> pinned pages + previously pinned pages since last clean) and clean all
> previously pinned pages
> 
> 3. save_complete_precopy --> GET_BITMAP ioctl --> return bitmap of (currently
> pinned pages + previously pinned pages since last clean) and clean all
> previously pinned pages
> 
> 
> Copying pinned pages multiple times is unavoidable because those pinned pages
> are always treated as dirty. That's per vendor's implementation.
> But if the pinned pages are not reported as dirty before stop-and-copy phase,
> QEMU would think dirty pages has converged
> and enter blackout phase, making downtime_limit severely incorrect.

I'm not sure it's any worse.
I *think* we do a last sync after we've decided to go to stop-and-copy;
wont that then mark all those pages as dirty again, so it'll have the
same behaviour?
Anyway, it seems wrong to repeatedly send pages that you know are
pointless - but that probably means we need a way to mark those somehow
to avoid it.

Dave

> Thanks
> Yan
> 
> > >>>> +			mutex_lock(&iommu->lock);
> > >>>> +			vfio_iova_dirty_bitmap(iommu, range.iova, range.size,
> > >>>> +					     range.pgsize, range.iova, bitmap);
> > >>>> +			mutex_unlock(&iommu->lock);
> > >>>> +
> > >>>> +			ret = copy_to_user((void __user *)range.bitmap, bitmap,
> > >>>> +					   range.bitmap_size) ? -EFAULT : 0;
> > >>>> +bitmap_exit:
> > >>>> +			kfree(bitmap);
> > >>>> +			return ret;
> > >>>> +		}
> > >>>>    	}
> > >>>>    
> > >>>>    	return -ENOTTY;
> > >>>> -- 
> > >>>> 2.7.0
> > >>>>
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Alex Williamson Dec. 18, 2019, 9:39 p.m. UTC | #7
On Tue, 17 Dec 2019 14:54:14 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 12/17/2019 10:45 AM, Yan Zhao wrote:
> > On Tue, Dec 17, 2019 at 04:21:39AM +0800, Kirti Wankhede wrote:  
> >> +		} else if (range.flags &
> >> +				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
> >> +			uint64_t iommu_pgmask;
> >> +			unsigned long pgshift = __ffs(range.pgsize);
> >> +			unsigned long *bitmap;
> >> +			long bsize;
> >> +
> >> +			iommu_pgmask =
> >> +			 ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
> >> +
> >> +			if (((range.pgsize - 1) & iommu_pgmask) !=
> >> +			    (range.pgsize - 1))
> >> +				return -EINVAL;
> >> +
> >> +			if (range.iova & iommu_pgmask)
> >> +				return -EINVAL;
> >> +			if (!range.size || range.size > SIZE_MAX)
> >> +				return -EINVAL;
> >> +			if (range.iova + range.size < range.iova)
> >> +				return -EINVAL;
> >> +
> >> +			bsize = verify_bitmap_size(range.size >> pgshift,
> >> +						   range.bitmap_size);
> >> +			if (bsize)
> >> +				return ret;
> >> +
> >> +			bitmap = kmalloc(bsize, GFP_KERNEL);
> >> +			if (!bitmap)
> >> +				return -ENOMEM;
> >> +
> >> +			ret = copy_from_user(bitmap,
> >> +			     (void __user *)range.bitmap, bsize) ? -EFAULT : 0;
> >> +			if (ret)
> >> +				goto bitmap_exit;
> >> +
> >> +			iommu->dirty_page_tracking = false;  
> > why iommu->dirty_page_tracking is false here?
> > suppose this ioctl can be called several times.
> >   
> 
> This ioctl can be called several times, but once this ioctl is called 
> that means vCPUs are stopped and VFIO devices are stopped (i.e. in 
> stop-and-copy phase) and dirty pages bitmap are being queried by user.

Do not assume how userspace works or its intent.  If dirty tracking is
on, it should remain on until the user turns it off.  We cannot assume
userspace uses a one-shot approach.  Thanks,

Alex
Yan Zhao Dec. 19, 2019, 12:57 a.m. UTC | #8
On Thu, Dec 19, 2019 at 04:05:52AM +0800, Dr. David Alan Gilbert wrote:
> * Yan Zhao (yan.y.zhao@intel.com) wrote:
> > On Tue, Dec 17, 2019 at 07:47:05PM +0800, Kirti Wankhede wrote:
> > > 
> > > 
> > > On 12/17/2019 3:21 PM, Yan Zhao wrote:
> > > > On Tue, Dec 17, 2019 at 05:24:14PM +0800, Kirti Wankhede wrote:
> > > >>
> > > >>
> > > >> On 12/17/2019 10:45 AM, Yan Zhao wrote:
> > > >>> On Tue, Dec 17, 2019 at 04:21:39AM +0800, Kirti Wankhede wrote:
> > > >>>> VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
> > > >>>> - Start unpinned pages dirty pages tracking while migration is active and
> > > >>>>     device is running, i.e. during pre-copy phase.
> > > >>>> - Stop unpinned pages dirty pages tracking. This is required to stop
> > > >>>>     unpinned dirty pages tracking if migration failed or cancelled during
> > > >>>>     pre-copy phase. Unpinned pages tracking is clear.
> > > >>>> - Get dirty pages bitmap. Stop unpinned dirty pages tracking and clear
> > > >>>>     unpinned pages information on bitmap read. This ioctl returns bitmap of
> > > >>>>     dirty pages, its user space application responsibility to copy content
> > > >>>>     of dirty pages from source to destination during migration.
> > > >>>>
> > > >>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> > > >>>> Reviewed-by: Neo Jia <cjia@nvidia.com>
> > > >>>> ---
> > > >>>>    drivers/vfio/vfio_iommu_type1.c | 210 ++++++++++++++++++++++++++++++++++++++--
> > > >>>>    1 file changed, 203 insertions(+), 7 deletions(-)
> > > >>>>
> > > >>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > > >>>> index 3f6b04f2334f..264449654d3f 100644
> > > >>>> --- a/drivers/vfio/vfio_iommu_type1.c
> > > >>>> +++ b/drivers/vfio/vfio_iommu_type1.c
> > > >>>> @@ -70,6 +70,7 @@ struct vfio_iommu {
> > > >>>>    	unsigned int		dma_avail;
> > > >>>>    	bool			v2;
> > > >>>>    	bool			nesting;
> > > >>>> +	bool			dirty_page_tracking;
> > > >>>>    };
> > > >>>>    
> > > >>>>    struct vfio_domain {
> > > >>>> @@ -112,6 +113,7 @@ struct vfio_pfn {
> > > >>>>    	dma_addr_t		iova;		/* Device address */
> > > >>>>    	unsigned long		pfn;		/* Host pfn */
> > > >>>>    	atomic_t		ref_count;
> > > >>>> +	bool			unpinned;
> > > >>>>    };
> > > >>>>    
> > > >>>>    struct vfio_regions {
> > > >>>> @@ -244,6 +246,32 @@ static void vfio_remove_from_pfn_list(struct vfio_dma *dma,
> > > >>>>    	kfree(vpfn);
> > > >>>>    }
> > > >>>>    
> > > >>>> +static void vfio_remove_unpinned_from_pfn_list(struct vfio_dma *dma, bool warn)
> > > >>>> +{
> > > >>>> +	struct rb_node *n = rb_first(&dma->pfn_list);
> > > >>>> +
> > > >>>> +	for (; n; n = rb_next(n)) {
> > > >>>> +		struct vfio_pfn *vpfn = rb_entry(n, struct vfio_pfn, node);
> > > >>>> +
> > > >>>> +		if (warn)
> > > >>>> +			WARN_ON_ONCE(vpfn->unpinned);
> > > >>>> +
> > > >>>> +		if (vpfn->unpinned)
> > > >>>> +			vfio_remove_from_pfn_list(dma, vpfn);
> > > >>>> +	}
> > > >>>> +}
> > > >>>> +
> > > >>>> +static void vfio_remove_unpinned_from_dma_list(struct vfio_iommu *iommu)
> > > >>>> +{
> > > >>>> +	struct rb_node *n = rb_first(&iommu->dma_list);
> > > >>>> +
> > > >>>> +	for (; n; n = rb_next(n)) {
> > > >>>> +		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> > > >>>> +
> > > >>>> +		vfio_remove_unpinned_from_pfn_list(dma, false);
> > > >>>> +	}
> > > >>>> +}
> > > >>>> +
> > > >>>>    static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
> > > >>>>    					       unsigned long iova)
> > > >>>>    {
> > > >>>> @@ -254,13 +282,17 @@ static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
> > > >>>>    	return vpfn;
> > > >>>>    }
> > > >>>>    
> > > >>>> -static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
> > > >>>> +static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn,
> > > >>>> +				  bool dirty_tracking)
> > > >>>>    {
> > > >>>>    	int ret = 0;
> > > >>>>    
> > > >>>>    	if (atomic_dec_and_test(&vpfn->ref_count)) {
> > > >>>>    		ret = put_pfn(vpfn->pfn, dma->prot);
> > > >>> if physical page here is put, it may cause problem when pin this iova
> > > >>> next time:
> > > >>> vfio_iommu_type1_pin_pages {
> > > >>>       ...
> > > >>>       vpfn = vfio_iova_get_vfio_pfn(dma, iova);
> > > >>>       if (vpfn) {
> > > >>>           phys_pfn[i] = vpfn->pfn;
> > > >>>           continue;
> > > >>>       }
> > > >>>       ...
> > > >>> }
> > > >>>
> > > >>
> > > >> Good point. Fixing it as:
> > > >>
> > > >>                   vpfn = vfio_iova_get_vfio_pfn(dma, iova);
> > > >>                   if (vpfn) {
> > > >> -                       phys_pfn[i] = vpfn->pfn;
> > > >> -                       continue;
> > > >> +                       if (vpfn->unpinned)
> > > >> +                               vfio_remove_from_pfn_list(dma, vpfn);
> > > > what about updating vpfn instead?
> > > > 
> > > 
> > > vfio_pin_page_external() takes care of verification checks and mem lock 
> > > accounting. I prefer to free existing and add new node with existing 
> > > functions.
> > > 
> > > >> +                       else {
> > > >> +                               phys_pfn[i] = vpfn->pfn;
> > > >> +                               continue;
> > > >> +                       }
> > > >>                   }
> > > >>
> > > >>
> > > >>
> > > >>>> -		vfio_remove_from_pfn_list(dma, vpfn);
> > > >>>> +		if (dirty_tracking)
> > > >>>> +			vpfn->unpinned = true;
> > > >>>> +		else
> > > >>>> +			vfio_remove_from_pfn_list(dma, vpfn);
> > > >>> so the unpinned pages before dirty page tracking is not treated as
> > > >>> dirty?
> > > >>>
> > > >>
> > > >> Yes. That's we agreed on previous version:
> > > >> https://www.mail-archive.com/qemu-devel@nongnu.org/msg663157.html
> > > >>
> > > >>>>    	}
> > > >>>>    	return ret;
> > > >>>>    }
> > > >>>> @@ -504,7 +536,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
> > > >>>>    }
> > > >>>>    
> > > >>>>    static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
> > > >>>> -				    bool do_accounting)
> > > >>>> +				    bool do_accounting, bool dirty_tracking)
> > > >>>>    {
> > > >>>>    	int unlocked;
> > > >>>>    	struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
> > > >>>> @@ -512,7 +544,10 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
> > > >>>>    	if (!vpfn)
> > > >>>>    		return 0;
> > > >>>>    
> > > >>>> -	unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
> > > >>>> +	if (vpfn->unpinned)
> > > >>>> +		return 0;
> > > >>>> +
> > > >>>> +	unlocked = vfio_iova_put_vfio_pfn(dma, vpfn, dirty_tracking);
> > > >>>>    
> > > >>>>    	if (do_accounting)
> > > >>>>    		vfio_lock_acct(dma, -unlocked, true);
> > > >>>> @@ -583,7 +618,8 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
> > > >>>>    
> > > >>>>    		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
> > > >>>>    		if (ret) {
> > > >>>> -			vfio_unpin_page_external(dma, iova, do_accounting);
> > > >>>> +			vfio_unpin_page_external(dma, iova, do_accounting,
> > > >>>> +						 false);
> > > >>>>    			goto pin_unwind;
> > > >>>>    		}
> > > >>>>    	}
> > > >>>> @@ -598,7 +634,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
> > > >>>>    
> > > >>>>    		iova = user_pfn[j] << PAGE_SHIFT;
> > > >>>>    		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
> > > >>>> -		vfio_unpin_page_external(dma, iova, do_accounting);
> > > >>>> +		vfio_unpin_page_external(dma, iova, do_accounting, false);
> > > >>>>    		phys_pfn[j] = 0;
> > > >>>>    	}
> > > >>>>    pin_done:
> > > >>>> @@ -632,7 +668,8 @@ static int vfio_iommu_type1_unpin_pages(void *iommu_data,
> > > >>>>    		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
> > > >>>>    		if (!dma)
> > > >>>>    			goto unpin_exit;
> > > >>>> -		vfio_unpin_page_external(dma, iova, do_accounting);
> > > >>>> +		vfio_unpin_page_external(dma, iova, do_accounting,
> > > >>>> +					 iommu->dirty_page_tracking);
> > > >>>>    	}
> > > >>>>    
> > > >>>>    unpin_exit:
> > > >>>> @@ -850,6 +887,88 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
> > > >>>>    	return bitmap;
> > > >>>>    }
> > > >>>>    
> > > >>>> +/*
> > > >>>> + * start_iova is the reference from where bitmaping started. This is called
> > > >>>> + * from DMA_UNMAP where start_iova can be different than iova
> > > >>>> + */
> > > >>>> +
> > > >>>> +static void vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
> > > >>>> +				  size_t size, uint64_t pgsize,
> > > >>>> +				  dma_addr_t start_iova, unsigned long *bitmap)
> > > >>>> +{
> > > >>>> +	struct vfio_dma *dma;
> > > >>>> +	dma_addr_t i = iova;
> > > >>>> +	unsigned long pgshift = __ffs(pgsize);
> > > >>>> +
> > > >>>> +	while ((dma = vfio_find_dma(iommu, i, pgsize))) {
> > > >>>> +		/* mark all pages dirty if all pages are pinned and mapped. */
> > > >>>> +		if (dma->iommu_mapped) {
> > > >>> This prevents pass-through devices from calling vfio_pin_pages to do
> > > >>> fine grained log dirty.
> > > >>
> > > >> Yes, I mentioned that in yet TODO item in cover letter:
> > > >>
> > > >> "If IOMMU capable device is present in the container, then all pages are
> > > >> marked dirty. Need to think smart way to know if IOMMU capable device's
> > > >> driver is smart to report pages to be marked dirty by pinning those
> > > >> pages externally."
> > > >>
> > > > why not just check first if any vpfn present for IOMMU capable devices?
> > > > 
> > > 
> > > vfio_pin_pages(dev, ...) calls driver->ops->pin_pages(iommu, ...)
> > > 
> > > In vfio_iommu_type1 module, vfio_iommu_type1_pin_pages() doesn't know 
> > > the device. vpfn are tracked against container->iommu, not against 
> > > device. Need to think of smart way to know if devices in container are 
> > > all smart which report pages dirty ny pinning those pages manually.
> > >
> > I believe in such case, the mdev on top of device is in the same iommu
> > group (i.e. 1:1 mdev on top of device).
> > device vendor driver calls vfio_pin_pages to notify vfio which pages are dirty. 
> > > 
> > > >>
> > > >>>> +			dma_addr_t iova_limit;
> > > >>>> +
> > > >>>> +			iova_limit = (dma->iova + dma->size) < (iova + size) ?
> > > >>>> +				     (dma->iova + dma->size) : (iova + size);
> > > >>>> +
> > > >>>> +			for (; i < iova_limit; i += pgsize) {
> > > >>>> +				unsigned int start;
> > > >>>> +
> > > >>>> +				start = (i - start_iova) >> pgshift;
> > > >>>> +
> > > >>>> +				__bitmap_set(bitmap, start, 1);
> > > >>>> +			}
> > > >>>> +			if (i >= iova + size)
> > > >>>> +				return;
> > > >>>> +		} else {
> > > >>>> +			struct rb_node *n = rb_first(&dma->pfn_list);
> > > >>>> +			bool found = false;
> > > >>>> +
> > > >>>> +			for (; n; n = rb_next(n)) {
> > > >>>> +				struct vfio_pfn *vpfn = rb_entry(n,
> > > >>>> +							struct vfio_pfn, node);
> > > >>>> +				if (vpfn->iova >= i) {
> > > >>>> +					found = true;
> > > >>>> +					break;
> > > >>>> +				}
> > > >>>> +			}
> > > >>>> +
> > > >>>> +			if (!found) {
> > > >>>> +				i += dma->size;
> > > >>>> +				continue;
> > > >>>> +			}
> > > >>>> +
> > > >>>> +			for (; n; n = rb_next(n)) {
> > > >>>> +				unsigned int start;
> > > >>>> +				struct vfio_pfn *vpfn = rb_entry(n,
> > > >>>> +							struct vfio_pfn, node);
> > > >>>> +
> > > >>>> +				if (vpfn->iova >= iova + size)
> > > >>>> +					return;
> > > >>>> +
> > > >>>> +				start = (vpfn->iova - start_iova) >> pgshift;
> > > >>>> +
> > > >>>> +				__bitmap_set(bitmap, start, 1);
> > > >>>> +
> > > >>>> +				i = vpfn->iova + pgsize;
> > > >>>> +			}
> > > >>>> +		}
> > > >>>> +		vfio_remove_unpinned_from_pfn_list(dma, false);
> > > >>>> +	}
> > > >>>> +}
> > > >>>> +
> > > >>>> +static long verify_bitmap_size(unsigned long npages, unsigned long bitmap_size)
> > > >>>> +{
> > > >>>> +	long bsize;
> > > >>>> +
> > > >>>> +	if (!bitmap_size || bitmap_size > SIZE_MAX)
> > > >>>> +		return -EINVAL;
> > > >>>> +
> > > >>>> +	bsize = ALIGN(npages, BITS_PER_LONG) / sizeof(unsigned long);
> > > >>>> +
> > > >>>> +	if (bitmap_size < bsize)
> > > >>>> +		return -EINVAL;
> > > >>>> +
> > > >>>> +	return bsize;
> > > >>>> +}
> > > >>>> +
> > > >>>>    static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> > > >>>>    			     struct vfio_iommu_type1_dma_unmap *unmap)
> > > >>>>    {
> > > >>>> @@ -2298,6 +2417,83 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
> > > >>>>    
> > > >>>>    		return copy_to_user((void __user *)arg, &unmap, minsz) ?
> > > >>>>    			-EFAULT : 0;
> > > >>>> +	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
> > > >>>> +		struct vfio_iommu_type1_dirty_bitmap range;
> > > >>>> +		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
> > > >>>> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
> > > >>>> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
> > > >>>> +		int ret;
> > > >>>> +
> > > >>>> +		if (!iommu->v2)
> > > >>>> +			return -EACCES;
> > > >>>> +
> > > >>>> +		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
> > > >>>> +				    bitmap);
> > > >>>> +
> > > >>>> +		if (copy_from_user(&range, (void __user *)arg, minsz))
> > > >>>> +			return -EFAULT;
> > > >>>> +
> > > >>>> +		if (range.argsz < minsz || range.flags & ~mask)
> > > >>>> +			return -EINVAL;
> > > >>>> +
> > > >>>> +		if (range.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
> > > >>>> +			iommu->dirty_page_tracking = true;
> > > >>>> +			return 0;
> > > >>>> +		} else if (range.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
> > > >>>> +			iommu->dirty_page_tracking = false;
> > > >>>> +
> > > >>>> +			mutex_lock(&iommu->lock);
> > > >>>> +			vfio_remove_unpinned_from_dma_list(iommu);
> > > >>>> +			mutex_unlock(&iommu->lock);
> > > >>>> +			return 0;
> > > >>>> +
> > > >>>> +		} else if (range.flags &
> > > >>>> +				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
> > > >>>> +			uint64_t iommu_pgmask;
> > > >>>> +			unsigned long pgshift = __ffs(range.pgsize);
> > > >>>> +			unsigned long *bitmap;
> > > >>>> +			long bsize;
> > > >>>> +
> > > >>>> +			iommu_pgmask =
> > > >>>> +			 ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
> > > >>>> +
> > > >>>> +			if (((range.pgsize - 1) & iommu_pgmask) !=
> > > >>>> +			    (range.pgsize - 1))
> > > >>>> +				return -EINVAL;
> > > >>>> +
> > > >>>> +			if (range.iova & iommu_pgmask)
> > > >>>> +				return -EINVAL;
> > > >>>> +			if (!range.size || range.size > SIZE_MAX)
> > > >>>> +				return -EINVAL;
> > > >>>> +			if (range.iova + range.size < range.iova)
> > > >>>> +				return -EINVAL;
> > > >>>> +
> > > >>>> +			bsize = verify_bitmap_size(range.size >> pgshift,
> > > >>>> +						   range.bitmap_size);
> > > >>>> +			if (bsize)
> > > >>>> +				return ret;
> > > >>>> +
> > > >>>> +			bitmap = kmalloc(bsize, GFP_KERNEL);
> > > >>>> +			if (!bitmap)
> > > >>>> +				return -ENOMEM;
> > > >>>> +
> > > >>>> +			ret = copy_from_user(bitmap,
> > > >>>> +			     (void __user *)range.bitmap, bsize) ? -EFAULT : 0;
> > > >>>> +			if (ret)
> > > >>>> +				goto bitmap_exit;
> > > >>>> +
> > > >>>> +			iommu->dirty_page_tracking = false;
> > > >>> why iommu->dirty_page_tracking is false here?
> > > >>> suppose this ioctl can be called several times.
> > > >>>
> > > >>
> > > >> This ioctl can be called several times, but once this ioctl is called
> > > >> that means vCPUs are stopped and VFIO devices are stopped (i.e. in
> > > >> stop-and-copy phase) and dirty pages bitmap are being queried by user.
> > > >>
> > > > can't agree that VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP can only be
> > > > called in stop-and-copy phase.
> > > > As stated in last version, this will cause QEMU to get a wrong expectation
> > > > of VM downtime and this is also the reason for previously pinned pages
> > > > before log_sync cannot be treated as dirty. If this get bitmap ioctl can
> > > > be called early in save_setup phase, then it's no problem even all ram
> > > > is dirty.
> > > > 
> > > 
> > > Device can also write to pages which are pinned, and then there is no 
> > > way to know pages dirtied by device during pre-copy phase.
> > > If user ask dirty bitmap in per-copy phase, even then user will have to 
> > > query dirty bitmap in stop-and-copy phase where this will be superset 
> > > including all pages reported during pre-copy. Then instead of copying 
> > > all pages twice, its better to do it once during stop-and-copy phase.
> > >
> > I think the flow should be like this:
> > 1. save_setup --> GET_BITMAP ioctl --> return bitmap for currently + previously
> > pinned pages and clean all previously pinned pages
> > 
> > 2. save_pending --> GET_BITMAP ioctl  --> return bitmap of (currently
> > pinned pages + previously pinned pages since last clean) and clean all
> > previously pinned pages
> > 
> > 3. save_complete_precopy --> GET_BITMAP ioctl --> return bitmap of (currently
> > pinned pages + previously pinned pages since last clean) and clean all
> > previously pinned pages
> > 
> > 
> > Copying pinned pages multiple times is unavoidable because those pinned pages
> > are always treated as dirty. That's per vendor's implementation.
> > But if the pinned pages are not reported as dirty before stop-and-copy phase,
> > QEMU would think dirty pages has converged
> > and enter blackout phase, making downtime_limit severely incorrect.
> 
> I'm not sure it's any worse.
> I *think* we do a last sync after we've decided to go to stop-and-copy;
> wont that then mark all those pages as dirty again, so it'll have the
> same behaviour?
No. something will be different.
currently, in kirti's implementation, if GET_BITMAP ioctl is called only
once in stop-and-copy phase, then before that phase, QEMU does not know those
pages are dirty. 
If we can report those dirty pages earlier before stop-and-copy phase,
QEMU can at least copy other pages to reduce dirty pages to below threshold.

Take a example, let's assume those vfio dirty pages is 1Gb, and network speed is
also 1Gb. Expected vm downtime is 1s.
If before stop-and-copy phase, dirty pages produced by other pages is
also 1Gb. To meet the expected vm downtime, QEMU should copy pages to
let dirty pages be less than 1Gb, otherwise, it should not complete live
migration.
If vfio does not report this 1Gb dirty pages, QEMU would think there's
only 1Gb and stop the vm. It would then find out there's actually 2Gb and vm
downtime is 2s.
Though the expected vm downtime is always not exactly the same as the
true vm downtime, it should be caused by rapid dirty page rate, which is
not predictable.
Right?

Thanks
Yan



> Anyway, it seems wrong to repeatedly send pages that you know are
> pointless - but that probably means we need a way to mark those somehow
> to avoid it.
> 
> Dave
> 
> > Thanks
> > Yan
> > 
> > > >>>> +			mutex_lock(&iommu->lock);
> > > >>>> +			vfio_iova_dirty_bitmap(iommu, range.iova, range.size,
> > > >>>> +					     range.pgsize, range.iova, bitmap);
> > > >>>> +			mutex_unlock(&iommu->lock);
> > > >>>> +
> > > >>>> +			ret = copy_to_user((void __user *)range.bitmap, bitmap,
> > > >>>> +					   range.bitmap_size) ? -EFAULT : 0;
> > > >>>> +bitmap_exit:
> > > >>>> +			kfree(bitmap);
> > > >>>> +			return ret;
> > > >>>> +		}
> > > >>>>    	}
> > > >>>>    
> > > >>>>    	return -ENOTTY;
> > > >>>> -- 
> > > >>>> 2.7.0
> > > >>>>
> > 
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
>
Kirti Wankhede Dec. 19, 2019, 4:21 p.m. UTC | #9
On 12/19/2019 6:27 AM, Yan Zhao wrote:
> On Thu, Dec 19, 2019 at 04:05:52AM +0800, Dr. David Alan Gilbert wrote:
>> * Yan Zhao (yan.y.zhao@intel.com) wrote:
>>> On Tue, Dec 17, 2019 at 07:47:05PM +0800, Kirti Wankhede wrote:
>>>>
>>>>
>>>> On 12/17/2019 3:21 PM, Yan Zhao wrote:
>>>>> On Tue, Dec 17, 2019 at 05:24:14PM +0800, Kirti Wankhede wrote:
>>>>>>>>     
>>>>>>>>     		return copy_to_user((void __user *)arg, &unmap, minsz) ?
>>>>>>>>     			-EFAULT : 0;
>>>>>>>> +	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
>>>>>>>> +		struct vfio_iommu_type1_dirty_bitmap range;
>>>>>>>> +		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
>>>>>>>> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
>>>>>>>> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
>>>>>>>> +		int ret;
>>>>>>>> +
>>>>>>>> +		if (!iommu->v2)
>>>>>>>> +			return -EACCES;
>>>>>>>> +
>>>>>>>> +		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
>>>>>>>> +				    bitmap);
>>>>>>>> +
>>>>>>>> +		if (copy_from_user(&range, (void __user *)arg, minsz))
>>>>>>>> +			return -EFAULT;
>>>>>>>> +
>>>>>>>> +		if (range.argsz < minsz || range.flags & ~mask)
>>>>>>>> +			return -EINVAL;
>>>>>>>> +
>>>>>>>> +		if (range.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
>>>>>>>> +			iommu->dirty_page_tracking = true;
>>>>>>>> +			return 0;
>>>>>>>> +		} else if (range.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
>>>>>>>> +			iommu->dirty_page_tracking = false;
>>>>>>>> +
>>>>>>>> +			mutex_lock(&iommu->lock);
>>>>>>>> +			vfio_remove_unpinned_from_dma_list(iommu);
>>>>>>>> +			mutex_unlock(&iommu->lock);
>>>>>>>> +			return 0;
>>>>>>>> +
>>>>>>>> +		} else if (range.flags &
>>>>>>>> +				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
>>>>>>>> +			uint64_t iommu_pgmask;
>>>>>>>> +			unsigned long pgshift = __ffs(range.pgsize);
>>>>>>>> +			unsigned long *bitmap;
>>>>>>>> +			long bsize;
>>>>>>>> +
>>>>>>>> +			iommu_pgmask =
>>>>>>>> +			 ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
>>>>>>>> +
>>>>>>>> +			if (((range.pgsize - 1) & iommu_pgmask) !=
>>>>>>>> +			    (range.pgsize - 1))
>>>>>>>> +				return -EINVAL;
>>>>>>>> +
>>>>>>>> +			if (range.iova & iommu_pgmask)
>>>>>>>> +				return -EINVAL;
>>>>>>>> +			if (!range.size || range.size > SIZE_MAX)
>>>>>>>> +				return -EINVAL;
>>>>>>>> +			if (range.iova + range.size < range.iova)
>>>>>>>> +				return -EINVAL;
>>>>>>>> +
>>>>>>>> +			bsize = verify_bitmap_size(range.size >> pgshift,
>>>>>>>> +						   range.bitmap_size);
>>>>>>>> +			if (bsize)
>>>>>>>> +				return ret;
>>>>>>>> +
>>>>>>>> +			bitmap = kmalloc(bsize, GFP_KERNEL);
>>>>>>>> +			if (!bitmap)
>>>>>>>> +				return -ENOMEM;
>>>>>>>> +
>>>>>>>> +			ret = copy_from_user(bitmap,
>>>>>>>> +			     (void __user *)range.bitmap, bsize) ? -EFAULT : 0;
>>>>>>>> +			if (ret)
>>>>>>>> +				goto bitmap_exit;
>>>>>>>> +
>>>>>>>> +			iommu->dirty_page_tracking = false;
>>>>>>> why iommu->dirty_page_tracking is false here?
>>>>>>> suppose this ioctl can be called several times.
>>>>>>>
>>>>>>
>>>>>> This ioctl can be called several times, but once this ioctl is called
>>>>>> that means vCPUs are stopped and VFIO devices are stopped (i.e. in
>>>>>> stop-and-copy phase) and dirty pages bitmap are being queried by user.
>>>>>>
>>>>> can't agree that VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP can only be
>>>>> called in stop-and-copy phase.
>>>>> As stated in last version, this will cause QEMU to get a wrong expectation
>>>>> of VM downtime and this is also the reason for previously pinned pages
>>>>> before log_sync cannot be treated as dirty. If this get bitmap ioctl can
>>>>> be called early in save_setup phase, then it's no problem even all ram
>>>>> is dirty.
>>>>>
>>>>
>>>> Device can also write to pages which are pinned, and then there is no
>>>> way to know pages dirtied by device during pre-copy phase.
>>>> If user ask dirty bitmap in per-copy phase, even then user will have to
>>>> query dirty bitmap in stop-and-copy phase where this will be superset
>>>> including all pages reported during pre-copy. Then instead of copying
>>>> all pages twice, its better to do it once during stop-and-copy phase.
>>>>
>>> I think the flow should be like this:
>>> 1. save_setup --> GET_BITMAP ioctl --> return bitmap for currently + previously
>>> pinned pages and clean all previously pinned pages
>>>
>>> 2. save_pending --> GET_BITMAP ioctl  --> return bitmap of (currently
>>> pinned pages + previously pinned pages since last clean) and clean all
>>> previously pinned pages
>>>
>>> 3. save_complete_precopy --> GET_BITMAP ioctl --> return bitmap of (currently
>>> pinned pages + previously pinned pages since last clean) and clean all
>>> previously pinned pages
>>>
>>>
>>> Copying pinned pages multiple times is unavoidable because those pinned pages
>>> are always treated as dirty. That's per vendor's implementation.
>>> But if the pinned pages are not reported as dirty before stop-and-copy phase,
>>> QEMU would think dirty pages has converged
>>> and enter blackout phase, making downtime_limit severely incorrect.
>>
>> I'm not sure it's any worse.
>> I *think* we do a last sync after we've decided to go to stop-and-copy;
>> wont that then mark all those pages as dirty again, so it'll have the
>> same behaviour?
> No. something will be different.
> currently, in kirti's implementation, if GET_BITMAP ioctl is called only
> once in stop-and-copy phase, then before that phase, QEMU does not know those
> pages are dirty.
> If we can report those dirty pages earlier before stop-and-copy phase,
> QEMU can at least copy other pages to reduce dirty pages to below threshold.
> 
> Take a example, let's assume those vfio dirty pages is 1Gb, and network speed is
> also 1Gb. Expected vm downtime is 1s.
> If before stop-and-copy phase, dirty pages produced by other pages is
> also 1Gb. To meet the expected vm downtime, QEMU should copy pages to
> let dirty pages be less than 1Gb, otherwise, it should not complete live
> migration.
> If vfio does not report this 1Gb dirty pages, QEMU would think there's
> only 1Gb and stop the vm. It would then find out there's actually 2Gb and vm
> downtime is 2s.
> Though the expected vm downtime is always not exactly the same as the
> true vm downtime, it should be caused by rapid dirty page rate, which is
> not predictable.
> Right?
> 

If you report vfio dirty pages 1Gb before stop-and-copy phase (i.e. in 
pre-copy phase), enter into stop-and-copy phase, how will you know which 
and how many pages are dirtied by device from the time when pages copied 
in pre-copy phase to that time where device is stopped? You don't have a 
way to know which pages are dirtied by device. So ideally device can 
write to all pages which are pinned. Then we have to mark all those 
pinned pages dirty in stop-and-copy phase, 1Gb, and copy to destination. 
Now you had copied same pages twice. Shouldn't we try not to copy pages 
twice?

Thanks,
Kirti
Kirti Wankhede Dec. 19, 2019, 6:42 p.m. UTC | #10
On 12/19/2019 3:09 AM, Alex Williamson wrote:
> On Tue, 17 Dec 2019 14:54:14 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> On 12/17/2019 10:45 AM, Yan Zhao wrote:
>>> On Tue, Dec 17, 2019 at 04:21:39AM +0800, Kirti Wankhede wrote:
>>>> +		} else if (range.flags &
>>>> +				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
>>>> +			uint64_t iommu_pgmask;
>>>> +			unsigned long pgshift = __ffs(range.pgsize);
>>>> +			unsigned long *bitmap;
>>>> +			long bsize;
>>>> +
>>>> +			iommu_pgmask =
>>>> +			 ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
>>>> +
>>>> +			if (((range.pgsize - 1) & iommu_pgmask) !=
>>>> +			    (range.pgsize - 1))
>>>> +				return -EINVAL;
>>>> +
>>>> +			if (range.iova & iommu_pgmask)
>>>> +				return -EINVAL;
>>>> +			if (!range.size || range.size > SIZE_MAX)
>>>> +				return -EINVAL;
>>>> +			if (range.iova + range.size < range.iova)
>>>> +				return -EINVAL;
>>>> +
>>>> +			bsize = verify_bitmap_size(range.size >> pgshift,
>>>> +						   range.bitmap_size);
>>>> +			if (bsize)
>>>> +				return ret;
>>>> +
>>>> +			bitmap = kmalloc(bsize, GFP_KERNEL);
>>>> +			if (!bitmap)
>>>> +				return -ENOMEM;
>>>> +
>>>> +			ret = copy_from_user(bitmap,
>>>> +			     (void __user *)range.bitmap, bsize) ? -EFAULT : 0;
>>>> +			if (ret)
>>>> +				goto bitmap_exit;
>>>> +
>>>> +			iommu->dirty_page_tracking = false;
>>> why iommu->dirty_page_tracking is false here?
>>> suppose this ioctl can be called several times.
>>>    
>>
>> This ioctl can be called several times, but once this ioctl is called
>> that means vCPUs are stopped and VFIO devices are stopped (i.e. in
>> stop-and-copy phase) and dirty pages bitmap are being queried by user.
> 
> Do not assume how userspace works or its intent.  If dirty tracking is
> on, it should remain on until the user turns it off.  We cannot assume
> userspace uses a one-shot approach.  Thanks,
> 

Dirty tracking should be on until user turns it off or user reads 
bitmap, right? This ioctl is used to read bitmap.


Thanks,
Kirti
Alex Williamson Dec. 19, 2019, 6:56 p.m. UTC | #11
On Fri, 20 Dec 2019 00:12:30 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 12/19/2019 3:09 AM, Alex Williamson wrote:
> > On Tue, 17 Dec 2019 14:54:14 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> >> On 12/17/2019 10:45 AM, Yan Zhao wrote:  
> >>> On Tue, Dec 17, 2019 at 04:21:39AM +0800, Kirti Wankhede wrote:  
> >>>> +		} else if (range.flags &
> >>>> +				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
> >>>> +			uint64_t iommu_pgmask;
> >>>> +			unsigned long pgshift = __ffs(range.pgsize);
> >>>> +			unsigned long *bitmap;
> >>>> +			long bsize;
> >>>> +
> >>>> +			iommu_pgmask =
> >>>> +			 ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
> >>>> +
> >>>> +			if (((range.pgsize - 1) & iommu_pgmask) !=
> >>>> +			    (range.pgsize - 1))
> >>>> +				return -EINVAL;
> >>>> +
> >>>> +			if (range.iova & iommu_pgmask)
> >>>> +				return -EINVAL;
> >>>> +			if (!range.size || range.size > SIZE_MAX)
> >>>> +				return -EINVAL;
> >>>> +			if (range.iova + range.size < range.iova)
> >>>> +				return -EINVAL;
> >>>> +
> >>>> +			bsize = verify_bitmap_size(range.size >> pgshift,
> >>>> +						   range.bitmap_size);
> >>>> +			if (bsize)
> >>>> +				return ret;
> >>>> +
> >>>> +			bitmap = kmalloc(bsize, GFP_KERNEL);
> >>>> +			if (!bitmap)
> >>>> +				return -ENOMEM;
> >>>> +
> >>>> +			ret = copy_from_user(bitmap,
> >>>> +			     (void __user *)range.bitmap, bsize) ? -EFAULT : 0;
> >>>> +			if (ret)
> >>>> +				goto bitmap_exit;
> >>>> +
> >>>> +			iommu->dirty_page_tracking = false;  
> >>> why iommu->dirty_page_tracking is false here?
> >>> suppose this ioctl can be called several times.
> >>>      
> >>
> >> This ioctl can be called several times, but once this ioctl is called
> >> that means vCPUs are stopped and VFIO devices are stopped (i.e. in
> >> stop-and-copy phase) and dirty pages bitmap are being queried by user.  
> > 
> > Do not assume how userspace works or its intent.  If dirty tracking is
> > on, it should remain on until the user turns it off.  We cannot assume
> > userspace uses a one-shot approach.  Thanks,
> >   
> 
> Dirty tracking should be on until user turns it off or user reads 
> bitmap, right? This ioctl is used to read bitmap.

No, dirty bitmap tracking is on until the user turns it off, period.
Retrieving the bitmap is probably only looking at a portion of the
container address space at a time, anything else would place
impractical requirements on the user allocated bitmap.  We also need to
support a usage model where the user is making successive calls, where
each should report pages dirtied since the previous call.  If the user
is required to re-enable tracking, there's an irreconcilable gap
between the call to retrieve the dirty bitmap and their opportunity to
re-enable dirty tracking.  It's fundamentally broken to automatically
disable tracking on read.  Thanks,

Alex
Yan Zhao Dec. 20, 2019, 12:58 a.m. UTC | #12
On Fri, Dec 20, 2019 at 12:21:45AM +0800, Kirti Wankhede wrote:
> 
> 
> On 12/19/2019 6:27 AM, Yan Zhao wrote:
> > On Thu, Dec 19, 2019 at 04:05:52AM +0800, Dr. David Alan Gilbert wrote:
> >> * Yan Zhao (yan.y.zhao@intel.com) wrote:
> >>> On Tue, Dec 17, 2019 at 07:47:05PM +0800, Kirti Wankhede wrote:
> >>>>
> >>>>
> >>>> On 12/17/2019 3:21 PM, Yan Zhao wrote:
> >>>>> On Tue, Dec 17, 2019 at 05:24:14PM +0800, Kirti Wankhede wrote:
> >>>>>>>>     
> >>>>>>>>     		return copy_to_user((void __user *)arg, &unmap, minsz) ?
> >>>>>>>>     			-EFAULT : 0;
> >>>>>>>> +	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
> >>>>>>>> +		struct vfio_iommu_type1_dirty_bitmap range;
> >>>>>>>> +		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
> >>>>>>>> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
> >>>>>>>> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
> >>>>>>>> +		int ret;
> >>>>>>>> +
> >>>>>>>> +		if (!iommu->v2)
> >>>>>>>> +			return -EACCES;
> >>>>>>>> +
> >>>>>>>> +		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
> >>>>>>>> +				    bitmap);
> >>>>>>>> +
> >>>>>>>> +		if (copy_from_user(&range, (void __user *)arg, minsz))
> >>>>>>>> +			return -EFAULT;
> >>>>>>>> +
> >>>>>>>> +		if (range.argsz < minsz || range.flags & ~mask)
> >>>>>>>> +			return -EINVAL;
> >>>>>>>> +
> >>>>>>>> +		if (range.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
> >>>>>>>> +			iommu->dirty_page_tracking = true;
> >>>>>>>> +			return 0;
> >>>>>>>> +		} else if (range.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
> >>>>>>>> +			iommu->dirty_page_tracking = false;
> >>>>>>>> +
> >>>>>>>> +			mutex_lock(&iommu->lock);
> >>>>>>>> +			vfio_remove_unpinned_from_dma_list(iommu);
> >>>>>>>> +			mutex_unlock(&iommu->lock);
> >>>>>>>> +			return 0;
> >>>>>>>> +
> >>>>>>>> +		} else if (range.flags &
> >>>>>>>> +				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
> >>>>>>>> +			uint64_t iommu_pgmask;
> >>>>>>>> +			unsigned long pgshift = __ffs(range.pgsize);
> >>>>>>>> +			unsigned long *bitmap;
> >>>>>>>> +			long bsize;
> >>>>>>>> +
> >>>>>>>> +			iommu_pgmask =
> >>>>>>>> +			 ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
> >>>>>>>> +
> >>>>>>>> +			if (((range.pgsize - 1) & iommu_pgmask) !=
> >>>>>>>> +			    (range.pgsize - 1))
> >>>>>>>> +				return -EINVAL;
> >>>>>>>> +
> >>>>>>>> +			if (range.iova & iommu_pgmask)
> >>>>>>>> +				return -EINVAL;
> >>>>>>>> +			if (!range.size || range.size > SIZE_MAX)
> >>>>>>>> +				return -EINVAL;
> >>>>>>>> +			if (range.iova + range.size < range.iova)
> >>>>>>>> +				return -EINVAL;
> >>>>>>>> +
> >>>>>>>> +			bsize = verify_bitmap_size(range.size >> pgshift,
> >>>>>>>> +						   range.bitmap_size);
> >>>>>>>> +			if (bsize)
> >>>>>>>> +				return ret;
> >>>>>>>> +
> >>>>>>>> +			bitmap = kmalloc(bsize, GFP_KERNEL);
> >>>>>>>> +			if (!bitmap)
> >>>>>>>> +				return -ENOMEM;
> >>>>>>>> +
> >>>>>>>> +			ret = copy_from_user(bitmap,
> >>>>>>>> +			     (void __user *)range.bitmap, bsize) ? -EFAULT : 0;
> >>>>>>>> +			if (ret)
> >>>>>>>> +				goto bitmap_exit;
> >>>>>>>> +
> >>>>>>>> +			iommu->dirty_page_tracking = false;
> >>>>>>> why iommu->dirty_page_tracking is false here?
> >>>>>>> suppose this ioctl can be called several times.
> >>>>>>>
> >>>>>>
> >>>>>> This ioctl can be called several times, but once this ioctl is called
> >>>>>> that means vCPUs are stopped and VFIO devices are stopped (i.e. in
> >>>>>> stop-and-copy phase) and dirty pages bitmap are being queried by user.
> >>>>>>
> >>>>> can't agree that VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP can only be
> >>>>> called in stop-and-copy phase.
> >>>>> As stated in last version, this will cause QEMU to get a wrong expectation
> >>>>> of VM downtime and this is also the reason for previously pinned pages
> >>>>> before log_sync cannot be treated as dirty. If this get bitmap ioctl can
> >>>>> be called early in save_setup phase, then it's no problem even all ram
> >>>>> is dirty.
> >>>>>
> >>>>
> >>>> Device can also write to pages which are pinned, and then there is no
> >>>> way to know pages dirtied by device during pre-copy phase.
> >>>> If user ask dirty bitmap in per-copy phase, even then user will have to
> >>>> query dirty bitmap in stop-and-copy phase where this will be superset
> >>>> including all pages reported during pre-copy. Then instead of copying
> >>>> all pages twice, its better to do it once during stop-and-copy phase.
> >>>>
> >>> I think the flow should be like this:
> >>> 1. save_setup --> GET_BITMAP ioctl --> return bitmap for currently + previously
> >>> pinned pages and clean all previously pinned pages
> >>>
> >>> 2. save_pending --> GET_BITMAP ioctl  --> return bitmap of (currently
> >>> pinned pages + previously pinned pages since last clean) and clean all
> >>> previously pinned pages
> >>>
> >>> 3. save_complete_precopy --> GET_BITMAP ioctl --> return bitmap of (currently
> >>> pinned pages + previously pinned pages since last clean) and clean all
> >>> previously pinned pages
> >>>
> >>>
> >>> Copying pinned pages multiple times is unavoidable because those pinned pages
> >>> are always treated as dirty. That's per vendor's implementation.
> >>> But if the pinned pages are not reported as dirty before stop-and-copy phase,
> >>> QEMU would think dirty pages has converged
> >>> and enter blackout phase, making downtime_limit severely incorrect.
> >>
> >> I'm not sure it's any worse.
> >> I *think* we do a last sync after we've decided to go to stop-and-copy;
> >> wont that then mark all those pages as dirty again, so it'll have the
> >> same behaviour?
> > No. something will be different.
> > currently, in kirti's implementation, if GET_BITMAP ioctl is called only
> > once in stop-and-copy phase, then before that phase, QEMU does not know those
> > pages are dirty.
> > If we can report those dirty pages earlier before stop-and-copy phase,
> > QEMU can at least copy other pages to reduce dirty pages to below threshold.
> > 
> > Take a example, let's assume those vfio dirty pages is 1Gb, and network speed is
> > also 1Gb. Expected vm downtime is 1s.
> > If before stop-and-copy phase, dirty pages produced by other pages is
> > also 1Gb. To meet the expected vm downtime, QEMU should copy pages to
> > let dirty pages be less than 1Gb, otherwise, it should not complete live
> > migration.
> > If vfio does not report this 1Gb dirty pages, QEMU would think there's
> > only 1Gb and stop the vm. It would then find out there's actually 2Gb and vm
> > downtime is 2s.
> > Though the expected vm downtime is always not exactly the same as the
> > true vm downtime, it should be caused by rapid dirty page rate, which is
> > not predictable.
> > Right?
> > 
> 
> If you report vfio dirty pages 1Gb before stop-and-copy phase (i.e. in 
> pre-copy phase), enter into stop-and-copy phase, how will you know which 
> and how many pages are dirtied by device from the time when pages copied 
> in pre-copy phase to that time where device is stopped? You don't have a 
> way to know which pages are dirtied by device. So ideally device can 
> write to all pages which are pinned. Then we have to mark all those 
> pinned pages dirty in stop-and-copy phase, 1Gb, and copy to destination. 
> Now you had copied same pages twice. Shouldn't we try not to copy pages 
> twice?
>
For mdevs who reply on treating all pinned pages as dirty pages, as above
mentioned condition, repeated page copying can be avoided by
(1) adding an ioctl to get size of dirty pages and report it to QEMU through
vfio_save_pending() interface
(2) in this GET_BITMAP ioctl, empty bitmap is returned until stop-and-copy phase.

But for devices who know fine-grained dirty pages, (e.g. devices have ditry page
tracking in hardware), GET_BITMAP ioctl has to return incremental dirty
bitmaps in each iteration and step (1) should return 0 to avoid 2*size
of dirty page reported.

Thanks
Yan
Dr. David Alan Gilbert Jan. 3, 2020, 7:44 p.m. UTC | #13
* Yan Zhao (yan.y.zhao@intel.com) wrote:
> On Fri, Dec 20, 2019 at 12:21:45AM +0800, Kirti Wankhede wrote:
> > 
> > 
> > On 12/19/2019 6:27 AM, Yan Zhao wrote:
> > > On Thu, Dec 19, 2019 at 04:05:52AM +0800, Dr. David Alan Gilbert wrote:
> > >> * Yan Zhao (yan.y.zhao@intel.com) wrote:
> > >>> On Tue, Dec 17, 2019 at 07:47:05PM +0800, Kirti Wankhede wrote:
> > >>>>
> > >>>>
> > >>>> On 12/17/2019 3:21 PM, Yan Zhao wrote:
> > >>>>> On Tue, Dec 17, 2019 at 05:24:14PM +0800, Kirti Wankhede wrote:
> > >>>>>>>>     
> > >>>>>>>>     		return copy_to_user((void __user *)arg, &unmap, minsz) ?
> > >>>>>>>>     			-EFAULT : 0;
> > >>>>>>>> +	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
> > >>>>>>>> +		struct vfio_iommu_type1_dirty_bitmap range;
> > >>>>>>>> +		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
> > >>>>>>>> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
> > >>>>>>>> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
> > >>>>>>>> +		int ret;
> > >>>>>>>> +
> > >>>>>>>> +		if (!iommu->v2)
> > >>>>>>>> +			return -EACCES;
> > >>>>>>>> +
> > >>>>>>>> +		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
> > >>>>>>>> +				    bitmap);
> > >>>>>>>> +
> > >>>>>>>> +		if (copy_from_user(&range, (void __user *)arg, minsz))
> > >>>>>>>> +			return -EFAULT;
> > >>>>>>>> +
> > >>>>>>>> +		if (range.argsz < minsz || range.flags & ~mask)
> > >>>>>>>> +			return -EINVAL;
> > >>>>>>>> +
> > >>>>>>>> +		if (range.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
> > >>>>>>>> +			iommu->dirty_page_tracking = true;
> > >>>>>>>> +			return 0;
> > >>>>>>>> +		} else if (range.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
> > >>>>>>>> +			iommu->dirty_page_tracking = false;
> > >>>>>>>> +
> > >>>>>>>> +			mutex_lock(&iommu->lock);
> > >>>>>>>> +			vfio_remove_unpinned_from_dma_list(iommu);
> > >>>>>>>> +			mutex_unlock(&iommu->lock);
> > >>>>>>>> +			return 0;
> > >>>>>>>> +
> > >>>>>>>> +		} else if (range.flags &
> > >>>>>>>> +				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
> > >>>>>>>> +			uint64_t iommu_pgmask;
> > >>>>>>>> +			unsigned long pgshift = __ffs(range.pgsize);
> > >>>>>>>> +			unsigned long *bitmap;
> > >>>>>>>> +			long bsize;
> > >>>>>>>> +
> > >>>>>>>> +			iommu_pgmask =
> > >>>>>>>> +			 ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
> > >>>>>>>> +
> > >>>>>>>> +			if (((range.pgsize - 1) & iommu_pgmask) !=
> > >>>>>>>> +			    (range.pgsize - 1))
> > >>>>>>>> +				return -EINVAL;
> > >>>>>>>> +
> > >>>>>>>> +			if (range.iova & iommu_pgmask)
> > >>>>>>>> +				return -EINVAL;
> > >>>>>>>> +			if (!range.size || range.size > SIZE_MAX)
> > >>>>>>>> +				return -EINVAL;
> > >>>>>>>> +			if (range.iova + range.size < range.iova)
> > >>>>>>>> +				return -EINVAL;
> > >>>>>>>> +
> > >>>>>>>> +			bsize = verify_bitmap_size(range.size >> pgshift,
> > >>>>>>>> +						   range.bitmap_size);
> > >>>>>>>> +			if (bsize)
> > >>>>>>>> +				return ret;
> > >>>>>>>> +
> > >>>>>>>> +			bitmap = kmalloc(bsize, GFP_KERNEL);
> > >>>>>>>> +			if (!bitmap)
> > >>>>>>>> +				return -ENOMEM;
> > >>>>>>>> +
> > >>>>>>>> +			ret = copy_from_user(bitmap,
> > >>>>>>>> +			     (void __user *)range.bitmap, bsize) ? -EFAULT : 0;
> > >>>>>>>> +			if (ret)
> > >>>>>>>> +				goto bitmap_exit;
> > >>>>>>>> +
> > >>>>>>>> +			iommu->dirty_page_tracking = false;
> > >>>>>>> why iommu->dirty_page_tracking is false here?
> > >>>>>>> suppose this ioctl can be called several times.
> > >>>>>>>
> > >>>>>>
> > >>>>>> This ioctl can be called several times, but once this ioctl is called
> > >>>>>> that means vCPUs are stopped and VFIO devices are stopped (i.e. in
> > >>>>>> stop-and-copy phase) and dirty pages bitmap are being queried by user.
> > >>>>>>
> > >>>>> can't agree that VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP can only be
> > >>>>> called in stop-and-copy phase.
> > >>>>> As stated in last version, this will cause QEMU to get a wrong expectation
> > >>>>> of VM downtime and this is also the reason for previously pinned pages
> > >>>>> before log_sync cannot be treated as dirty. If this get bitmap ioctl can
> > >>>>> be called early in save_setup phase, then it's no problem even all ram
> > >>>>> is dirty.
> > >>>>>
> > >>>>
> > >>>> Device can also write to pages which are pinned, and then there is no
> > >>>> way to know pages dirtied by device during pre-copy phase.
> > >>>> If user ask dirty bitmap in per-copy phase, even then user will have to
> > >>>> query dirty bitmap in stop-and-copy phase where this will be superset
> > >>>> including all pages reported during pre-copy. Then instead of copying
> > >>>> all pages twice, its better to do it once during stop-and-copy phase.
> > >>>>
> > >>> I think the flow should be like this:
> > >>> 1. save_setup --> GET_BITMAP ioctl --> return bitmap for currently + previously
> > >>> pinned pages and clean all previously pinned pages
> > >>>
> > >>> 2. save_pending --> GET_BITMAP ioctl  --> return bitmap of (currently
> > >>> pinned pages + previously pinned pages since last clean) and clean all
> > >>> previously pinned pages
> > >>>
> > >>> 3. save_complete_precopy --> GET_BITMAP ioctl --> return bitmap of (currently
> > >>> pinned pages + previously pinned pages since last clean) and clean all
> > >>> previously pinned pages
> > >>>
> > >>>
> > >>> Copying pinned pages multiple times is unavoidable because those pinned pages
> > >>> are always treated as dirty. That's per vendor's implementation.
> > >>> But if the pinned pages are not reported as dirty before stop-and-copy phase,
> > >>> QEMU would think dirty pages has converged
> > >>> and enter blackout phase, making downtime_limit severely incorrect.
> > >>
> > >> I'm not sure it's any worse.
> > >> I *think* we do a last sync after we've decided to go to stop-and-copy;
> > >> wont that then mark all those pages as dirty again, so it'll have the
> > >> same behaviour?
> > > No. something will be different.
> > > currently, in kirti's implementation, if GET_BITMAP ioctl is called only
> > > once in stop-and-copy phase, then before that phase, QEMU does not know those
> > > pages are dirty.
> > > If we can report those dirty pages earlier before stop-and-copy phase,
> > > QEMU can at least copy other pages to reduce dirty pages to below threshold.
> > > 
> > > Take a example, let's assume those vfio dirty pages is 1Gb, and network speed is
> > > also 1Gb. Expected vm downtime is 1s.
> > > If before stop-and-copy phase, dirty pages produced by other pages is
> > > also 1Gb. To meet the expected vm downtime, QEMU should copy pages to
> > > let dirty pages be less than 1Gb, otherwise, it should not complete live
> > > migration.
> > > If vfio does not report this 1Gb dirty pages, QEMU would think there's
> > > only 1Gb and stop the vm. It would then find out there's actually 2Gb and vm
> > > downtime is 2s.
> > > Though the expected vm downtime is always not exactly the same as the
> > > true vm downtime, it should be caused by rapid dirty page rate, which is
> > > not predictable.
> > > Right?
> > > 
> > 
> > If you report vfio dirty pages 1Gb before stop-and-copy phase (i.e. in 
> > pre-copy phase), enter into stop-and-copy phase, how will you know which 
> > and how many pages are dirtied by device from the time when pages copied 
> > in pre-copy phase to that time where device is stopped? You don't have a 
> > way to know which pages are dirtied by device. So ideally device can 
> > write to all pages which are pinned. Then we have to mark all those 
> > pinned pages dirty in stop-and-copy phase, 1Gb, and copy to destination. 
> > Now you had copied same pages twice. Shouldn't we try not to copy pages 
> > twice?
> >
> For mdevs who reply on treating all pinned pages as dirty pages, as above
> mentioned condition, repeated page copying can be avoided by
> (1) adding an ioctl to get size of dirty pages and report it to QEMU through
> vfio_save_pending() interface
> (2) in this GET_BITMAP ioctl, empty bitmap is returned until stop-and-copy phase.
> 
> But for devices who know fine-grained dirty pages, (e.g. devices have ditry page
> tracking in hardware), GET_BITMAP ioctl has to return incremental dirty
> bitmaps in each iteration and step (1) should return 0 to avoid 2*size
> of dirty page reported.

I think you're right that something is needed; but it's starting to get
a bit complex.

As well as the size, you need the addresses to know which areas to avoid
- it's not just a simple size, because I think you only care about areas
that the guest has registered/pinned to the device; so it would have to
be a list somehow.  So then if you got that list you'd add the size
to the amount you knew was pending, and avoid sending that area until
stop-and-copy.

However, if it is only areas that the guest has registered, then what
happens if the guest (de)registers an area during the migration process?
That says the list itself has to be refreshed.  So it's getting messy.

Dave

> Thanks
> Yan
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Yan Zhao Jan. 4, 2020, 3:53 a.m. UTC | #14
On Sat, Jan 04, 2020 at 03:44:34AM +0800, Dr. David Alan Gilbert wrote:
> * Yan Zhao (yan.y.zhao@intel.com) wrote:
> > On Fri, Dec 20, 2019 at 12:21:45AM +0800, Kirti Wankhede wrote:
> > > 
> > > 
> > > On 12/19/2019 6:27 AM, Yan Zhao wrote:
> > > > On Thu, Dec 19, 2019 at 04:05:52AM +0800, Dr. David Alan Gilbert wrote:
> > > >> * Yan Zhao (yan.y.zhao@intel.com) wrote:
> > > >>> On Tue, Dec 17, 2019 at 07:47:05PM +0800, Kirti Wankhede wrote:
> > > >>>>
> > > >>>>
> > > >>>> On 12/17/2019 3:21 PM, Yan Zhao wrote:
> > > >>>>> On Tue, Dec 17, 2019 at 05:24:14PM +0800, Kirti Wankhede wrote:
> > > >>>>>>>>     
> > > >>>>>>>>     		return copy_to_user((void __user *)arg, &unmap, minsz) ?
> > > >>>>>>>>     			-EFAULT : 0;
> > > >>>>>>>> +	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
> > > >>>>>>>> +		struct vfio_iommu_type1_dirty_bitmap range;
> > > >>>>>>>> +		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
> > > >>>>>>>> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
> > > >>>>>>>> +				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
> > > >>>>>>>> +		int ret;
> > > >>>>>>>> +
> > > >>>>>>>> +		if (!iommu->v2)
> > > >>>>>>>> +			return -EACCES;
> > > >>>>>>>> +
> > > >>>>>>>> +		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
> > > >>>>>>>> +				    bitmap);
> > > >>>>>>>> +
> > > >>>>>>>> +		if (copy_from_user(&range, (void __user *)arg, minsz))
> > > >>>>>>>> +			return -EFAULT;
> > > >>>>>>>> +
> > > >>>>>>>> +		if (range.argsz < minsz || range.flags & ~mask)
> > > >>>>>>>> +			return -EINVAL;
> > > >>>>>>>> +
> > > >>>>>>>> +		if (range.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
> > > >>>>>>>> +			iommu->dirty_page_tracking = true;
> > > >>>>>>>> +			return 0;
> > > >>>>>>>> +		} else if (range.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
> > > >>>>>>>> +			iommu->dirty_page_tracking = false;
> > > >>>>>>>> +
> > > >>>>>>>> +			mutex_lock(&iommu->lock);
> > > >>>>>>>> +			vfio_remove_unpinned_from_dma_list(iommu);
> > > >>>>>>>> +			mutex_unlock(&iommu->lock);
> > > >>>>>>>> +			return 0;
> > > >>>>>>>> +
> > > >>>>>>>> +		} else if (range.flags &
> > > >>>>>>>> +				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
> > > >>>>>>>> +			uint64_t iommu_pgmask;
> > > >>>>>>>> +			unsigned long pgshift = __ffs(range.pgsize);
> > > >>>>>>>> +			unsigned long *bitmap;
> > > >>>>>>>> +			long bsize;
> > > >>>>>>>> +
> > > >>>>>>>> +			iommu_pgmask =
> > > >>>>>>>> +			 ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
> > > >>>>>>>> +
> > > >>>>>>>> +			if (((range.pgsize - 1) & iommu_pgmask) !=
> > > >>>>>>>> +			    (range.pgsize - 1))
> > > >>>>>>>> +				return -EINVAL;
> > > >>>>>>>> +
> > > >>>>>>>> +			if (range.iova & iommu_pgmask)
> > > >>>>>>>> +				return -EINVAL;
> > > >>>>>>>> +			if (!range.size || range.size > SIZE_MAX)
> > > >>>>>>>> +				return -EINVAL;
> > > >>>>>>>> +			if (range.iova + range.size < range.iova)
> > > >>>>>>>> +				return -EINVAL;
> > > >>>>>>>> +
> > > >>>>>>>> +			bsize = verify_bitmap_size(range.size >> pgshift,
> > > >>>>>>>> +						   range.bitmap_size);
> > > >>>>>>>> +			if (bsize)
> > > >>>>>>>> +				return ret;
> > > >>>>>>>> +
> > > >>>>>>>> +			bitmap = kmalloc(bsize, GFP_KERNEL);
> > > >>>>>>>> +			if (!bitmap)
> > > >>>>>>>> +				return -ENOMEM;
> > > >>>>>>>> +
> > > >>>>>>>> +			ret = copy_from_user(bitmap,
> > > >>>>>>>> +			     (void __user *)range.bitmap, bsize) ? -EFAULT : 0;
> > > >>>>>>>> +			if (ret)
> > > >>>>>>>> +				goto bitmap_exit;
> > > >>>>>>>> +
> > > >>>>>>>> +			iommu->dirty_page_tracking = false;
> > > >>>>>>> why iommu->dirty_page_tracking is false here?
> > > >>>>>>> suppose this ioctl can be called several times.
> > > >>>>>>>
> > > >>>>>>
> > > >>>>>> This ioctl can be called several times, but once this ioctl is called
> > > >>>>>> that means vCPUs are stopped and VFIO devices are stopped (i.e. in
> > > >>>>>> stop-and-copy phase) and dirty pages bitmap are being queried by user.
> > > >>>>>>
> > > >>>>> can't agree that VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP can only be
> > > >>>>> called in stop-and-copy phase.
> > > >>>>> As stated in last version, this will cause QEMU to get a wrong expectation
> > > >>>>> of VM downtime and this is also the reason for previously pinned pages
> > > >>>>> before log_sync cannot be treated as dirty. If this get bitmap ioctl can
> > > >>>>> be called early in save_setup phase, then it's no problem even all ram
> > > >>>>> is dirty.
> > > >>>>>
> > > >>>>
> > > >>>> Device can also write to pages which are pinned, and then there is no
> > > >>>> way to know pages dirtied by device during pre-copy phase.
> > > >>>> If user ask dirty bitmap in per-copy phase, even then user will have to
> > > >>>> query dirty bitmap in stop-and-copy phase where this will be superset
> > > >>>> including all pages reported during pre-copy. Then instead of copying
> > > >>>> all pages twice, its better to do it once during stop-and-copy phase.
> > > >>>>
> > > >>> I think the flow should be like this:
> > > >>> 1. save_setup --> GET_BITMAP ioctl --> return bitmap for currently + previously
> > > >>> pinned pages and clean all previously pinned pages
> > > >>>
> > > >>> 2. save_pending --> GET_BITMAP ioctl  --> return bitmap of (currently
> > > >>> pinned pages + previously pinned pages since last clean) and clean all
> > > >>> previously pinned pages
> > > >>>
> > > >>> 3. save_complete_precopy --> GET_BITMAP ioctl --> return bitmap of (currently
> > > >>> pinned pages + previously pinned pages since last clean) and clean all
> > > >>> previously pinned pages
> > > >>>
> > > >>>
> > > >>> Copying pinned pages multiple times is unavoidable because those pinned pages
> > > >>> are always treated as dirty. That's per vendor's implementation.
> > > >>> But if the pinned pages are not reported as dirty before stop-and-copy phase,
> > > >>> QEMU would think dirty pages has converged
> > > >>> and enter blackout phase, making downtime_limit severely incorrect.
> > > >>
> > > >> I'm not sure it's any worse.
> > > >> I *think* we do a last sync after we've decided to go to stop-and-copy;
> > > >> wont that then mark all those pages as dirty again, so it'll have the
> > > >> same behaviour?
> > > > No. something will be different.
> > > > currently, in kirti's implementation, if GET_BITMAP ioctl is called only
> > > > once in stop-and-copy phase, then before that phase, QEMU does not know those
> > > > pages are dirty.
> > > > If we can report those dirty pages earlier before stop-and-copy phase,
> > > > QEMU can at least copy other pages to reduce dirty pages to below threshold.
> > > > 
> > > > Take a example, let's assume those vfio dirty pages is 1Gb, and network speed is
> > > > also 1Gb. Expected vm downtime is 1s.
> > > > If before stop-and-copy phase, dirty pages produced by other pages is
> > > > also 1Gb. To meet the expected vm downtime, QEMU should copy pages to
> > > > let dirty pages be less than 1Gb, otherwise, it should not complete live
> > > > migration.
> > > > If vfio does not report this 1Gb dirty pages, QEMU would think there's
> > > > only 1Gb and stop the vm. It would then find out there's actually 2Gb and vm
> > > > downtime is 2s.
> > > > Though the expected vm downtime is always not exactly the same as the
> > > > true vm downtime, it should be caused by rapid dirty page rate, which is
> > > > not predictable.
> > > > Right?
> > > > 
> > > 
> > > If you report vfio dirty pages 1Gb before stop-and-copy phase (i.e. in 
> > > pre-copy phase), enter into stop-and-copy phase, how will you know which 
> > > and how many pages are dirtied by device from the time when pages copied 
> > > in pre-copy phase to that time where device is stopped? You don't have a 
> > > way to know which pages are dirtied by device. So ideally device can 
> > > write to all pages which are pinned. Then we have to mark all those 
> > > pinned pages dirty in stop-and-copy phase, 1Gb, and copy to destination. 
> > > Now you had copied same pages twice. Shouldn't we try not to copy pages 
> > > twice?
> > >
> > For mdevs who reply on treating all pinned pages as dirty pages, as above
> > mentioned condition, repeated page copying can be avoided by
> > (1) adding an ioctl to get size of dirty pages and report it to QEMU through
> > vfio_save_pending() interface
> > (2) in this GET_BITMAP ioctl, empty bitmap is returned until stop-and-copy phase.
> > 
> > But for devices who know fine-grained dirty pages, (e.g. devices have ditry page
> > tracking in hardware), GET_BITMAP ioctl has to return incremental dirty
> > bitmaps in each iteration and step (1) should return 0 to avoid 2*size
> > of dirty page reported.
> 
> I think you're right that something is needed; but it's starting to get
> a bit complex.
> 
> As well as the size, you need the addresses to know which areas to avoid
> - it's not just a simple size, because I think you only care about areas
> that the guest has registered/pinned to the device; so it would have to
> be a list somehow.  So then if you got that list you'd add the size
> to the amount you knew was pending, and avoid sending that area until
> stop-and-copy.
> 
> However, if it is only areas that the guest has registered, then what
> happens if the guest (de)registers an area during the migration process?
> That says the list itself has to be refreshed.  So it's getting messy.
> 
> Dave
>

hi Dave,
I think this GET_BITMAP interface is only for system memory (not device
internal memory) and Kirti has already maintained a vpfn list to record
dirty pages list. It's constantly refreshed during migration.
a. for devices who do not know fine-grained dirty pages,
they can report dirty page size (in vfio module) and not report dirty bitmap
to qemu ram module until stop-and-copy phase.
As it does not report dirty bitmaps, dirty status of unpinned pages during
migration will not be cleared.
b. for devices who know fine-grained dirty pages,
it can report dirty page size (to qemu ram module, not through vfio module)
by reporting dirty bitmap to qemu ram module.
dirty status of unpinned pages during migraion is cleared after each iteration.

The key is to differentiate whether a device has the ability to track fine-grained
dirty pages.
Though more complex than current implementation, it's right. Agree? :)

Thanks
Yan

> > 
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
>
diff mbox series

Patch

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 3f6b04f2334f..264449654d3f 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -70,6 +70,7 @@  struct vfio_iommu {
 	unsigned int		dma_avail;
 	bool			v2;
 	bool			nesting;
+	bool			dirty_page_tracking;
 };
 
 struct vfio_domain {
@@ -112,6 +113,7 @@  struct vfio_pfn {
 	dma_addr_t		iova;		/* Device address */
 	unsigned long		pfn;		/* Host pfn */
 	atomic_t		ref_count;
+	bool			unpinned;
 };
 
 struct vfio_regions {
@@ -244,6 +246,32 @@  static void vfio_remove_from_pfn_list(struct vfio_dma *dma,
 	kfree(vpfn);
 }
 
+static void vfio_remove_unpinned_from_pfn_list(struct vfio_dma *dma, bool warn)
+{
+	struct rb_node *n = rb_first(&dma->pfn_list);
+
+	for (; n; n = rb_next(n)) {
+		struct vfio_pfn *vpfn = rb_entry(n, struct vfio_pfn, node);
+
+		if (warn)
+			WARN_ON_ONCE(vpfn->unpinned);
+
+		if (vpfn->unpinned)
+			vfio_remove_from_pfn_list(dma, vpfn);
+	}
+}
+
+static void vfio_remove_unpinned_from_dma_list(struct vfio_iommu *iommu)
+{
+	struct rb_node *n = rb_first(&iommu->dma_list);
+
+	for (; n; n = rb_next(n)) {
+		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+
+		vfio_remove_unpinned_from_pfn_list(dma, false);
+	}
+}
+
 static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
 					       unsigned long iova)
 {
@@ -254,13 +282,17 @@  static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
 	return vpfn;
 }
 
-static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
+static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn,
+				  bool dirty_tracking)
 {
 	int ret = 0;
 
 	if (atomic_dec_and_test(&vpfn->ref_count)) {
 		ret = put_pfn(vpfn->pfn, dma->prot);
-		vfio_remove_from_pfn_list(dma, vpfn);
+		if (dirty_tracking)
+			vpfn->unpinned = true;
+		else
+			vfio_remove_from_pfn_list(dma, vpfn);
 	}
 	return ret;
 }
@@ -504,7 +536,7 @@  static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
 }
 
 static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
-				    bool do_accounting)
+				    bool do_accounting, bool dirty_tracking)
 {
 	int unlocked;
 	struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
@@ -512,7 +544,10 @@  static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
 	if (!vpfn)
 		return 0;
 
-	unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
+	if (vpfn->unpinned)
+		return 0;
+
+	unlocked = vfio_iova_put_vfio_pfn(dma, vpfn, dirty_tracking);
 
 	if (do_accounting)
 		vfio_lock_acct(dma, -unlocked, true);
@@ -583,7 +618,8 @@  static int vfio_iommu_type1_pin_pages(void *iommu_data,
 
 		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
 		if (ret) {
-			vfio_unpin_page_external(dma, iova, do_accounting);
+			vfio_unpin_page_external(dma, iova, do_accounting,
+						 false);
 			goto pin_unwind;
 		}
 	}
@@ -598,7 +634,7 @@  static int vfio_iommu_type1_pin_pages(void *iommu_data,
 
 		iova = user_pfn[j] << PAGE_SHIFT;
 		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
-		vfio_unpin_page_external(dma, iova, do_accounting);
+		vfio_unpin_page_external(dma, iova, do_accounting, false);
 		phys_pfn[j] = 0;
 	}
 pin_done:
@@ -632,7 +668,8 @@  static int vfio_iommu_type1_unpin_pages(void *iommu_data,
 		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
 		if (!dma)
 			goto unpin_exit;
-		vfio_unpin_page_external(dma, iova, do_accounting);
+		vfio_unpin_page_external(dma, iova, do_accounting,
+					 iommu->dirty_page_tracking);
 	}
 
 unpin_exit:
@@ -850,6 +887,88 @@  static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
 	return bitmap;
 }
 
+/*
+ * start_iova is the reference from where bitmaping started. This is called
+ * from DMA_UNMAP where start_iova can be different than iova
+ */
+
+static void vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
+				  size_t size, uint64_t pgsize,
+				  dma_addr_t start_iova, unsigned long *bitmap)
+{
+	struct vfio_dma *dma;
+	dma_addr_t i = iova;
+	unsigned long pgshift = __ffs(pgsize);
+
+	while ((dma = vfio_find_dma(iommu, i, pgsize))) {
+		/* mark all pages dirty if all pages are pinned and mapped. */
+		if (dma->iommu_mapped) {
+			dma_addr_t iova_limit;
+
+			iova_limit = (dma->iova + dma->size) < (iova + size) ?
+				     (dma->iova + dma->size) : (iova + size);
+
+			for (; i < iova_limit; i += pgsize) {
+				unsigned int start;
+
+				start = (i - start_iova) >> pgshift;
+
+				__bitmap_set(bitmap, start, 1);
+			}
+			if (i >= iova + size)
+				return;
+		} else {
+			struct rb_node *n = rb_first(&dma->pfn_list);
+			bool found = false;
+
+			for (; n; n = rb_next(n)) {
+				struct vfio_pfn *vpfn = rb_entry(n,
+							struct vfio_pfn, node);
+				if (vpfn->iova >= i) {
+					found = true;
+					break;
+				}
+			}
+
+			if (!found) {
+				i += dma->size;
+				continue;
+			}
+
+			for (; n; n = rb_next(n)) {
+				unsigned int start;
+				struct vfio_pfn *vpfn = rb_entry(n,
+							struct vfio_pfn, node);
+
+				if (vpfn->iova >= iova + size)
+					return;
+
+				start = (vpfn->iova - start_iova) >> pgshift;
+
+				__bitmap_set(bitmap, start, 1);
+
+				i = vpfn->iova + pgsize;
+			}
+		}
+		vfio_remove_unpinned_from_pfn_list(dma, false);
+	}
+}
+
+static long verify_bitmap_size(unsigned long npages, unsigned long bitmap_size)
+{
+	long bsize;
+
+	if (!bitmap_size || bitmap_size > SIZE_MAX)
+		return -EINVAL;
+
+	bsize = ALIGN(npages, BITS_PER_LONG) / sizeof(unsigned long);
+
+	if (bitmap_size < bsize)
+		return -EINVAL;
+
+	return bsize;
+}
+
 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 			     struct vfio_iommu_type1_dma_unmap *unmap)
 {
@@ -2298,6 +2417,83 @@  static long vfio_iommu_type1_ioctl(void *iommu_data,
 
 		return copy_to_user((void __user *)arg, &unmap, minsz) ?
 			-EFAULT : 0;
+	} else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
+		struct vfio_iommu_type1_dirty_bitmap range;
+		uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
+				VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
+				VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
+		int ret;
+
+		if (!iommu->v2)
+			return -EACCES;
+
+		minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
+				    bitmap);
+
+		if (copy_from_user(&range, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (range.argsz < minsz || range.flags & ~mask)
+			return -EINVAL;
+
+		if (range.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
+			iommu->dirty_page_tracking = true;
+			return 0;
+		} else if (range.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
+			iommu->dirty_page_tracking = false;
+
+			mutex_lock(&iommu->lock);
+			vfio_remove_unpinned_from_dma_list(iommu);
+			mutex_unlock(&iommu->lock);
+			return 0;
+
+		} else if (range.flags &
+				 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
+			uint64_t iommu_pgmask;
+			unsigned long pgshift = __ffs(range.pgsize);
+			unsigned long *bitmap;
+			long bsize;
+
+			iommu_pgmask =
+			 ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
+
+			if (((range.pgsize - 1) & iommu_pgmask) !=
+			    (range.pgsize - 1))
+				return -EINVAL;
+
+			if (range.iova & iommu_pgmask)
+				return -EINVAL;
+			if (!range.size || range.size > SIZE_MAX)
+				return -EINVAL;
+			if (range.iova + range.size < range.iova)
+				return -EINVAL;
+
+			bsize = verify_bitmap_size(range.size >> pgshift,
+						   range.bitmap_size);
+			if (bsize)
+				return ret;
+
+			bitmap = kmalloc(bsize, GFP_KERNEL);
+			if (!bitmap)
+				return -ENOMEM;
+
+			ret = copy_from_user(bitmap,
+			     (void __user *)range.bitmap, bsize) ? -EFAULT : 0;
+			if (ret)
+				goto bitmap_exit;
+
+			iommu->dirty_page_tracking = false;
+			mutex_lock(&iommu->lock);
+			vfio_iova_dirty_bitmap(iommu, range.iova, range.size,
+					     range.pgsize, range.iova, bitmap);
+			mutex_unlock(&iommu->lock);
+
+			ret = copy_to_user((void __user *)range.bitmap, bitmap,
+					   range.bitmap_size) ? -EFAULT : 0;
+bitmap_exit:
+			kfree(bitmap);
+			return ret;
+		}
 	}
 
 	return -ENOTTY;