diff mbox series

[v11,Kernel,6/6] vfio: Selective dirty page tracking if IOMMU backed device pins pages

Message ID 1576602651-15430-7-git-send-email-kwankhede@nvidia.com (mailing list archive)
State New, archived
Headers show
Series KABIs to support migration for VFIO devices | expand

Commit Message

Kirti Wankhede Dec. 17, 2019, 5:10 p.m. UTC
Track dirty pages reporting capability for each vfio_device by setting the
capability flag on calling vfio_pin_pages() for that device.

In vfio_iommu_type1 module, while creating dirty pages bitmap, check if
IOMMU backed device is present in the container. If IOMMU backed device is
present in container then check dirty pages reporting capability for each
vfio device in the container. If all vfio devices are capable of reporing
dirty pages tracking by pinning pages through external API, then report
create bitmap of pinned pages only. If IOMMU backed device is present in
the container and any one device is not able to report dirty pages, then
marked all pages as dirty.

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Reviewed-by: Neo Jia <cjia@nvidia.com>
---
 drivers/vfio/vfio.c             | 33 +++++++++++++++++++++++++++++++
 drivers/vfio/vfio_iommu_type1.c | 44 +++++++++++++++++++++++++++++++++++++++--
 include/linux/vfio.h            |  3 ++-
 3 files changed, 77 insertions(+), 3 deletions(-)

Comments

Alex Williamson Dec. 18, 2019, 12:12 a.m. UTC | #1
On Tue, 17 Dec 2019 22:40:51 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> Track dirty pages reporting capability for each vfio_device by setting the
> capability flag on calling vfio_pin_pages() for that device.
> 
> In vfio_iommu_type1 module, while creating dirty pages bitmap, check if
> IOMMU backed device is present in the container. If IOMMU backed device is
> present in container then check dirty pages reporting capability for each
> vfio device in the container. If all vfio devices are capable of reporing
> dirty pages tracking by pinning pages through external API, then report
> create bitmap of pinned pages only. If IOMMU backed device is present in
> the container and any one device is not able to report dirty pages, then
> marked all pages as dirty.
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Reviewed-by: Neo Jia <cjia@nvidia.com>
> ---
>  drivers/vfio/vfio.c             | 33 +++++++++++++++++++++++++++++++
>  drivers/vfio/vfio_iommu_type1.c | 44 +++++++++++++++++++++++++++++++++++++++--
>  include/linux/vfio.h            |  3 ++-
>  3 files changed, 77 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index c8482624ca34..9d2fbe09768a 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
> @@ -96,6 +96,8 @@ struct vfio_device {
>  	struct vfio_group		*group;
>  	struct list_head		group_next;
>  	void				*device_data;
> +	/* dirty pages reporting capable */
> +	bool				dirty_pages_cap;
>  };
>  
>  #ifdef CONFIG_VFIO_NOIOMMU
> @@ -1866,6 +1868,29 @@ int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
>  }
>  EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
>  
> +int vfio_device_is_dirty_reporting_capable(struct device *dev, bool *cap)
> +{
> +	struct vfio_device *device;
> +	struct vfio_group *group;
> +
> +	if (!dev || !cap)
> +		return -EINVAL;
> +
> +	group = vfio_group_get_from_dev(dev);
> +	if (!group)
> +		return -ENODEV;
> +
> +	device = vfio_group_get_device(group, dev);
> +	if (!device)
> +		return -ENODEV;
> +
> +	*cap = device->dirty_pages_cap;
> +	vfio_device_put(device);
> +	vfio_group_put(group);
> +	return 0;
> +}
> +EXPORT_SYMBOL(vfio_device_is_dirty_reporting_capable);

I'd suggest this just return true/false and any error condition simply
be part of the false case.

> +
>  /*
>   * Pin a set of guest PFNs and return their associated host PFNs for local
>   * domain only.
> @@ -1907,6 +1932,14 @@ int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
>  	else
>  		ret = -ENOTTY;
>  
> +	if (ret > 0) {
> +		struct vfio_device *device = vfio_group_get_device(group, dev);
> +
> +		if (device) {
> +			device->dirty_pages_cap = true;
> +			vfio_device_put(device);
> +		}
> +	}

I think we'd want to trivially rework vfio_pin_pages() to use
vfio_device_get_from_dev() instead of vfio_group_get_from_dev(), then
we have access to the group via device->group.  Then vfio_device_put()
would be common in the return path.

>  	vfio_group_try_dissolve_container(group);
>  
>  err_pin_pages:
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 68d8ed3b2665..ef56f31f4e73 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -891,6 +891,39 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
>  	return bitmap;
>  }
>  
> +static int vfio_is_dirty_pages_reporting_capable(struct device *dev, void *data)
> +{
> +	bool new;
> +	int ret;
> +
> +	ret = vfio_device_is_dirty_reporting_capable(dev, &new);
> +	if (ret)
> +		return ret;
> +
> +	*(bool *)data = *(bool *)data && new;
> +
> +	return 0;
> +}
> +
> +static bool vfio_dirty_pages_reporting_capable(struct vfio_iommu *iommu)
> +{
> +	struct vfio_domain *d;
> +	struct vfio_group *g;
> +	bool capable = true;
> +	int ret;
> +
> +	list_for_each_entry(d, &iommu->domain_list, next) {
> +		list_for_each_entry(g, &d->group_list, next) {
> +			ret = iommu_group_for_each_dev(g->iommu_group, &capable,
> +					vfio_is_dirty_pages_reporting_capable);
> +			if (ret)
> +				return false;

This will fail when there are devices within the IOMMU group that are
not represented as vfio_devices.  My original suggestion was:

On Thu, 14 Nov 2019 14:06:25 -0700
Alex Williamson <alex.williamson@redhat.com> wrote:
> I think it does so by pinning pages.  Is it acceptable that if the
> vendor driver pins any pages, then from that point forward we consider
> the IOMMU group dirty page scope to be limited to pinned pages?  There
> are complications around non-singleton IOMMU groups, but I think we're
> already leaning towards that being a non-worthwhile problem to solve.
> So if we require that only singleton IOMMU groups can pin pages and we

We could tag vfio_groups as singleton at vfio_add_group_dev() time with
an iommu_group_for_each_dev() walk so that we can cache the value on
the struct vfio_group.  vfio_group_nb_add_dev() could update this if
the IOMMU group composition changes.  vfio_pin_pages() could return
-EINVAL if (!group->is_singleton).

> pass the IOMMU group as a parameter to
> vfio_iommu_driver_ops.pin_pages(), then the type1 backend can set a
> flag on its local vfio_group struct to indicate dirty page scope is
> limited to pinned pages.

ie. vfio_iommu_type1_unpin_pages() calls find_iommu_group() on each
domain in domain_list and the external_domain using the struct
iommu_group pointer provided by vfio-core.  We set a new attribute on
the vfio_group to indicate that vfio_group has (at some point) pinned
pages.

>  We might want to keep a flag on the
> vfio_iommu struct to indicate if all of the vfio_groups for each
> vfio_domain in the vfio_iommu.domain_list dirty page scope limited to
> pinned pages as an optimization to avoid walking lists too often.  Then
> we could test if vfio_iommu.domain_list is not empty and this new flag
> does not limit the dirty page scope, then everything within each
> vfio_dma is considered dirty.

So at the point where we change vfio_group.has_pinned_pages from false
to true, or a group is added or removed, we walk all the groups in the
vfio_iommu and if they all have has_pinned_pages set, we can set a
vfio_iommu.pinned_page_dirty_scope flag to true.  If that flag is
already true on page pinning, we can skip the lookup.

I still like this approach better, it doesn't require a callback from
type1 to vfio-core and it doesn't require a heavy weight walking for
group devices and vfio data structures every time we fill a bitmap.
Did you run into issues trying to implement this approach?  Thanks,

Alex

> +		}
> +	}
> +
> +	return capable;
> +}
> +
>  /*
>   * start_iova is the reference from where bitmaping started. This is called
>   * from DMA_UNMAP where start_iova can be different than iova
> @@ -903,10 +936,17 @@ static void vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
>  	struct vfio_dma *dma;
>  	dma_addr_t i = iova;
>  	unsigned long pgshift = __ffs(pgsize);
> +	bool dirty_report_cap = true;
> +
> +	if (IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
> +		dirty_report_cap = vfio_dirty_pages_reporting_capable(iommu);
>  
>  	while ((dma = vfio_find_dma(iommu, i, pgsize))) {
> -		/* mark all pages dirty if all pages are pinned and mapped. */
> -		if (dma->iommu_mapped) {
> +		/*
> +		 * mark all pages dirty if any IOMMU capable device is not able
> +		 * to report dirty pages and all pages are pinned and mapped.
> +		 */
> +		if (!dirty_report_cap && dma->iommu_mapped) {
>  			dma_addr_t iova_limit;
>  
>  			iova_limit = (dma->iova + dma->size) < (iova + size) ?
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index e42a711a2800..ed3832ea10a1 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -148,7 +148,8 @@ extern int vfio_info_add_capability(struct vfio_info_cap *caps,
>  extern int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr,
>  					      int num_irqs, int max_irq_type,
>  					      size_t *data_size);
> -
> +extern int vfio_device_is_dirty_reporting_capable(struct device *dev,
> +						  bool *cap);
>  struct pci_dev;
>  #if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH)
>  extern void vfio_spapr_pci_eeh_open(struct pci_dev *pdev);
Kirti Wankhede Jan. 7, 2020, 8:45 p.m. UTC | #2
On 12/18/2019 5:42 AM, Alex Williamson wrote:
> On Tue, 17 Dec 2019 22:40:51 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 

<snip>

> 
> This will fail when there are devices within the IOMMU group that are
> not represented as vfio_devices.  My original suggestion was:
> 
> On Thu, 14 Nov 2019 14:06:25 -0700
> Alex Williamson <alex.williamson@redhat.com> wrote:
>> I think it does so by pinning pages.  Is it acceptable that if the
>> vendor driver pins any pages, then from that point forward we consider
>> the IOMMU group dirty page scope to be limited to pinned pages?  There
>> are complications around non-singleton IOMMU groups, but I think we're
>> already leaning towards that being a non-worthwhile problem to solve.
>> So if we require that only singleton IOMMU groups can pin pages and we
> 
> We could tag vfio_groups as singleton at vfio_add_group_dev() time with
> an iommu_group_for_each_dev() walk so that we can cache the value on
> the struct vfio_group. 

I don't think iommu_group_for_each_dev() is required. Checking 
group->device_list in vfio_add_group_dev() if there are more than one 
device should work, right?

         list_for_each_entry(vdev, &group->device_list, group_next) {
                 if (group->is_singleton) {
                         group->is_singleton = false;
                         break;
                 } else {
                         group->is_singleton = true;
                 }
         }


> vfio_group_nb_add_dev() could update this if
> the IOMMU group composition changes. 

I don't see vfio_group_nb_add_dev() calls vfio_add_group_dev() (?)
If checking is_singleton is taken care in vfio_group_nb_add_dev(), which 
is the only place where vfio_group is allocated, that should work, I think.


> vfio_pin_pages() could return
> -EINVAL if (!group->is_singleton).
> 
>> pass the IOMMU group as a parameter to
>> vfio_iommu_driver_ops.pin_pages(), then the type1 backend can set a
>> flag on its local vfio_group struct to indicate dirty page scope is
>> limited to pinned pages.
> 
> ie. vfio_iommu_type1_unpin_pages() calls find_iommu_group() on each
> domain in domain_list and the external_domain using the struct
> iommu_group pointer provided by vfio-core.  We set a new attribute on
> the vfio_group to indicate that vfio_group has (at some point) pinned
> pages.
> 
>>   We might want to keep a flag on the
>> vfio_iommu struct to indicate if all of the vfio_groups for each
>> vfio_domain in the vfio_iommu.domain_list dirty page scope limited to
>> pinned pages as an optimization to avoid walking lists too often.  Then
>> we could test if vfio_iommu.domain_list is not empty and this new flag
>> does not limit the dirty page scope, then everything within each
>> vfio_dma is considered dirty.
> 
> So at the point where we change vfio_group.has_pinned_pages from false
> to true, or a group is added or removed, we walk all the groups in the
> vfio_iommu and if they all have has_pinned_pages set, we can set a
> vfio_iommu.pinned_page_dirty_scope flag to true.  If that flag is
> already true on page pinning, we can skip the lookup.
> 
> I still like this approach better, it doesn't require a callback from
> type1 to vfio-core and it doesn't require a heavy weight walking for
> group devices and vfio data structures every time we fill a bitmap.
> Did you run into issues trying to implement this approach? 

Thanks for elaborative steps.
This works. Changing this last commit.

Thanks,
Kirti
Alex Williamson Jan. 8, 2020, 12:09 a.m. UTC | #3
On Wed, 8 Jan 2020 02:15:01 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 12/18/2019 5:42 AM, Alex Williamson wrote:
> > On Tue, 17 Dec 2019 22:40:51 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> 
> <snip>
> 
> > 
> > This will fail when there are devices within the IOMMU group that are
> > not represented as vfio_devices.  My original suggestion was:
> > 
> > On Thu, 14 Nov 2019 14:06:25 -0700
> > Alex Williamson <alex.williamson@redhat.com> wrote:  
> >> I think it does so by pinning pages.  Is it acceptable that if the
> >> vendor driver pins any pages, then from that point forward we consider
> >> the IOMMU group dirty page scope to be limited to pinned pages?  There
> >> are complications around non-singleton IOMMU groups, but I think we're
> >> already leaning towards that being a non-worthwhile problem to solve.
> >> So if we require that only singleton IOMMU groups can pin pages and we  
> > 
> > We could tag vfio_groups as singleton at vfio_add_group_dev() time with
> > an iommu_group_for_each_dev() walk so that we can cache the value on
> > the struct vfio_group.   
> 
> I don't think iommu_group_for_each_dev() is required. Checking 
> group->device_list in vfio_add_group_dev() if there are more than one 
> device should work, right?
> 
>          list_for_each_entry(vdev, &group->device_list, group_next) {
>                  if (group->is_singleton) {
>                          group->is_singleton = false;
>                          break;
>                  } else {
>                          group->is_singleton = true;
>                  }
>          }

Hmm, I think you're taking a different approach to this than I was
thinking.  Re-reading my previous comments, the fact that both vfio.c
and vfio_iommu_type1.c each have their own private struct vfio_group
makes things rather unclear.  I was intending to use the struct
iommu_group as the object vfio.c provides to type1.c to associate the
pinning.  This would require that not only the vfio view of devices in
the group to be singleton, but also the actual iommu group to be
singleton.  Otherwise the set of devices vfio.c has in the group might
only be a subset of the group.  Maybe a merger of the approaches is
easier though.

Tracking whether the vfio.c view of a group is singleton is even easier
than above, we could simply add a device_count field to vfio_group,
increment it in vfio_group_create_device() and decrement it in
vfio_device_release().  vfio_pin_pages() could return error if
device_count is not 1.  We could still add the iommu_group pointer to
the type1 pin_pages callback, but perhaps type1 simply assumes that the
group is singleton when pin pages is called and it's vfio.c's
responsibility to maintain that group as singleton once pages have been
pinned.  vfio.c would therefore also need to set a field on the
vfio_group if pages have been pinned such that vfio_add_group_dev()
could return error if a new device attempts to join the group.  We'd
need to make sure that field is cleared when the group is released from
use and pay attention to races that might occur between adding devices
to a group and pinning pages.

> > vfio_group_nb_add_dev() could update this if
> > the IOMMU group composition changes.   
> 
> I don't see vfio_group_nb_add_dev() calls vfio_add_group_dev() (?)
> If checking is_singleton is taken care in vfio_group_nb_add_dev(), which 
> is the only place where vfio_group is allocated, that should work, I think.

This was relative to maintaining that the iommu group itself is
singleton, not just the vfio view of the group.  If we use the latter
as our basis, then you're right, we should need this, but vfio.c would
need to enforce that the group remains singleton if it has pinned
pages.  Does that make sense?  Thanks,

Alex

> > vfio_pin_pages() could return
> > -EINVAL if (!group->is_singleton).
> >   
> >> pass the IOMMU group as a parameter to
> >> vfio_iommu_driver_ops.pin_pages(), then the type1 backend can set a
> >> flag on its local vfio_group struct to indicate dirty page scope is
> >> limited to pinned pages.  
> > 
> > ie. vfio_iommu_type1_unpin_pages() calls find_iommu_group() on each
> > domain in domain_list and the external_domain using the struct
> > iommu_group pointer provided by vfio-core.  We set a new attribute on
> > the vfio_group to indicate that vfio_group has (at some point) pinned
> > pages.
> >   
> >>   We might want to keep a flag on the
> >> vfio_iommu struct to indicate if all of the vfio_groups for each
> >> vfio_domain in the vfio_iommu.domain_list dirty page scope limited to
> >> pinned pages as an optimization to avoid walking lists too often.  Then
> >> we could test if vfio_iommu.domain_list is not empty and this new flag
> >> does not limit the dirty page scope, then everything within each
> >> vfio_dma is considered dirty.  
> > 
> > So at the point where we change vfio_group.has_pinned_pages from false
> > to true, or a group is added or removed, we walk all the groups in the
> > vfio_iommu and if they all have has_pinned_pages set, we can set a
> > vfio_iommu.pinned_page_dirty_scope flag to true.  If that flag is
> > already true on page pinning, we can skip the lookup.
> > 
> > I still like this approach better, it doesn't require a callback from
> > type1 to vfio-core and it doesn't require a heavy weight walking for
> > group devices and vfio data structures every time we fill a bitmap.
> > Did you run into issues trying to implement this approach?   
> 
> Thanks for elaborative steps.
> This works. Changing this last commit.
> 
> Thanks,
> Kirti
>
Kirti Wankhede Jan. 8, 2020, 8:52 p.m. UTC | #4
On 1/8/2020 5:39 AM, Alex Williamson wrote:
> On Wed, 8 Jan 2020 02:15:01 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> On 12/18/2019 5:42 AM, Alex Williamson wrote:
>>> On Tue, 17 Dec 2019 22:40:51 +0530
>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>    
>>
>> <snip>
>>
>>>
>>> This will fail when there are devices within the IOMMU group that are
>>> not represented as vfio_devices.  My original suggestion was:
>>>
>>> On Thu, 14 Nov 2019 14:06:25 -0700
>>> Alex Williamson <alex.williamson@redhat.com> wrote:
>>>> I think it does so by pinning pages.  Is it acceptable that if the
>>>> vendor driver pins any pages, then from that point forward we consider
>>>> the IOMMU group dirty page scope to be limited to pinned pages?  There
>>>> are complications around non-singleton IOMMU groups, but I think we're
>>>> already leaning towards that being a non-worthwhile problem to solve.
>>>> So if we require that only singleton IOMMU groups can pin pages and we
>>>
>>> We could tag vfio_groups as singleton at vfio_add_group_dev() time with
>>> an iommu_group_for_each_dev() walk so that we can cache the value on
>>> the struct vfio_group.
>>
>> I don't think iommu_group_for_each_dev() is required. Checking
>> group->device_list in vfio_add_group_dev() if there are more than one
>> device should work, right?
>>
>>           list_for_each_entry(vdev, &group->device_list, group_next) {
>>                   if (group->is_singleton) {
>>                           group->is_singleton = false;
>>                           break;
>>                   } else {
>>                           group->is_singleton = true;
>>                   }
>>           }
> 
> Hmm, I think you're taking a different approach to this than I was
> thinking.  Re-reading my previous comments, the fact that both vfio.c
> and vfio_iommu_type1.c each have their own private struct vfio_group
> makes things rather unclear.  I was intending to use the struct
> iommu_group as the object vfio.c provides to type1.c to associate the
> pinning.  This would require that not only the vfio view of devices in
> the group to be singleton, but also the actual iommu group to be
> singleton.  Otherwise the set of devices vfio.c has in the group might
> only be a subset of the group.  Maybe a merger of the approaches is
> easier though.
> 
> Tracking whether the vfio.c view of a group is singleton is even easier
> than above, we could simply add a device_count field to vfio_group,
> increment it in vfio_group_create_device() and decrement it in
> vfio_device_release().  vfio_pin_pages() could return error if
> device_count is not 1.  We could still add the iommu_group pointer to
> the type1 pin_pages callback, but perhaps type1 simply assumes that the
> group is singleton when pin pages is called and it's vfio.c's
> responsibility to maintain that group as singleton once pages have been
> pinned.  vfio.c would therefore also need to set a field on the
> vfio_group if pages have been pinned such that vfio_add_group_dev()
> could return error if a new device attempts to join the group.  We'd
> need to make sure that field is cleared when the group is released from
> use and pay attention to races that might occur between adding devices
> to a group and pinning pages.
> 

Thinking aloud, will adding singleton check could cause issues in near 
future? - may be in future support for p2p and direct RDMA will be added 
for mdev devices. In that case the two devices should be in same 
iommu_domain, but should be in different iommu_group - is that 
understanding correct?

>>> vfio_group_nb_add_dev() could update this if
>>> the IOMMU group composition changes.
>>
>> I don't see vfio_group_nb_add_dev() calls vfio_add_group_dev() (?)
>> If checking is_singleton is taken care in vfio_group_nb_add_dev(), which
>> is the only place where vfio_group is allocated, that should work, I think.
> 
> This was relative to maintaining that the iommu group itself is
> singleton, not just the vfio view of the group.  If we use the latter
> as our basis, then you're right, we should need this, but vfio.c would
> need to enforce that the group remains singleton if it has pinned
> pages.  Does that make sense?  Thanks,
> 

Which route should be taken - iommu_group view or vfio.c group view?

Thanks,
Kirti

> Alex
> 
>>> vfio_pin_pages() could return
>>> -EINVAL if (!group->is_singleton).
>>>    
>>>> pass the IOMMU group as a parameter to
>>>> vfio_iommu_driver_ops.pin_pages(), then the type1 backend can set a
>>>> flag on its local vfio_group struct to indicate dirty page scope is
>>>> limited to pinned pages.
>>>
>>> ie. vfio_iommu_type1_unpin_pages() calls find_iommu_group() on each
>>> domain in domain_list and the external_domain using the struct
>>> iommu_group pointer provided by vfio-core.  We set a new attribute on
>>> the vfio_group to indicate that vfio_group has (at some point) pinned
>>> pages.
>>>    
>>>>    We might want to keep a flag on the
>>>> vfio_iommu struct to indicate if all of the vfio_groups for each
>>>> vfio_domain in the vfio_iommu.domain_list dirty page scope limited to
>>>> pinned pages as an optimization to avoid walking lists too often.  Then
>>>> we could test if vfio_iommu.domain_list is not empty and this new flag
>>>> does not limit the dirty page scope, then everything within each
>>>> vfio_dma is considered dirty.
>>>
>>> So at the point where we change vfio_group.has_pinned_pages from false
>>> to true, or a group is added or removed, we walk all the groups in the
>>> vfio_iommu and if they all have has_pinned_pages set, we can set a
>>> vfio_iommu.pinned_page_dirty_scope flag to true.  If that flag is
>>> already true on page pinning, we can skip the lookup.
>>>
>>> I still like this approach better, it doesn't require a callback from
>>> type1 to vfio-core and it doesn't require a heavy weight walking for
>>> group devices and vfio data structures every time we fill a bitmap.
>>> Did you run into issues trying to implement this approach?
>>
>> Thanks for elaborative steps.
>> This works. Changing this last commit.
>>
>> Thanks,
>> Kirti
>>
>
Alex Williamson Jan. 8, 2020, 10:59 p.m. UTC | #5
On Thu, 9 Jan 2020 02:22:26 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 1/8/2020 5:39 AM, Alex Williamson wrote:
> > On Wed, 8 Jan 2020 02:15:01 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> >> On 12/18/2019 5:42 AM, Alex Williamson wrote:  
> >>> On Tue, 17 Dec 2019 22:40:51 +0530
> >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>      
> >>
> >> <snip>
> >>  
> >>>
> >>> This will fail when there are devices within the IOMMU group that are
> >>> not represented as vfio_devices.  My original suggestion was:
> >>>
> >>> On Thu, 14 Nov 2019 14:06:25 -0700
> >>> Alex Williamson <alex.williamson@redhat.com> wrote:  
> >>>> I think it does so by pinning pages.  Is it acceptable that if the
> >>>> vendor driver pins any pages, then from that point forward we consider
> >>>> the IOMMU group dirty page scope to be limited to pinned pages?  There
> >>>> are complications around non-singleton IOMMU groups, but I think we're
> >>>> already leaning towards that being a non-worthwhile problem to solve.
> >>>> So if we require that only singleton IOMMU groups can pin pages and we  
> >>>
> >>> We could tag vfio_groups as singleton at vfio_add_group_dev() time with
> >>> an iommu_group_for_each_dev() walk so that we can cache the value on
> >>> the struct vfio_group.  
> >>
> >> I don't think iommu_group_for_each_dev() is required. Checking
> >> group->device_list in vfio_add_group_dev() if there are more than one
> >> device should work, right?
> >>
> >>           list_for_each_entry(vdev, &group->device_list, group_next) {
> >>                   if (group->is_singleton) {
> >>                           group->is_singleton = false;
> >>                           break;
> >>                   } else {
> >>                           group->is_singleton = true;
> >>                   }
> >>           }  
> > 
> > Hmm, I think you're taking a different approach to this than I was
> > thinking.  Re-reading my previous comments, the fact that both vfio.c
> > and vfio_iommu_type1.c each have their own private struct vfio_group
> > makes things rather unclear.  I was intending to use the struct
> > iommu_group as the object vfio.c provides to type1.c to associate the
> > pinning.  This would require that not only the vfio view of devices in
> > the group to be singleton, but also the actual iommu group to be
> > singleton.  Otherwise the set of devices vfio.c has in the group might
> > only be a subset of the group.  Maybe a merger of the approaches is
> > easier though.
> > 
> > Tracking whether the vfio.c view of a group is singleton is even easier
> > than above, we could simply add a device_count field to vfio_group,
> > increment it in vfio_group_create_device() and decrement it in
> > vfio_device_release().  vfio_pin_pages() could return error if
> > device_count is not 1.  We could still add the iommu_group pointer to
> > the type1 pin_pages callback, but perhaps type1 simply assumes that the
> > group is singleton when pin pages is called and it's vfio.c's
> > responsibility to maintain that group as singleton once pages have been
> > pinned.  vfio.c would therefore also need to set a field on the
> > vfio_group if pages have been pinned such that vfio_add_group_dev()
> > could return error if a new device attempts to join the group.  We'd
> > need to make sure that field is cleared when the group is released from
> > use and pay attention to races that might occur between adding devices
> > to a group and pinning pages.
> >   
> 
> Thinking aloud, will adding singleton check could cause issues in near 
> future? - may be in future support for p2p and direct RDMA will be added 
> for mdev devices. In that case the two devices should be in same 
> iommu_domain, but should be in different iommu_group - is that 
> understanding correct?

The ACS redirection stuff is the only thing that actually changes iommu
grouping relative to p2p/RDMA and that's specifically for untranslated
DMA, aiui.  If we wanted translated p2p DMA to be routed downstream of
the IOMMU we'd need to enable ACS direct translation.  In any case,
those are about shortening the path for p2p between devices.  We
actually don't even need devices to be in the same iommu domain to
allow p2p, we only need the iommu domain for each respective device to
map the mmio spaces of the other device.  I don't think we're doing
anything here that would cause us trouble later in this space, but it's
also just a policy decision, we wouldn't be breaking ABI to change the
implementation later.
 
> >>> vfio_group_nb_add_dev() could update this if
> >>> the IOMMU group composition changes.  
> >>
> >> I don't see vfio_group_nb_add_dev() calls vfio_add_group_dev() (?)
> >> If checking is_singleton is taken care in vfio_group_nb_add_dev(), which
> >> is the only place where vfio_group is allocated, that should work, I think.  
> > 
> > This was relative to maintaining that the iommu group itself is
> > singleton, not just the vfio view of the group.  If we use the latter
> > as our basis, then you're right, we should need this, but vfio.c would
> > need to enforce that the group remains singleton if it has pinned
> > pages.  Does that make sense?  Thanks,
> >   
> 
> Which route should be taken - iommu_group view or vfio.c group view?

The latter seems easier, more flexible, and lower overhead as far as I
can see.  Thanks,

Alex
diff mbox series

Patch

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index c8482624ca34..9d2fbe09768a 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -96,6 +96,8 @@  struct vfio_device {
 	struct vfio_group		*group;
 	struct list_head		group_next;
 	void				*device_data;
+	/* dirty pages reporting capable */
+	bool				dirty_pages_cap;
 };
 
 #ifdef CONFIG_VFIO_NOIOMMU
@@ -1866,6 +1868,29 @@  int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
 }
 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
 
+int vfio_device_is_dirty_reporting_capable(struct device *dev, bool *cap)
+{
+	struct vfio_device *device;
+	struct vfio_group *group;
+
+	if (!dev || !cap)
+		return -EINVAL;
+
+	group = vfio_group_get_from_dev(dev);
+	if (!group)
+		return -ENODEV;
+
+	device = vfio_group_get_device(group, dev);
+	if (!device)
+		return -ENODEV;
+
+	*cap = device->dirty_pages_cap;
+	vfio_device_put(device);
+	vfio_group_put(group);
+	return 0;
+}
+EXPORT_SYMBOL(vfio_device_is_dirty_reporting_capable);
+
 /*
  * Pin a set of guest PFNs and return their associated host PFNs for local
  * domain only.
@@ -1907,6 +1932,14 @@  int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
 	else
 		ret = -ENOTTY;
 
+	if (ret > 0) {
+		struct vfio_device *device = vfio_group_get_device(group, dev);
+
+		if (device) {
+			device->dirty_pages_cap = true;
+			vfio_device_put(device);
+		}
+	}
 	vfio_group_try_dissolve_container(group);
 
 err_pin_pages:
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 68d8ed3b2665..ef56f31f4e73 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -891,6 +891,39 @@  static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
 	return bitmap;
 }
 
+static int vfio_is_dirty_pages_reporting_capable(struct device *dev, void *data)
+{
+	bool new;
+	int ret;
+
+	ret = vfio_device_is_dirty_reporting_capable(dev, &new);
+	if (ret)
+		return ret;
+
+	*(bool *)data = *(bool *)data && new;
+
+	return 0;
+}
+
+static bool vfio_dirty_pages_reporting_capable(struct vfio_iommu *iommu)
+{
+	struct vfio_domain *d;
+	struct vfio_group *g;
+	bool capable = true;
+	int ret;
+
+	list_for_each_entry(d, &iommu->domain_list, next) {
+		list_for_each_entry(g, &d->group_list, next) {
+			ret = iommu_group_for_each_dev(g->iommu_group, &capable,
+					vfio_is_dirty_pages_reporting_capable);
+			if (ret)
+				return false;
+		}
+	}
+
+	return capable;
+}
+
 /*
  * start_iova is the reference from where bitmaping started. This is called
  * from DMA_UNMAP where start_iova can be different than iova
@@ -903,10 +936,17 @@  static void vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
 	struct vfio_dma *dma;
 	dma_addr_t i = iova;
 	unsigned long pgshift = __ffs(pgsize);
+	bool dirty_report_cap = true;
+
+	if (IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
+		dirty_report_cap = vfio_dirty_pages_reporting_capable(iommu);
 
 	while ((dma = vfio_find_dma(iommu, i, pgsize))) {
-		/* mark all pages dirty if all pages are pinned and mapped. */
-		if (dma->iommu_mapped) {
+		/*
+		 * mark all pages dirty if any IOMMU capable device is not able
+		 * to report dirty pages and all pages are pinned and mapped.
+		 */
+		if (!dirty_report_cap && dma->iommu_mapped) {
 			dma_addr_t iova_limit;
 
 			iova_limit = (dma->iova + dma->size) < (iova + size) ?
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index e42a711a2800..ed3832ea10a1 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -148,7 +148,8 @@  extern int vfio_info_add_capability(struct vfio_info_cap *caps,
 extern int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr,
 					      int num_irqs, int max_irq_type,
 					      size_t *data_size);
-
+extern int vfio_device_is_dirty_reporting_capable(struct device *dev,
+						  bool *cap);
 struct pci_dev;
 #if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH)
 extern void vfio_spapr_pci_eeh_open(struct pci_dev *pdev);