diff mbox series

[V2,vfio,06/11] vfio: Introduce the DMA logging feature support

Message ID 20220714081251.240584-7-yishaih@nvidia.com (mailing list archive)
State New, archived
Headers show
Series Add device DMA logging support for mlx5 driver | expand

Commit Message

Yishai Hadas July 14, 2022, 8:12 a.m. UTC
Introduce the DMA logging feature support in the vfio core layer.

It includes the processing of the device start/stop/report DMA logging
UAPIs and calling the relevant driver 'op' to do the work.

Specifically,
Upon start, the core translates the given input ranges into an interval
tree, checks for unexpected overlapping, non aligned ranges and then
pass the translated input to the driver for start tracking the given
ranges.

Upon report, the core translates the given input user space bitmap and
page size into an IOVA kernel bitmap iterator. Then it iterates it and
call the driver to set the corresponding bits for the dirtied pages in a
specific IOVA range.

Upon stop, the driver is called to stop the previous started tracking.

The next patches from the series will introduce the mlx5 driver
implementation for the logging ops.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
---
 drivers/vfio/Kconfig             |   1 +
 drivers/vfio/pci/vfio_pci_core.c |   5 +
 drivers/vfio/vfio_main.c         | 161 +++++++++++++++++++++++++++++++
 include/linux/vfio.h             |  21 +++-
 4 files changed, 186 insertions(+), 2 deletions(-)

Comments

Alex Williamson July 18, 2022, 10:30 p.m. UTC | #1
On Thu, 14 Jul 2022 11:12:46 +0300
Yishai Hadas <yishaih@nvidia.com> wrote:

> Introduce the DMA logging feature support in the vfio core layer.
> 
> It includes the processing of the device start/stop/report DMA logging
> UAPIs and calling the relevant driver 'op' to do the work.
> 
> Specifically,
> Upon start, the core translates the given input ranges into an interval
> tree, checks for unexpected overlapping, non aligned ranges and then
> pass the translated input to the driver for start tracking the given
> ranges.
> 
> Upon report, the core translates the given input user space bitmap and
> page size into an IOVA kernel bitmap iterator. Then it iterates it and
> call the driver to set the corresponding bits for the dirtied pages in a
> specific IOVA range.
> 
> Upon stop, the driver is called to stop the previous started tracking.
> 
> The next patches from the series will introduce the mlx5 driver
> implementation for the logging ops.
> 
> Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
> ---
>  drivers/vfio/Kconfig             |   1 +
>  drivers/vfio/pci/vfio_pci_core.c |   5 +
>  drivers/vfio/vfio_main.c         | 161 +++++++++++++++++++++++++++++++
>  include/linux/vfio.h             |  21 +++-
>  4 files changed, 186 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> index 6130d00252ed..86c381ceb9a1 100644
> --- a/drivers/vfio/Kconfig
> +++ b/drivers/vfio/Kconfig
> @@ -3,6 +3,7 @@ menuconfig VFIO
>  	tristate "VFIO Non-Privileged userspace driver framework"
>  	select IOMMU_API
>  	select VFIO_IOMMU_TYPE1 if MMU && (X86 || S390 || ARM || ARM64)
> +	select INTERVAL_TREE
>  	help
>  	  VFIO provides a framework for secure userspace device drivers.
>  	  See Documentation/driver-api/vfio.rst for more details.
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index 2efa06b1fafa..b6dabf398251 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -1862,6 +1862,11 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev)
>  			return -EINVAL;
>  	}
>  
> +	if (vdev->vdev.log_ops && !(vdev->vdev.log_ops->log_start &&
> +	    vdev->vdev.log_ops->log_stop &&
> +	    vdev->vdev.log_ops->log_read_and_clear))
> +		return -EINVAL;
> +
>  	/*
>  	 * Prevent binding to PFs with VFs enabled, the VFs might be in use
>  	 * by the host or other users.  We cannot capture the VFs if they
> diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> index bd84ca7c5e35..2414d827e3c8 100644
> --- a/drivers/vfio/vfio_main.c
> +++ b/drivers/vfio/vfio_main.c
> @@ -32,6 +32,8 @@
>  #include <linux/vfio.h>
>  #include <linux/wait.h>
>  #include <linux/sched/signal.h>
> +#include <linux/interval_tree.h>
> +#include <linux/iova_bitmap.h>
>  #include "vfio.h"
>  
>  #define DRIVER_VERSION	"0.3"
> @@ -1603,6 +1605,153 @@ static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
>  	return 0;
>  }
>  
> +#define LOG_MAX_RANGES 1024
> +
> +static int
> +vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
> +					u32 flags, void __user *arg,
> +					size_t argsz)
> +{
> +	size_t minsz =
> +		offsetofend(struct vfio_device_feature_dma_logging_control,
> +			    ranges);
> +	struct vfio_device_feature_dma_logging_range __user *ranges;
> +	struct vfio_device_feature_dma_logging_control control;
> +	struct vfio_device_feature_dma_logging_range range;
> +	struct rb_root_cached root = RB_ROOT_CACHED;
> +	struct interval_tree_node *nodes;
> +	u32 nnodes;
> +	int i, ret;
> +
> +	if (!device->log_ops)
> +		return -ENOTTY;
> +
> +	ret = vfio_check_feature(flags, argsz,
> +				 VFIO_DEVICE_FEATURE_SET,
> +				 sizeof(control));
> +	if (ret != 1)
> +		return ret;
> +
> +	if (copy_from_user(&control, arg, minsz))
> +		return -EFAULT;
> +
> +	nnodes = control.num_ranges;
> +	if (!nnodes || nnodes > LOG_MAX_RANGES)
> +		return -EINVAL;

The latter looks more like an -E2BIG errno.  This is a hard coded
limit, but what are the heuristics?  Can a user introspect the limit?
Thanks,

Alex

> +
> +	ranges = u64_to_user_ptr(control.ranges);
> +	nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
> +			      GFP_KERNEL);
> +	if (!nodes)
> +		return -ENOMEM;
> +
> +	for (i = 0; i < nnodes; i++) {
> +		if (copy_from_user(&range, &ranges[i], sizeof(range))) {
> +			ret = -EFAULT;
> +			goto end;
> +		}
> +		if (!IS_ALIGNED(range.iova, control.page_size) ||
> +		    !IS_ALIGNED(range.length, control.page_size)) {
> +			ret = -EINVAL;
> +			goto end;
> +		}
> +		nodes[i].start = range.iova;
> +		nodes[i].last = range.iova + range.length - 1;
> +		if (interval_tree_iter_first(&root, nodes[i].start,
> +					     nodes[i].last)) {
> +			/* Range overlapping */
> +			ret = -EINVAL;
> +			goto end;
> +		}
> +		interval_tree_insert(nodes + i, &root);
> +	}
> +
> +	ret = device->log_ops->log_start(device, &root, nnodes,
> +					 &control.page_size);
> +	if (ret)
> +		goto end;
> +
> +	if (copy_to_user(arg, &control, sizeof(control))) {
> +		ret = -EFAULT;
> +		device->log_ops->log_stop(device);
> +	}
> +
> +end:
> +	kfree(nodes);
> +	return ret;
> +}
> +
> +static int
> +vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
> +				       u32 flags, void __user *arg,
> +				       size_t argsz)
> +{
> +	int ret;
> +
> +	if (!device->log_ops)
> +		return -ENOTTY;
> +
> +	ret = vfio_check_feature(flags, argsz,
> +				 VFIO_DEVICE_FEATURE_SET, 0);
> +	if (ret != 1)
> +		return ret;
> +
> +	return device->log_ops->log_stop(device);
> +}
> +
> +static int
> +vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
> +					 u32 flags, void __user *arg,
> +					 size_t argsz)
> +{
> +	size_t minsz =
> +		offsetofend(struct vfio_device_feature_dma_logging_report,
> +			    bitmap);
> +	struct vfio_device_feature_dma_logging_report report;
> +	struct iova_bitmap_iter iter;
> +	int ret;
> +
> +	if (!device->log_ops)
> +		return -ENOTTY;
> +
> +	ret = vfio_check_feature(flags, argsz,
> +				 VFIO_DEVICE_FEATURE_GET,
> +				 sizeof(report));
> +	if (ret != 1)
> +		return ret;
> +
> +	if (copy_from_user(&report, arg, minsz))
> +		return -EFAULT;
> +
> +	if (report.page_size < PAGE_SIZE)
> +		return -EINVAL;
> +
> +	iova_bitmap_init(&iter.dirty, report.iova, ilog2(report.page_size));
> +	ret = iova_bitmap_iter_init(&iter, report.iova, report.length,
> +				    u64_to_user_ptr(report.bitmap));
> +	if (ret)
> +		return ret;
> +
> +	for (; !iova_bitmap_iter_done(&iter);
> +	     iova_bitmap_iter_advance(&iter)) {
> +		ret = iova_bitmap_iter_get(&iter);
> +		if (ret)
> +			break;
> +
> +		ret = device->log_ops->log_read_and_clear(device,
> +			iova_bitmap_iova(&iter),
> +			iova_bitmap_length(&iter), &iter.dirty);
> +
> +		iova_bitmap_iter_put(&iter);
> +
> +		if (ret)
> +			break;
> +	}
> +
> +	iova_bitmap_iter_free(&iter);
> +	return ret;
> +}
> +
>  static int vfio_ioctl_device_feature(struct vfio_device *device,
>  				     struct vfio_device_feature __user *arg)
>  {
> @@ -1636,6 +1785,18 @@ static int vfio_ioctl_device_feature(struct vfio_device *device,
>  		return vfio_ioctl_device_feature_mig_device_state(
>  			device, feature.flags, arg->data,
>  			feature.argsz - minsz);
> +	case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
> +		return vfio_ioctl_device_feature_logging_start(
> +			device, feature.flags, arg->data,
> +			feature.argsz - minsz);
> +	case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
> +		return vfio_ioctl_device_feature_logging_stop(
> +			device, feature.flags, arg->data,
> +			feature.argsz - minsz);
> +	case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
> +		return vfio_ioctl_device_feature_logging_report(
> +			device, feature.flags, arg->data,
> +			feature.argsz - minsz);
>  	default:
>  		if (unlikely(!device->ops->device_feature))
>  			return -EINVAL;
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 4d26e149db81..feed84d686ec 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -14,6 +14,7 @@
>  #include <linux/workqueue.h>
>  #include <linux/poll.h>
>  #include <uapi/linux/vfio.h>
> +#include <linux/iova_bitmap.h>
>  
>  struct kvm;
>  
> @@ -33,10 +34,11 @@ struct vfio_device {
>  	struct device *dev;
>  	const struct vfio_device_ops *ops;
>  	/*
> -	 * mig_ops is a static property of the vfio_device which must be set
> -	 * prior to registering the vfio_device.
> +	 * mig_ops/log_ops is a static property of the vfio_device which must
> +	 * be set prior to registering the vfio_device.
>  	 */
>  	const struct vfio_migration_ops *mig_ops;
> +	const struct vfio_log_ops *log_ops;
>  	struct vfio_group *group;
>  	struct vfio_device_set *dev_set;
>  	struct list_head dev_set_list;
> @@ -104,6 +106,21 @@ struct vfio_migration_ops {
>  				   enum vfio_device_mig_state *curr_state);
>  };
>  
> +/**
> + * @log_start: Optional callback to ask the device start DMA logging.
> + * @log_stop: Optional callback to ask the device stop DMA logging.
> + * @log_read_and_clear: Optional callback to ask the device read
> + *         and clear the dirty DMAs in some given range.
> + */
> +struct vfio_log_ops {
> +	int (*log_start)(struct vfio_device *device,
> +		struct rb_root_cached *ranges, u32 nnodes, u64 *page_size);
> +	int (*log_stop)(struct vfio_device *device);
> +	int (*log_read_and_clear)(struct vfio_device *device,
> +		unsigned long iova, unsigned long length,
> +		struct iova_bitmap *dirty);
> +};
> +
>  /**
>   * vfio_check_feature - Validate user input for the VFIO_DEVICE_FEATURE ioctl
>   * @flags: Arg from the device_feature op
Yishai Hadas July 19, 2022, 9:19 a.m. UTC | #2
On 19/07/2022 1:30, Alex Williamson wrote:
> On Thu, 14 Jul 2022 11:12:46 +0300
> Yishai Hadas <yishaih@nvidia.com> wrote:
>
>> Introduce the DMA logging feature support in the vfio core layer.
>>
>> It includes the processing of the device start/stop/report DMA logging
>> UAPIs and calling the relevant driver 'op' to do the work.
>>
>> Specifically,
>> Upon start, the core translates the given input ranges into an interval
>> tree, checks for unexpected overlapping, non aligned ranges and then
>> pass the translated input to the driver for start tracking the given
>> ranges.
>>
>> Upon report, the core translates the given input user space bitmap and
>> page size into an IOVA kernel bitmap iterator. Then it iterates it and
>> call the driver to set the corresponding bits for the dirtied pages in a
>> specific IOVA range.
>>
>> Upon stop, the driver is called to stop the previous started tracking.
>>
>> The next patches from the series will introduce the mlx5 driver
>> implementation for the logging ops.
>>
>> Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
>> ---
>>   drivers/vfio/Kconfig             |   1 +
>>   drivers/vfio/pci/vfio_pci_core.c |   5 +
>>   drivers/vfio/vfio_main.c         | 161 +++++++++++++++++++++++++++++++
>>   include/linux/vfio.h             |  21 +++-
>>   4 files changed, 186 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
>> index 6130d00252ed..86c381ceb9a1 100644
>> --- a/drivers/vfio/Kconfig
>> +++ b/drivers/vfio/Kconfig
>> @@ -3,6 +3,7 @@ menuconfig VFIO
>>   	tristate "VFIO Non-Privileged userspace driver framework"
>>   	select IOMMU_API
>>   	select VFIO_IOMMU_TYPE1 if MMU && (X86 || S390 || ARM || ARM64)
>> +	select INTERVAL_TREE
>>   	help
>>   	  VFIO provides a framework for secure userspace device drivers.
>>   	  See Documentation/driver-api/vfio.rst for more details.
>> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
>> index 2efa06b1fafa..b6dabf398251 100644
>> --- a/drivers/vfio/pci/vfio_pci_core.c
>> +++ b/drivers/vfio/pci/vfio_pci_core.c
>> @@ -1862,6 +1862,11 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev)
>>   			return -EINVAL;
>>   	}
>>   
>> +	if (vdev->vdev.log_ops && !(vdev->vdev.log_ops->log_start &&
>> +	    vdev->vdev.log_ops->log_stop &&
>> +	    vdev->vdev.log_ops->log_read_and_clear))
>> +		return -EINVAL;
>> +
>>   	/*
>>   	 * Prevent binding to PFs with VFs enabled, the VFs might be in use
>>   	 * by the host or other users.  We cannot capture the VFs if they
>> diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
>> index bd84ca7c5e35..2414d827e3c8 100644
>> --- a/drivers/vfio/vfio_main.c
>> +++ b/drivers/vfio/vfio_main.c
>> @@ -32,6 +32,8 @@
>>   #include <linux/vfio.h>
>>   #include <linux/wait.h>
>>   #include <linux/sched/signal.h>
>> +#include <linux/interval_tree.h>
>> +#include <linux/iova_bitmap.h>
>>   #include "vfio.h"
>>   
>>   #define DRIVER_VERSION	"0.3"
>> @@ -1603,6 +1605,153 @@ static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
>>   	return 0;
>>   }
>>   
>> +#define LOG_MAX_RANGES 1024
>> +
>> +static int
>> +vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
>> +					u32 flags, void __user *arg,
>> +					size_t argsz)
>> +{
>> +	size_t minsz =
>> +		offsetofend(struct vfio_device_feature_dma_logging_control,
>> +			    ranges);
>> +	struct vfio_device_feature_dma_logging_range __user *ranges;
>> +	struct vfio_device_feature_dma_logging_control control;
>> +	struct vfio_device_feature_dma_logging_range range;
>> +	struct rb_root_cached root = RB_ROOT_CACHED;
>> +	struct interval_tree_node *nodes;
>> +	u32 nnodes;
>> +	int i, ret;
>> +
>> +	if (!device->log_ops)
>> +		return -ENOTTY;
>> +
>> +	ret = vfio_check_feature(flags, argsz,
>> +				 VFIO_DEVICE_FEATURE_SET,
>> +				 sizeof(control));
>> +	if (ret != 1)
>> +		return ret;
>> +
>> +	if (copy_from_user(&control, arg, minsz))
>> +		return -EFAULT;
>> +
>> +	nnodes = control.num_ranges;
>> +	if (!nnodes || nnodes > LOG_MAX_RANGES)
>> +		return -EINVAL;
> The latter looks more like an -E2BIG errno.

OK

> This is a hard coded
> limit, but what are the heuristics?  Can a user introspect the limit?
> Thanks,
>
> Alex

This hard coded value just comes to prevent user space from exploding 
kernel memory allocation.

We don't really expect user space to hit this limit, the RAM in QEMU is 
divided today to around ~12 ranges as we saw so far in our evaluation.

We may also expect user space to combine contiguous ranges to a single 
range or in the worst case even to combine non contiguous ranges to a 
single range.

We can consider moving this hard-coded value to be part of the UAPI 
header, although, not sure that this is really a must.

What do you think ?

Yishai

>
>> +
>> +	ranges = u64_to_user_ptr(control.ranges);
>> +	nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
>> +			      GFP_KERNEL);
>> +	if (!nodes)
>> +		return -ENOMEM;
>> +
>> +	for (i = 0; i < nnodes; i++) {
>> +		if (copy_from_user(&range, &ranges[i], sizeof(range))) {
>> +			ret = -EFAULT;
>> +			goto end;
>> +		}
>> +		if (!IS_ALIGNED(range.iova, control.page_size) ||
>> +		    !IS_ALIGNED(range.length, control.page_size)) {
>> +			ret = -EINVAL;
>> +			goto end;
>> +		}
>> +		nodes[i].start = range.iova;
>> +		nodes[i].last = range.iova + range.length - 1;
>> +		if (interval_tree_iter_first(&root, nodes[i].start,
>> +					     nodes[i].last)) {
>> +			/* Range overlapping */
>> +			ret = -EINVAL;
>> +			goto end;
>> +		}
>> +		interval_tree_insert(nodes + i, &root);
>> +	}
>> +
>> +	ret = device->log_ops->log_start(device, &root, nnodes,
>> +					 &control.page_size);
>> +	if (ret)
>> +		goto end;
>> +
>> +	if (copy_to_user(arg, &control, sizeof(control))) {
>> +		ret = -EFAULT;
>> +		device->log_ops->log_stop(device);
>> +	}
>> +
>> +end:
>> +	kfree(nodes);
>> +	return ret;
>> +}
>> +
>> +static int
>> +vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
>> +				       u32 flags, void __user *arg,
>> +				       size_t argsz)
>> +{
>> +	int ret;
>> +
>> +	if (!device->log_ops)
>> +		return -ENOTTY;
>> +
>> +	ret = vfio_check_feature(flags, argsz,
>> +				 VFIO_DEVICE_FEATURE_SET, 0);
>> +	if (ret != 1)
>> +		return ret;
>> +
>> +	return device->log_ops->log_stop(device);
>> +}
>> +
>> +static int
>> +vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
>> +					 u32 flags, void __user *arg,
>> +					 size_t argsz)
>> +{
>> +	size_t minsz =
>> +		offsetofend(struct vfio_device_feature_dma_logging_report,
>> +			    bitmap);
>> +	struct vfio_device_feature_dma_logging_report report;
>> +	struct iova_bitmap_iter iter;
>> +	int ret;
>> +
>> +	if (!device->log_ops)
>> +		return -ENOTTY;
>> +
>> +	ret = vfio_check_feature(flags, argsz,
>> +				 VFIO_DEVICE_FEATURE_GET,
>> +				 sizeof(report));
>> +	if (ret != 1)
>> +		return ret;
>> +
>> +	if (copy_from_user(&report, arg, minsz))
>> +		return -EFAULT;
>> +
>> +	if (report.page_size < PAGE_SIZE)
>> +		return -EINVAL;
>> +
>> +	iova_bitmap_init(&iter.dirty, report.iova, ilog2(report.page_size));
>> +	ret = iova_bitmap_iter_init(&iter, report.iova, report.length,
>> +				    u64_to_user_ptr(report.bitmap));
>> +	if (ret)
>> +		return ret;
>> +
>> +	for (; !iova_bitmap_iter_done(&iter);
>> +	     iova_bitmap_iter_advance(&iter)) {
>> +		ret = iova_bitmap_iter_get(&iter);
>> +		if (ret)
>> +			break;
>> +
>> +		ret = device->log_ops->log_read_and_clear(device,
>> +			iova_bitmap_iova(&iter),
>> +			iova_bitmap_length(&iter), &iter.dirty);
>> +
>> +		iova_bitmap_iter_put(&iter);
>> +
>> +		if (ret)
>> +			break;
>> +	}
>> +
>> +	iova_bitmap_iter_free(&iter);
>> +	return ret;
>> +}
>> +
>>   static int vfio_ioctl_device_feature(struct vfio_device *device,
>>   				     struct vfio_device_feature __user *arg)
>>   {
>> @@ -1636,6 +1785,18 @@ static int vfio_ioctl_device_feature(struct vfio_device *device,
>>   		return vfio_ioctl_device_feature_mig_device_state(
>>   			device, feature.flags, arg->data,
>>   			feature.argsz - minsz);
>> +	case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
>> +		return vfio_ioctl_device_feature_logging_start(
>> +			device, feature.flags, arg->data,
>> +			feature.argsz - minsz);
>> +	case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
>> +		return vfio_ioctl_device_feature_logging_stop(
>> +			device, feature.flags, arg->data,
>> +			feature.argsz - minsz);
>> +	case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
>> +		return vfio_ioctl_device_feature_logging_report(
>> +			device, feature.flags, arg->data,
>> +			feature.argsz - minsz);
>>   	default:
>>   		if (unlikely(!device->ops->device_feature))
>>   			return -EINVAL;
>> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
>> index 4d26e149db81..feed84d686ec 100644
>> --- a/include/linux/vfio.h
>> +++ b/include/linux/vfio.h
>> @@ -14,6 +14,7 @@
>>   #include <linux/workqueue.h>
>>   #include <linux/poll.h>
>>   #include <uapi/linux/vfio.h>
>> +#include <linux/iova_bitmap.h>
>>   
>>   struct kvm;
>>   
>> @@ -33,10 +34,11 @@ struct vfio_device {
>>   	struct device *dev;
>>   	const struct vfio_device_ops *ops;
>>   	/*
>> -	 * mig_ops is a static property of the vfio_device which must be set
>> -	 * prior to registering the vfio_device.
>> +	 * mig_ops/log_ops is a static property of the vfio_device which must
>> +	 * be set prior to registering the vfio_device.
>>   	 */
>>   	const struct vfio_migration_ops *mig_ops;
>> +	const struct vfio_log_ops *log_ops;
>>   	struct vfio_group *group;
>>   	struct vfio_device_set *dev_set;
>>   	struct list_head dev_set_list;
>> @@ -104,6 +106,21 @@ struct vfio_migration_ops {
>>   				   enum vfio_device_mig_state *curr_state);
>>   };
>>   
>> +/**
>> + * @log_start: Optional callback to ask the device start DMA logging.
>> + * @log_stop: Optional callback to ask the device stop DMA logging.
>> + * @log_read_and_clear: Optional callback to ask the device read
>> + *         and clear the dirty DMAs in some given range.
>> + */
>> +struct vfio_log_ops {
>> +	int (*log_start)(struct vfio_device *device,
>> +		struct rb_root_cached *ranges, u32 nnodes, u64 *page_size);
>> +	int (*log_stop)(struct vfio_device *device);
>> +	int (*log_read_and_clear)(struct vfio_device *device,
>> +		unsigned long iova, unsigned long length,
>> +		struct iova_bitmap *dirty);
>> +};
>> +
>>   /**
>>    * vfio_check_feature - Validate user input for the VFIO_DEVICE_FEATURE ioctl
>>    * @flags: Arg from the device_feature op
Alex Williamson July 19, 2022, 7:25 p.m. UTC | #3
On Tue, 19 Jul 2022 12:19:25 +0300
Yishai Hadas <yishaih@nvidia.com> wrote:

> On 19/07/2022 1:30, Alex Williamson wrote:
> > On Thu, 14 Jul 2022 11:12:46 +0300
> > Yishai Hadas <yishaih@nvidia.com> wrote:
> >  
> >> Introduce the DMA logging feature support in the vfio core layer.
> >>
> >> It includes the processing of the device start/stop/report DMA logging
> >> UAPIs and calling the relevant driver 'op' to do the work.
> >>
> >> Specifically,
> >> Upon start, the core translates the given input ranges into an interval
> >> tree, checks for unexpected overlapping, non aligned ranges and then
> >> pass the translated input to the driver for start tracking the given
> >> ranges.
> >>
> >> Upon report, the core translates the given input user space bitmap and
> >> page size into an IOVA kernel bitmap iterator. Then it iterates it and
> >> call the driver to set the corresponding bits for the dirtied pages in a
> >> specific IOVA range.
> >>
> >> Upon stop, the driver is called to stop the previous started tracking.
> >>
> >> The next patches from the series will introduce the mlx5 driver
> >> implementation for the logging ops.
> >>
> >> Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
> >> ---
> >>   drivers/vfio/Kconfig             |   1 +
> >>   drivers/vfio/pci/vfio_pci_core.c |   5 +
> >>   drivers/vfio/vfio_main.c         | 161 +++++++++++++++++++++++++++++++
> >>   include/linux/vfio.h             |  21 +++-
> >>   4 files changed, 186 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> >> index 6130d00252ed..86c381ceb9a1 100644
> >> --- a/drivers/vfio/Kconfig
> >> +++ b/drivers/vfio/Kconfig
> >> @@ -3,6 +3,7 @@ menuconfig VFIO
> >>   	tristate "VFIO Non-Privileged userspace driver framework"
> >>   	select IOMMU_API
> >>   	select VFIO_IOMMU_TYPE1 if MMU && (X86 || S390 || ARM || ARM64)
> >> +	select INTERVAL_TREE
> >>   	help
> >>   	  VFIO provides a framework for secure userspace device drivers.
> >>   	  See Documentation/driver-api/vfio.rst for more details.
> >> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> >> index 2efa06b1fafa..b6dabf398251 100644
> >> --- a/drivers/vfio/pci/vfio_pci_core.c
> >> +++ b/drivers/vfio/pci/vfio_pci_core.c
> >> @@ -1862,6 +1862,11 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev)
> >>   			return -EINVAL;
> >>   	}
> >>   
> >> +	if (vdev->vdev.log_ops && !(vdev->vdev.log_ops->log_start &&
> >> +	    vdev->vdev.log_ops->log_stop &&
> >> +	    vdev->vdev.log_ops->log_read_and_clear))
> >> +		return -EINVAL;
> >> +
> >>   	/*
> >>   	 * Prevent binding to PFs with VFs enabled, the VFs might be in use
> >>   	 * by the host or other users.  We cannot capture the VFs if they
> >> diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> >> index bd84ca7c5e35..2414d827e3c8 100644
> >> --- a/drivers/vfio/vfio_main.c
> >> +++ b/drivers/vfio/vfio_main.c
> >> @@ -32,6 +32,8 @@
> >>   #include <linux/vfio.h>
> >>   #include <linux/wait.h>
> >>   #include <linux/sched/signal.h>
> >> +#include <linux/interval_tree.h>
> >> +#include <linux/iova_bitmap.h>
> >>   #include "vfio.h"
> >>   
> >>   #define DRIVER_VERSION	"0.3"
> >> @@ -1603,6 +1605,153 @@ static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
> >>   	return 0;
> >>   }
> >>   
> >> +#define LOG_MAX_RANGES 1024
> >> +
> >> +static int
> >> +vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
> >> +					u32 flags, void __user *arg,
> >> +					size_t argsz)
> >> +{
> >> +	size_t minsz =
> >> +		offsetofend(struct vfio_device_feature_dma_logging_control,
> >> +			    ranges);
> >> +	struct vfio_device_feature_dma_logging_range __user *ranges;
> >> +	struct vfio_device_feature_dma_logging_control control;
> >> +	struct vfio_device_feature_dma_logging_range range;
> >> +	struct rb_root_cached root = RB_ROOT_CACHED;
> >> +	struct interval_tree_node *nodes;
> >> +	u32 nnodes;
> >> +	int i, ret;
> >> +
> >> +	if (!device->log_ops)
> >> +		return -ENOTTY;
> >> +
> >> +	ret = vfio_check_feature(flags, argsz,
> >> +				 VFIO_DEVICE_FEATURE_SET,
> >> +				 sizeof(control));
> >> +	if (ret != 1)
> >> +		return ret;
> >> +
> >> +	if (copy_from_user(&control, arg, minsz))
> >> +		return -EFAULT;
> >> +
> >> +	nnodes = control.num_ranges;
> >> +	if (!nnodes || nnodes > LOG_MAX_RANGES)
> >> +		return -EINVAL;  
> > The latter looks more like an -E2BIG errno.  
> 
> OK
> 
> > This is a hard coded
> > limit, but what are the heuristics?  Can a user introspect the limit?
> > Thanks,
> >
> > Alex  
> 
> This hard coded value just comes to prevent user space from exploding 
> kernel memory allocation.

Of course.

> We don't really expect user space to hit this limit, the RAM in QEMU is 
> divided today to around ~12 ranges as we saw so far in our evaluation.

There can be far more for vIOMMU use cases or non-QEMU drivers.

> We may also expect user space to combine contiguous ranges to a single 
> range or in the worst case even to combine non contiguous ranges to a 
> single range.

Why do we expect that from users?
 
> We can consider moving this hard-coded value to be part of the UAPI 
> header, although, not sure that this is really a must.
> 
> What do you think ?

We're looking at a very narrow use case with implicit assumptions about
the behavior of the user driver.  Some of those assumptions need to be
exposed via the uAPI so that userspace can make reasonable choices.
Thanks,

Alex

> >> +
> >> +	ranges = u64_to_user_ptr(control.ranges);
> >> +	nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
> >> +			      GFP_KERNEL);
> >> +	if (!nodes)
> >> +		return -ENOMEM;
> >> +
> >> +	for (i = 0; i < nnodes; i++) {
> >> +		if (copy_from_user(&range, &ranges[i], sizeof(range))) {
> >> +			ret = -EFAULT;
> >> +			goto end;
> >> +		}
> >> +		if (!IS_ALIGNED(range.iova, control.page_size) ||
> >> +		    !IS_ALIGNED(range.length, control.page_size)) {
> >> +			ret = -EINVAL;
> >> +			goto end;
> >> +		}
> >> +		nodes[i].start = range.iova;
> >> +		nodes[i].last = range.iova + range.length - 1;
> >> +		if (interval_tree_iter_first(&root, nodes[i].start,
> >> +					     nodes[i].last)) {
> >> +			/* Range overlapping */
> >> +			ret = -EINVAL;
> >> +			goto end;
> >> +		}
> >> +		interval_tree_insert(nodes + i, &root);
> >> +	}
> >> +
> >> +	ret = device->log_ops->log_start(device, &root, nnodes,
> >> +					 &control.page_size);
> >> +	if (ret)
> >> +		goto end;
> >> +
> >> +	if (copy_to_user(arg, &control, sizeof(control))) {
> >> +		ret = -EFAULT;
> >> +		device->log_ops->log_stop(device);
> >> +	}
> >> +
> >> +end:
> >> +	kfree(nodes);
> >> +	return ret;
> >> +}
> >> +
> >> +static int
> >> +vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
> >> +				       u32 flags, void __user *arg,
> >> +				       size_t argsz)
> >> +{
> >> +	int ret;
> >> +
> >> +	if (!device->log_ops)
> >> +		return -ENOTTY;
> >> +
> >> +	ret = vfio_check_feature(flags, argsz,
> >> +				 VFIO_DEVICE_FEATURE_SET, 0);
> >> +	if (ret != 1)
> >> +		return ret;
> >> +
> >> +	return device->log_ops->log_stop(device);
> >> +}
> >> +
> >> +static int
> >> +vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
> >> +					 u32 flags, void __user *arg,
> >> +					 size_t argsz)
> >> +{
> >> +	size_t minsz =
> >> +		offsetofend(struct vfio_device_feature_dma_logging_report,
> >> +			    bitmap);
> >> +	struct vfio_device_feature_dma_logging_report report;
> >> +	struct iova_bitmap_iter iter;
> >> +	int ret;
> >> +
> >> +	if (!device->log_ops)
> >> +		return -ENOTTY;
> >> +
> >> +	ret = vfio_check_feature(flags, argsz,
> >> +				 VFIO_DEVICE_FEATURE_GET,
> >> +				 sizeof(report));
> >> +	if (ret != 1)
> >> +		return ret;
> >> +
> >> +	if (copy_from_user(&report, arg, minsz))
> >> +		return -EFAULT;
> >> +
> >> +	if (report.page_size < PAGE_SIZE)
> >> +		return -EINVAL;
> >> +
> >> +	iova_bitmap_init(&iter.dirty, report.iova, ilog2(report.page_size));
> >> +	ret = iova_bitmap_iter_init(&iter, report.iova, report.length,
> >> +				    u64_to_user_ptr(report.bitmap));
> >> +	if (ret)
> >> +		return ret;
> >> +
> >> +	for (; !iova_bitmap_iter_done(&iter);
> >> +	     iova_bitmap_iter_advance(&iter)) {
> >> +		ret = iova_bitmap_iter_get(&iter);
> >> +		if (ret)
> >> +			break;
> >> +
> >> +		ret = device->log_ops->log_read_and_clear(device,
> >> +			iova_bitmap_iova(&iter),
> >> +			iova_bitmap_length(&iter), &iter.dirty);
> >> +
> >> +		iova_bitmap_iter_put(&iter);
> >> +
> >> +		if (ret)
> >> +			break;
> >> +	}
> >> +
> >> +	iova_bitmap_iter_free(&iter);
> >> +	return ret;
> >> +}
> >> +
> >>   static int vfio_ioctl_device_feature(struct vfio_device *device,
> >>   				     struct vfio_device_feature __user *arg)
> >>   {
> >> @@ -1636,6 +1785,18 @@ static int vfio_ioctl_device_feature(struct vfio_device *device,
> >>   		return vfio_ioctl_device_feature_mig_device_state(
> >>   			device, feature.flags, arg->data,
> >>   			feature.argsz - minsz);
> >> +	case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
> >> +		return vfio_ioctl_device_feature_logging_start(
> >> +			device, feature.flags, arg->data,
> >> +			feature.argsz - minsz);
> >> +	case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
> >> +		return vfio_ioctl_device_feature_logging_stop(
> >> +			device, feature.flags, arg->data,
> >> +			feature.argsz - minsz);
> >> +	case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
> >> +		return vfio_ioctl_device_feature_logging_report(
> >> +			device, feature.flags, arg->data,
> >> +			feature.argsz - minsz);
> >>   	default:
> >>   		if (unlikely(!device->ops->device_feature))
> >>   			return -EINVAL;
> >> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> >> index 4d26e149db81..feed84d686ec 100644
> >> --- a/include/linux/vfio.h
> >> +++ b/include/linux/vfio.h
> >> @@ -14,6 +14,7 @@
> >>   #include <linux/workqueue.h>
> >>   #include <linux/poll.h>
> >>   #include <uapi/linux/vfio.h>
> >> +#include <linux/iova_bitmap.h>
> >>   
> >>   struct kvm;
> >>   
> >> @@ -33,10 +34,11 @@ struct vfio_device {
> >>   	struct device *dev;
> >>   	const struct vfio_device_ops *ops;
> >>   	/*
> >> -	 * mig_ops is a static property of the vfio_device which must be set
> >> -	 * prior to registering the vfio_device.
> >> +	 * mig_ops/log_ops is a static property of the vfio_device which must
> >> +	 * be set prior to registering the vfio_device.
> >>   	 */
> >>   	const struct vfio_migration_ops *mig_ops;
> >> +	const struct vfio_log_ops *log_ops;
> >>   	struct vfio_group *group;
> >>   	struct vfio_device_set *dev_set;
> >>   	struct list_head dev_set_list;
> >> @@ -104,6 +106,21 @@ struct vfio_migration_ops {
> >>   				   enum vfio_device_mig_state *curr_state);
> >>   };
> >>   
> >> +/**
> >> + * @log_start: Optional callback to ask the device start DMA logging.
> >> + * @log_stop: Optional callback to ask the device stop DMA logging.
> >> + * @log_read_and_clear: Optional callback to ask the device read
> >> + *         and clear the dirty DMAs in some given range.
> >> + */
> >> +struct vfio_log_ops {
> >> +	int (*log_start)(struct vfio_device *device,
> >> +		struct rb_root_cached *ranges, u32 nnodes, u64 *page_size);
> >> +	int (*log_stop)(struct vfio_device *device);
> >> +	int (*log_read_and_clear)(struct vfio_device *device,
> >> +		unsigned long iova, unsigned long length,
> >> +		struct iova_bitmap *dirty);
> >> +};
> >> +
> >>   /**
> >>    * vfio_check_feature - Validate user input for the VFIO_DEVICE_FEATURE ioctl
> >>    * @flags: Arg from the device_feature op  
> 
>
Jason Gunthorpe July 19, 2022, 8:08 p.m. UTC | #4
On Tue, Jul 19, 2022 at 01:25:14PM -0600, Alex Williamson wrote:

> > We don't really expect user space to hit this limit, the RAM in QEMU is 
> > divided today to around ~12 ranges as we saw so far in our evaluation.
> 
> There can be far more for vIOMMU use cases or non-QEMU drivers.

Not really, it isn't dynamic so vIOMMU has to decide what it wants to
track up front. It would never make sense to track based on what is
currently mapped. So it will be some small list, probably a big linear
chunk of the IOVA space.

No idea why non-qemu cases would need to be so different?

> We're looking at a very narrow use case with implicit assumptions about
> the behavior of the user driver.  Some of those assumptions need to be
> exposed via the uAPI so that userspace can make reasonable choices.

I think we need to see a clear use case for more than a few 10's of
ranges before we complicate things. I don't see one. If one does crop
up someday it is easy to add a new query, or some other behavior.

Remember, the devices can't handle huge numbers of ranges anyhow, so
any userspace should not be designed to have more than a few tens in
the first place.

Jason
Tian, Kevin July 21, 2022, 8:54 a.m. UTC | #5
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Wednesday, July 20, 2022 4:08 AM
> 
> On Tue, Jul 19, 2022 at 01:25:14PM -0600, Alex Williamson wrote:
> 
> > > We don't really expect user space to hit this limit, the RAM in QEMU is
> > > divided today to around ~12 ranges as we saw so far in our evaluation.
> >
> > There can be far more for vIOMMU use cases or non-QEMU drivers.
> 
> Not really, it isn't dynamic so vIOMMU has to decide what it wants to
> track up front. It would never make sense to track based on what is
> currently mapped. So it will be some small list, probably a big linear
> chunk of the IOVA space.

How would vIOMMU make such decision when the address space 
is managed by the guest? it is dynamic and could be sparse. I'm
curious about any example a vIOMMU can use to generate such small
list. Would it be a single range based on aperture reported from the
kernel?
Jason Gunthorpe July 21, 2022, 11:50 a.m. UTC | #6
On Thu, Jul 21, 2022 at 08:54:39AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Wednesday, July 20, 2022 4:08 AM
> > 
> > On Tue, Jul 19, 2022 at 01:25:14PM -0600, Alex Williamson wrote:
> > 
> > > > We don't really expect user space to hit this limit, the RAM in QEMU is
> > > > divided today to around ~12 ranges as we saw so far in our evaluation.
> > >
> > > There can be far more for vIOMMU use cases or non-QEMU drivers.
> > 
> > Not really, it isn't dynamic so vIOMMU has to decide what it wants to
> > track up front. It would never make sense to track based on what is
> > currently mapped. So it will be some small list, probably a big linear
> > chunk of the IOVA space.
> 
> How would vIOMMU make such decision when the address space 
> is managed by the guest? it is dynamic and could be sparse. I'm
> curious about any example a vIOMMU can use to generate such small
> list. Would it be a single range based on aperture reported from the
> kernel?

Yes. qemu has to select a static aperture at start.

 The entire aperture is best, if that fails

 A smaller aperture and hope the guest doesn't use the whole space, if
 that fails,

 The entire guest physical map and hope the guest is in PT mode

All of these options are small lists.

Any vIOMMU maps that are created outside of what was asked to be
tracked have to be made permanently dirty by qemu.

Jason
Tian, Kevin July 25, 2022, 7:38 a.m. UTC | #7
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Thursday, July 21, 2022 7:51 PM
> 
> On Thu, Jul 21, 2022 at 08:54:39AM +0000, Tian, Kevin wrote:
> > > From: Jason Gunthorpe <jgg@nvidia.com>
> > > Sent: Wednesday, July 20, 2022 4:08 AM
> > >
> > > On Tue, Jul 19, 2022 at 01:25:14PM -0600, Alex Williamson wrote:
> > >
> > > > > We don't really expect user space to hit this limit, the RAM in QEMU is
> > > > > divided today to around ~12 ranges as we saw so far in our evaluation.
> > > >
> > > > There can be far more for vIOMMU use cases or non-QEMU drivers.
> > >
> > > Not really, it isn't dynamic so vIOMMU has to decide what it wants to
> > > track up front. It would never make sense to track based on what is
> > > currently mapped. So it will be some small list, probably a big linear
> > > chunk of the IOVA space.
> >
> > How would vIOMMU make such decision when the address space
> > is managed by the guest? it is dynamic and could be sparse. I'm
> > curious about any example a vIOMMU can use to generate such small
> > list. Would it be a single range based on aperture reported from the
> > kernel?
> 
> Yes. qemu has to select a static aperture at start.
> 
>  The entire aperture is best, if that fails
> 
>  A smaller aperture and hope the guest doesn't use the whole space, if
>  that fails,
> 
>  The entire guest physical map and hope the guest is in PT mode

That sounds a bit hacky... does it instead suggest that an interface
for reporting the supported ranges on a tracker could be helpful once
trying the entire aperture fails?

> 
> All of these options are small lists.
> 
> Any vIOMMU maps that are created outside of what was asked to be
> tracked have to be made permanently dirty by qemu.
> 
> Jason
Jason Gunthorpe July 25, 2022, 2:37 p.m. UTC | #8
On Mon, Jul 25, 2022 at 07:38:52AM +0000, Tian, Kevin wrote:

> > Yes. qemu has to select a static aperture at start.
> > 
> >  The entire aperture is best, if that fails
> > 
> >  A smaller aperture and hope the guest doesn't use the whole space, if
> >  that fails,
> > 
> >  The entire guest physical map and hope the guest is in PT mode
> 
> That sounds a bit hacky... does it instead suggest that an interface
> for reporting the supported ranges on a tracker could be helpful once
> trying the entire aperture fails?

It is the "try and fail" approach. It gives the driver the most
flexability in processing the ranges to try and make them work. If we
attempt to describe all the device constraints that might exist we
will be here forever.

Eg the driver might be able to do the entire aperture, but it has to
use 2M pages or something.

Jason
Tian, Kevin July 26, 2022, 7:34 a.m. UTC | #9
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Monday, July 25, 2022 10:37 PM
> 
> On Mon, Jul 25, 2022 at 07:38:52AM +0000, Tian, Kevin wrote:
> 
> > > Yes. qemu has to select a static aperture at start.
> > >
> > >  The entire aperture is best, if that fails
> > >
> > >  A smaller aperture and hope the guest doesn't use the whole space, if
> > >  that fails,
> > >
> > >  The entire guest physical map and hope the guest is in PT mode
> >
> > That sounds a bit hacky... does it instead suggest that an interface
> > for reporting the supported ranges on a tracker could be helpful once
> > trying the entire aperture fails?
> 
> It is the "try and fail" approach. It gives the driver the most
> flexability in processing the ranges to try and make them work. If we
> attempt to describe all the device constraints that might exist we
> will be here forever.

Usually the caller of a 'try and fail' interface knows exactly what to
be tried and then call the interface to see whether the callee can
meet its requirement.

Now above turns out to be a 'guess and fail' approach with which
the caller doesn't know exactly what should be tried. In this case
even if the attempt succeeds it's a question how helpful it is.

But I can see why a reporting mechanism doesn't fit well with
your example below. In the worst case probably the user has to
decide between using vIOMMU vs. vfio DMA logging if a simple
policy of using the entire aperture doesn't work...

> 
> Eg the driver might be able to do the entire aperture, but it has to
> use 2M pages or something.
>
Jason Gunthorpe July 26, 2022, 3:12 p.m. UTC | #10
On Tue, Jul 26, 2022 at 07:34:55AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Monday, July 25, 2022 10:37 PM
> > 
> > On Mon, Jul 25, 2022 at 07:38:52AM +0000, Tian, Kevin wrote:
> > 
> > > > Yes. qemu has to select a static aperture at start.
> > > >
> > > >  The entire aperture is best, if that fails
> > > >
> > > >  A smaller aperture and hope the guest doesn't use the whole space, if
> > > >  that fails,
> > > >
> > > >  The entire guest physical map and hope the guest is in PT mode
> > >
> > > That sounds a bit hacky... does it instead suggest that an interface
> > > for reporting the supported ranges on a tracker could be helpful once
> > > trying the entire aperture fails?
> > 
> > It is the "try and fail" approach. It gives the driver the most
> > flexability in processing the ranges to try and make them work. If we
> > attempt to describe all the device constraints that might exist we
> > will be here forever.
> 
> Usually the caller of a 'try and fail' interface knows exactly what to
> be tried and then call the interface to see whether the callee can
> meet its requirement.

Which is exactly this case.

qemu has one thing to try that meets its full requirement - the entire
vIOMMU aperture.

The other two are possible options based on assumptions of how the
guest VM is operating that might work - but this guessing is entirely
between qemu and the VM, not something the kernel can help with.

So, from the kernel perspective qemu will try three things in order of
preference and the first to work will be the right one. Making the
kernel API more complicated is not going to help qemu guess what the
guest is doing any better.

In any case this is vIOMMU mode so if the VM establishes mappings
outside the tracked IOVA then qemu is aware of it and qemu can
perma-dirty those pages as part of its migration logic. It is not
broken, it just might not meet the SLA.

> But I can see why a reporting mechanism doesn't fit well with
> your example below. In the worst case probably the user has to
> decide between using vIOMMU vs. vfio DMA logging if a simple
> policy of using the entire aperture doesn't work...

Well, yes, this is exactly the situation unfortunately. Without
special HW support vIOMMU is not going to work perfectly, but there
are reasonably use cases where vIOMMU is on but the guest is in PT
mode that could work, or where the IOVA aperture is limited, or
so on..

Jason
diff mbox series

Patch

diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 6130d00252ed..86c381ceb9a1 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -3,6 +3,7 @@  menuconfig VFIO
 	tristate "VFIO Non-Privileged userspace driver framework"
 	select IOMMU_API
 	select VFIO_IOMMU_TYPE1 if MMU && (X86 || S390 || ARM || ARM64)
+	select INTERVAL_TREE
 	help
 	  VFIO provides a framework for secure userspace device drivers.
 	  See Documentation/driver-api/vfio.rst for more details.
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 2efa06b1fafa..b6dabf398251 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1862,6 +1862,11 @@  int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev)
 			return -EINVAL;
 	}
 
+	if (vdev->vdev.log_ops && !(vdev->vdev.log_ops->log_start &&
+	    vdev->vdev.log_ops->log_stop &&
+	    vdev->vdev.log_ops->log_read_and_clear))
+		return -EINVAL;
+
 	/*
 	 * Prevent binding to PFs with VFs enabled, the VFs might be in use
 	 * by the host or other users.  We cannot capture the VFs if they
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index bd84ca7c5e35..2414d827e3c8 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -32,6 +32,8 @@ 
 #include <linux/vfio.h>
 #include <linux/wait.h>
 #include <linux/sched/signal.h>
+#include <linux/interval_tree.h>
+#include <linux/iova_bitmap.h>
 #include "vfio.h"
 
 #define DRIVER_VERSION	"0.3"
@@ -1603,6 +1605,153 @@  static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
 	return 0;
 }
 
+#define LOG_MAX_RANGES 1024
+
+static int
+vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
+					u32 flags, void __user *arg,
+					size_t argsz)
+{
+	size_t minsz =
+		offsetofend(struct vfio_device_feature_dma_logging_control,
+			    ranges);
+	struct vfio_device_feature_dma_logging_range __user *ranges;
+	struct vfio_device_feature_dma_logging_control control;
+	struct vfio_device_feature_dma_logging_range range;
+	struct rb_root_cached root = RB_ROOT_CACHED;
+	struct interval_tree_node *nodes;
+	u32 nnodes;
+	int i, ret;
+
+	if (!device->log_ops)
+		return -ENOTTY;
+
+	ret = vfio_check_feature(flags, argsz,
+				 VFIO_DEVICE_FEATURE_SET,
+				 sizeof(control));
+	if (ret != 1)
+		return ret;
+
+	if (copy_from_user(&control, arg, minsz))
+		return -EFAULT;
+
+	nnodes = control.num_ranges;
+	if (!nnodes || nnodes > LOG_MAX_RANGES)
+		return -EINVAL;
+
+	ranges = u64_to_user_ptr(control.ranges);
+	nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
+			      GFP_KERNEL);
+	if (!nodes)
+		return -ENOMEM;
+
+	for (i = 0; i < nnodes; i++) {
+		if (copy_from_user(&range, &ranges[i], sizeof(range))) {
+			ret = -EFAULT;
+			goto end;
+		}
+		if (!IS_ALIGNED(range.iova, control.page_size) ||
+		    !IS_ALIGNED(range.length, control.page_size)) {
+			ret = -EINVAL;
+			goto end;
+		}
+		nodes[i].start = range.iova;
+		nodes[i].last = range.iova + range.length - 1;
+		if (interval_tree_iter_first(&root, nodes[i].start,
+					     nodes[i].last)) {
+			/* Range overlapping */
+			ret = -EINVAL;
+			goto end;
+		}
+		interval_tree_insert(nodes + i, &root);
+	}
+
+	ret = device->log_ops->log_start(device, &root, nnodes,
+					 &control.page_size);
+	if (ret)
+		goto end;
+
+	if (copy_to_user(arg, &control, sizeof(control))) {
+		ret = -EFAULT;
+		device->log_ops->log_stop(device);
+	}
+
+end:
+	kfree(nodes);
+	return ret;
+}
+
+static int
+vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
+				       u32 flags, void __user *arg,
+				       size_t argsz)
+{
+	int ret;
+
+	if (!device->log_ops)
+		return -ENOTTY;
+
+	ret = vfio_check_feature(flags, argsz,
+				 VFIO_DEVICE_FEATURE_SET, 0);
+	if (ret != 1)
+		return ret;
+
+	return device->log_ops->log_stop(device);
+}
+
+static int
+vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
+					 u32 flags, void __user *arg,
+					 size_t argsz)
+{
+	size_t minsz =
+		offsetofend(struct vfio_device_feature_dma_logging_report,
+			    bitmap);
+	struct vfio_device_feature_dma_logging_report report;
+	struct iova_bitmap_iter iter;
+	int ret;
+
+	if (!device->log_ops)
+		return -ENOTTY;
+
+	ret = vfio_check_feature(flags, argsz,
+				 VFIO_DEVICE_FEATURE_GET,
+				 sizeof(report));
+	if (ret != 1)
+		return ret;
+
+	if (copy_from_user(&report, arg, minsz))
+		return -EFAULT;
+
+	if (report.page_size < PAGE_SIZE)
+		return -EINVAL;
+
+	iova_bitmap_init(&iter.dirty, report.iova, ilog2(report.page_size));
+	ret = iova_bitmap_iter_init(&iter, report.iova, report.length,
+				    u64_to_user_ptr(report.bitmap));
+	if (ret)
+		return ret;
+
+	for (; !iova_bitmap_iter_done(&iter);
+	     iova_bitmap_iter_advance(&iter)) {
+		ret = iova_bitmap_iter_get(&iter);
+		if (ret)
+			break;
+
+		ret = device->log_ops->log_read_and_clear(device,
+			iova_bitmap_iova(&iter),
+			iova_bitmap_length(&iter), &iter.dirty);
+
+		iova_bitmap_iter_put(&iter);
+
+		if (ret)
+			break;
+	}
+
+	iova_bitmap_iter_free(&iter);
+	return ret;
+}
+
 static int vfio_ioctl_device_feature(struct vfio_device *device,
 				     struct vfio_device_feature __user *arg)
 {
@@ -1636,6 +1785,18 @@  static int vfio_ioctl_device_feature(struct vfio_device *device,
 		return vfio_ioctl_device_feature_mig_device_state(
 			device, feature.flags, arg->data,
 			feature.argsz - minsz);
+	case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
+		return vfio_ioctl_device_feature_logging_start(
+			device, feature.flags, arg->data,
+			feature.argsz - minsz);
+	case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
+		return vfio_ioctl_device_feature_logging_stop(
+			device, feature.flags, arg->data,
+			feature.argsz - minsz);
+	case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
+		return vfio_ioctl_device_feature_logging_report(
+			device, feature.flags, arg->data,
+			feature.argsz - minsz);
 	default:
 		if (unlikely(!device->ops->device_feature))
 			return -EINVAL;
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 4d26e149db81..feed84d686ec 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -14,6 +14,7 @@ 
 #include <linux/workqueue.h>
 #include <linux/poll.h>
 #include <uapi/linux/vfio.h>
+#include <linux/iova_bitmap.h>
 
 struct kvm;
 
@@ -33,10 +34,11 @@  struct vfio_device {
 	struct device *dev;
 	const struct vfio_device_ops *ops;
 	/*
-	 * mig_ops is a static property of the vfio_device which must be set
-	 * prior to registering the vfio_device.
+	 * mig_ops/log_ops is a static property of the vfio_device which must
+	 * be set prior to registering the vfio_device.
 	 */
 	const struct vfio_migration_ops *mig_ops;
+	const struct vfio_log_ops *log_ops;
 	struct vfio_group *group;
 	struct vfio_device_set *dev_set;
 	struct list_head dev_set_list;
@@ -104,6 +106,21 @@  struct vfio_migration_ops {
 				   enum vfio_device_mig_state *curr_state);
 };
 
+/**
+ * @log_start: Optional callback to ask the device start DMA logging.
+ * @log_stop: Optional callback to ask the device stop DMA logging.
+ * @log_read_and_clear: Optional callback to ask the device read
+ *         and clear the dirty DMAs in some given range.
+ */
+struct vfio_log_ops {
+	int (*log_start)(struct vfio_device *device,
+		struct rb_root_cached *ranges, u32 nnodes, u64 *page_size);
+	int (*log_stop)(struct vfio_device *device);
+	int (*log_read_and_clear)(struct vfio_device *device,
+		unsigned long iova, unsigned long length,
+		struct iova_bitmap *dirty);
+};
+
 /**
  * vfio_check_feature - Validate user input for the VFIO_DEVICE_FEATURE ioctl
  * @flags: Arg from the device_feature op