diff mbox

[RFC] vfio/type1: Add IOVA_RANGE capability support

Message ID 20171206160736.77704-1-shameerali.kolothum.thodi@huawei.com (mailing list archive)
State New, archived
Headers show

Commit Message

Shameerali Kolothum Thodi Dec. 6, 2017, 4:07 p.m. UTC
This patch allows the user-space to retrieve the supported
IOVA range(s), excluding any reserved regions. The implementation
is based on capability chains, added to the VFIO_IOMMU_GET_INFO ioctl.

This is following the discussions here[1] and is based on the RFC patch[2].

ToDo:
 - This currently derives the default supported iova range from the first
   iommu domain. This needs to be changed to go through the domain_list
   instead.
 - Sync with Pierre's patch[3].

1.https://lists.gnu.org/archive/html/qemu-devel/2017-11/msg03651.html
2.https://lists.linuxfoundation.org/pipermail/iommu/2016-November/019002.html
3.https://patchwork.kernel.org/patch/10084655/

Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
---
 drivers/vfio/vfio_iommu_type1.c | 172 +++++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/vfio.h       |  13 +++
 2 files changed, 184 insertions(+), 1 deletion(-)

Comments

Shameerali Kolothum Thodi Dec. 6, 2017, 4:15 p.m. UTC | #1
Hi Pierre,

> -----Original Message-----
> From: Shameerali Kolothum Thodi
> Sent: Wednesday, December 06, 2017 4:08 PM
> To: alex.williamson@redhat.com; eric.auger@redhat.com;
> pmorel@linux.vnet.ibm.com
> Cc: kvm@vger.kernel.org; linux-kernel@vger.kernel.org; Linuxarm
> <linuxarm@huawei.com>; Shameerali Kolothum Thodi
> <shameerali.kolothum.thodi@huawei.com>
> Subject: [RFC] vfio/type1: Add IOVA_RANGE capability support
> 
> This patch allows the user-space to retrieve the supported
> IOVA range(s), excluding any reserved regions. The implementation
> is based on capability chains, added to the VFIO_IOMMU_GET_INFO ioctl.
> 
> This is following the discussions here[1] and is based on the RFC patch[2].
> 
> ToDo:
>  - This currently derives the default supported iova range from the first
>    iommu domain. This needs to be changed to go through the domain_list
>    instead.
>  - Sync with Pierre's patch[3].

Thanks to Eric[1], came to know that you have posted a patch to retrieve the
iommu aperture info. This RFC does a similar thing but try to take care of
any reserved regions and adds to the capability chain. 

Please take a look and if there is a possibility to sync up your next revision
and this, please let me know.

Thanks,
Shameer

1. https://patchwork.kernel.org/patch/10056967/

> 1.https://lists.gnu.org/archive/html/qemu-devel/2017-11/msg03651.html
> 2.https://lists.linuxfoundation.org/pipermail/iommu/2016-
> November/019002.html
> 3.https://patchwork.kernel.org/patch/10084655/
> 
> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
> ---
>  drivers/vfio/vfio_iommu_type1.c | 172
> +++++++++++++++++++++++++++++++++++++++-
>  include/uapi/linux/vfio.h       |  13 +++
>  2 files changed, 184 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index e30e29a..72ca78a 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -28,6 +28,7 @@
>  #include <linux/device.h>
>  #include <linux/fs.h>
>  #include <linux/iommu.h>
> +#include <linux/list_sort.h>
>  #include <linux/module.h>
>  #include <linux/mm.h>
>  #include <linux/rbtree.h>
> @@ -92,6 +93,12 @@ struct vfio_group {
>  	struct list_head	next;
>  };
> 
> +struct vfio_iommu_iova {
> +	struct list_head	list;
> +	phys_addr_t		start;
> +	phys_addr_t		end;
> +};
> +
>  /*
>   * Guest RAM pinning working set or DMA target
>   */
> @@ -1537,6 +1544,144 @@ static int vfio_domains_have_iommu_cache(struct
> vfio_iommu *iommu)
>  	return ret;
>  }
> 
> +static int vfio_add_iova_cap(struct vfio_info_cap *caps, u64 start, u64 end)
> +{
> +	struct vfio_iommu_type1_info_cap_iova_range *cap;
> +	struct vfio_info_cap_header *header;
> +
> +	header = vfio_info_cap_add(caps, sizeof(*cap),
> +			VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 1);
> +	if (IS_ERR(header))
> +		return PTR_ERR(header);
> +
> +	cap = container_of(header,
> +			   struct vfio_iommu_type1_info_cap_iova_range,
> +			   header);
> +
> +	cap->start = start;
> +	cap->end = end;
> +
> +	return 0;
> +}
> +
> +static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,
> +				struct list_head *head)
> +{
> +	struct vfio_iommu_iova *region;
> +
> +	region = kzalloc(sizeof(*region), GFP_KERNEL);
> +	if (!region)
> +		return -ENOMEM;
> +
> +	INIT_LIST_HEAD(&region->list);
> +	region->start = start;
> +	region->end = end;
> +
> +	list_add_tail(&region->list, head);
> +	return 0;
> +}
> +
> +/*
> + * Check and update iova region list in case a reserved region
> + * overlaps the iommu iova range.
> + */
> +static int vfio_update_iommu_iova_range(phys_addr_t start, phys_addr_t end,
> +					struct list_head *iova)
> +{
> +	struct vfio_iommu_iova *node;
> +	phys_addr_t a, b;
> +	int ret = 0;
> +
> +	if (list_empty(iova))
> +		return -ENODEV;
> +
> +	node = list_last_entry(iova, struct vfio_iommu_iova, list);
> +	a = node->start;
> +	b = node->end;
> +
> +	/* No overlap */
> +	if ((start > b) || (end < a))
> +		return 0;
> +
> +	if (start > a)
> +		ret = vfio_insert_iova(a, start - 1, &node->list);
> +	if (ret)
> +		goto done;
> +	if (end < b)
> +		ret = vfio_insert_iova(end + 1, b, &node->list);
> +
> +done:
> +	list_del(&node->list);
> +	kfree(node);
> +
> +	return ret;
> +}
> +
> +static int vfio_resv_cmp(void *priv, struct list_head *a, struct list_head *b)
> +{
> +	struct iommu_resv_region *ra, *rb;
> +
> +	ra = container_of(a, struct iommu_resv_region, list);
> +	rb = container_of(b, struct iommu_resv_region, list);
> +
> +	if (ra->start < rb->start)
> +		return -1;
> +	if (ra->start > rb->start)
> +		return 1;
> +	return 0;
> +}
> +
> +static int vfio_build_iommu_iova_caps(struct vfio_iommu *iommu,
> +				struct vfio_info_cap *caps)
> +{
> +	struct iommu_resv_region *resv, *resv_next;
> +	struct vfio_iommu_iova *iova, *iova_next;
> +	struct list_head group_resv_regions, vfio_iova_regions;
> +	struct vfio_domain *domain;
> +	struct vfio_group *g;
> +	phys_addr_t start, end;
> +	int ret = 0;
> +
> +	domain = list_first_entry(&iommu->domain_list,
> +				  struct vfio_domain, next);
> +	/* Get the default iova range supported */
> +	start = domain->domain->geometry.aperture_start;
> +	end = domain->domain->geometry.aperture_end;
> +	INIT_LIST_HEAD(&vfio_iova_regions);
> +	vfio_insert_iova(start, end, &vfio_iova_regions);
> +
> +	/* Get reserved regions if any */
> +	INIT_LIST_HEAD(&group_resv_regions);
> +	list_for_each_entry(g, &domain->group_list, next)
> +		iommu_get_group_resv_regions(g->iommu_group,
> +						&group_resv_regions);
> +	list_sort(NULL, &group_resv_regions, vfio_resv_cmp);
> +
> +	/* Update iova range excluding reserved regions */
> +	list_for_each_entry(resv, &group_resv_regions, list) {
> +		ret = vfio_update_iommu_iova_range(resv->start,
> +				resv->start + resv->length - 1,
> +				&vfio_iova_regions);
> +		if (ret)
> +			goto done;
> +	}
> +
> +	list_for_each_entry(iova, &vfio_iova_regions, list) {
> +		ret = vfio_add_iova_cap(caps, iova->start, iova->end);
> +		if (ret)
> +			goto done;
> +	}
> +
> +done:
> +	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)
> +		kfree(resv);
> +
> +	list_for_each_entry_safe(iova, iova_next, &vfio_iova_regions, list)
> +		kfree(iova);
> +
> +	return ret;
> +}
> +
>  static long vfio_iommu_type1_ioctl(void *iommu_data,
>  				   unsigned int cmd, unsigned long arg)
>  {
> @@ -1558,8 +1703,10 @@ static long vfio_iommu_type1_ioctl(void
> *iommu_data,
>  		}
>  	} else if (cmd == VFIO_IOMMU_GET_INFO) {
>  		struct vfio_iommu_type1_info info;
> +		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
> +		int ret;
> 
> -		minsz = offsetofend(struct vfio_iommu_type1_info,
> iova_pgsizes);
> +		minsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
> 
>  		if (copy_from_user(&info, (void __user *)arg, minsz))
>  			return -EFAULT;
> @@ -1571,6 +1718,29 @@ static long vfio_iommu_type1_ioctl(void
> *iommu_data,
> 
>  		info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
> 
> +		ret = vfio_build_iommu_iova_caps(iommu, &caps);
> +		if (ret)
> +			return ret;
> +
> +		if (caps.size) {
> +			info.flags |= VFIO_IOMMU_INFO_CAPS;
> +			if (info.argsz < sizeof(info) + caps.size) {
> +				info.argsz = sizeof(info) + caps.size;
> +				info.cap_offset = 0;
> +			} else {
> +				vfio_info_cap_shift(&caps, sizeof(info));
> +				if (copy_to_user((void __user *)arg +
> +						sizeof(info), caps.buf,
> +						caps.size)) {
> +					kfree(caps.buf);
> +					return -EFAULT;
> +				}
> +				info.cap_offset = sizeof(info);
> +			}
> +
> +			kfree(caps.buf);
> +		}
> +
>  		return copy_to_user((void __user *)arg, &info, minsz) ?
>  			-EFAULT : 0;
> 
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index e3301db..c4e338b 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -517,7 +517,20 @@ struct vfio_iommu_type1_info {
>  	__u32	argsz;
>  	__u32	flags;
>  #define VFIO_IOMMU_INFO_PGSIZES (1 << 0)	/* supported page sizes info */
> +#define VFIO_IOMMU_INFO_CAPS	(1 << 1)	/* Info supports caps */
>  	__u64	iova_pgsizes;		/* Bitmap of supported page sizes */
> +	__u32   cap_offset;	/* Offset within info struct of first cap */
> +	__u32	__resv;
> +};
> +
> +/*
> + * The IOVA_RANGE capability allows to report the IOVA range(s),
> + */
> +#define VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE  1
> +struct vfio_iommu_type1_info_cap_iova_range {
> +	struct vfio_info_cap_header header;
> +	__u64 start;
> +	__u64 end;
>  };
> 
>  #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
> --
> 1.9.1
>
Pierre Morel Dec. 7, 2017, 12:55 p.m. UTC | #2
On 06/12/2017 17:15, Shameerali Kolothum Thodi wrote:
> Hi Pierre,
> 
>> -----Original Message-----
>> From: Shameerali Kolothum Thodi
>> Sent: Wednesday, December 06, 2017 4:08 PM
>> To: alex.williamson@redhat.com; eric.auger@redhat.com;
>> pmorel@linux.vnet.ibm.com
>> Cc: kvm@vger.kernel.org; linux-kernel@vger.kernel.org; Linuxarm
>> <linuxarm@huawei.com>; Shameerali Kolothum Thodi
>> <shameerali.kolothum.thodi@huawei.com>
>> Subject: [RFC] vfio/type1: Add IOVA_RANGE capability support
>>
>> This patch allows the user-space to retrieve the supported
>> IOVA range(s), excluding any reserved regions. The implementation
>> is based on capability chains, added to the VFIO_IOMMU_GET_INFO ioctl.
>>
>> This is following the discussions here[1] and is based on the RFC patch[2].
>>
>> ToDo:
>>   - This currently derives the default supported iova range from the first
>>     iommu domain. This needs to be changed to go through the domain_list
>>     instead.
>>   - Sync with Pierre's patch[3].
> 
> Thanks to Eric[1], came to know that you have posted a patch to retrieve the
> iommu aperture info. This RFC does a similar thing but try to take care of
> any reserved regions and adds to the capability chain.
> 
> Please take a look and if there is a possibility to sync up your next revision
> and this, please let me know.

Hi Shameer,

Indeed it is close to what I was developing.

I have a single concern, the aperture is strongly hardware related while 
reserved areas are much more flexible.

As a consequence, I think it would be better if they were handled in 
separate capabilities and let the user space decide if it wants to know 
about one or the other.

For my immediate needs, this patch would be OK since we (s390x) do not 
use reserved regions.

@Alex: what do you prefer
If we need two capabilities, I will send the patch serie I made on the 
aperture capability for VFIO IOMMU.
If not I will use Shameer's patch.

Thanks,

Best regards,

Pierre

> 
> Thanks,
> Shameer
> 
> 1. https://patchwork.kernel.org/patch/10056967/
> 
>> 1.https://lists.gnu.org/archive/html/qemu-devel/2017-11/msg03651.html
>> 2.https://lists.linuxfoundation.org/pipermail/iommu/2016-
>> November/019002.html
>> 3.https://patchwork.kernel.org/patch/10084655/
>>
>> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
>> ---
>>   drivers/vfio/vfio_iommu_type1.c | 172
>> +++++++++++++++++++++++++++++++++++++++-
>>   include/uapi/linux/vfio.h       |  13 +++
>>   2 files changed, 184 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>> index e30e29a..72ca78a 100644
>> --- a/drivers/vfio/vfio_iommu_type1.c
>> +++ b/drivers/vfio/vfio_iommu_type1.c
>> @@ -28,6 +28,7 @@
>>   #include <linux/device.h>
>>   #include <linux/fs.h>
>>   #include <linux/iommu.h>
>> +#include <linux/list_sort.h>
>>   #include <linux/module.h>
>>   #include <linux/mm.h>
>>   #include <linux/rbtree.h>
>> @@ -92,6 +93,12 @@ struct vfio_group {
>>   	struct list_head	next;
>>   };
>>
>> +struct vfio_iommu_iova {
>> +	struct list_head	list;
>> +	phys_addr_t		start;
>> +	phys_addr_t		end;
>> +};
>> +
>>   /*
>>    * Guest RAM pinning working set or DMA target
>>    */
>> @@ -1537,6 +1544,144 @@ static int vfio_domains_have_iommu_cache(struct
>> vfio_iommu *iommu)
>>   	return ret;
>>   }
>>
>> +static int vfio_add_iova_cap(struct vfio_info_cap *caps, u64 start, u64 end)
>> +{
>> +	struct vfio_iommu_type1_info_cap_iova_range *cap;
>> +	struct vfio_info_cap_header *header;
>> +
>> +	header = vfio_info_cap_add(caps, sizeof(*cap),
>> +			VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 1);
>> +	if (IS_ERR(header))
>> +		return PTR_ERR(header);
>> +
>> +	cap = container_of(header,
>> +			   struct vfio_iommu_type1_info_cap_iova_range,
>> +			   header);
>> +
>> +	cap->start = start;
>> +	cap->end = end;
>> +
>> +	return 0;
>> +}
>> +
>> +static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,
>> +				struct list_head *head)
>> +{
>> +	struct vfio_iommu_iova *region;
>> +
>> +	region = kzalloc(sizeof(*region), GFP_KERNEL);
>> +	if (!region)
>> +		return -ENOMEM;
>> +
>> +	INIT_LIST_HEAD(&region->list);
>> +	region->start = start;
>> +	region->end = end;
>> +
>> +	list_add_tail(&region->list, head);
>> +	return 0;
>> +}
>> +
>> +/*
>> + * Check and update iova region list in case a reserved region
>> + * overlaps the iommu iova range.
>> + */
>> +static int vfio_update_iommu_iova_range(phys_addr_t start, phys_addr_t end,
>> +					struct list_head *iova)
>> +{
>> +	struct vfio_iommu_iova *node;
>> +	phys_addr_t a, b;
>> +	int ret = 0;
>> +
>> +	if (list_empty(iova))
>> +		return -ENODEV;
>> +
>> +	node = list_last_entry(iova, struct vfio_iommu_iova, list);
>> +	a = node->start;
>> +	b = node->end;
>> +
>> +	/* No overlap */
>> +	if ((start > b) || (end < a))
>> +		return 0;
>> +
>> +	if (start > a)
>> +		ret = vfio_insert_iova(a, start - 1, &node->list);
>> +	if (ret)
>> +		goto done;
>> +	if (end < b)
>> +		ret = vfio_insert_iova(end + 1, b, &node->list);
>> +
>> +done:
>> +	list_del(&node->list);
>> +	kfree(node);
>> +
>> +	return ret;
>> +}
>> +
>> +static int vfio_resv_cmp(void *priv, struct list_head *a, struct list_head *b)
>> +{
>> +	struct iommu_resv_region *ra, *rb;
>> +
>> +	ra = container_of(a, struct iommu_resv_region, list);
>> +	rb = container_of(b, struct iommu_resv_region, list);
>> +
>> +	if (ra->start < rb->start)
>> +		return -1;
>> +	if (ra->start > rb->start)
>> +		return 1;
>> +	return 0;
>> +}
>> +
>> +static int vfio_build_iommu_iova_caps(struct vfio_iommu *iommu,
>> +				struct vfio_info_cap *caps)
>> +{
>> +	struct iommu_resv_region *resv, *resv_next;
>> +	struct vfio_iommu_iova *iova, *iova_next;
>> +	struct list_head group_resv_regions, vfio_iova_regions;
>> +	struct vfio_domain *domain;
>> +	struct vfio_group *g;
>> +	phys_addr_t start, end;
>> +	int ret = 0;
>> +
>> +	domain = list_first_entry(&iommu->domain_list,
>> +				  struct vfio_domain, next);
>> +	/* Get the default iova range supported */
>> +	start = domain->domain->geometry.aperture_start;
>> +	end = domain->domain->geometry.aperture_end;
>> +	INIT_LIST_HEAD(&vfio_iova_regions);
>> +	vfio_insert_iova(start, end, &vfio_iova_regions);
>> +
>> +	/* Get reserved regions if any */
>> +	INIT_LIST_HEAD(&group_resv_regions);
>> +	list_for_each_entry(g, &domain->group_list, next)
>> +		iommu_get_group_resv_regions(g->iommu_group,
>> +						&group_resv_regions);
>> +	list_sort(NULL, &group_resv_regions, vfio_resv_cmp);
>> +
>> +	/* Update iova range excluding reserved regions */
>> +	list_for_each_entry(resv, &group_resv_regions, list) {
>> +		ret = vfio_update_iommu_iova_range(resv->start,
>> +				resv->start + resv->length - 1,
>> +				&vfio_iova_regions);
>> +		if (ret)
>> +			goto done;
>> +	}
>> +
>> +	list_for_each_entry(iova, &vfio_iova_regions, list) {
>> +		ret = vfio_add_iova_cap(caps, iova->start, iova->end);
>> +		if (ret)
>> +			goto done;
>> +	}
>> +
>> +done:
>> +	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)
>> +		kfree(resv);
>> +
>> +	list_for_each_entry_safe(iova, iova_next, &vfio_iova_regions, list)
>> +		kfree(iova);
>> +
>> +	return ret;
>> +}
>> +
>>   static long vfio_iommu_type1_ioctl(void *iommu_data,
>>   				   unsigned int cmd, unsigned long arg)
>>   {
>> @@ -1558,8 +1703,10 @@ static long vfio_iommu_type1_ioctl(void
>> *iommu_data,
>>   		}
>>   	} else if (cmd == VFIO_IOMMU_GET_INFO) {
>>   		struct vfio_iommu_type1_info info;
>> +		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
>> +		int ret;
>>
>> -		minsz = offsetofend(struct vfio_iommu_type1_info,
>> iova_pgsizes);
>> +		minsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
>>
>>   		if (copy_from_user(&info, (void __user *)arg, minsz))
>>   			return -EFAULT;
>> @@ -1571,6 +1718,29 @@ static long vfio_iommu_type1_ioctl(void
>> *iommu_data,
>>
>>   		info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
>>
>> +		ret = vfio_build_iommu_iova_caps(iommu, &caps);
>> +		if (ret)
>> +			return ret;
>> +
>> +		if (caps.size) {
>> +			info.flags |= VFIO_IOMMU_INFO_CAPS;
>> +			if (info.argsz < sizeof(info) + caps.size) {
>> +				info.argsz = sizeof(info) + caps.size;
>> +				info.cap_offset = 0;
>> +			} else {
>> +				vfio_info_cap_shift(&caps, sizeof(info));
>> +				if (copy_to_user((void __user *)arg +
>> +						sizeof(info), caps.buf,
>> +						caps.size)) {
>> +					kfree(caps.buf);
>> +					return -EFAULT;
>> +				}
>> +				info.cap_offset = sizeof(info);
>> +			}
>> +
>> +			kfree(caps.buf);
>> +		}
>> +
>>   		return copy_to_user((void __user *)arg, &info, minsz) ?
>>   			-EFAULT : 0;
>>
>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
>> index e3301db..c4e338b 100644
>> --- a/include/uapi/linux/vfio.h
>> +++ b/include/uapi/linux/vfio.h
>> @@ -517,7 +517,20 @@ struct vfio_iommu_type1_info {
>>   	__u32	argsz;
>>   	__u32	flags;
>>   #define VFIO_IOMMU_INFO_PGSIZES (1 << 0)	/* supported page sizes info */
>> +#define VFIO_IOMMU_INFO_CAPS	(1 << 1)	/* Info supports caps */
>>   	__u64	iova_pgsizes;		/* Bitmap of supported page sizes */
>> +	__u32   cap_offset;	/* Offset within info struct of first cap */
>> +	__u32	__resv;
>> +};
>> +
>> +/*
>> + * The IOVA_RANGE capability allows to report the IOVA range(s),
>> + */
>> +#define VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE  1
>> +struct vfio_iommu_type1_info_cap_iova_range {
>> +	struct vfio_info_cap_header header;
>> +	__u64 start;
>> +	__u64 end;
>>   };
>>
>>   #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
>> --
>> 1.9.1
>>
>
Alex Williamson Dec. 7, 2017, 4:08 p.m. UTC | #3
On Wed, 6 Dec 2017 16:07:36 +0000
Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> wrote:

> This patch allows the user-space to retrieve the supported
> IOVA range(s), excluding any reserved regions. The implementation
> is based on capability chains, added to the VFIO_IOMMU_GET_INFO ioctl.
> 
> This is following the discussions here[1] and is based on the RFC patch[2].
> 
> ToDo:
>  - This currently derives the default supported iova range from the first
>    iommu domain. This needs to be changed to go through the domain_list
>    instead.
>  - Sync with Pierre's patch[3].
> 
> 1.https://lists.gnu.org/archive/html/qemu-devel/2017-11/msg03651.html
> 2.https://lists.linuxfoundation.org/pipermail/iommu/2016-November/019002.html
> 3.https://patchwork.kernel.org/patch/10084655/
> 
> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
> ---
>  drivers/vfio/vfio_iommu_type1.c | 172 +++++++++++++++++++++++++++++++++++++++-
>  include/uapi/linux/vfio.h       |  13 +++
>  2 files changed, 184 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index e30e29a..72ca78a 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -28,6 +28,7 @@
>  #include <linux/device.h>
>  #include <linux/fs.h>
>  #include <linux/iommu.h>
> +#include <linux/list_sort.h>
>  #include <linux/module.h>
>  #include <linux/mm.h>
>  #include <linux/rbtree.h>
> @@ -92,6 +93,12 @@ struct vfio_group {
>  	struct list_head	next;
>  };
>  
> +struct vfio_iommu_iova {
> +	struct list_head	list;
> +	phys_addr_t		start;
> +	phys_addr_t		end;
> +};
> +
>  /*
>   * Guest RAM pinning working set or DMA target
>   */
> @@ -1537,6 +1544,144 @@ static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu)
>  	return ret;
>  }
>  
> +static int vfio_add_iova_cap(struct vfio_info_cap *caps, u64 start, u64 end)
> +{
> +	struct vfio_iommu_type1_info_cap_iova_range *cap;
> +	struct vfio_info_cap_header *header;
> +
> +	header = vfio_info_cap_add(caps, sizeof(*cap),
> +			VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 1);
> +	if (IS_ERR(header))
> +		return PTR_ERR(header);
> +
> +	cap = container_of(header,
> +			   struct vfio_iommu_type1_info_cap_iova_range,
> +			   header);
> +
> +	cap->start = start;
> +	cap->end = end;
> +
> +	return 0;
> +}
> +
> +static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,
> +				struct list_head *head)
> +{
> +	struct vfio_iommu_iova *region;
> +
> +	region = kzalloc(sizeof(*region), GFP_KERNEL);

You're initializing every field, so a zero'd allocation is not
necessary here.

> +	if (!region)
> +		return -ENOMEM;
> +
> +	INIT_LIST_HEAD(&region->list);
> +	region->start = start;
> +	region->end = end;
> +
> +	list_add_tail(&region->list, head);
> +	return 0;
> +}
> +
> +/*
> + * Check and update iova region list in case a reserved region
> + * overlaps the iommu iova range.
> + */
> +static int vfio_update_iommu_iova_range(phys_addr_t start, phys_addr_t end,
> +					struct list_head *iova)
> +{
> +	struct vfio_iommu_iova *node;
> +	phys_addr_t a, b;
> +	int ret = 0;
> +
> +	if (list_empty(iova))
> +		return -ENODEV;
> +
> +	node = list_last_entry(iova, struct vfio_iommu_iova, list);
> +	a = node->start;
> +	b = node->end;
> +
> +	/* No overlap */
> +	if ((start > b) || (end < a))
> +		return 0;
> +
> +	if (start > a)
> +		ret = vfio_insert_iova(a, start - 1, &node->list);
> +	if (ret)
> +		goto done;
> +	if (end < b)
> +		ret = vfio_insert_iova(end + 1, b, &node->list);
> +
> +done:
> +	list_del(&node->list);
> +	kfree(node);
> +
> +	return ret;
> +}
> +
> +static int vfio_resv_cmp(void *priv, struct list_head *a, struct list_head *b)
> +{
> +	struct iommu_resv_region *ra, *rb;
> +
> +	ra = container_of(a, struct iommu_resv_region, list);
> +	rb = container_of(b, struct iommu_resv_region, list);
> +
> +	if (ra->start < rb->start)
> +		return -1;
> +	if (ra->start > rb->start)
> +		return 1;
> +	return 0;
> +}
> +
> +static int vfio_build_iommu_iova_caps(struct vfio_iommu *iommu,
> +				struct vfio_info_cap *caps)
> +{
> +	struct iommu_resv_region *resv, *resv_next;
> +	struct vfio_iommu_iova *iova, *iova_next;
> +	struct list_head group_resv_regions, vfio_iova_regions;
> +	struct vfio_domain *domain;
> +	struct vfio_group *g;
> +	phys_addr_t start, end;
> +	int ret = 0;
> +
> +	domain = list_first_entry(&iommu->domain_list,
> +				  struct vfio_domain, next);

How do you resolve that we can have multiple domains in a container and
each my provide different apertures?  Eric noted that the attach group
function attempts to do compatibility checks, so we need to figure out
how we determine IOVA apertures are compatible.  The most obvious
answer seems to be that we should look through the dma_list on the
vfio_iommu and determine if there are existing mappings that are
incompatible with the new domain.  That also suggests that we should
maintain and update a list of valid iova ranges as we go such that we
can reject mappings outside of those valid ranges.

> +	/* Get the default iova range supported */
> +	start = domain->domain->geometry.aperture_start;
> +	end = domain->domain->geometry.aperture_end;

There's an IOMMU API for this, iommu_domain_get_attr() with
DOMAIN_ATTR_GEOMETRY.

> +	INIT_LIST_HEAD(&vfio_iova_regions);
> +	vfio_insert_iova(start, end, &vfio_iova_regions);
> +
> +	/* Get reserved regions if any */
> +	INIT_LIST_HEAD(&group_resv_regions);
> +	list_for_each_entry(g, &domain->group_list, next)
> +		iommu_get_group_resv_regions(g->iommu_group,
> +						&group_resv_regions);

Reserved ranges also need to be accounted for as groups are added to a
domain.  Again, if dma list includes any mappings overlapping reserved
ranges (ie. holes in the iova space), the group attach should fail.

> +	list_sort(NULL, &group_resv_regions, vfio_resv_cmp);
> +
> +	/* Update iova range excluding reserved regions */
> +	list_for_each_entry(resv, &group_resv_regions, list) {
> +		ret = vfio_update_iommu_iova_range(resv->start,
> +				resv->start + resv->length - 1,
> +				&vfio_iova_regions);
> +		if (ret)
> +			goto done;
> +	}
> +
> +	list_for_each_entry(iova, &vfio_iova_regions, list) {
> +		ret = vfio_add_iova_cap(caps, iova->start, iova->end);

It seems like a fair bit of overhead and nuisance for the user to have
each iova range added as a separate capability.  I think I'd prefer to
see the capability size be based on a number of entries field.

> +		if (ret)
> +			goto done;
> +	}
> +
> +done:
> +	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)
> +		kfree(resv);
> +
> +	list_for_each_entry_safe(iova, iova_next, &vfio_iova_regions, list)
> +		kfree(iova);
> +
> +	return ret;
> +}
> +
>  static long vfio_iommu_type1_ioctl(void *iommu_data,
>  				   unsigned int cmd, unsigned long arg)
>  {
> @@ -1558,8 +1703,10 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>  		}
>  	} else if (cmd == VFIO_IOMMU_GET_INFO) {
>  		struct vfio_iommu_type1_info info;
> +		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
> +		int ret;
>  
> -		minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
> +		minsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);

This is incompatible, it will break existing userspace.  See
include/uapi/linux/vfio.h:

/*
 * Callers of INFO ioctls passing insufficiently sized buffers will see
 * the capability chain flag bit set, a zero value for the first capability
 * offset (if available within the provided argsz), and argsz will be
 * updated to report the necessary buffer size.  For compatibility, the
 * INFO ioctl will not report error in this case, but the capability chain
 * will not be available.
 */

>  
>  		if (copy_from_user(&info, (void __user *)arg, minsz))
>  			return -EFAULT;
> @@ -1571,6 +1718,29 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>  
>  		info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
>  
> +		ret = vfio_build_iommu_iova_caps(iommu, &caps);
> +		if (ret)
> +			return ret;
> +
> +		if (caps.size) {
> +			info.flags |= VFIO_IOMMU_INFO_CAPS;
> +			if (info.argsz < sizeof(info) + caps.size) {
> +				info.argsz = sizeof(info) + caps.size;
> +				info.cap_offset = 0;
> +			} else {
> +				vfio_info_cap_shift(&caps, sizeof(info));
> +				if (copy_to_user((void __user *)arg +
> +						sizeof(info), caps.buf,
> +						caps.size)) {
> +					kfree(caps.buf);
> +					return -EFAULT;
> +				}
> +				info.cap_offset = sizeof(info);
> +			}
> +
> +			kfree(caps.buf);
> +		}
> +
>  		return copy_to_user((void __user *)arg, &info, minsz) ?
>  			-EFAULT : 0;
>  
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index e3301db..c4e338b 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -517,7 +517,20 @@ struct vfio_iommu_type1_info {
>  	__u32	argsz;
>  	__u32	flags;
>  #define VFIO_IOMMU_INFO_PGSIZES (1 << 0)	/* supported page sizes info */
> +#define VFIO_IOMMU_INFO_CAPS	(1 << 1)	/* Info supports caps */
>  	__u64	iova_pgsizes;		/* Bitmap of supported page sizes */
> +	__u32   cap_offset;	/* Offset within info struct of first cap */
> +	__u32	__resv;

Hmm, I'm not sure why the additional reserved field here is necessary.
I guess we need 8-byte alignment of this iova range capability, but
that should probably be accounted for explicitly as the capability is
constructed in the buffer rather than implicitly by the ending offset
of the static structure.

> +};
> +
> +/*
> + * The IOVA_RANGE capability allows to report the IOVA range(s),
> + */
> +#define VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE  1
> +struct vfio_iommu_type1_info_cap_iova_range {
> +	struct vfio_info_cap_header header;
> +	__u64 start;
> +	__u64 end;
>  };
>  

It should be noted that this is version 1 of this structure.

>  #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)

Thanks,
Alex
Alex Williamson Dec. 7, 2017, 6:37 p.m. UTC | #4
On Thu, 7 Dec 2017 13:55:33 +0100
Pierre Morel <pmorel@linux.vnet.ibm.com> wrote:

> On 06/12/2017 17:15, Shameerali Kolothum Thodi wrote:
> > Hi Pierre,
> >   
> >> -----Original Message-----
> >> From: Shameerali Kolothum Thodi
> >> Sent: Wednesday, December 06, 2017 4:08 PM
> >> To: alex.williamson@redhat.com; eric.auger@redhat.com;
> >> pmorel@linux.vnet.ibm.com
> >> Cc: kvm@vger.kernel.org; linux-kernel@vger.kernel.org; Linuxarm
> >> <linuxarm@huawei.com>; Shameerali Kolothum Thodi
> >> <shameerali.kolothum.thodi@huawei.com>
> >> Subject: [RFC] vfio/type1: Add IOVA_RANGE capability support
> >>
> >> This patch allows the user-space to retrieve the supported
> >> IOVA range(s), excluding any reserved regions. The implementation
> >> is based on capability chains, added to the VFIO_IOMMU_GET_INFO ioctl.
> >>
> >> This is following the discussions here[1] and is based on the RFC patch[2].
> >>
> >> ToDo:
> >>   - This currently derives the default supported iova range from the first
> >>     iommu domain. This needs to be changed to go through the domain_list
> >>     instead.
> >>   - Sync with Pierre's patch[3].  
> > 
> > Thanks to Eric[1], came to know that you have posted a patch to retrieve the
> > iommu aperture info. This RFC does a similar thing but try to take care of
> > any reserved regions and adds to the capability chain.
> > 
> > Please take a look and if there is a possibility to sync up your next revision
> > and this, please let me know.  
> 
> Hi Shameer,
> 
> Indeed it is close to what I was developing.
> 
> I have a single concern, the aperture is strongly hardware related while 
> reserved areas are much more flexible.
> 
> As a consequence, I think it would be better if they were handled in 
> separate capabilities and let the user space decide if it wants to know 
> about one or the other.
> 
> For my immediate needs, this patch would be OK since we (s390x) do not 
> use reserved regions.
> 
> @Alex: what do you prefer
> If we need two capabilities, I will send the patch serie I made on the 
> aperture capability for VFIO IOMMU.
> If not I will use Shameer's patch.

I think your suggestion boils down to mirroring the IOMMU API of
exposing base geometry and reserved ranges more directly to userspace.
I don't think that creates a stable user API.  Users would need to be
updated in lockstep for new types of reserved ranges in order to
exclude them from the base geometry.  The vfio kernel code (proposed
in this patch) can be updated in lockstep as it is part of the kernel.
Therefore the idea here is that this capability exposes only apertures
which are available for standard IOVA map and unmap requests.  It's
vfio in the host kernel that's responsible for pruning out new reserved
types.

Beyond that, we can certainly add new capabilities that identify
certain types of reserved areas within the IOVA gaps of the above
capability.  This allows the vfio API to be extended in a compatible
way for users, the IOVA ranges capability accounts for all types of
special ranges that aren't available for standard IOVA mappings and
address limitations of the IOMMU itself, then new capabilities can be
added if there's a need to expose specific types of excluded ranges to
the user.  Thanks,

Alex
Shameerali Kolothum Thodi Dec. 8, 2017, 9:44 a.m. UTC | #5
Hi Alex,

> -----Original Message-----
> From: Alex Williamson [mailto:alex.williamson@redhat.com]
> Sent: Thursday, December 07, 2017 4:08 PM
> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>
> Cc: eric.auger@redhat.com; pmorel@linux.vnet.ibm.com;
> kvm@vger.kernel.org; linux-kernel@vger.kernel.org; Linuxarm
> <linuxarm@huawei.com>
> Subject: Re: [RFC] vfio/type1: Add IOVA_RANGE capability support
> 
> On Wed, 6 Dec 2017 16:07:36 +0000
> Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> wrote:
> 
> > This patch allows the user-space to retrieve the supported
> > IOVA range(s), excluding any reserved regions. The implementation
> > is based on capability chains, added to the VFIO_IOMMU_GET_INFO ioctl.
> >
> > This is following the discussions here[1] and is based on the RFC patch[2].
> >
> > ToDo:
> >  - This currently derives the default supported iova range from the first
> >    iommu domain. This needs to be changed to go through the domain_list
> >    instead.
> >  - Sync with Pierre's patch[3].
> >
> > 1.https://lists.gnu.org/archive/html/qemu-devel/2017-11/msg03651.html
> > 2.https://lists.linuxfoundation.org/pipermail/iommu/2016-
> November/019002.html
> > 3.https://patchwork.kernel.org/patch/10084655/
> >
> > Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
> > ---
> >  drivers/vfio/vfio_iommu_type1.c | 172
> +++++++++++++++++++++++++++++++++++++++-
> >  include/uapi/linux/vfio.h       |  13 +++
> >  2 files changed, 184 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/vfio/vfio_iommu_type1.c
> b/drivers/vfio/vfio_iommu_type1.c
> > index e30e29a..72ca78a 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -28,6 +28,7 @@
> >  #include <linux/device.h>
> >  #include <linux/fs.h>
> >  #include <linux/iommu.h>
> > +#include <linux/list_sort.h>
> >  #include <linux/module.h>
> >  #include <linux/mm.h>
> >  #include <linux/rbtree.h>
> > @@ -92,6 +93,12 @@ struct vfio_group {
> >  	struct list_head	next;
> >  };
> >
> > +struct vfio_iommu_iova {
> > +	struct list_head	list;
> > +	phys_addr_t		start;
> > +	phys_addr_t		end;
> > +};
> > +
> >  /*
> >   * Guest RAM pinning working set or DMA target
> >   */
> > @@ -1537,6 +1544,144 @@ static int
> vfio_domains_have_iommu_cache(struct vfio_iommu *iommu)
> >  	return ret;
> >  }
> >
> > +static int vfio_add_iova_cap(struct vfio_info_cap *caps, u64 start, u64 end)
> > +{
> > +	struct vfio_iommu_type1_info_cap_iova_range *cap;
> > +	struct vfio_info_cap_header *header;
> > +
> > +	header = vfio_info_cap_add(caps, sizeof(*cap),
> > +			VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 1);
> > +	if (IS_ERR(header))
> > +		return PTR_ERR(header);
> > +
> > +	cap = container_of(header,
> > +			   struct vfio_iommu_type1_info_cap_iova_range,
> > +			   header);
> > +
> > +	cap->start = start;
> > +	cap->end = end;
> > +
> > +	return 0;
> > +}
> > +
> > +static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,
> > +				struct list_head *head)
> > +{
> > +	struct vfio_iommu_iova *region;
> > +
> > +	region = kzalloc(sizeof(*region), GFP_KERNEL);
> 
> You're initializing every field, so a zero'd allocation is not
> necessary here.

Ok.

> > +	if (!region)
> > +		return -ENOMEM;
> > +
> > +	INIT_LIST_HEAD(&region->list);
> > +	region->start = start;
> > +	region->end = end;
> > +
> > +	list_add_tail(&region->list, head);
> > +	return 0;
> > +}
> > +
> > +/*
> > + * Check and update iova region list in case a reserved region
> > + * overlaps the iommu iova range.
> > + */
> > +static int vfio_update_iommu_iova_range(phys_addr_t start, phys_addr_t
> end,
> > +					struct list_head *iova)
> > +{
> > +	struct vfio_iommu_iova *node;
> > +	phys_addr_t a, b;
> > +	int ret = 0;
> > +
> > +	if (list_empty(iova))
> > +		return -ENODEV;
> > +
> > +	node = list_last_entry(iova, struct vfio_iommu_iova, list);
> > +	a = node->start;
> > +	b = node->end;
> > +
> > +	/* No overlap */
> > +	if ((start > b) || (end < a))
> > +		return 0;
> > +
> > +	if (start > a)
> > +		ret = vfio_insert_iova(a, start - 1, &node->list);
> > +	if (ret)
> > +		goto done;
> > +	if (end < b)
> > +		ret = vfio_insert_iova(end + 1, b, &node->list);
> > +
> > +done:
> > +	list_del(&node->list);
> > +	kfree(node);
> > +
> > +	return ret;
> > +}
> > +
> > +static int vfio_resv_cmp(void *priv, struct list_head *a, struct list_head *b)
> > +{
> > +	struct iommu_resv_region *ra, *rb;
> > +
> > +	ra = container_of(a, struct iommu_resv_region, list);
> > +	rb = container_of(b, struct iommu_resv_region, list);
> > +
> > +	if (ra->start < rb->start)
> > +		return -1;
> > +	if (ra->start > rb->start)
> > +		return 1;
> > +	return 0;
> > +}
> > +
> > +static int vfio_build_iommu_iova_caps(struct vfio_iommu *iommu,
> > +				struct vfio_info_cap *caps)
> > +{
> > +	struct iommu_resv_region *resv, *resv_next;
> > +	struct vfio_iommu_iova *iova, *iova_next;
> > +	struct list_head group_resv_regions, vfio_iova_regions;
> > +	struct vfio_domain *domain;
> > +	struct vfio_group *g;
> > +	phys_addr_t start, end;
> > +	int ret = 0;
> > +
> > +	domain = list_first_entry(&iommu->domain_list,
> > +				  struct vfio_domain, next);
> 
> How do you resolve that we can have multiple domains in a container and
> each my provide different apertures?  Eric noted that the attach group
> function attempts to do compatibility checks, so we need to figure out
> how we determine IOVA apertures are compatible.  The most obvious
> answer seems to be that we should look through the dma_list on the
> vfio_iommu and determine if there are existing mappings that are
> incompatible with the new domain.  That also suggests that we should
> maintain and update a list of valid iova ranges as we go such that we
> can reject mappings outside of those valid ranges.

Right. I will go through this in detail and see how to accommodate this.

> > +	/* Get the default iova range supported */
> > +	start = domain->domain->geometry.aperture_start;
> > +	end = domain->domain->geometry.aperture_end;
> 
> There's an IOMMU API for this, iommu_domain_get_attr() with
> DOMAIN_ATTR_GEOMETRY.
> 
> > +	INIT_LIST_HEAD(&vfio_iova_regions);
> > +	vfio_insert_iova(start, end, &vfio_iova_regions);
> > +
> > +	/* Get reserved regions if any */
> > +	INIT_LIST_HEAD(&group_resv_regions);
> > +	list_for_each_entry(g, &domain->group_list, next)
> > +		iommu_get_group_resv_regions(g->iommu_group,
> > +						&group_resv_regions);
> 
> Reserved ranges also need to be accounted for as groups are added to a
> domain.  Again, if dma list includes any mappings overlapping reserved
> ranges (ie. holes in the iova space), the group attach should fail.

Ok.

> > +	list_sort(NULL, &group_resv_regions, vfio_resv_cmp);
> > +
> > +	/* Update iova range excluding reserved regions */
> > +	list_for_each_entry(resv, &group_resv_regions, list) {
> > +		ret = vfio_update_iommu_iova_range(resv->start,
> > +				resv->start + resv->length - 1,
> > +				&vfio_iova_regions);
> > +		if (ret)
> > +			goto done;
> > +	}
> > +
> > +	list_for_each_entry(iova, &vfio_iova_regions, list) {
> > +		ret = vfio_add_iova_cap(caps, iova->start, iova->end);
> 
> It seems like a fair bit of overhead and nuisance for the user to have
> each iova range added as a separate capability.  I think I'd prefer to
> see the capability size be based on a number of entries field.

Ok. I think the suggestion is to have something similar to the sparse mmap
capability implementation. 
 
> > +		if (ret)
> > +			goto done;
> > +	}
> > +
> > +done:
> > +	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)
> > +		kfree(resv);
> > +
> > +	list_for_each_entry_safe(iova, iova_next, &vfio_iova_regions, list)
> > +		kfree(iova);
> > +
> > +	return ret;
> > +}
> > +
> >  static long vfio_iommu_type1_ioctl(void *iommu_data,
> >  				   unsigned int cmd, unsigned long arg)
> >  {
> > @@ -1558,8 +1703,10 @@ static long vfio_iommu_type1_ioctl(void
> *iommu_data,
> >  		}
> >  	} else if (cmd == VFIO_IOMMU_GET_INFO) {
> >  		struct vfio_iommu_type1_info info;
> > +		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
> > +		int ret;
> >
> > -		minsz = offsetofend(struct vfio_iommu_type1_info,
> iova_pgsizes);
> > +		minsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
> 
> This is incompatible, it will break existing userspace.  See
> include/uapi/linux/vfio.h:

Right. 

> /*
>  * Callers of INFO ioctls passing insufficiently sized buffers will see
>  * the capability chain flag bit set, a zero value for the first capability
>  * offset (if available within the provided argsz), and argsz will be
>  * updated to report the necessary buffer size.  For compatibility, the
>  * INFO ioctl will not report error in this case, but the capability chain
>  * will not be available.
>  */
> 
> >
> >  		if (copy_from_user(&info, (void __user *)arg, minsz))
> >  			return -EFAULT;
> > @@ -1571,6 +1718,29 @@ static long vfio_iommu_type1_ioctl(void
> *iommu_data,
> >
> >  		info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
> >
> > +		ret = vfio_build_iommu_iova_caps(iommu, &caps);
> > +		if (ret)
> > +			return ret;
> > +
> > +		if (caps.size) {
> > +			info.flags |= VFIO_IOMMU_INFO_CAPS;
> > +			if (info.argsz < sizeof(info) + caps.size) {
> > +				info.argsz = sizeof(info) + caps.size;
> > +				info.cap_offset = 0;
> > +			} else {
> > +				vfio_info_cap_shift(&caps, sizeof(info));
> > +				if (copy_to_user((void __user *)arg +
> > +						sizeof(info), caps.buf,
> > +						caps.size)) {
> > +					kfree(caps.buf);
> > +					return -EFAULT;
> > +				}
> > +				info.cap_offset = sizeof(info);
> > +			}
> > +
> > +			kfree(caps.buf);
> > +		}
> > +
> >  		return copy_to_user((void __user *)arg, &info, minsz) ?
> >  			-EFAULT : 0;
> >
> > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > index e3301db..c4e338b 100644
> > --- a/include/uapi/linux/vfio.h
> > +++ b/include/uapi/linux/vfio.h
> > @@ -517,7 +517,20 @@ struct vfio_iommu_type1_info {
> >  	__u32	argsz;
> >  	__u32	flags;
> >  #define VFIO_IOMMU_INFO_PGSIZES (1 << 0)	/* supported page sizes info */
> > +#define VFIO_IOMMU_INFO_CAPS	(1 << 1)	/* Info supports caps */
> >  	__u64	iova_pgsizes;		/* Bitmap of supported page sizes */
> > +	__u32   cap_offset;	/* Offset within info struct of first cap */
> > +	__u32	__resv;
> 
> Hmm, I'm not sure why the additional reserved field here is necessary.
> I guess we need 8-byte alignment of this iova range capability, but
> that should probably be accounted for explicitly as the capability is
> constructed in the buffer rather than implicitly by the ending offset
> of the static structure.

Ok.

> > +};
> > +
> > +/*
> > + * The IOVA_RANGE capability allows to report the IOVA range(s),
> > + */
> > +#define VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE  1
> > +struct vfio_iommu_type1_info_cap_iova_range {
> > +	struct vfio_info_cap_header header;
> > +	__u64 start;
> > +	__u64 end;
> >  };
> >
> 
> It should be noted that this is version 1 of this structure.

Ok. Will update that.

Many thanks for going through this and the feedback.

Cheers,
Shameer
diff mbox

Patch

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index e30e29a..72ca78a 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -28,6 +28,7 @@ 
 #include <linux/device.h>
 #include <linux/fs.h>
 #include <linux/iommu.h>
+#include <linux/list_sort.h>
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/rbtree.h>
@@ -92,6 +93,12 @@  struct vfio_group {
 	struct list_head	next;
 };
 
+struct vfio_iommu_iova {
+	struct list_head	list;
+	phys_addr_t		start;
+	phys_addr_t		end;
+};
+
 /*
  * Guest RAM pinning working set or DMA target
  */
@@ -1537,6 +1544,144 @@  static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu)
 	return ret;
 }
 
+static int vfio_add_iova_cap(struct vfio_info_cap *caps, u64 start, u64 end)
+{
+	struct vfio_iommu_type1_info_cap_iova_range *cap;
+	struct vfio_info_cap_header *header;
+
+	header = vfio_info_cap_add(caps, sizeof(*cap),
+			VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 1);
+	if (IS_ERR(header))
+		return PTR_ERR(header);
+
+	cap = container_of(header,
+			   struct vfio_iommu_type1_info_cap_iova_range,
+			   header);
+
+	cap->start = start;
+	cap->end = end;
+
+	return 0;
+}
+
+static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,
+				struct list_head *head)
+{
+	struct vfio_iommu_iova *region;
+
+	region = kzalloc(sizeof(*region), GFP_KERNEL);
+	if (!region)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&region->list);
+	region->start = start;
+	region->end = end;
+
+	list_add_tail(&region->list, head);
+	return 0;
+}
+
+/*
+ * Check and update iova region list in case a reserved region
+ * overlaps the iommu iova range.
+ */
+static int vfio_update_iommu_iova_range(phys_addr_t start, phys_addr_t end,
+					struct list_head *iova)
+{
+	struct vfio_iommu_iova *node;
+	phys_addr_t a, b;
+	int ret = 0;
+
+	if (list_empty(iova))
+		return -ENODEV;
+
+	node = list_last_entry(iova, struct vfio_iommu_iova, list);
+	a = node->start;
+	b = node->end;
+
+	/* No overlap */
+	if ((start > b) || (end < a))
+		return 0;
+
+	if (start > a)
+		ret = vfio_insert_iova(a, start - 1, &node->list);
+	if (ret)
+		goto done;
+	if (end < b)
+		ret = vfio_insert_iova(end + 1, b, &node->list);
+
+done:
+	list_del(&node->list);
+	kfree(node);
+
+	return ret;
+}
+
+static int vfio_resv_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	struct iommu_resv_region *ra, *rb;
+
+	ra = container_of(a, struct iommu_resv_region, list);
+	rb = container_of(b, struct iommu_resv_region, list);
+
+	if (ra->start < rb->start)
+		return -1;
+	if (ra->start > rb->start)
+		return 1;
+	return 0;
+}
+
+static int vfio_build_iommu_iova_caps(struct vfio_iommu *iommu,
+				struct vfio_info_cap *caps)
+{
+	struct iommu_resv_region *resv, *resv_next;
+	struct vfio_iommu_iova *iova, *iova_next;
+	struct list_head group_resv_regions, vfio_iova_regions;
+	struct vfio_domain *domain;
+	struct vfio_group *g;
+	phys_addr_t start, end;
+	int ret = 0;
+
+	domain = list_first_entry(&iommu->domain_list,
+				  struct vfio_domain, next);
+	/* Get the default iova range supported */
+	start = domain->domain->geometry.aperture_start;
+	end = domain->domain->geometry.aperture_end;
+	INIT_LIST_HEAD(&vfio_iova_regions);
+	vfio_insert_iova(start, end, &vfio_iova_regions);
+
+	/* Get reserved regions if any */
+	INIT_LIST_HEAD(&group_resv_regions);
+	list_for_each_entry(g, &domain->group_list, next)
+		iommu_get_group_resv_regions(g->iommu_group,
+						&group_resv_regions);
+	list_sort(NULL, &group_resv_regions, vfio_resv_cmp);
+
+	/* Update iova range excluding reserved regions */
+	list_for_each_entry(resv, &group_resv_regions, list) {
+		ret = vfio_update_iommu_iova_range(resv->start,
+				resv->start + resv->length - 1,
+				&vfio_iova_regions);
+		if (ret)
+			goto done;
+	}
+
+	list_for_each_entry(iova, &vfio_iova_regions, list) {
+		ret = vfio_add_iova_cap(caps, iova->start, iova->end);
+		if (ret)
+			goto done;
+	}
+
+done:
+	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)
+		kfree(resv);
+
+	list_for_each_entry_safe(iova, iova_next, &vfio_iova_regions, list)
+		kfree(iova);
+
+	return ret;
+}
+
 static long vfio_iommu_type1_ioctl(void *iommu_data,
 				   unsigned int cmd, unsigned long arg)
 {
@@ -1558,8 +1703,10 @@  static long vfio_iommu_type1_ioctl(void *iommu_data,
 		}
 	} else if (cmd == VFIO_IOMMU_GET_INFO) {
 		struct vfio_iommu_type1_info info;
+		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
+		int ret;
 
-		minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
+		minsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
 
 		if (copy_from_user(&info, (void __user *)arg, minsz))
 			return -EFAULT;
@@ -1571,6 +1718,29 @@  static long vfio_iommu_type1_ioctl(void *iommu_data,
 
 		info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
 
+		ret = vfio_build_iommu_iova_caps(iommu, &caps);
+		if (ret)
+			return ret;
+
+		if (caps.size) {
+			info.flags |= VFIO_IOMMU_INFO_CAPS;
+			if (info.argsz < sizeof(info) + caps.size) {
+				info.argsz = sizeof(info) + caps.size;
+				info.cap_offset = 0;
+			} else {
+				vfio_info_cap_shift(&caps, sizeof(info));
+				if (copy_to_user((void __user *)arg +
+						sizeof(info), caps.buf,
+						caps.size)) {
+					kfree(caps.buf);
+					return -EFAULT;
+				}
+				info.cap_offset = sizeof(info);
+			}
+
+			kfree(caps.buf);
+		}
+
 		return copy_to_user((void __user *)arg, &info, minsz) ?
 			-EFAULT : 0;
 
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index e3301db..c4e338b 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -517,7 +517,20 @@  struct vfio_iommu_type1_info {
 	__u32	argsz;
 	__u32	flags;
 #define VFIO_IOMMU_INFO_PGSIZES (1 << 0)	/* supported page sizes info */
+#define VFIO_IOMMU_INFO_CAPS	(1 << 1)	/* Info supports caps */
 	__u64	iova_pgsizes;		/* Bitmap of supported page sizes */
+	__u32   cap_offset;	/* Offset within info struct of first cap */
+	__u32	__resv;
+};
+
+/*
+ * The IOVA_RANGE capability allows to report the IOVA range(s),
+ */
+#define VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE  1
+struct vfio_iommu_type1_info_cap_iova_range {
+	struct vfio_info_cap_header header;
+	__u64 start;
+	__u64 end;
 };
 
 #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)