diff mbox

[kernel] vfio-pci: Allow mapping MSIX BAR

Message ID 20171208041829.27520-1-aik@ozlabs.ru (mailing list archive)
State New, archived
Headers show

Commit Message

Alexey Kardashevskiy Dec. 8, 2017, 4:18 a.m. UTC
By default VFIO disables mapping of MSIX BAR to the userspace as
the userspace may program it in a way allowing spurious interrupts;
instead the userspace uses the VFIO_DEVICE_SET_IRQS ioctl.
In order to eliminate guessing from the userspace about what is
mmapable, VFIO also advertises a sparse list of regions allowed to mmap.

This works fine as long as the system page size equals to the MSIX
alignment requirement which is 4KB. However with a bigger page size
the existing code prohibits mapping non-MSIX parts of a page with MSIX
structures so these parts have to be emulated via slow reads/writes on
a VFIO device fd. If these emulated bits are accessed often, this has
serious impact on performance.

This allows mmap of the entire BAR containing MSIX vector table.

This removes the sparse capability for PCI devices as it becomes useless.

As the userspace needs to know for sure whether mmapping of the MSIX
vector containing data can succeed, this adds a new capability -
VFIO_REGION_INFO_CAP_MSIX_MAPPABLE - which explicitly tells the userspace
that the entire BAR can be mmapped.

This does not touch the MSIX mangling in the BAR read/write handlers as
we are doing this just to enable direct access to non MSIX registers.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 include/uapi/linux/vfio.h   | 10 +++++++
 drivers/vfio/pci/vfio_pci.c | 65 +++++++--------------------------------------
 drivers/vfio/vfio.c         |  9 +++++++
 3 files changed, 29 insertions(+), 55 deletions(-)

Comments

Alex Williamson Dec. 12, 2017, 12:09 a.m. UTC | #1
On Fri,  8 Dec 2017 15:18:29 +1100
Alexey Kardashevskiy <aik@ozlabs.ru> wrote:

> By default VFIO disables mapping of MSIX BAR to the userspace as
> the userspace may program it in a way allowing spurious interrupts;
> instead the userspace uses the VFIO_DEVICE_SET_IRQS ioctl.
> In order to eliminate guessing from the userspace about what is
> mmapable, VFIO also advertises a sparse list of regions allowed to mmap.
> 
> This works fine as long as the system page size equals to the MSIX
> alignment requirement which is 4KB. However with a bigger page size
> the existing code prohibits mapping non-MSIX parts of a page with MSIX
> structures so these parts have to be emulated via slow reads/writes on
> a VFIO device fd. If these emulated bits are accessed often, this has
> serious impact on performance.
> 
> This allows mmap of the entire BAR containing MSIX vector table.
> 
> This removes the sparse capability for PCI devices as it becomes useless.
> 
> As the userspace needs to know for sure whether mmapping of the MSIX
> vector containing data can succeed, this adds a new capability -
> VFIO_REGION_INFO_CAP_MSIX_MAPPABLE - which explicitly tells the userspace
> that the entire BAR can be mmapped.
> 
> This does not touch the MSIX mangling in the BAR read/write handlers as
> we are doing this just to enable direct access to non MSIX registers.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  include/uapi/linux/vfio.h   | 10 +++++++
>  drivers/vfio/pci/vfio_pci.c | 65 +++++++--------------------------------------
>  drivers/vfio/vfio.c         |  9 +++++++
>  3 files changed, 29 insertions(+), 55 deletions(-)
> 
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index ae46105..0fb4407 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -300,6 +300,16 @@ struct vfio_region_info_cap_type {
>  #define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG	(2)
>  #define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG	(3)
>  
> +/*
> + * The MSIX mappable capability informs that MSIX data of a BAR can be mmapped
> + * which allows direct access to non-MSIX registers which happened to be within
> + * the same system page.
> + *
> + * Even though the userspace gets direct access to the MSIX data, the existing
> + * VFIO_DEVICE_SET_IRQS interface must still be used for MSIX configuration.
> + */
> +#define VFIO_REGION_INFO_CAP_MSIX_MAPPABLE	3
> +
>  /**
>   * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9,
>   *				    struct vfio_irq_info)
> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index a98681d..9bcc1e1 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -565,46 +565,16 @@ static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
>  	return walk.ret;
>  }
>  
> -static int msix_sparse_mmap_cap(struct vfio_pci_device *vdev,
> -				struct vfio_info_cap *caps)
> +static int msix_sparse_msix_mmappable_cap(struct vfio_pci_device *vdev,
> +		struct vfio_info_cap *caps)
>  {
> -	struct vfio_region_info_cap_sparse_mmap *sparse;
> -	size_t end, size;
> -	int nr_areas = 2, i = 0, ret;
> +	struct vfio_info_cap_header header = {
> +		.id = VFIO_REGION_INFO_CAP_MSIX_MAPPABLE,
> +		.version = 1
> +	};
>  
> -	end = pci_resource_len(vdev->pdev, vdev->msix_bar);
> -
> -	/* If MSI-X table is aligned to the start or end, only one area */
> -	if (((vdev->msix_offset & PAGE_MASK) == 0) ||
> -	    (PAGE_ALIGN(vdev->msix_offset + vdev->msix_size) >= end))
> -		nr_areas = 1;
> -
> -	size = sizeof(*sparse) + (nr_areas * sizeof(*sparse->areas));
> -
> -	sparse = kzalloc(size, GFP_KERNEL);
> -	if (!sparse)
> -		return -ENOMEM;
> -
> -	sparse->nr_areas = nr_areas;
> -
> -	if (vdev->msix_offset & PAGE_MASK) {
> -		sparse->areas[i].offset = 0;
> -		sparse->areas[i].size = vdev->msix_offset & PAGE_MASK;
> -		i++;
> -	}
> -
> -	if (PAGE_ALIGN(vdev->msix_offset + vdev->msix_size) < end) {
> -		sparse->areas[i].offset = PAGE_ALIGN(vdev->msix_offset +
> -						     vdev->msix_size);
> -		sparse->areas[i].size = end - sparse->areas[i].offset;
> -		i++;
> -	}
> -
> -	ret = vfio_info_add_capability(caps, VFIO_REGION_INFO_CAP_SPARSE_MMAP,
> -				       sparse);
> -	kfree(sparse);
> -
> -	return ret;
> +	return vfio_info_add_capability(caps,
> +			VFIO_REGION_INFO_CAP_MSIX_MAPPABLE, &header);
>  }
>  
>  int vfio_pci_register_dev_region(struct vfio_pci_device *vdev,
> @@ -695,7 +665,8 @@ static long vfio_pci_ioctl(void *device_data,
>  			if (vdev->bar_mmap_supported[info.index]) {
>  				info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
>  				if (info.index == vdev->msix_bar) {
> -					ret = msix_sparse_mmap_cap(vdev, &caps);
> +					ret = msix_sparse_msix_mmappable_cap(
> +							vdev, &caps);
>  					if (ret)
>  						return ret;
>  				}
> @@ -1125,22 +1096,6 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
>  	if (req_start + req_len > phys_len)
>  		return -EINVAL;
>  
> -	if (index == vdev->msix_bar) {
> -		/*
> -		 * Disallow mmaps overlapping the MSI-X table; users don't
> -		 * get to touch this directly.  We could find somewhere
> -		 * else to map the overlap, but page granularity is only
> -		 * a recommendation, not a requirement, so the user needs
> -		 * to know which bits are real.  Requiring them to mmap
> -		 * around the table makes that clear.
> -		 */
> -
> -		/* If neither entirely above nor below, then it overlaps */
> -		if (!(req_start >= vdev->msix_offset + vdev->msix_size ||
> -		      req_start + req_len <= vdev->msix_offset))
> -			return -EINVAL;
> -	}
> -
>  	/*
>  	 * Even though we don't make use of the barmap for the mmap,
>  	 * we need to request the region and the barmap tracks that.
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index f5a86f6..376b4e7 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
> @@ -1898,6 +1898,7 @@ int vfio_info_add_capability(struct vfio_info_cap *caps, int cap_type_id,
>  			     void *cap_type)
>  {
>  	int ret = -EINVAL;
> +	struct vfio_info_cap_header *header, *cap;
>  
>  	if (!cap_type)
>  		return 0;
> @@ -1910,6 +1911,14 @@ int vfio_info_add_capability(struct vfio_info_cap *caps, int cap_type_id,
>  	case VFIO_REGION_INFO_CAP_TYPE:
>  		ret = region_type_cap(caps, cap_type);
>  		break;
> +
> +	case VFIO_REGION_INFO_CAP_MSIX_MAPPABLE:
> +		cap = cap_type;
> +		header = vfio_info_cap_add(caps, sizeof(*header), cap->id,
> +				cap->version);
> +		if (IS_ERR(header))
> +			return PTR_ERR(header);
> +		return 0;
>  	}
>  
>  	return ret;

Why does this capability need to use the interface differently than the
existing callers?  The existing users pass a third arg with the data
structure, minus the header, filled in.  Logically, for a capability
which is only a header, the third arg could be NULL.  We could then
have a function:

int header_cap(struct vfio_info_cap *caps,
	       int cap_type_id, int cap_type_ver)

which does what you're doing here but plays nicely and sets ret and
breaks out to the common return.  Please also post the QEMU changes so
we can see the end to end changes.  Thanks,

Alex
Alexey Kardashevskiy Dec. 12, 2017, 2 a.m. UTC | #2
On 12/12/17 11:09, Alex Williamson wrote:
> On Fri,  8 Dec 2017 15:18:29 +1100
> Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
> 
>> By default VFIO disables mapping of MSIX BAR to the userspace as
>> the userspace may program it in a way allowing spurious interrupts;
>> instead the userspace uses the VFIO_DEVICE_SET_IRQS ioctl.
>> In order to eliminate guessing from the userspace about what is
>> mmapable, VFIO also advertises a sparse list of regions allowed to mmap.
>>
>> This works fine as long as the system page size equals to the MSIX
>> alignment requirement which is 4KB. However with a bigger page size
>> the existing code prohibits mapping non-MSIX parts of a page with MSIX
>> structures so these parts have to be emulated via slow reads/writes on
>> a VFIO device fd. If these emulated bits are accessed often, this has
>> serious impact on performance.
>>
>> This allows mmap of the entire BAR containing MSIX vector table.
>>
>> This removes the sparse capability for PCI devices as it becomes useless.
>>
>> As the userspace needs to know for sure whether mmapping of the MSIX
>> vector containing data can succeed, this adds a new capability -
>> VFIO_REGION_INFO_CAP_MSIX_MAPPABLE - which explicitly tells the userspace
>> that the entire BAR can be mmapped.
>>
>> This does not touch the MSIX mangling in the BAR read/write handlers as
>> we are doing this just to enable direct access to non MSIX registers.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>  include/uapi/linux/vfio.h   | 10 +++++++
>>  drivers/vfio/pci/vfio_pci.c | 65 +++++++--------------------------------------
>>  drivers/vfio/vfio.c         |  9 +++++++
>>  3 files changed, 29 insertions(+), 55 deletions(-)
>>
>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
>> index ae46105..0fb4407 100644
>> --- a/include/uapi/linux/vfio.h
>> +++ b/include/uapi/linux/vfio.h
>> @@ -300,6 +300,16 @@ struct vfio_region_info_cap_type {
>>  #define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG	(2)
>>  #define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG	(3)
>>  
>> +/*
>> + * The MSIX mappable capability informs that MSIX data of a BAR can be mmapped
>> + * which allows direct access to non-MSIX registers which happened to be within
>> + * the same system page.
>> + *
>> + * Even though the userspace gets direct access to the MSIX data, the existing
>> + * VFIO_DEVICE_SET_IRQS interface must still be used for MSIX configuration.
>> + */
>> +#define VFIO_REGION_INFO_CAP_MSIX_MAPPABLE	3
>> +
>>  /**
>>   * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9,
>>   *				    struct vfio_irq_info)
>> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
>> index a98681d..9bcc1e1 100644
>> --- a/drivers/vfio/pci/vfio_pci.c
>> +++ b/drivers/vfio/pci/vfio_pci.c
>> @@ -565,46 +565,16 @@ static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
>>  	return walk.ret;
>>  }
>>  
>> -static int msix_sparse_mmap_cap(struct vfio_pci_device *vdev,
>> -				struct vfio_info_cap *caps)
>> +static int msix_sparse_msix_mmappable_cap(struct vfio_pci_device *vdev,
>> +		struct vfio_info_cap *caps)
>>  {
>> -	struct vfio_region_info_cap_sparse_mmap *sparse;
>> -	size_t end, size;
>> -	int nr_areas = 2, i = 0, ret;
>> +	struct vfio_info_cap_header header = {
>> +		.id = VFIO_REGION_INFO_CAP_MSIX_MAPPABLE,
>> +		.version = 1
>> +	};
>>  
>> -	end = pci_resource_len(vdev->pdev, vdev->msix_bar);
>> -
>> -	/* If MSI-X table is aligned to the start or end, only one area */
>> -	if (((vdev->msix_offset & PAGE_MASK) == 0) ||
>> -	    (PAGE_ALIGN(vdev->msix_offset + vdev->msix_size) >= end))
>> -		nr_areas = 1;
>> -
>> -	size = sizeof(*sparse) + (nr_areas * sizeof(*sparse->areas));
>> -
>> -	sparse = kzalloc(size, GFP_KERNEL);
>> -	if (!sparse)
>> -		return -ENOMEM;
>> -
>> -	sparse->nr_areas = nr_areas;
>> -
>> -	if (vdev->msix_offset & PAGE_MASK) {
>> -		sparse->areas[i].offset = 0;
>> -		sparse->areas[i].size = vdev->msix_offset & PAGE_MASK;
>> -		i++;
>> -	}
>> -
>> -	if (PAGE_ALIGN(vdev->msix_offset + vdev->msix_size) < end) {
>> -		sparse->areas[i].offset = PAGE_ALIGN(vdev->msix_offset +
>> -						     vdev->msix_size);
>> -		sparse->areas[i].size = end - sparse->areas[i].offset;
>> -		i++;
>> -	}
>> -
>> -	ret = vfio_info_add_capability(caps, VFIO_REGION_INFO_CAP_SPARSE_MMAP,
>> -				       sparse);
>> -	kfree(sparse);
>> -
>> -	return ret;
>> +	return vfio_info_add_capability(caps,
>> +			VFIO_REGION_INFO_CAP_MSIX_MAPPABLE, &header);
>>  }
>>  
>>  int vfio_pci_register_dev_region(struct vfio_pci_device *vdev,
>> @@ -695,7 +665,8 @@ static long vfio_pci_ioctl(void *device_data,
>>  			if (vdev->bar_mmap_supported[info.index]) {
>>  				info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
>>  				if (info.index == vdev->msix_bar) {
>> -					ret = msix_sparse_mmap_cap(vdev, &caps);
>> +					ret = msix_sparse_msix_mmappable_cap(
>> +							vdev, &caps);
>>  					if (ret)
>>  						return ret;
>>  				}
>> @@ -1125,22 +1096,6 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
>>  	if (req_start + req_len > phys_len)
>>  		return -EINVAL;
>>  
>> -	if (index == vdev->msix_bar) {
>> -		/*
>> -		 * Disallow mmaps overlapping the MSI-X table; users don't
>> -		 * get to touch this directly.  We could find somewhere
>> -		 * else to map the overlap, but page granularity is only
>> -		 * a recommendation, not a requirement, so the user needs
>> -		 * to know which bits are real.  Requiring them to mmap
>> -		 * around the table makes that clear.
>> -		 */
>> -
>> -		/* If neither entirely above nor below, then it overlaps */
>> -		if (!(req_start >= vdev->msix_offset + vdev->msix_size ||
>> -		      req_start + req_len <= vdev->msix_offset))
>> -			return -EINVAL;
>> -	}
>> -
>>  	/*
>>  	 * Even though we don't make use of the barmap for the mmap,
>>  	 * we need to request the region and the barmap tracks that.
>> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
>> index f5a86f6..376b4e7 100644
>> --- a/drivers/vfio/vfio.c
>> +++ b/drivers/vfio/vfio.c
>> @@ -1898,6 +1898,7 @@ int vfio_info_add_capability(struct vfio_info_cap *caps, int cap_type_id,
>>  			     void *cap_type)
>>  {
>>  	int ret = -EINVAL;
>> +	struct vfio_info_cap_header *header, *cap;
>>  
>>  	if (!cap_type)
>>  		return 0;
>> @@ -1910,6 +1911,14 @@ int vfio_info_add_capability(struct vfio_info_cap *caps, int cap_type_id,
>>  	case VFIO_REGION_INFO_CAP_TYPE:
>>  		ret = region_type_cap(caps, cap_type);
>>  		break;
>> +
>> +	case VFIO_REGION_INFO_CAP_MSIX_MAPPABLE:
>> +		cap = cap_type;
>> +		header = vfio_info_cap_add(caps, sizeof(*header), cap->id,
>> +				cap->version);
>> +		if (IS_ERR(header))
>> +			return PTR_ERR(header);
>> +		return 0;
>>  	}
>>  
>>  	return ret;
> 
> Why does this capability need to use the interface differently than the
> existing callers?  The existing users pass a third arg with the data
> structure, minus the header, filled in. 

The third arg points to the entire capability including the
vfio_info_cap_header, why minus (I read "minus" as if
vfio_region_info_cap_type did not have header embedded into it)? The header
is not initialized though.


> Logically, for a capability
> which is only a header, the third arg could be NULL.  We could then
> have a function:
> 
> int header_cap(struct vfio_info_cap *caps,
> 	       int cap_type_id, int cap_type_ver)

This is what I miss in the existing design - the caller of
vfio_info_add_capability passes the entire capability structure including
the id and version fields but it also passes the id separately and hard
codes the version to 1 in all current handlers, and overwrites the header
values. It should either use the passed header or take a version explicitly
imho, my try was to use the header but yeah, the cumbersome one.

Why was the version left out in the first place? Or a header not used (I
assume this is because we still to have initialize "next" so it would be
incomplete anyway so just forget that there is a header)?



> which does what you're doing here but plays nicely and sets ret and
> breaks out to the common return.  Please also post the QEMU changes so
> we can see the end to end changes.  Thanks,

I will post it but since we are simply enabling mmap, it is not much to
look at there...
David Gibson Dec. 15, 2017, 4:13 a.m. UTC | #3
On Fri, Dec 08, 2017 at 03:18:29PM +1100, Alexey Kardashevskiy wrote:
> By default VFIO disables mapping of MSIX BAR to the userspace as
> the userspace may program it in a way allowing spurious interrupts;
> instead the userspace uses the VFIO_DEVICE_SET_IRQS ioctl.
> In order to eliminate guessing from the userspace about what is
> mmapable, VFIO also advertises a sparse list of regions allowed to mmap.
> 
> This works fine as long as the system page size equals to the MSIX
> alignment requirement which is 4KB. However with a bigger page size
> the existing code prohibits mapping non-MSIX parts of a page with MSIX
> structures so these parts have to be emulated via slow reads/writes on
> a VFIO device fd. If these emulated bits are accessed often, this has
> serious impact on performance.
> 
> This allows mmap of the entire BAR containing MSIX vector table.
> 
> This removes the sparse capability for PCI devices as it becomes useless.
> 
> As the userspace needs to know for sure whether mmapping of the MSIX
> vector containing data can succeed, this adds a new capability -
> VFIO_REGION_INFO_CAP_MSIX_MAPPABLE - which explicitly tells the userspace
> that the entire BAR can be mmapped.
> 
> This does not touch the MSIX mangling in the BAR read/write handlers as
> we are doing this just to enable direct access to non MSIX registers.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>

This seems reasonable, except for the interface issue that Alex
pointed out already.

TBH, I'm not really sold on the need for a new kernel cap, rather than
just having qemu attempt the mamp() and fall back if it fails.  But
I'm not really fussed either way.

> ---
>  include/uapi/linux/vfio.h   | 10 +++++++
>  drivers/vfio/pci/vfio_pci.c | 65 +++++++--------------------------------------
>  drivers/vfio/vfio.c         |  9 +++++++
>  3 files changed, 29 insertions(+), 55 deletions(-)
> 
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index ae46105..0fb4407 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -300,6 +300,16 @@ struct vfio_region_info_cap_type {
>  #define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG	(2)
>  #define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG	(3)
>  
> +/*
> + * The MSIX mappable capability informs that MSIX data of a BAR can be mmapped
> + * which allows direct access to non-MSIX registers which happened to be within
> + * the same system page.
> + *
> + * Even though the userspace gets direct access to the MSIX data, the existing
> + * VFIO_DEVICE_SET_IRQS interface must still be used for MSIX configuration.
> + */
> +#define VFIO_REGION_INFO_CAP_MSIX_MAPPABLE	3
> +
>  /**
>   * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9,
>   *				    struct vfio_irq_info)
> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index a98681d..9bcc1e1 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -565,46 +565,16 @@ static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
>  	return walk.ret;
>  }
>  
> -static int msix_sparse_mmap_cap(struct vfio_pci_device *vdev,
> -				struct vfio_info_cap *caps)
> +static int msix_sparse_msix_mmappable_cap(struct vfio_pci_device *vdev,
> +		struct vfio_info_cap *caps)
>  {
> -	struct vfio_region_info_cap_sparse_mmap *sparse;
> -	size_t end, size;
> -	int nr_areas = 2, i = 0, ret;
> +	struct vfio_info_cap_header header = {
> +		.id = VFIO_REGION_INFO_CAP_MSIX_MAPPABLE,
> +		.version = 1
> +	};
>  
> -	end = pci_resource_len(vdev->pdev, vdev->msix_bar);
> -
> -	/* If MSI-X table is aligned to the start or end, only one area */
> -	if (((vdev->msix_offset & PAGE_MASK) == 0) ||
> -	    (PAGE_ALIGN(vdev->msix_offset + vdev->msix_size) >= end))
> -		nr_areas = 1;
> -
> -	size = sizeof(*sparse) + (nr_areas * sizeof(*sparse->areas));
> -
> -	sparse = kzalloc(size, GFP_KERNEL);
> -	if (!sparse)
> -		return -ENOMEM;
> -
> -	sparse->nr_areas = nr_areas;
> -
> -	if (vdev->msix_offset & PAGE_MASK) {
> -		sparse->areas[i].offset = 0;
> -		sparse->areas[i].size = vdev->msix_offset & PAGE_MASK;
> -		i++;
> -	}
> -
> -	if (PAGE_ALIGN(vdev->msix_offset + vdev->msix_size) < end) {
> -		sparse->areas[i].offset = PAGE_ALIGN(vdev->msix_offset +
> -						     vdev->msix_size);
> -		sparse->areas[i].size = end - sparse->areas[i].offset;
> -		i++;
> -	}
> -
> -	ret = vfio_info_add_capability(caps, VFIO_REGION_INFO_CAP_SPARSE_MMAP,
> -				       sparse);
> -	kfree(sparse);
> -
> -	return ret;
> +	return vfio_info_add_capability(caps,
> +			VFIO_REGION_INFO_CAP_MSIX_MAPPABLE, &header);
>  }
>  
>  int vfio_pci_register_dev_region(struct vfio_pci_device *vdev,
> @@ -695,7 +665,8 @@ static long vfio_pci_ioctl(void *device_data,
>  			if (vdev->bar_mmap_supported[info.index]) {
>  				info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
>  				if (info.index == vdev->msix_bar) {
> -					ret = msix_sparse_mmap_cap(vdev, &caps);
> +					ret = msix_sparse_msix_mmappable_cap(
> +							vdev, &caps);
>  					if (ret)
>  						return ret;
>  				}
> @@ -1125,22 +1096,6 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
>  	if (req_start + req_len > phys_len)
>  		return -EINVAL;
>  
> -	if (index == vdev->msix_bar) {
> -		/*
> -		 * Disallow mmaps overlapping the MSI-X table; users don't
> -		 * get to touch this directly.  We could find somewhere
> -		 * else to map the overlap, but page granularity is only
> -		 * a recommendation, not a requirement, so the user needs
> -		 * to know which bits are real.  Requiring them to mmap
> -		 * around the table makes that clear.
> -		 */
> -
> -		/* If neither entirely above nor below, then it overlaps */
> -		if (!(req_start >= vdev->msix_offset + vdev->msix_size ||
> -		      req_start + req_len <= vdev->msix_offset))
> -			return -EINVAL;
> -	}
> -
>  	/*
>  	 * Even though we don't make use of the barmap for the mmap,
>  	 * we need to request the region and the barmap tracks that.
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index f5a86f6..376b4e7 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
> @@ -1898,6 +1898,7 @@ int vfio_info_add_capability(struct vfio_info_cap *caps, int cap_type_id,
>  			     void *cap_type)
>  {
>  	int ret = -EINVAL;
> +	struct vfio_info_cap_header *header, *cap;
>  
>  	if (!cap_type)
>  		return 0;
> @@ -1910,6 +1911,14 @@ int vfio_info_add_capability(struct vfio_info_cap *caps, int cap_type_id,
>  	case VFIO_REGION_INFO_CAP_TYPE:
>  		ret = region_type_cap(caps, cap_type);
>  		break;
> +
> +	case VFIO_REGION_INFO_CAP_MSIX_MAPPABLE:
> +		cap = cap_type;
> +		header = vfio_info_cap_add(caps, sizeof(*header), cap->id,
> +				cap->version);
> +		if (IS_ERR(header))
> +			return PTR_ERR(header);
> +		return 0;
>  	}
>  
>  	return ret;
Alex Williamson Dec. 15, 2017, 3:56 p.m. UTC | #4
On Fri, 15 Dec 2017 15:13:22 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:

> On Fri, Dec 08, 2017 at 03:18:29PM +1100, Alexey Kardashevskiy wrote:
> > By default VFIO disables mapping of MSIX BAR to the userspace as
> > the userspace may program it in a way allowing spurious interrupts;
> > instead the userspace uses the VFIO_DEVICE_SET_IRQS ioctl.
> > In order to eliminate guessing from the userspace about what is
> > mmapable, VFIO also advertises a sparse list of regions allowed to mmap.
> > 
> > This works fine as long as the system page size equals to the MSIX
> > alignment requirement which is 4KB. However with a bigger page size
> > the existing code prohibits mapping non-MSIX parts of a page with MSIX
> > structures so these parts have to be emulated via slow reads/writes on
> > a VFIO device fd. If these emulated bits are accessed often, this has
> > serious impact on performance.
> > 
> > This allows mmap of the entire BAR containing MSIX vector table.
> > 
> > This removes the sparse capability for PCI devices as it becomes useless.
> > 
> > As the userspace needs to know for sure whether mmapping of the MSIX
> > vector containing data can succeed, this adds a new capability -
> > VFIO_REGION_INFO_CAP_MSIX_MAPPABLE - which explicitly tells the userspace
> > that the entire BAR can be mmapped.
> > 
> > This does not touch the MSIX mangling in the BAR read/write handlers as
> > we are doing this just to enable direct access to non MSIX registers.
> > 
> > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>  
> 
> This seems reasonable, except for the interface issue that Alex
> pointed out already.
> 
> TBH, I'm not really sold on the need for a new kernel cap, rather than
> just having qemu attempt the mamp() and fall back if it fails.  But
> I'm not really fussed either way.

Userspace could never handle an mmap failure as an error in this case,
they'd always just assume the old interface.  If the user sees the
capability and the mmap semantics don't match, something is wrong.
Userspace is of course free to try to mmap and fall back to the old
assumptions regardless of this capability, but having this capability
provides more deterministic behavior.  Thanks,

Alex

> > ---
> >  include/uapi/linux/vfio.h   | 10 +++++++
> >  drivers/vfio/pci/vfio_pci.c | 65 +++++++--------------------------------------
> >  drivers/vfio/vfio.c         |  9 +++++++
> >  3 files changed, 29 insertions(+), 55 deletions(-)
> > 
> > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > index ae46105..0fb4407 100644
> > --- a/include/uapi/linux/vfio.h
> > +++ b/include/uapi/linux/vfio.h
> > @@ -300,6 +300,16 @@ struct vfio_region_info_cap_type {
> >  #define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG	(2)
> >  #define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG	(3)
> >  
> > +/*
> > + * The MSIX mappable capability informs that MSIX data of a BAR can be mmapped
> > + * which allows direct access to non-MSIX registers which happened to be within
> > + * the same system page.
> > + *
> > + * Even though the userspace gets direct access to the MSIX data, the existing
> > + * VFIO_DEVICE_SET_IRQS interface must still be used for MSIX configuration.
> > + */
> > +#define VFIO_REGION_INFO_CAP_MSIX_MAPPABLE	3
> > +
> >  /**
> >   * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9,
> >   *				    struct vfio_irq_info)
> > diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> > index a98681d..9bcc1e1 100644
> > --- a/drivers/vfio/pci/vfio_pci.c
> > +++ b/drivers/vfio/pci/vfio_pci.c
> > @@ -565,46 +565,16 @@ static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
> >  	return walk.ret;
> >  }
> >  
> > -static int msix_sparse_mmap_cap(struct vfio_pci_device *vdev,
> > -				struct vfio_info_cap *caps)
> > +static int msix_sparse_msix_mmappable_cap(struct vfio_pci_device *vdev,
> > +		struct vfio_info_cap *caps)
> >  {
> > -	struct vfio_region_info_cap_sparse_mmap *sparse;
> > -	size_t end, size;
> > -	int nr_areas = 2, i = 0, ret;
> > +	struct vfio_info_cap_header header = {
> > +		.id = VFIO_REGION_INFO_CAP_MSIX_MAPPABLE,
> > +		.version = 1
> > +	};
> >  
> > -	end = pci_resource_len(vdev->pdev, vdev->msix_bar);
> > -
> > -	/* If MSI-X table is aligned to the start or end, only one area */
> > -	if (((vdev->msix_offset & PAGE_MASK) == 0) ||
> > -	    (PAGE_ALIGN(vdev->msix_offset + vdev->msix_size) >= end))
> > -		nr_areas = 1;
> > -
> > -	size = sizeof(*sparse) + (nr_areas * sizeof(*sparse->areas));
> > -
> > -	sparse = kzalloc(size, GFP_KERNEL);
> > -	if (!sparse)
> > -		return -ENOMEM;
> > -
> > -	sparse->nr_areas = nr_areas;
> > -
> > -	if (vdev->msix_offset & PAGE_MASK) {
> > -		sparse->areas[i].offset = 0;
> > -		sparse->areas[i].size = vdev->msix_offset & PAGE_MASK;
> > -		i++;
> > -	}
> > -
> > -	if (PAGE_ALIGN(vdev->msix_offset + vdev->msix_size) < end) {
> > -		sparse->areas[i].offset = PAGE_ALIGN(vdev->msix_offset +
> > -						     vdev->msix_size);
> > -		sparse->areas[i].size = end - sparse->areas[i].offset;
> > -		i++;
> > -	}
> > -
> > -	ret = vfio_info_add_capability(caps, VFIO_REGION_INFO_CAP_SPARSE_MMAP,
> > -				       sparse);
> > -	kfree(sparse);
> > -
> > -	return ret;
> > +	return vfio_info_add_capability(caps,
> > +			VFIO_REGION_INFO_CAP_MSIX_MAPPABLE, &header);
> >  }
> >  
> >  int vfio_pci_register_dev_region(struct vfio_pci_device *vdev,
> > @@ -695,7 +665,8 @@ static long vfio_pci_ioctl(void *device_data,
> >  			if (vdev->bar_mmap_supported[info.index]) {
> >  				info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
> >  				if (info.index == vdev->msix_bar) {
> > -					ret = msix_sparse_mmap_cap(vdev, &caps);
> > +					ret = msix_sparse_msix_mmappable_cap(
> > +							vdev, &caps);
> >  					if (ret)
> >  						return ret;
> >  				}
> > @@ -1125,22 +1096,6 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
> >  	if (req_start + req_len > phys_len)
> >  		return -EINVAL;
> >  
> > -	if (index == vdev->msix_bar) {
> > -		/*
> > -		 * Disallow mmaps overlapping the MSI-X table; users don't
> > -		 * get to touch this directly.  We could find somewhere
> > -		 * else to map the overlap, but page granularity is only
> > -		 * a recommendation, not a requirement, so the user needs
> > -		 * to know which bits are real.  Requiring them to mmap
> > -		 * around the table makes that clear.
> > -		 */
> > -
> > -		/* If neither entirely above nor below, then it overlaps */
> > -		if (!(req_start >= vdev->msix_offset + vdev->msix_size ||
> > -		      req_start + req_len <= vdev->msix_offset))
> > -			return -EINVAL;
> > -	}
> > -
> >  	/*
> >  	 * Even though we don't make use of the barmap for the mmap,
> >  	 * we need to request the region and the barmap tracks that.
> > diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> > index f5a86f6..376b4e7 100644
> > --- a/drivers/vfio/vfio.c
> > +++ b/drivers/vfio/vfio.c
> > @@ -1898,6 +1898,7 @@ int vfio_info_add_capability(struct vfio_info_cap *caps, int cap_type_id,
> >  			     void *cap_type)
> >  {
> >  	int ret = -EINVAL;
> > +	struct vfio_info_cap_header *header, *cap;
> >  
> >  	if (!cap_type)
> >  		return 0;
> > @@ -1910,6 +1911,14 @@ int vfio_info_add_capability(struct vfio_info_cap *caps, int cap_type_id,
> >  	case VFIO_REGION_INFO_CAP_TYPE:
> >  		ret = region_type_cap(caps, cap_type);
> >  		break;
> > +
> > +	case VFIO_REGION_INFO_CAP_MSIX_MAPPABLE:
> > +		cap = cap_type;
> > +		header = vfio_info_cap_add(caps, sizeof(*header), cap->id,
> > +				cap->version);
> > +		if (IS_ERR(header))
> > +			return PTR_ERR(header);
> > +		return 0;
> >  	}
> >  
> >  	return ret;  
>
David Gibson Dec. 18, 2017, 2:46 a.m. UTC | #5
On Fri, Dec 15, 2017 at 08:56:51AM -0700, Alex Williamson wrote:
> On Fri, 15 Dec 2017 15:13:22 +1100
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
> > On Fri, Dec 08, 2017 at 03:18:29PM +1100, Alexey Kardashevskiy wrote:
> > > By default VFIO disables mapping of MSIX BAR to the userspace as
> > > the userspace may program it in a way allowing spurious interrupts;
> > > instead the userspace uses the VFIO_DEVICE_SET_IRQS ioctl.
> > > In order to eliminate guessing from the userspace about what is
> > > mmapable, VFIO also advertises a sparse list of regions allowed to mmap.
> > > 
> > > This works fine as long as the system page size equals to the MSIX
> > > alignment requirement which is 4KB. However with a bigger page size
> > > the existing code prohibits mapping non-MSIX parts of a page with MSIX
> > > structures so these parts have to be emulated via slow reads/writes on
> > > a VFIO device fd. If these emulated bits are accessed often, this has
> > > serious impact on performance.
> > > 
> > > This allows mmap of the entire BAR containing MSIX vector table.
> > > 
> > > This removes the sparse capability for PCI devices as it becomes useless.
> > > 
> > > As the userspace needs to know for sure whether mmapping of the MSIX
> > > vector containing data can succeed, this adds a new capability -
> > > VFIO_REGION_INFO_CAP_MSIX_MAPPABLE - which explicitly tells the userspace
> > > that the entire BAR can be mmapped.
> > > 
> > > This does not touch the MSIX mangling in the BAR read/write handlers as
> > > we are doing this just to enable direct access to non MSIX registers.
> > > 
> > > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>  
> > 
> > This seems reasonable, except for the interface issue that Alex
> > pointed out already.
> > 
> > TBH, I'm not really sold on the need for a new kernel cap, rather than
> > just having qemu attempt the mamp() and fall back if it fails.  But
> > I'm not really fussed either way.
> 
> Userspace could never handle an mmap failure as an error in this case,
> they'd always just assume the old interface.  If the user sees the
> capability and the mmap semantics don't match, something is wrong.
> Userspace is of course free to try to mmap and fall back to the old
> assumptions regardless of this capability, but having this capability
> provides more deterministic behavior.  Thanks,

Hm, yeah, I guess so.  Like I say, not really fussed either way.
diff mbox

Patch

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index ae46105..0fb4407 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -300,6 +300,16 @@  struct vfio_region_info_cap_type {
 #define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG	(2)
 #define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG	(3)
 
+/*
+ * The MSIX mappable capability informs that MSIX data of a BAR can be mmapped
+ * which allows direct access to non-MSIX registers which happened to be within
+ * the same system page.
+ *
+ * Even though the userspace gets direct access to the MSIX data, the existing
+ * VFIO_DEVICE_SET_IRQS interface must still be used for MSIX configuration.
+ */
+#define VFIO_REGION_INFO_CAP_MSIX_MAPPABLE	3
+
 /**
  * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9,
  *				    struct vfio_irq_info)
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index a98681d..9bcc1e1 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -565,46 +565,16 @@  static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
 	return walk.ret;
 }
 
-static int msix_sparse_mmap_cap(struct vfio_pci_device *vdev,
-				struct vfio_info_cap *caps)
+static int msix_sparse_msix_mmappable_cap(struct vfio_pci_device *vdev,
+		struct vfio_info_cap *caps)
 {
-	struct vfio_region_info_cap_sparse_mmap *sparse;
-	size_t end, size;
-	int nr_areas = 2, i = 0, ret;
+	struct vfio_info_cap_header header = {
+		.id = VFIO_REGION_INFO_CAP_MSIX_MAPPABLE,
+		.version = 1
+	};
 
-	end = pci_resource_len(vdev->pdev, vdev->msix_bar);
-
-	/* If MSI-X table is aligned to the start or end, only one area */
-	if (((vdev->msix_offset & PAGE_MASK) == 0) ||
-	    (PAGE_ALIGN(vdev->msix_offset + vdev->msix_size) >= end))
-		nr_areas = 1;
-
-	size = sizeof(*sparse) + (nr_areas * sizeof(*sparse->areas));
-
-	sparse = kzalloc(size, GFP_KERNEL);
-	if (!sparse)
-		return -ENOMEM;
-
-	sparse->nr_areas = nr_areas;
-
-	if (vdev->msix_offset & PAGE_MASK) {
-		sparse->areas[i].offset = 0;
-		sparse->areas[i].size = vdev->msix_offset & PAGE_MASK;
-		i++;
-	}
-
-	if (PAGE_ALIGN(vdev->msix_offset + vdev->msix_size) < end) {
-		sparse->areas[i].offset = PAGE_ALIGN(vdev->msix_offset +
-						     vdev->msix_size);
-		sparse->areas[i].size = end - sparse->areas[i].offset;
-		i++;
-	}
-
-	ret = vfio_info_add_capability(caps, VFIO_REGION_INFO_CAP_SPARSE_MMAP,
-				       sparse);
-	kfree(sparse);
-
-	return ret;
+	return vfio_info_add_capability(caps,
+			VFIO_REGION_INFO_CAP_MSIX_MAPPABLE, &header);
 }
 
 int vfio_pci_register_dev_region(struct vfio_pci_device *vdev,
@@ -695,7 +665,8 @@  static long vfio_pci_ioctl(void *device_data,
 			if (vdev->bar_mmap_supported[info.index]) {
 				info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
 				if (info.index == vdev->msix_bar) {
-					ret = msix_sparse_mmap_cap(vdev, &caps);
+					ret = msix_sparse_msix_mmappable_cap(
+							vdev, &caps);
 					if (ret)
 						return ret;
 				}
@@ -1125,22 +1096,6 @@  static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
 	if (req_start + req_len > phys_len)
 		return -EINVAL;
 
-	if (index == vdev->msix_bar) {
-		/*
-		 * Disallow mmaps overlapping the MSI-X table; users don't
-		 * get to touch this directly.  We could find somewhere
-		 * else to map the overlap, but page granularity is only
-		 * a recommendation, not a requirement, so the user needs
-		 * to know which bits are real.  Requiring them to mmap
-		 * around the table makes that clear.
-		 */
-
-		/* If neither entirely above nor below, then it overlaps */
-		if (!(req_start >= vdev->msix_offset + vdev->msix_size ||
-		      req_start + req_len <= vdev->msix_offset))
-			return -EINVAL;
-	}
-
 	/*
 	 * Even though we don't make use of the barmap for the mmap,
 	 * we need to request the region and the barmap tracks that.
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index f5a86f6..376b4e7 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1898,6 +1898,7 @@  int vfio_info_add_capability(struct vfio_info_cap *caps, int cap_type_id,
 			     void *cap_type)
 {
 	int ret = -EINVAL;
+	struct vfio_info_cap_header *header, *cap;
 
 	if (!cap_type)
 		return 0;
@@ -1910,6 +1911,14 @@  int vfio_info_add_capability(struct vfio_info_cap *caps, int cap_type_id,
 	case VFIO_REGION_INFO_CAP_TYPE:
 		ret = region_type_cap(caps, cap_type);
 		break;
+
+	case VFIO_REGION_INFO_CAP_MSIX_MAPPABLE:
+		cap = cap_type;
+		header = vfio_info_cap_add(caps, sizeof(*header), cap->id,
+				cap->version);
+		if (IS_ERR(header))
+			return PTR_ERR(header);
+		return 0;
 	}
 
 	return ret;