diff mbox

[v2] vfio-pci: Allow to mmap sub-page MMIO BARs if the mmio page is exclusive

Message ID 1462966459-12459-1-git-send-email-xyjxie@linux.vnet.ibm.com (mailing list archive)
State New, archived
Delegated to: Bjorn Helgaas
Headers show

Commit Message

Yongji Xie May 11, 2016, 11:34 a.m. UTC
Current vfio-pci implementation disallows to mmap
sub-page(size < PAGE_SIZE) MMIO BARs because these BARs' mmio
page may be shared with other BARs. This will cause some
performance issues when we passthrough a PCI device with
this kind of BARs. Guest will be not able to handle the mmio
accesses to the BARs which leads to mmio emulations in host.

However, not all sub-page BARs will share page with other BARs.
We should allow to mmap the sub-page MMIO BARs which we can
make sure will not share page with other BARs.

This patch adds support for this case. And we try to add some
dummy resources to reserve the remaind of the page which
hot-add device's BAR might be assigned into.

Signed-off-by: Yongji Xie <xyjxie@linux.vnet.ibm.com>
---
 drivers/vfio/pci/vfio_pci.c         |   85 ++++++++++++++++++++++++++++++++---
 drivers/vfio/pci/vfio_pci_private.h |    8 ++++
 2 files changed, 86 insertions(+), 7 deletions(-)

Comments

Bjorn Helgaas May 11, 2016, 3:10 p.m. UTC | #1
On Wed, May 11, 2016 at 07:34:19PM +0800, Yongji Xie wrote:
> Current vfio-pci implementation disallows to mmap
> sub-page(size < PAGE_SIZE) MMIO BARs because these BARs' mmio
> page may be shared with other BARs. This will cause some
> performance issues when we passthrough a PCI device with
> this kind of BARs. Guest will be not able to handle the mmio
> accesses to the BARs which leads to mmio emulations in host.
> 
> However, not all sub-page BARs will share page with other BARs.
> We should allow to mmap the sub-page MMIO BARs which we can
> make sure will not share page with other BARs.
> 
> This patch adds support for this case. And we try to add some
> dummy resources to reserve the remaind of the page which
> hot-add device's BAR might be assigned into.

This is starting to look more reasonable from a safety perspective.
At least I don't have an allergic reaction to mapping a page that may
contain BARs from other random devices :)

> Signed-off-by: Yongji Xie <xyjxie@linux.vnet.ibm.com>
> ---
>  drivers/vfio/pci/vfio_pci.c         |   85 ++++++++++++++++++++++++++++++++---
>  drivers/vfio/pci/vfio_pci_private.h |    8 ++++
>  2 files changed, 86 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index 98059df..33282b8 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -110,16 +110,77 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
>  	return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA;
>  }
>  
> +static bool vfio_pci_bar_mmap_supported(struct vfio_pci_device *vdev, int index)
> +{
> +	struct resource *res = vdev->pdev->resource + index;
> +	struct vfio_pci_dummy_resource *dummy_res1 = NULL;
> +	struct vfio_pci_dummy_resource *dummy_res2 = NULL;
> +
> +	if (IS_ENABLED(CONFIG_VFIO_PCI_MMAP) && res->flags & IORESOURCE_MEM &&
> +		resource_size(res) > 0) {
> +		if (resource_size(res) >= PAGE_SIZE)
> +			return true;
> +
> +		if ((res->start & ~PAGE_MASK)) {
> +			/*
> +			 * Add a dummy resource to reserve the portion
> +			 * before res->start in exclusive page in case
> +			 * that hot-add device's bar is assigned into it.
> +			 */
> +			dummy_res1 = kzalloc(sizeof(*dummy_res1), GFP_KERNEL);

Should check for kzalloc() failure here.

> +			dummy_res1->resource.start = res->start & PAGE_MASK;
> +			dummy_res1->resource.end = res->start - 1;
> +			dummy_res1->resource.flags = res->flags;
> +			if (request_resource(res->parent,
> +						&dummy_res1->resource)) {
> +				kfree(dummy_res1);
> +				return false;
> +			}
> +			dummy_res1->index = index;
> +			list_add(&dummy_res1->res_next,
> +					&vdev->dummy_resources_list);

The main body of this function is unnecessarily indented.  If you did
this it would be less indented:

  if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
    return false;

  if (!res->flags & IORESOURCE_MEM)
    return false;

  /*
   * Not sure this is necessary; the PCI core *shouldn't* set up a
   * resource with a type but zero size.  But there may be bugs that
   * cause us to do that.
   */
  if (!resource_size(res))
    return false;

  if (resource_size(res) >= PAGE_SIZE)
    return true;

  if ((res->start & ~PAGE_MASK)) {
    ...

> +		}
> +		if (((res->end + 1) & ~PAGE_MASK)) {
> +			/*
> +			 * Add a dummy resource to reserve the portion
> +			 * after res->end.
> +			 */
> +			dummy_res2 = kzalloc(sizeof(*dummy_res2), GFP_KERNEL);
> +			dummy_res2->resource.start = res->end + 1;
> +			dummy_res2->resource.end = (res->start & PAGE_MASK) +
> +							PAGE_SIZE - 1;
> +			dummy_res2->resource.flags = res->flags;
> +			if (request_resource(res->parent,
> +						&dummy_res2->resource)) {
> +				if (dummy_res1) {
> +					list_del(&dummy_res1->res_next);
> +					release_resource(&dummy_res1->resource);
> +					kfree(dummy_res1);
> +				}
> +				kfree(dummy_res2);
> +				return false;
> +			}
> +			dummy_res2->index = index;
> +			list_add(&dummy_res2->res_next,
> +					&vdev->dummy_resources_list);
> +		}
> +		return true;
> +	}
> +	return false;
> +}
> +
>  static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev);
>  static void vfio_pci_disable(struct vfio_pci_device *vdev);
>  
>  static int vfio_pci_enable(struct vfio_pci_device *vdev)
>  {
>  	struct pci_dev *pdev = vdev->pdev;
> -	int ret;
> +	int ret, bar;
>  	u16 cmd;
>  	u8 msix_pos;
>  
> +	INIT_LIST_HEAD(&vdev->dummy_resources_list);
> +
>  	pci_set_power_state(pdev, PCI_D0);
>  
>  	/* Don't allow our initial saved state to include busmaster */
> @@ -183,12 +244,17 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev)
>  		}
>  	}
>  
> +	for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
> +		vdev->bar_mmap_supported[bar] =
> +				vfio_pci_bar_mmap_supported(vdev, bar);
> +	}
>  	return 0;
>  }
>  
>  static void vfio_pci_disable(struct vfio_pci_device *vdev)
>  {
>  	struct pci_dev *pdev = vdev->pdev;
> +	struct vfio_pci_dummy_resource *dummy_res, *tmp;
>  	int i, bar;
>  
>  	/* Stop the device from further DMA */
> @@ -217,6 +283,13 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
>  		vdev->barmap[bar] = NULL;
>  	}
>  
> +	list_for_each_entry_safe(dummy_res, tmp,
> +				 &vdev->dummy_resources_list, res_next) {
> +		list_del(&dummy_res->res_next);
> +		release_resource(&dummy_res->resource);
> +		kfree(dummy_res);
> +	}
> +
>  	vdev->needs_reset = true;
>  
>  	/*
> @@ -587,9 +660,7 @@ static long vfio_pci_ioctl(void *device_data,
>  
>  			info.flags = VFIO_REGION_INFO_FLAG_READ |
>  				     VFIO_REGION_INFO_FLAG_WRITE;
> -			if (IS_ENABLED(CONFIG_VFIO_PCI_MMAP) &&
> -			    pci_resource_flags(pdev, info.index) &
> -			    IORESOURCE_MEM && info.size >= PAGE_SIZE) {
> +			if (vdev->bar_mmap_supported[info.index]) {
>  				info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
>  				if (info.index == vdev->msix_bar) {
>  					ret = msix_sparse_mmap_cap(vdev, &caps);
> @@ -1011,16 +1082,16 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
>  		return -EINVAL;
>  	if (index >= VFIO_PCI_ROM_REGION_INDEX)
>  		return -EINVAL;
> -	if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM))
> +	if (!vdev->bar_mmap_supported[index])
>  		return -EINVAL;
>  
> -	phys_len = pci_resource_len(pdev, index);
> +	phys_len = PAGE_ALIGN(pci_resource_len(pdev, index));
>  	req_len = vma->vm_end - vma->vm_start;
>  	pgoff = vma->vm_pgoff &
>  		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
>  	req_start = pgoff << PAGE_SHIFT;
>  
> -	if (phys_len < PAGE_SIZE || req_start + req_len > phys_len)
> +	if (req_start + req_len > phys_len)
>  		return -EINVAL;
>  
>  	if (index == vdev->msix_bar) {
> diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
> index 8a7d546..a4233a8 100644
> --- a/drivers/vfio/pci/vfio_pci_private.h
> +++ b/drivers/vfio/pci/vfio_pci_private.h
> @@ -57,9 +57,16 @@ struct vfio_pci_region {
>  	u32				flags;
>  };
>  
> +struct vfio_pci_dummy_resource {
> +	struct resource		resource;
> +	int			index;
> +	struct list_head	res_next;
> +};
> +
>  struct vfio_pci_device {
>  	struct pci_dev		*pdev;
>  	void __iomem		*barmap[PCI_STD_RESOURCE_END + 1];
> +	bool			bar_mmap_supported[PCI_STD_RESOURCE_END + 1];
>  	u8			*pci_config_map;
>  	u8			*vconfig;
>  	struct perm_bits	*msi_perm;
> @@ -87,6 +94,7 @@ struct vfio_pci_device {
>  	int			refcnt;
>  	struct eventfd_ctx	*err_trigger;
>  	struct eventfd_ctx	*req_trigger;
> +	struct list_head	dummy_resources_list;
>  };
>  
>  #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
> -- 
> 1.7.9.5
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yongji Xie May 12, 2016, 5:56 a.m. UTC | #2
On 2016/5/11 23:10, Bjorn Helgaas wrote:

> On Wed, May 11, 2016 at 07:34:19PM +0800, Yongji Xie wrote:
>> Current vfio-pci implementation disallows to mmap
>> sub-page(size < PAGE_SIZE) MMIO BARs because these BARs' mmio
>> page may be shared with other BARs. This will cause some
>> performance issues when we passthrough a PCI device with
>> this kind of BARs. Guest will be not able to handle the mmio
>> accesses to the BARs which leads to mmio emulations in host.
>>
>> However, not all sub-page BARs will share page with other BARs.
>> We should allow to mmap the sub-page MMIO BARs which we can
>> make sure will not share page with other BARs.
>>
>> This patch adds support for this case. And we try to add some
>> dummy resources to reserve the remaind of the page which
>> hot-add device's BAR might be assigned into.
> This is starting to look more reasonable from a safety perspective.
> At least I don't have an allergic reaction to mapping a page that may
> contain BARs from other random devices :)
>
>> Signed-off-by: Yongji Xie <xyjxie@linux.vnet.ibm.com>
>> ---
>>   drivers/vfio/pci/vfio_pci.c         |   85 ++++++++++++++++++++++++++++++++---
>>   drivers/vfio/pci/vfio_pci_private.h |    8 ++++
>>   2 files changed, 86 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
>> index 98059df..33282b8 100644
>> --- a/drivers/vfio/pci/vfio_pci.c
>> +++ b/drivers/vfio/pci/vfio_pci.c
>> @@ -110,16 +110,77 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
>>   	return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA;
>>   }
>>   
>> +static bool vfio_pci_bar_mmap_supported(struct vfio_pci_device *vdev, int index)
>> +{
>> +	struct resource *res = vdev->pdev->resource + index;
>> +	struct vfio_pci_dummy_resource *dummy_res1 = NULL;
>> +	struct vfio_pci_dummy_resource *dummy_res2 = NULL;
>> +
>> +	if (IS_ENABLED(CONFIG_VFIO_PCI_MMAP) && res->flags & IORESOURCE_MEM &&
>> +		resource_size(res) > 0) {
>> +		if (resource_size(res) >= PAGE_SIZE)
>> +			return true;
>> +
>> +		if ((res->start & ~PAGE_MASK)) {
>> +			/*
>> +			 * Add a dummy resource to reserve the portion
>> +			 * before res->start in exclusive page in case
>> +			 * that hot-add device's bar is assigned into it.
>> +			 */
>> +			dummy_res1 = kzalloc(sizeof(*dummy_res1), GFP_KERNEL);
> Should check for kzalloc() failure here.
>
>> +			dummy_res1->resource.start = res->start & PAGE_MASK;
>> +			dummy_res1->resource.end = res->start - 1;
>> +			dummy_res1->resource.flags = res->flags;
>> +			if (request_resource(res->parent,
>> +						&dummy_res1->resource)) {
>> +				kfree(dummy_res1);
>> +				return false;
>> +			}
>> +			dummy_res1->index = index;
>> +			list_add(&dummy_res1->res_next,
>> +					&vdev->dummy_resources_list);
> The main body of this function is unnecessarily indented.  If you did
> this it would be less indented:
>
>    if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
>      return false;
>
>    if (!res->flags & IORESOURCE_MEM)
>      return false;
>
>    /*
>     * Not sure this is necessary; the PCI core *shouldn't* set up a
>     * resource with a type but zero size.  But there may be bugs that
>     * cause us to do that.
>     */
>    if (!resource_size(res))
>      return false;
>
>    if (resource_size(res) >= PAGE_SIZE)
>      return true;
>
>    if ((res->start & ~PAGE_MASK)) {
>      ...
>

Thanks for your comments. I'll send a v3 soon.

Regards,
Yongji

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 98059df..33282b8 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -110,16 +110,77 @@  static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
 	return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA;
 }
 
+static bool vfio_pci_bar_mmap_supported(struct vfio_pci_device *vdev, int index)
+{
+	struct resource *res = vdev->pdev->resource + index;
+	struct vfio_pci_dummy_resource *dummy_res1 = NULL;
+	struct vfio_pci_dummy_resource *dummy_res2 = NULL;
+
+	if (IS_ENABLED(CONFIG_VFIO_PCI_MMAP) && res->flags & IORESOURCE_MEM &&
+		resource_size(res) > 0) {
+		if (resource_size(res) >= PAGE_SIZE)
+			return true;
+
+		if ((res->start & ~PAGE_MASK)) {
+			/*
+			 * Add a dummy resource to reserve the portion
+			 * before res->start in exclusive page in case
+			 * that hot-add device's bar is assigned into it.
+			 */
+			dummy_res1 = kzalloc(sizeof(*dummy_res1), GFP_KERNEL);
+			dummy_res1->resource.start = res->start & PAGE_MASK;
+			dummy_res1->resource.end = res->start - 1;
+			dummy_res1->resource.flags = res->flags;
+			if (request_resource(res->parent,
+						&dummy_res1->resource)) {
+				kfree(dummy_res1);
+				return false;
+			}
+			dummy_res1->index = index;
+			list_add(&dummy_res1->res_next,
+					&vdev->dummy_resources_list);
+		}
+		if (((res->end + 1) & ~PAGE_MASK)) {
+			/*
+			 * Add a dummy resource to reserve the portion
+			 * after res->end.
+			 */
+			dummy_res2 = kzalloc(sizeof(*dummy_res2), GFP_KERNEL);
+			dummy_res2->resource.start = res->end + 1;
+			dummy_res2->resource.end = (res->start & PAGE_MASK) +
+							PAGE_SIZE - 1;
+			dummy_res2->resource.flags = res->flags;
+			if (request_resource(res->parent,
+						&dummy_res2->resource)) {
+				if (dummy_res1) {
+					list_del(&dummy_res1->res_next);
+					release_resource(&dummy_res1->resource);
+					kfree(dummy_res1);
+				}
+				kfree(dummy_res2);
+				return false;
+			}
+			dummy_res2->index = index;
+			list_add(&dummy_res2->res_next,
+					&vdev->dummy_resources_list);
+		}
+		return true;
+	}
+	return false;
+}
+
 static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev);
 static void vfio_pci_disable(struct vfio_pci_device *vdev);
 
 static int vfio_pci_enable(struct vfio_pci_device *vdev)
 {
 	struct pci_dev *pdev = vdev->pdev;
-	int ret;
+	int ret, bar;
 	u16 cmd;
 	u8 msix_pos;
 
+	INIT_LIST_HEAD(&vdev->dummy_resources_list);
+
 	pci_set_power_state(pdev, PCI_D0);
 
 	/* Don't allow our initial saved state to include busmaster */
@@ -183,12 +244,17 @@  static int vfio_pci_enable(struct vfio_pci_device *vdev)
 		}
 	}
 
+	for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
+		vdev->bar_mmap_supported[bar] =
+				vfio_pci_bar_mmap_supported(vdev, bar);
+	}
 	return 0;
 }
 
 static void vfio_pci_disable(struct vfio_pci_device *vdev)
 {
 	struct pci_dev *pdev = vdev->pdev;
+	struct vfio_pci_dummy_resource *dummy_res, *tmp;
 	int i, bar;
 
 	/* Stop the device from further DMA */
@@ -217,6 +283,13 @@  static void vfio_pci_disable(struct vfio_pci_device *vdev)
 		vdev->barmap[bar] = NULL;
 	}
 
+	list_for_each_entry_safe(dummy_res, tmp,
+				 &vdev->dummy_resources_list, res_next) {
+		list_del(&dummy_res->res_next);
+		release_resource(&dummy_res->resource);
+		kfree(dummy_res);
+	}
+
 	vdev->needs_reset = true;
 
 	/*
@@ -587,9 +660,7 @@  static long vfio_pci_ioctl(void *device_data,
 
 			info.flags = VFIO_REGION_INFO_FLAG_READ |
 				     VFIO_REGION_INFO_FLAG_WRITE;
-			if (IS_ENABLED(CONFIG_VFIO_PCI_MMAP) &&
-			    pci_resource_flags(pdev, info.index) &
-			    IORESOURCE_MEM && info.size >= PAGE_SIZE) {
+			if (vdev->bar_mmap_supported[info.index]) {
 				info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
 				if (info.index == vdev->msix_bar) {
 					ret = msix_sparse_mmap_cap(vdev, &caps);
@@ -1011,16 +1082,16 @@  static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
 		return -EINVAL;
 	if (index >= VFIO_PCI_ROM_REGION_INDEX)
 		return -EINVAL;
-	if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM))
+	if (!vdev->bar_mmap_supported[index])
 		return -EINVAL;
 
-	phys_len = pci_resource_len(pdev, index);
+	phys_len = PAGE_ALIGN(pci_resource_len(pdev, index));
 	req_len = vma->vm_end - vma->vm_start;
 	pgoff = vma->vm_pgoff &
 		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
 	req_start = pgoff << PAGE_SHIFT;
 
-	if (phys_len < PAGE_SIZE || req_start + req_len > phys_len)
+	if (req_start + req_len > phys_len)
 		return -EINVAL;
 
 	if (index == vdev->msix_bar) {
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index 8a7d546..a4233a8 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -57,9 +57,16 @@  struct vfio_pci_region {
 	u32				flags;
 };
 
+struct vfio_pci_dummy_resource {
+	struct resource		resource;
+	int			index;
+	struct list_head	res_next;
+};
+
 struct vfio_pci_device {
 	struct pci_dev		*pdev;
 	void __iomem		*barmap[PCI_STD_RESOURCE_END + 1];
+	bool			bar_mmap_supported[PCI_STD_RESOURCE_END + 1];
 	u8			*pci_config_map;
 	u8			*vconfig;
 	struct perm_bits	*msi_perm;
@@ -87,6 +94,7 @@  struct vfio_pci_device {
 	int			refcnt;
 	struct eventfd_ctx	*err_trigger;
 	struct eventfd_ctx	*req_trigger;
+	struct list_head	dummy_resources_list;
 };
 
 #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)