diff mbox

[v6,2/4] vfio: VFIO driver for mediated PCI device

Message ID 1470251034-1555-3-git-send-email-kwankhede@nvidia.com (mailing list archive)
State New, archived
Headers show

Commit Message

Kirti Wankhede Aug. 3, 2016, 7:03 p.m. UTC
MPCI VFIO driver registers with MDEV core driver. MDEV core driver creates
mediated device and calls probe routine of MPCI VFIO driver. This driver
adds mediated device to VFIO core module.
Main aim of this module is to manage all VFIO APIs for each mediated PCI
device. Those are:
- get region information from vendor driver.
- trap and emulate PCI config space and BAR region.
- Send interrupt configuration information to vendor driver.
- Device reset
- mmap mappable region with invalidate mapping and fault on access to
  remap pfns. If validate_map_request() is not provided by vendor driver,
  fault handler maps physical devices region.
- Add and delete mappable region's physical mappings to mdev's mapping
  tracking logic.

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Signed-off-by: Neo Jia <cjia@nvidia.com>
Change-Id: I583f4734752971d3d112324d69e2508c88f359ec
---
 drivers/vfio/mdev/Kconfig           |   6 +
 drivers/vfio/mdev/Makefile          |   1 +
 drivers/vfio/mdev/vfio_mpci.c       | 536 ++++++++++++++++++++++++++++++++++++
 drivers/vfio/pci/vfio_pci_private.h |   6 -
 drivers/vfio/pci/vfio_pci_rdwr.c    |   1 +
 include/linux/vfio.h                |   7 +
 6 files changed, 551 insertions(+), 6 deletions(-)
 create mode 100644 drivers/vfio/mdev/vfio_mpci.c

Comments

kernel test robot Aug. 3, 2016, 9:03 p.m. UTC | #1
Hi Kirti,

[auto build test WARNING on vfio/next]
[also build test WARNING on v4.7 next-20160803]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Kirti-Wankhede/Add-Mediated-device-support/20160804-032209
base:   https://github.com/awilliam/linux-vfio.git next
config: i386-allmodconfig (attached as .config)
compiler: gcc-6 (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All warnings (new ones prefixed by >>):

   drivers/vfio/mdev/vfio_mpci.c: In function 'mdev_dev_mmio_fault':
>> drivers/vfio/mdev/vfio_mpci.c:384:17: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
     u64 virtaddr = (u64)vmf->virtual_address;
                    ^
   In file included from drivers/vfio/mdev/vfio_mpci.c:19:0:
>> include/linux/vfio.h:23:46: warning: right shift count >= width of type [-Wshift-count-overflow]
    #define VFIO_PCI_OFFSET_TO_INDEX(off)   (off >> VFIO_PCI_OFFSET_SHIFT)
                                                 ^
>> drivers/vfio/mdev/vfio_mpci.c:424:11: note: in expansion of macro 'VFIO_PCI_OFFSET_TO_INDEX'
      index = VFIO_PCI_OFFSET_TO_INDEX(vma->vm_pgoff << PAGE_SHIFT);
              ^~~~~~~~~~~~~~~~~~~~~~~~

vim +384 drivers/vfio/mdev/vfio_mpci.c

   378	static int mdev_dev_mmio_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
   379	{
   380		int ret;
   381		struct vfio_mdev *vmdev = vma->vm_private_data;
   382		struct mdev_device *mdev;
   383		struct parent_device *parent;
 > 384		u64 virtaddr = (u64)vmf->virtual_address;
   385		unsigned long req_size, pgoff = 0;
   386		pgprot_t pg_prot;
   387		unsigned int index;
   388	
   389		if (!vmdev && !vmdev->mdev)
   390			return -EINVAL;
   391	
   392		mdev = vmdev->mdev;
   393		parent  = mdev->parent;
   394	
   395		pg_prot  = vma->vm_page_prot;
   396	
   397		if (parent->ops->validate_map_request) {
   398			u64 offset;
   399			loff_t pos;
   400	
   401			offset   = virtaddr - vma->vm_start;
   402			req_size = vma->vm_end - virtaddr;
   403			pos = (vma->vm_pgoff << PAGE_SHIFT) + offset;
   404	
   405			ret = parent->ops->validate_map_request(mdev, pos, &virtaddr,
   406							&pgoff, &req_size, &pg_prot);
   407			if (ret)
   408				return ret;
   409	
   410			/*
   411			 * Verify pgoff and req_size are valid and virtaddr is within
   412			 * vma range
   413			 */
   414			if (!pgoff || !req_size || (virtaddr < vma->vm_start) ||
   415			    ((virtaddr + req_size) >= vma->vm_end))
   416				return -EINVAL;
   417		} else {
   418			struct pci_dev *pdev;
   419	
   420			virtaddr = vma->vm_start;
   421			req_size = vma->vm_end - vma->vm_start;
   422	
   423			pdev = to_pci_dev(parent->dev);
 > 424			index = VFIO_PCI_OFFSET_TO_INDEX(vma->vm_pgoff << PAGE_SHIFT);
   425			pgoff = pci_resource_start(pdev, index) >> PAGE_SHIFT;
   426		}
   427	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
kernel test robot Aug. 4, 2016, 12:19 a.m. UTC | #2
Hi Kirti,

[auto build test WARNING on vfio/next]
[also build test WARNING on v4.7 next-20160803]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Kirti-Wankhede/Add-Mediated-device-support/20160804-032209
base:   https://github.com/awilliam/linux-vfio.git next
config: i386-allyesconfig (attached as .config)
compiler: gcc-6 (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All warnings (new ones prefixed by >>):

   drivers/vfio/mdev/vfio_mpci.c: In function 'mdev_dev_mmio_fault':
   drivers/vfio/mdev/vfio_mpci.c:384:17: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
     u64 virtaddr = (u64)vmf->virtual_address;
                    ^
>> drivers/vfio/mdev/vfio_mpci.c:424:32: warning: right shift count >= width of type [-Wshift-count-overflow]
      index = VFIO_PCI_OFFSET_TO_INDEX(vma->vm_pgoff << PAGE_SHIFT);
                                   ^~

vim +424 drivers/vfio/mdev/vfio_mpci.c

   378	static int mdev_dev_mmio_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
   379	{
   380		int ret;
   381		struct vfio_mdev *vmdev = vma->vm_private_data;
   382		struct mdev_device *mdev;
   383		struct parent_device *parent;
 > 384		u64 virtaddr = (u64)vmf->virtual_address;
   385		unsigned long req_size, pgoff = 0;
   386		pgprot_t pg_prot;
   387		unsigned int index;
   388	
   389		if (!vmdev && !vmdev->mdev)
   390			return -EINVAL;
   391	
   392		mdev = vmdev->mdev;
   393		parent  = mdev->parent;
   394	
   395		pg_prot  = vma->vm_page_prot;
   396	
   397		if (parent->ops->validate_map_request) {
   398			u64 offset;
   399			loff_t pos;
   400	
   401			offset   = virtaddr - vma->vm_start;
   402			req_size = vma->vm_end - virtaddr;
   403			pos = (vma->vm_pgoff << PAGE_SHIFT) + offset;
   404	
   405			ret = parent->ops->validate_map_request(mdev, pos, &virtaddr,
   406							&pgoff, &req_size, &pg_prot);
   407			if (ret)
   408				return ret;
   409	
   410			/*
   411			 * Verify pgoff and req_size are valid and virtaddr is within
   412			 * vma range
   413			 */
   414			if (!pgoff || !req_size || (virtaddr < vma->vm_start) ||
   415			    ((virtaddr + req_size) >= vma->vm_end))
   416				return -EINVAL;
   417		} else {
   418			struct pci_dev *pdev;
   419	
   420			virtaddr = vma->vm_start;
   421			req_size = vma->vm_end - vma->vm_start;
   422	
   423			pdev = to_pci_dev(parent->dev);
 > 424			index = VFIO_PCI_OFFSET_TO_INDEX(vma->vm_pgoff << PAGE_SHIFT);
   425			pgoff = pci_resource_start(pdev, index) >> PAGE_SHIFT;
   426		}
   427	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
Alex Williamson Aug. 9, 2016, 7 p.m. UTC | #3
On Thu, 4 Aug 2016 00:33:52 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> MPCI VFIO driver registers with MDEV core driver. MDEV core driver creates
> mediated device and calls probe routine of MPCI VFIO driver. This driver
> adds mediated device to VFIO core module.
> Main aim of this module is to manage all VFIO APIs for each mediated PCI
> device. Those are:
> - get region information from vendor driver.
> - trap and emulate PCI config space and BAR region.
> - Send interrupt configuration information to vendor driver.
> - Device reset
> - mmap mappable region with invalidate mapping and fault on access to
>   remap pfns. If validate_map_request() is not provided by vendor driver,
>   fault handler maps physical devices region.
> - Add and delete mappable region's physical mappings to mdev's mapping
>   tracking logic.
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Signed-off-by: Neo Jia <cjia@nvidia.com>
> Change-Id: I583f4734752971d3d112324d69e2508c88f359ec
> ---
>  drivers/vfio/mdev/Kconfig           |   6 +
>  drivers/vfio/mdev/Makefile          |   1 +
>  drivers/vfio/mdev/vfio_mpci.c       | 536 ++++++++++++++++++++++++++++++++++++
>  drivers/vfio/pci/vfio_pci_private.h |   6 -
>  drivers/vfio/pci/vfio_pci_rdwr.c    |   1 +
>  include/linux/vfio.h                |   7 +
>  6 files changed, 551 insertions(+), 6 deletions(-)
>  create mode 100644 drivers/vfio/mdev/vfio_mpci.c
> 
> diff --git a/drivers/vfio/mdev/Kconfig b/drivers/vfio/mdev/Kconfig
> index a34fbc66f92f..431ed595c8da 100644
> --- a/drivers/vfio/mdev/Kconfig
> +++ b/drivers/vfio/mdev/Kconfig
> @@ -9,4 +9,10 @@ config VFIO_MDEV
>  
>          If you don't know what do here, say N.
>  
> +config VFIO_MPCI
> +    tristate "VFIO support for Mediated PCI devices"
> +    depends on VFIO && PCI && VFIO_MDEV
> +    default n
> +    help
> +        VFIO based driver for mediated PCI devices.
>  
> diff --git a/drivers/vfio/mdev/Makefile b/drivers/vfio/mdev/Makefile
> index 56a75e689582..264fb03dd0e3 100644
> --- a/drivers/vfio/mdev/Makefile
> +++ b/drivers/vfio/mdev/Makefile
> @@ -2,4 +2,5 @@
>  mdev-y := mdev_core.o mdev_sysfs.o mdev_driver.o
>  
>  obj-$(CONFIG_VFIO_MDEV) += mdev.o
> +obj-$(CONFIG_VFIO_MPCI) += vfio_mpci.o
>  
> diff --git a/drivers/vfio/mdev/vfio_mpci.c b/drivers/vfio/mdev/vfio_mpci.c
> new file mode 100644
> index 000000000000..9da94b76ae3e
> --- /dev/null
> +++ b/drivers/vfio/mdev/vfio_mpci.c
> @@ -0,0 +1,536 @@
> +/*
> + * VFIO based Mediated PCI device driver
> + *
> + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
> + *     Author: Neo Jia <cjia@nvidia.com>
> + *	       Kirti Wankhede <kwankhede@nvidia.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/init.h>
> +#include <linux/module.h>
> +#include <linux/device.h>
> +#include <linux/kernel.h>
> +#include <linux/slab.h>
> +#include <linux/uuid.h>
> +#include <linux/vfio.h>
> +#include <linux/iommu.h>
> +#include <linux/mdev.h>
> +
> +#include "mdev_private.h"
> +
> +#define DRIVER_VERSION  "0.1"
> +#define DRIVER_AUTHOR   "NVIDIA Corporation"
> +#define DRIVER_DESC     "VFIO based Mediated PCI device driver"
> +
> +struct vfio_mdev {
> +	struct iommu_group *group;
> +	struct mdev_device *mdev;
> +	int		    refcnt;
> +	struct vfio_region_info vfio_region_info[VFIO_PCI_NUM_REGIONS];
> +	struct mutex	    vfio_mdev_lock;
> +};
> +
> +static int vfio_mpci_open(void *device_data)
> +{
> +	int ret = 0;
> +	struct vfio_mdev *vmdev = device_data;
> +	struct parent_device *parent = vmdev->mdev->parent;
> +
> +	if (!try_module_get(THIS_MODULE))
> +		return -ENODEV;
> +
> +	mutex_lock(&vmdev->vfio_mdev_lock);
> +	if (!vmdev->refcnt && parent->ops->get_region_info) {
> +		int index;
> +
> +		for (index = VFIO_PCI_BAR0_REGION_INDEX;
> +		     index < VFIO_PCI_NUM_REGIONS; index++) {
> +			ret = parent->ops->get_region_info(vmdev->mdev, index,
> +					      &vmdev->vfio_region_info[index]);
> +			if (ret)
> +				goto open_error;
> +		}
> +	}
> +
> +	vmdev->refcnt++;
> +
> +open_error:
> +	mutex_unlock(&vmdev->vfio_mdev_lock);
> +	if (ret)
> +		module_put(THIS_MODULE);
> +
> +	return ret;
> +}
> +
> +static void vfio_mpci_close(void *device_data)
> +{
> +	struct vfio_mdev *vmdev = device_data;
> +
> +	mutex_lock(&vmdev->vfio_mdev_lock);
> +	vmdev->refcnt--;
> +	if (!vmdev->refcnt) {
> +		memset(&vmdev->vfio_region_info, 0,
> +			sizeof(vmdev->vfio_region_info));
> +	}
> +	mutex_unlock(&vmdev->vfio_mdev_lock);
> +	module_put(THIS_MODULE);
> +}
> +
> +static u8 mpci_find_pci_capability(struct mdev_device *mdev, u8 capability)
> +{
> +	loff_t pos = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_CONFIG_REGION_INDEX);

This creates a fixed ABI between vfio-mdev-pci and vendor drivers that
a given region starts at a pre-defined offset.  We have the offset
stored in vfio_mdev.region_info[VFIO_PCI_CONFIG_REGION_INDEX].offset,
use it.  It's just as unacceptable to impose this fixed relationship
with a vendor driver here as if a userspace driver were to do the same.

> +	struct parent_device *parent = mdev->parent;
> +	u16 status;
> +	u8  cap_ptr, cap_id = 0xff;
> +
> +	parent->ops->read(mdev, (char *)&status, sizeof(status),
> +			  pos + PCI_STATUS);
> +	if (!(status & PCI_STATUS_CAP_LIST))
> +		return 0;
> +
> +	parent->ops->read(mdev, &cap_ptr, sizeof(cap_ptr),
> +			  pos + PCI_CAPABILITY_LIST);
> +
> +	do {
> +		cap_ptr &= 0xfc;
> +		parent->ops->read(mdev, &cap_id, sizeof(cap_id),
> +				  pos + cap_ptr + PCI_CAP_LIST_ID);
> +		if (cap_id == capability)
> +			return cap_ptr;
> +		parent->ops->read(mdev, &cap_ptr, sizeof(cap_ptr),
> +				  pos + cap_ptr + PCI_CAP_LIST_NEXT);
> +	} while (cap_ptr && cap_id != 0xff);
> +
> +	return 0;
> +}
> +
> +static int mpci_get_irq_count(struct vfio_mdev *vmdev, int irq_type)
> +{
> +	loff_t pos = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_CONFIG_REGION_INDEX);
> +	struct mdev_device *mdev = vmdev->mdev;
> +	struct parent_device *parent = mdev->parent;
> +
> +	if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
> +		u8 pin;
> +
> +		parent->ops->read(mdev, &pin, sizeof(pin),
> +				  pos + PCI_INTERRUPT_PIN);
> +		if (IS_ENABLED(CONFIG_VFIO_PCI_INTX) && pin)
> +			return 1;
> +
> +	} else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
> +		u8 cap_ptr;
> +		u16 flags;
> +
> +		cap_ptr = mpci_find_pci_capability(mdev, PCI_CAP_ID_MSI);
> +		if (cap_ptr) {
> +			parent->ops->read(mdev, (char *)&flags, sizeof(flags),
> +					pos + cap_ptr + PCI_MSI_FLAGS);
> +			return 1 << ((flags & PCI_MSI_FLAGS_QMASK) >> 1);
> +		}
> +	} else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) {
> +		u8 cap_ptr;
> +		u16 flags;
> +
> +		cap_ptr = mpci_find_pci_capability(mdev, PCI_CAP_ID_MSIX);
> +		if (cap_ptr) {
> +			parent->ops->read(mdev, (char *)&flags, sizeof(flags),
> +					pos + cap_ptr + PCI_MSIX_FLAGS);
> +
> +			return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
> +		}
> +	} else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) {
> +		u8 cap_ptr;
> +
> +		cap_ptr = mpci_find_pci_capability(mdev, PCI_CAP_ID_EXP);
> +		if (cap_ptr)
> +			return 1;
> +	} else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) {
> +		return 1;
> +	}

Much better than previous versions, but use the region_info provided by
the vendor driver.  Maybe you want helpers such as
mpci_config_{read,write}{b,w,l}.

> +
> +	return 0;
> +}
> +
> +static long vfio_mpci_unlocked_ioctl(void *device_data,
> +				     unsigned int cmd, unsigned long arg)
> +{
> +	int ret = 0;
> +	struct vfio_mdev *vmdev = device_data;
> +	unsigned long minsz;
> +
> +	switch (cmd) {
> +	case VFIO_DEVICE_GET_INFO:
> +	{
> +		struct vfio_device_info info;
> +		struct parent_device *parent = vmdev->mdev->parent;
> +
> +		minsz = offsetofend(struct vfio_device_info, num_irqs);
> +
> +		if (copy_from_user(&info, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (info.argsz < minsz)
> +			return -EINVAL;
> +
> +		info.flags = VFIO_DEVICE_FLAGS_PCI;
> +
> +		if (parent->ops->reset)
> +			info.flags |= VFIO_DEVICE_FLAGS_RESET;
> +
> +		info.num_regions = VFIO_PCI_NUM_REGIONS;
> +		info.num_irqs = VFIO_PCI_NUM_IRQS;
> +
> +		return copy_to_user((void __user *)arg, &info, minsz);
> +	}
> +	case VFIO_DEVICE_GET_REGION_INFO:
> +	{
> +		struct vfio_region_info info;
> +
> +		minsz = offsetofend(struct vfio_region_info, offset);
> +
> +		if (copy_from_user(&info, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (info.argsz < minsz)
> +			return -EINVAL;
> +
> +		switch (info.index) {
> +		case VFIO_PCI_CONFIG_REGION_INDEX:
> +		case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
> +			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);

No, vmdev->vfio_region_info[info.index].offset

> +			info.size = vmdev->vfio_region_info[info.index].size;
> +			if (!info.size) {
> +				info.flags = 0;
> +				break;
> +			}
> +
> +			info.flags = vmdev->vfio_region_info[info.index].flags;
> +			break;
> +		case VFIO_PCI_VGA_REGION_INDEX:
> +		case VFIO_PCI_ROM_REGION_INDEX:

Why?  Let the vendor driver decide.

> +		default:
> +			return -EINVAL;
> +		}
> +
> +		return copy_to_user((void __user *)arg, &info, minsz);
> +	}
> +	case VFIO_DEVICE_GET_IRQ_INFO:
> +	{
> +		struct vfio_irq_info info;
> +
> +		minsz = offsetofend(struct vfio_irq_info, count);
> +
> +		if (copy_from_user(&info, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
> +			return -EINVAL;
> +
> +		switch (info.index) {
> +		case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSI_IRQ_INDEX:
> +		case VFIO_PCI_REQ_IRQ_INDEX:
> +			break;
> +			/* pass thru to return error */
> +		case VFIO_PCI_MSIX_IRQ_INDEX:

???

> +		default:
> +			return -EINVAL;
> +		}
> +
> +		info.flags = VFIO_IRQ_INFO_EVENTFD;
> +		info.count = mpci_get_irq_count(vmdev, info.index);
> +
> +		if (info.count == -1)
> +			return -EINVAL;
> +
> +		if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
> +			info.flags |= (VFIO_IRQ_INFO_MASKABLE |
> +					VFIO_IRQ_INFO_AUTOMASKED);
> +		else
> +			info.flags |= VFIO_IRQ_INFO_NORESIZE;
> +
> +		return copy_to_user((void __user *)arg, &info, minsz);
> +	}
> +	case VFIO_DEVICE_SET_IRQS:
> +	{
> +		struct vfio_irq_set hdr;
> +		struct mdev_device *mdev = vmdev->mdev;
> +		struct parent_device *parent = mdev->parent;
> +		u8 *data = NULL, *ptr = NULL;
> +
> +		minsz = offsetofend(struct vfio_irq_set, count);
> +
> +		if (copy_from_user(&hdr, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (hdr.argsz < minsz || hdr.index >= VFIO_PCI_NUM_IRQS ||
> +		    hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
> +				  VFIO_IRQ_SET_ACTION_TYPE_MASK))
> +			return -EINVAL;
> +
> +		if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
> +			size_t size;
> +			int max = mpci_get_irq_count(vmdev, hdr.index);
> +
> +			if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL)
> +				size = sizeof(uint8_t);
> +			else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD)
> +				size = sizeof(int32_t);
> +			else
> +				return -EINVAL;
> +
> +			if (hdr.argsz - minsz < hdr.count * size ||
> +			    hdr.start >= max || hdr.start + hdr.count > max)
> +				return -EINVAL;
> +
> +			ptr = data = memdup_user((void __user *)(arg + minsz),
> +						 hdr.count * size);
> +			if (IS_ERR(data))
> +				return PTR_ERR(data);
> +		}
> +
> +		if (parent->ops->set_irqs)
> +			ret = parent->ops->set_irqs(mdev, hdr.flags, hdr.index,
> +						    hdr.start, hdr.count, data);
> +
> +		kfree(ptr);
> +		return ret;

Return success if no set_irqs callback?

> +	}
> +	case VFIO_DEVICE_RESET:
> +	{
> +		struct parent_device *parent = vmdev->mdev->parent;
> +
> +		if (parent->ops->reset)
> +			return parent->ops->reset(vmdev->mdev);
> +
> +		return -EINVAL;
> +	}
> +	}
> +	return -ENOTTY;
> +}
> +
> +static ssize_t vfio_mpci_read(void *device_data, char __user *buf,
> +			      size_t count, loff_t *ppos)
> +{
> +	struct vfio_mdev *vmdev = device_data;
> +	struct mdev_device *mdev = vmdev->mdev;
> +	struct parent_device *parent = mdev->parent;
> +	int ret = 0;
> +
> +	if (!count)
> +		return 0;
> +
> +	if (parent->ops->read) {
> +		char *ret_data, *ptr;
> +
> +		ptr = ret_data = kzalloc(count, GFP_KERNEL);

Do we really need to support arbitrary lengths in one shot?  Seems like
we could just use a 4 or 8 byte variable on the stack and iterate until
done.

> +
> +		if (!ret_data)
> +			return  -ENOMEM;
> +
> +		ret = parent->ops->read(mdev, ret_data, count, *ppos);
> +
> +		if (ret > 0) {
> +			if (copy_to_user(buf, ret_data, ret))
> +				ret = -EFAULT;
> +			else
> +				*ppos += ret;
> +		}
> +		kfree(ptr);
> +	}
> +
> +	return ret;
> +}
> +
> +static ssize_t vfio_mpci_write(void *device_data, const char __user *buf,
> +			       size_t count, loff_t *ppos)
> +{
> +	struct vfio_mdev *vmdev = device_data;
> +	struct mdev_device *mdev = vmdev->mdev;
> +	struct parent_device *parent = mdev->parent;
> +	int ret = 0;
> +
> +	if (!count)
> +		return 0;
> +
> +	if (parent->ops->write) {
> +		char *usr_data, *ptr;
> +
> +		ptr = usr_data = memdup_user(buf, count);

Same here, how much do we care to let the user write in one pass and is
there any advantage to it?  When QEMU is our userspace we're only
likely to see 4-byte accesses anyway.

> +		if (IS_ERR(usr_data))
> +			return PTR_ERR(usr_data);
> +
> +		ret = parent->ops->write(mdev, usr_data, count, *ppos);
> +
> +		if (ret > 0)
> +			*ppos += ret;
> +
> +		kfree(ptr);
> +	}
> +
> +	return ret;
> +}
> +
> +static int mdev_dev_mmio_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
> +{
> +	int ret;
> +	struct vfio_mdev *vmdev = vma->vm_private_data;
> +	struct mdev_device *mdev;
> +	struct parent_device *parent;
> +	u64 virtaddr = (u64)vmf->virtual_address;
> +	unsigned long req_size, pgoff = 0;
> +	pgprot_t pg_prot;
> +	unsigned int index;
> +
> +	if (!vmdev && !vmdev->mdev)
> +		return -EINVAL;
> +
> +	mdev = vmdev->mdev;
> +	parent  = mdev->parent;
> +
> +	pg_prot  = vma->vm_page_prot;
> +
> +	if (parent->ops->validate_map_request) {
> +		u64 offset;
> +		loff_t pos;
> +
> +		offset   = virtaddr - vma->vm_start;
> +		req_size = vma->vm_end - virtaddr;
> +		pos = (vma->vm_pgoff << PAGE_SHIFT) + offset;
> +
> +		ret = parent->ops->validate_map_request(mdev, pos, &virtaddr,
> +						&pgoff, &req_size, &pg_prot);
> +		if (ret)
> +			return ret;
> +
> +		/*
> +		 * Verify pgoff and req_size are valid and virtaddr is within
> +		 * vma range
> +		 */
> +		if (!pgoff || !req_size || (virtaddr < vma->vm_start) ||
> +		    ((virtaddr + req_size) >= vma->vm_end))
> +			return -EINVAL;
> +	} else {
> +		struct pci_dev *pdev;
> +
> +		virtaddr = vma->vm_start;
> +		req_size = vma->vm_end - vma->vm_start;
> +
> +		pdev = to_pci_dev(parent->dev);
> +		index = VFIO_PCI_OFFSET_TO_INDEX(vma->vm_pgoff << PAGE_SHIFT);

Iterate through region_info[*].offset/size provided by vendor driver.

> +		pgoff = pci_resource_start(pdev, index) >> PAGE_SHIFT;
> +	}
> +
> +	ret = remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot);
> +
> +	return ret | VM_FAULT_NOPAGE;
> +}
> +
> +void mdev_dev_mmio_close(struct vm_area_struct *vma)
> +{
> +	struct vfio_mdev *vmdev = vma->vm_private_data;
> +	struct mdev_device *mdev = vmdev->mdev;
> +
> +	mdev_del_phys_mapping(mdev, vma->vm_pgoff << PAGE_SHIFT);
> +}
> +
> +static const struct vm_operations_struct mdev_dev_mmio_ops = {
> +	.fault = mdev_dev_mmio_fault,
> +	.close = mdev_dev_mmio_close,
> +};
> +
> +static int vfio_mpci_mmap(void *device_data, struct vm_area_struct *vma)
> +{
> +	unsigned int index;
> +	struct vfio_mdev *vmdev = device_data;
> +	struct mdev_device *mdev = vmdev->mdev;
> +
> +	index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
> +
> +	if (index >= VFIO_PCI_ROM_REGION_INDEX)
> +		return -EINVAL;
> +
> +	vma->vm_private_data = vmdev;
> +	vma->vm_ops = &mdev_dev_mmio_ops;
> +
> +	return mdev_add_phys_mapping(mdev, vma->vm_file->f_mapping,
> +				     vma->vm_pgoff << PAGE_SHIFT,
> +				     vma->vm_end - vma->vm_start);
> +}
> +
> +static const struct vfio_device_ops vfio_mpci_dev_ops = {
> +	.name		= "vfio-mpci",
> +	.open		= vfio_mpci_open,
> +	.release	= vfio_mpci_close,
> +	.ioctl		= vfio_mpci_unlocked_ioctl,
> +	.read		= vfio_mpci_read,
> +	.write		= vfio_mpci_write,
> +	.mmap		= vfio_mpci_mmap,
> +};
> +
> +int vfio_mpci_probe(struct device *dev)
> +{
> +	struct vfio_mdev *vmdev;
> +	struct mdev_device *mdev = to_mdev_device(dev);
> +	int ret;
> +
> +	vmdev = kzalloc(sizeof(*vmdev), GFP_KERNEL);
> +	if (IS_ERR(vmdev))
> +		return PTR_ERR(vmdev);
> +
> +	vmdev->mdev = mdev_get_device(mdev);
> +	vmdev->group = mdev->group;
> +	mutex_init(&vmdev->vfio_mdev_lock);
> +
> +	ret = vfio_add_group_dev(dev, &vfio_mpci_dev_ops, vmdev);
> +	if (ret)
> +		kfree(vmdev);
> +
> +	mdev_put_device(mdev);
> +	return ret;
> +}
> +
> +void vfio_mpci_remove(struct device *dev)
> +{
> +	struct vfio_mdev *vmdev;
> +
> +	vmdev = vfio_del_group_dev(dev);
> +	kfree(vmdev);
> +}
> +
> +int vfio_mpci_match(struct device *dev)
> +{
> +	if (dev_is_pci(dev->parent))

This is the wrong test, there's really no requirement that a pci mdev
device is hosted by a real pci device.  Can't we check that the device
is on an mdev_pci_bus_type?

> +		return 1;
> +
> +	return 0;
> +}
> +
> +struct mdev_driver vfio_mpci_driver = {
> +	.name	= "vfio_mpci",
> +	.probe	= vfio_mpci_probe,
> +	.remove	= vfio_mpci_remove,
> +	.match	= vfio_mpci_match,
> +};
> +
> +static int __init vfio_mpci_init(void)
> +{
> +	return mdev_register_driver(&vfio_mpci_driver, THIS_MODULE);
> +}
> +
> +static void __exit vfio_mpci_exit(void)
> +{
> +	mdev_unregister_driver(&vfio_mpci_driver);
> +}
> +
> +module_init(vfio_mpci_init)
> +module_exit(vfio_mpci_exit)
> +
> +MODULE_VERSION(DRIVER_VERSION);
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR(DRIVER_AUTHOR);
> +MODULE_DESCRIPTION(DRIVER_DESC);
> diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
> index 8a7d546d18a0..04a450908ffb 100644
> --- a/drivers/vfio/pci/vfio_pci_private.h
> +++ b/drivers/vfio/pci/vfio_pci_private.h
> @@ -19,12 +19,6 @@
>  #ifndef VFIO_PCI_PRIVATE_H
>  #define VFIO_PCI_PRIVATE_H
>  
> -#define VFIO_PCI_OFFSET_SHIFT   40
> -
> -#define VFIO_PCI_OFFSET_TO_INDEX(off)	(off >> VFIO_PCI_OFFSET_SHIFT)
> -#define VFIO_PCI_INDEX_TO_OFFSET(index)	((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
> -#define VFIO_PCI_OFFSET_MASK	(((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
> -
>  /* Special capability IDs predefined access */
>  #define PCI_CAP_ID_INVALID		0xFF	/* default raw access */
>  #define PCI_CAP_ID_INVALID_VIRT		0xFE	/* default virt access */
> diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c
> index 5ffd1d9ad4bd..5b912be9d9c3 100644
> --- a/drivers/vfio/pci/vfio_pci_rdwr.c
> +++ b/drivers/vfio/pci/vfio_pci_rdwr.c
> @@ -18,6 +18,7 @@
>  #include <linux/uaccess.h>
>  #include <linux/io.h>
>  #include <linux/vgaarb.h>
> +#include <linux/vfio.h>
>  
>  #include "vfio_pci_private.h"
>  
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 0ecae0b1cd34..431b824b0d3e 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -18,6 +18,13 @@
>  #include <linux/poll.h>
>  #include <uapi/linux/vfio.h>
>  
> +#define VFIO_PCI_OFFSET_SHIFT   40
> +
> +#define VFIO_PCI_OFFSET_TO_INDEX(off)   (off >> VFIO_PCI_OFFSET_SHIFT)
> +#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
> +#define VFIO_PCI_OFFSET_MASK    (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
> +
> +

Nak this, I'm not interested in making this any sort of ABI.

>  /**
>   * struct vfio_device_ops - VFIO bus driver device callbacks
>   *

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kirti Wankhede Aug. 10, 2016, 9:23 p.m. UTC | #4
On 8/10/2016 12:30 AM, Alex Williamson wrote:
> On Thu, 4 Aug 2016 00:33:52 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 

...

>> +
>> +		switch (info.index) {
>> +		case VFIO_PCI_CONFIG_REGION_INDEX:
>> +		case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
>> +			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
> 
> No, vmdev->vfio_region_info[info.index].offset
>

Ok.

>> +			info.size = vmdev->vfio_region_info[info.index].size;
>> +			if (!info.size) {
>> +				info.flags = 0;
>> +				break;
>> +			}
>> +
>> +			info.flags = vmdev->vfio_region_info[info.index].flags;
>> +			break;
>> +		case VFIO_PCI_VGA_REGION_INDEX:
>> +		case VFIO_PCI_ROM_REGION_INDEX:
> 
> Why?  Let the vendor driver decide.
> 

Ok.

>> +		switch (info.index) {
>> +		case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSI_IRQ_INDEX:
>> +		case VFIO_PCI_REQ_IRQ_INDEX:
>> +			break;
>> +			/* pass thru to return error */
>> +		case VFIO_PCI_MSIX_IRQ_INDEX:
> 
> ???

Sorry, I missed to update this. Updating it.

>> +	case VFIO_DEVICE_SET_IRQS:
>> +	{
...
>> +
>> +		if (parent->ops->set_irqs)
>> +			ret = parent->ops->set_irqs(mdev, hdr.flags, hdr.index,
>> +						    hdr.start, hdr.count, data);
>> +
>> +		kfree(ptr);
>> +		return ret;
> 
> Return success if no set_irqs callback?
>

Ideally, vendor driver should provide this function. If vendor driver
doesn't provide it, do we really need to fail here?


>> +static ssize_t vfio_mpci_read(void *device_data, char __user *buf,
>> +			      size_t count, loff_t *ppos)
>> +{
>> +	struct vfio_mdev *vmdev = device_data;
>> +	struct mdev_device *mdev = vmdev->mdev;
>> +	struct parent_device *parent = mdev->parent;
>> +	int ret = 0;
>> +
>> +	if (!count)
>> +		return 0;
>> +
>> +	if (parent->ops->read) {
>> +		char *ret_data, *ptr;
>> +
>> +		ptr = ret_data = kzalloc(count, GFP_KERNEL);
> 
> Do we really need to support arbitrary lengths in one shot?  Seems like
> we could just use a 4 or 8 byte variable on the stack and iterate until
> done.
> 

We just want to pass the arguments to vendor driver as is here. Vendor
driver could take care of that.

>> +
>> +static ssize_t vfio_mpci_write(void *device_data, const char __user *buf,
>> +			       size_t count, loff_t *ppos)
>> +{
>> +	struct vfio_mdev *vmdev = device_data;
>> +	struct mdev_device *mdev = vmdev->mdev;
>> +	struct parent_device *parent = mdev->parent;
>> +	int ret = 0;
>> +
>> +	if (!count)
>> +		return 0;
>> +
>> +	if (parent->ops->write) {
>> +		char *usr_data, *ptr;
>> +
>> +		ptr = usr_data = memdup_user(buf, count);
> 
> Same here, how much do we care to let the user write in one pass and is
> there any advantage to it?  When QEMU is our userspace we're only
> likely to see 4-byte accesses anyway.

Same as above.

>> +
>> +static int mdev_dev_mmio_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
>> +{
...
>> +	} else {
>> +		struct pci_dev *pdev;
>> +
>> +		virtaddr = vma->vm_start;
>> +		req_size = vma->vm_end - vma->vm_start;
>> +
>> +		pdev = to_pci_dev(parent->dev);
>> +		index = VFIO_PCI_OFFSET_TO_INDEX(vma->vm_pgoff << PAGE_SHIFT);
> 
> Iterate through region_info[*].offset/size provided by vendor driver.
> 

Yes, makes sense.

>> +
>> +int vfio_mpci_match(struct device *dev)
>> +{
>> +	if (dev_is_pci(dev->parent))
> 
> This is the wrong test, there's really no requirement that a pci mdev
> device is hosted by a real pci device.  

Ideally this module is for the mediated device whose parent is PCI
device. And we are relying on kernel functions like
pci_resource_start(), to_pci_dev() in this module, so better to check it
while loading.


> Can't we check that the device
> is on an mdev_pci_bus_type?
> 

I didn't get this part.

Each mediated device is of mdev_bus_type. But VFIO module could be
different based on parent device type and loaded at the same time. For
example, there should be different modules for channel IO or any other
type of devices and could be loaded at the same time. Then when mdev
device is created based on check in match() function of each module, and
proper driver would be linked for that mdev device.

If this check is not based on parent device type, do you expect to set
parent device type by vendor driver and accordingly load corresponding
VFIO driver?


>> @@ -18,6 +18,7 @@
>>  #include <linux/uaccess.h>
>>  #include <linux/io.h>
>>  #include <linux/vgaarb.h>
>> +#include <linux/vfio.h>
>>  
>>  #include "vfio_pci_private.h"
>>  
>> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
>> index 0ecae0b1cd34..431b824b0d3e 100644
>> --- a/include/linux/vfio.h
>> +++ b/include/linux/vfio.h
>> @@ -18,6 +18,13 @@
>>  #include <linux/poll.h>
>>  #include <uapi/linux/vfio.h>
>>  
>> +#define VFIO_PCI_OFFSET_SHIFT   40
>> +
>> +#define VFIO_PCI_OFFSET_TO_INDEX(off)   (off >> VFIO_PCI_OFFSET_SHIFT)
>> +#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
>> +#define VFIO_PCI_OFFSET_MASK    (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
>> +
>> +
> 
> Nak this, I'm not interested in making this any sort of ABI.
> 

These macros are used by drivers/vfio/pci/vfio_pci.c and
drivers/vfio/mdev/vfio_mpci.c and to use those in both these modules,
they should be moved to common place as you suggested in earlier
reviews. I think this is better common place. Are there any other
suggestion?

>> +static u8 mpci_find_pci_capability(struct mdev_device *mdev, u8 capability)
>> +{
>> +	loff_t pos = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_CONFIG_REGION_INDEX);
> 
> This creates a fixed ABI between vfio-mdev-pci and vendor drivers that
> a given region starts at a pre-defined offset.  We have the offset
> stored in vfio_mdev.region_info[VFIO_PCI_CONFIG_REGION_INDEX].offset,
> use it.  It's just as unacceptable to impose this fixed relationship
> with a vendor driver here as if a userspace driver were to do the same.
> 

In the v5 version, where config space was cached in this module,
suggestion was to don't care about data or caching it at read/write,
just pass it through. Now since VFIO_PCI_* macros are also available
here, vendor driver can use it to decode pos to find region index and
offset of access. Then vendor driver itself add
vmdev->vfio_region_info[info.index].offset, which is known to him.
Either we do this in VFIO module or vendor driver?

Thanks,
Kirti.







--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson Aug. 10, 2016, 11 p.m. UTC | #5
On Thu, 11 Aug 2016 02:53:10 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 8/10/2016 12:30 AM, Alex Williamson wrote:
> > On Thu, 4 Aug 2016 00:33:52 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> 
> ...
> 
> >> +
> >> +		switch (info.index) {
> >> +		case VFIO_PCI_CONFIG_REGION_INDEX:
> >> +		case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
> >> +			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);  
> > 
> > No, vmdev->vfio_region_info[info.index].offset
> >  
> 
> Ok.
> 
> >> +			info.size = vmdev->vfio_region_info[info.index].size;
> >> +			if (!info.size) {
> >> +				info.flags = 0;
> >> +				break;
> >> +			}
> >> +
> >> +			info.flags = vmdev->vfio_region_info[info.index].flags;
> >> +			break;
> >> +		case VFIO_PCI_VGA_REGION_INDEX:
> >> +		case VFIO_PCI_ROM_REGION_INDEX:  
> > 
> > Why?  Let the vendor driver decide.
> >   
> 
> Ok.
> 
> >> +		switch (info.index) {
> >> +		case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSI_IRQ_INDEX:
> >> +		case VFIO_PCI_REQ_IRQ_INDEX:
> >> +			break;
> >> +			/* pass thru to return error */
> >> +		case VFIO_PCI_MSIX_IRQ_INDEX:  
> > 
> > ???  
> 
> Sorry, I missed to update this. Updating it.
> 
> >> +	case VFIO_DEVICE_SET_IRQS:
> >> +	{  
> ...
> >> +
> >> +		if (parent->ops->set_irqs)
> >> +			ret = parent->ops->set_irqs(mdev, hdr.flags, hdr.index,
> >> +						    hdr.start, hdr.count, data);
> >> +
> >> +		kfree(ptr);
> >> +		return ret;  
> > 
> > Return success if no set_irqs callback?
> >  
> 
> Ideally, vendor driver should provide this function. If vendor driver
> doesn't provide it, do we really need to fail here?

Wouldn't you as a user expect to get an error if you try to call an
ioctl that has no backing rather than assume success and never receive
and interrupt?
 
> >> +static ssize_t vfio_mpci_read(void *device_data, char __user *buf,
> >> +			      size_t count, loff_t *ppos)
> >> +{
> >> +	struct vfio_mdev *vmdev = device_data;
> >> +	struct mdev_device *mdev = vmdev->mdev;
> >> +	struct parent_device *parent = mdev->parent;
> >> +	int ret = 0;
> >> +
> >> +	if (!count)
> >> +		return 0;
> >> +
> >> +	if (parent->ops->read) {
> >> +		char *ret_data, *ptr;
> >> +
> >> +		ptr = ret_data = kzalloc(count, GFP_KERNEL);  
> > 
> > Do we really need to support arbitrary lengths in one shot?  Seems like
> > we could just use a 4 or 8 byte variable on the stack and iterate until
> > done.
> >   
> 
> We just want to pass the arguments to vendor driver as is here. Vendor
> driver could take care of that.

But I think this is exploitable, it lets the user make the kernel
allocate an arbitrarily sized buffer.
 
> >> +
> >> +static ssize_t vfio_mpci_write(void *device_data, const char __user *buf,
> >> +			       size_t count, loff_t *ppos)
> >> +{
> >> +	struct vfio_mdev *vmdev = device_data;
> >> +	struct mdev_device *mdev = vmdev->mdev;
> >> +	struct parent_device *parent = mdev->parent;
> >> +	int ret = 0;
> >> +
> >> +	if (!count)
> >> +		return 0;
> >> +
> >> +	if (parent->ops->write) {
> >> +		char *usr_data, *ptr;
> >> +
> >> +		ptr = usr_data = memdup_user(buf, count);  
> > 
> > Same here, how much do we care to let the user write in one pass and is
> > there any advantage to it?  When QEMU is our userspace we're only
> > likely to see 4-byte accesses anyway.  
> 
> Same as above.
> 
> >> +
> >> +static int mdev_dev_mmio_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
> >> +{  
> ...
> >> +	} else {
> >> +		struct pci_dev *pdev;
> >> +
> >> +		virtaddr = vma->vm_start;
> >> +		req_size = vma->vm_end - vma->vm_start;
> >> +
> >> +		pdev = to_pci_dev(parent->dev);
> >> +		index = VFIO_PCI_OFFSET_TO_INDEX(vma->vm_pgoff << PAGE_SHIFT);  
> > 
> > Iterate through region_info[*].offset/size provided by vendor driver.
> >   
> 
> Yes, makes sense.
> 
> >> +
> >> +int vfio_mpci_match(struct device *dev)
> >> +{
> >> +	if (dev_is_pci(dev->parent))  
> > 
> > This is the wrong test, there's really no requirement that a pci mdev
> > device is hosted by a real pci device.    
> 
> Ideally this module is for the mediated device whose parent is PCI
> device. And we are relying on kernel functions like
> pci_resource_start(), to_pci_dev() in this module, so better to check it
> while loading.

IMO, we don't want to care what the parent device is, it's not ideal,
it's actually a limitation to impose that it is a PCI device.  I want to
be able to make purely virtual mediated devices.  I only see that you
use these functions in the mmio fault handling.  Is it useful to assume
that on mmio fault we map to the parent device PCI BAR regions?  Just
require that the vendor driver provides a fault mapping function or
SIGBUS if we get a fault and it doesn't.

> > Can't we check that the device
> > is on an mdev_pci_bus_type?
> >   
> 
> I didn't get this part.
> 
> Each mediated device is of mdev_bus_type. But VFIO module could be
> different based on parent device type and loaded at the same time. For
> example, there should be different modules for channel IO or any other
> type of devices and could be loaded at the same time. Then when mdev
> device is created based on check in match() function of each module, and
> proper driver would be linked for that mdev device.
> 
> If this check is not based on parent device type, do you expect to set
> parent device type by vendor driver and accordingly load corresponding
> VFIO driver?

mdev_pci_bus_type was an off the cuff response since the driver.bus
controls which devices a probe function will see.  If we have a unique
bus for a driver and create devices appropriately, we really don't
even need a match function.  That would still work, but what if you
made a get_device_info callback to the vendor driver rather than
creating that info in the mediated bus driver layer.  Then the probe
function here could simply check the flags to see if the device is
VFIO_DEVICE_FLAGS_PCI?

> >> @@ -18,6 +18,7 @@
> >>  #include <linux/uaccess.h>
> >>  #include <linux/io.h>
> >>  #include <linux/vgaarb.h>
> >> +#include <linux/vfio.h>
> >>  
> >>  #include "vfio_pci_private.h"
> >>  
> >> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> >> index 0ecae0b1cd34..431b824b0d3e 100644
> >> --- a/include/linux/vfio.h
> >> +++ b/include/linux/vfio.h
> >> @@ -18,6 +18,13 @@
> >>  #include <linux/poll.h>
> >>  #include <uapi/linux/vfio.h>
> >>  
> >> +#define VFIO_PCI_OFFSET_SHIFT   40
> >> +
> >> +#define VFIO_PCI_OFFSET_TO_INDEX(off)   (off >> VFIO_PCI_OFFSET_SHIFT)
> >> +#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
> >> +#define VFIO_PCI_OFFSET_MASK    (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
> >> +
> >> +  
> > 
> > Nak this, I'm not interested in making this any sort of ABI.
> >   
> 
> These macros are used by drivers/vfio/pci/vfio_pci.c and
> drivers/vfio/mdev/vfio_mpci.c and to use those in both these modules,
> they should be moved to common place as you suggested in earlier
> reviews. I think this is better common place. Are there any other
> suggestion?

They're only used in ways that I objected to above and you've agreed
to.  These define implementation details that must not become part of
the mediated vendor driver ABI.  A vendor driver is free to redefine
this the same if they want, but as we can see with how easily they slip
into code where they don't belong, the only way to make sure they don't
become ABI is to keep them in private headers.
 
> >> +static u8 mpci_find_pci_capability(struct mdev_device *mdev, u8 capability)
> >> +{
> >> +	loff_t pos = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_CONFIG_REGION_INDEX);  
> > 
> > This creates a fixed ABI between vfio-mdev-pci and vendor drivers that
> > a given region starts at a pre-defined offset.  We have the offset
> > stored in vfio_mdev.region_info[VFIO_PCI_CONFIG_REGION_INDEX].offset,
> > use it.  It's just as unacceptable to impose this fixed relationship
> > with a vendor driver here as if a userspace driver were to do the same.
> >   
> 
> In the v5 version, where config space was cached in this module,
> suggestion was to don't care about data or caching it at read/write,
> just pass it through. Now since VFIO_PCI_* macros are also available
> here, vendor driver can use it to decode pos to find region index and
> offset of access. Then vendor driver itself add
> vmdev->vfio_region_info[info.index].offset, which is known to him.
> Either we do this in VFIO module or vendor driver?

As I say above, a vendor driver is absolutely free to use the same
index/offset scheme, but it absolutely must not be part of the ABI
between vendor drivers and the mediated driver core.  It's up to the
vendor driver to define that relation and moving these to a common
header is clearly too dangerous.  I'm sorry if I've said otherwise in
the past, but I've only recently discovered a userspace driver (DPDK)
copying these defines and ignoring the index offsets reported through
the REGION_INFO API.  So I'm now bitterly aware how an internal
implementation detail can be abused and if we don't catch them, it's
going to lock us into an implementation that was designed to be
flexible.  Thanks,

Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kirti Wankhede Aug. 11, 2016, 3:59 p.m. UTC | #6
On 8/11/2016 4:30 AM, Alex Williamson wrote:
> On Thu, 11 Aug 2016 02:53:10 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> On 8/10/2016 12:30 AM, Alex Williamson wrote:
>>> On Thu, 4 Aug 2016 00:33:52 +0530
>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>   
>>
>> ...
>>
>>>> +
>>>> +		switch (info.index) {
>>>> +		case VFIO_PCI_CONFIG_REGION_INDEX:
>>>> +		case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
>>>> +			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);  
>>>
>>> No, vmdev->vfio_region_info[info.index].offset
>>>  
>>
>> Ok.
>>
>>>> +			info.size = vmdev->vfio_region_info[info.index].size;
>>>> +			if (!info.size) {
>>>> +				info.flags = 0;
>>>> +				break;
>>>> +			}
>>>> +
>>>> +			info.flags = vmdev->vfio_region_info[info.index].flags;
>>>> +			break;
>>>> +		case VFIO_PCI_VGA_REGION_INDEX:
>>>> +		case VFIO_PCI_ROM_REGION_INDEX:  
>>>
>>> Why?  Let the vendor driver decide.
>>>   
>>
>> Ok.
>>
>>>> +		switch (info.index) {
>>>> +		case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSI_IRQ_INDEX:
>>>> +		case VFIO_PCI_REQ_IRQ_INDEX:
>>>> +			break;
>>>> +			/* pass thru to return error */
>>>> +		case VFIO_PCI_MSIX_IRQ_INDEX:  
>>>
>>> ???  
>>
>> Sorry, I missed to update this. Updating it.
>>
>>>> +	case VFIO_DEVICE_SET_IRQS:
>>>> +	{  
>> ...
>>>> +
>>>> +		if (parent->ops->set_irqs)
>>>> +			ret = parent->ops->set_irqs(mdev, hdr.flags, hdr.index,
>>>> +						    hdr.start, hdr.count, data);
>>>> +
>>>> +		kfree(ptr);
>>>> +		return ret;  
>>>
>>> Return success if no set_irqs callback?
>>>  
>>
>> Ideally, vendor driver should provide this function. If vendor driver
>> doesn't provide it, do we really need to fail here?
> 
> Wouldn't you as a user expect to get an error if you try to call an
> ioctl that has no backing rather than assume success and never receive
> and interrupt?
>  

If we really don't want to proceed if set_irqs() is not provided then
its better to add it in mandatory list in mdev_register_device() in
mdev_core.c and fail earlier, i.e. fail to register the device.


>>>> +static ssize_t vfio_mpci_read(void *device_data, char __user *buf,
>>>> +			      size_t count, loff_t *ppos)
>>>> +{
>>>> +	struct vfio_mdev *vmdev = device_data;
>>>> +	struct mdev_device *mdev = vmdev->mdev;
>>>> +	struct parent_device *parent = mdev->parent;
>>>> +	int ret = 0;
>>>> +
>>>> +	if (!count)
>>>> +		return 0;
>>>> +
>>>> +	if (parent->ops->read) {
>>>> +		char *ret_data, *ptr;
>>>> +
>>>> +		ptr = ret_data = kzalloc(count, GFP_KERNEL);  
>>>
>>> Do we really need to support arbitrary lengths in one shot?  Seems like
>>> we could just use a 4 or 8 byte variable on the stack and iterate until
>>> done.
>>>   
>>
>> We just want to pass the arguments to vendor driver as is here. Vendor
>> driver could take care of that.
> 
> But I think this is exploitable, it lets the user make the kernel
> allocate an arbitrarily sized buffer.
>  
>>>> +
>>>> +static ssize_t vfio_mpci_write(void *device_data, const char __user *buf,
>>>> +			       size_t count, loff_t *ppos)
>>>> +{
>>>> +	struct vfio_mdev *vmdev = device_data;
>>>> +	struct mdev_device *mdev = vmdev->mdev;
>>>> +	struct parent_device *parent = mdev->parent;
>>>> +	int ret = 0;
>>>> +
>>>> +	if (!count)
>>>> +		return 0;
>>>> +
>>>> +	if (parent->ops->write) {
>>>> +		char *usr_data, *ptr;
>>>> +
>>>> +		ptr = usr_data = memdup_user(buf, count);  
>>>
>>> Same here, how much do we care to let the user write in one pass and is
>>> there any advantage to it?  When QEMU is our userspace we're only
>>> likely to see 4-byte accesses anyway.  
>>
>> Same as above.
>>
>>>> +
>>>> +static int mdev_dev_mmio_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
>>>> +{  
>> ...
>>>> +	} else {
>>>> +		struct pci_dev *pdev;
>>>> +
>>>> +		virtaddr = vma->vm_start;
>>>> +		req_size = vma->vm_end - vma->vm_start;
>>>> +
>>>> +		pdev = to_pci_dev(parent->dev);
>>>> +		index = VFIO_PCI_OFFSET_TO_INDEX(vma->vm_pgoff << PAGE_SHIFT);  
>>>
>>> Iterate through region_info[*].offset/size provided by vendor driver.
>>>   
>>
>> Yes, makes sense.
>>
>>>> +
>>>> +int vfio_mpci_match(struct device *dev)
>>>> +{
>>>> +	if (dev_is_pci(dev->parent))  
>>>
>>> This is the wrong test, there's really no requirement that a pci mdev
>>> device is hosted by a real pci device.    
>>
>> Ideally this module is for the mediated device whose parent is PCI
>> device. And we are relying on kernel functions like
>> pci_resource_start(), to_pci_dev() in this module, so better to check it
>> while loading.
> 
> IMO, we don't want to care what the parent device is, it's not ideal,
> it's actually a limitation to impose that it is a PCI device.  I want to
> be able to make purely virtual mediated devices.  I only see that you
> use these functions in the mmio fault handling.  Is it useful to assume
> that on mmio fault we map to the parent device PCI BAR regions?  Just
> require that the vendor driver provides a fault mapping function or
> SIGBUS if we get a fault and it doesn't.
> 
>>> Can't we check that the device
>>> is on an mdev_pci_bus_type?
>>>   
>>
>> I didn't get this part.
>>
>> Each mediated device is of mdev_bus_type. But VFIO module could be
>> different based on parent device type and loaded at the same time. For
>> example, there should be different modules for channel IO or any other
>> type of devices and could be loaded at the same time. Then when mdev
>> device is created based on check in match() function of each module, and
>> proper driver would be linked for that mdev device.
>>
>> If this check is not based on parent device type, do you expect to set
>> parent device type by vendor driver and accordingly load corresponding
>> VFIO driver?
> 
> mdev_pci_bus_type was an off the cuff response since the driver.bus
> controls which devices a probe function will see.  If we have a unique
> bus for a driver and create devices appropriately, we really don't
> even need a match function. 

I still think that all types of mdev devices should have unique bus type
so that VFIO IOMMU module could be used for any type of mediated device
without any change. Otherwise we have to add checks for all supported
bus types in vfio_iommu_type1_attach_group().

> That would still work, but what if you
> made a get_device_info callback to the vendor driver rather than
> creating that info in the mediated bus driver layer.  Then the probe
> function here could simply check the flags to see if the device is
> VFIO_DEVICE_FLAGS_PCI?
> 

Right. get_device_info() would be a mandatory callback and it would be
vendor driver's responsibility to return proper flag.


>>>> @@ -18,6 +18,7 @@
>>>>  #include <linux/uaccess.h>
>>>>  #include <linux/io.h>
>>>>  #include <linux/vgaarb.h>
>>>> +#include <linux/vfio.h>
>>>>  
>>>>  #include "vfio_pci_private.h"
>>>>  
>>>> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
>>>> index 0ecae0b1cd34..431b824b0d3e 100644
>>>> --- a/include/linux/vfio.h
>>>> +++ b/include/linux/vfio.h
>>>> @@ -18,6 +18,13 @@
>>>>  #include <linux/poll.h>
>>>>  #include <uapi/linux/vfio.h>
>>>>  
>>>> +#define VFIO_PCI_OFFSET_SHIFT   40
>>>> +
>>>> +#define VFIO_PCI_OFFSET_TO_INDEX(off)   (off >> VFIO_PCI_OFFSET_SHIFT)
>>>> +#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
>>>> +#define VFIO_PCI_OFFSET_MASK    (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
>>>> +
>>>> +  
>>>
>>> Nak this, I'm not interested in making this any sort of ABI.
>>>   
>>
>> These macros are used by drivers/vfio/pci/vfio_pci.c and
>> drivers/vfio/mdev/vfio_mpci.c and to use those in both these modules,
>> they should be moved to common place as you suggested in earlier
>> reviews. I think this is better common place. Are there any other
>> suggestion?
> 
> They're only used in ways that I objected to above and you've agreed
> to.  These define implementation details that must not become part of
> the mediated vendor driver ABI.  A vendor driver is free to redefine
> this the same if they want, but as we can see with how easily they slip
> into code where they don't belong, the only way to make sure they don't
> become ABI is to keep them in private headers.
>  

Then I think, I can't use these macros in mdev modules, they are defined
in drivers/vfio/pci/vfio_pci_private.h
I have to define similar macros in drivers/vfio/mdev/mdev_private.h?

parent->ops->get_region_info() is called from vfio_mpci_open() that is
before PCI config space is setup. Main expectation from
get_region_info() was to get flags and size. At this point of time
vendor driver also don't know about the base addresses of regions.

    case VFIO_DEVICE_GET_REGION_INFO:
...

        info.offset = vmdev->vfio_region_info[info.index].offset;

In that case, as suggested in previous reply, above is not going to work.
I'll define such macros in drivers/vfio/mdev/mdev_private.h, set above
offset according to these macros. Then on first access to any BAR
region, i.e. after PCI config space is populated, call
parent->ops->get_region_info() again so that
vfio_region_info[index].offset for all regions are set by vendor driver.
Then use these offsets to calculate 'pos' for
read/write/validate_map_request(). Does this seems reasonable?

Thanks,
Kirti

>>>> +static u8 mpci_find_pci_capability(struct mdev_device *mdev, u8 capability)
>>>> +{
>>>> +	loff_t pos = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_CONFIG_REGION_INDEX);  
>>>
>>> This creates a fixed ABI between vfio-mdev-pci and vendor drivers that
>>> a given region starts at a pre-defined offset.  We have the offset
>>> stored in vfio_mdev.region_info[VFIO_PCI_CONFIG_REGION_INDEX].offset,
>>> use it.  It's just as unacceptable to impose this fixed relationship
>>> with a vendor driver here as if a userspace driver were to do the same.
>>>   
>>
>> In the v5 version, where config space was cached in this module,
>> suggestion was to don't care about data or caching it at read/write,
>> just pass it through. Now since VFIO_PCI_* macros are also available
>> here, vendor driver can use it to decode pos to find region index and
>> offset of access. Then vendor driver itself add
>> vmdev->vfio_region_info[info.index].offset, which is known to him.
>> Either we do this in VFIO module or vendor driver?
> 
> As I say above, a vendor driver is absolutely free to use the same
> index/offset scheme, but it absolutely must not be part of the ABI
> between vendor drivers and the mediated driver core.  It's up to the
> vendor driver to define that relation and moving these to a common
> header is clearly too dangerous.  I'm sorry if I've said otherwise in
> the past, but I've only recently discovered a userspace driver (DPDK)
> copying these defines and ignoring the index offsets reported through
> the REGION_INFO API.  So I'm now bitterly aware how an internal
> implementation detail can be abused and if we don't catch them, it's
> going to lock us into an implementation that was designed to be
> flexible.  Thanks,
> 
> Alex
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson Aug. 11, 2016, 4:24 p.m. UTC | #7
On Thu, 11 Aug 2016 21:29:35 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 8/11/2016 4:30 AM, Alex Williamson wrote:
> > On Thu, 11 Aug 2016 02:53:10 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> >> On 8/10/2016 12:30 AM, Alex Williamson wrote:  
> >>> On Thu, 4 Aug 2016 00:33:52 +0530
> >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>     
> >>
> >> ...
> >>  
> >>>> +
> >>>> +		switch (info.index) {
> >>>> +		case VFIO_PCI_CONFIG_REGION_INDEX:
> >>>> +		case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
> >>>> +			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);    
> >>>
> >>> No, vmdev->vfio_region_info[info.index].offset
> >>>    
> >>
> >> Ok.
> >>  
> >>>> +			info.size = vmdev->vfio_region_info[info.index].size;
> >>>> +			if (!info.size) {
> >>>> +				info.flags = 0;
> >>>> +				break;
> >>>> +			}
> >>>> +
> >>>> +			info.flags = vmdev->vfio_region_info[info.index].flags;
> >>>> +			break;
> >>>> +		case VFIO_PCI_VGA_REGION_INDEX:
> >>>> +		case VFIO_PCI_ROM_REGION_INDEX:    
> >>>
> >>> Why?  Let the vendor driver decide.
> >>>     
> >>
> >> Ok.
> >>  
> >>>> +		switch (info.index) {
> >>>> +		case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSI_IRQ_INDEX:
> >>>> +		case VFIO_PCI_REQ_IRQ_INDEX:
> >>>> +			break;
> >>>> +			/* pass thru to return error */
> >>>> +		case VFIO_PCI_MSIX_IRQ_INDEX:    
> >>>
> >>> ???    
> >>
> >> Sorry, I missed to update this. Updating it.
> >>  
> >>>> +	case VFIO_DEVICE_SET_IRQS:
> >>>> +	{    
> >> ...  
> >>>> +
> >>>> +		if (parent->ops->set_irqs)
> >>>> +			ret = parent->ops->set_irqs(mdev, hdr.flags, hdr.index,
> >>>> +						    hdr.start, hdr.count, data);
> >>>> +
> >>>> +		kfree(ptr);
> >>>> +		return ret;    
> >>>
> >>> Return success if no set_irqs callback?
> >>>    
> >>
> >> Ideally, vendor driver should provide this function. If vendor driver
> >> doesn't provide it, do we really need to fail here?  
> > 
> > Wouldn't you as a user expect to get an error if you try to call an
> > ioctl that has no backing rather than assume success and never receive
> > and interrupt?
> >    
> 
> If we really don't want to proceed if set_irqs() is not provided then
> its better to add it in mandatory list in mdev_register_device() in
> mdev_core.c and fail earlier, i.e. fail to register the device.

Is a device required to implement some form of interrupt to be useful?
What if there's a memory-only device that does not report INTx or
provide MSI or MSI-X capabilities?  It could still be PCI spec
complaint.  Really though it's just a matter of whether we're going to
require the mediated driver to provide a set_irqs() stub or let them
skip it and return error ourselves.  Either is really fine with me, but
we can't return success for an ioctl that has no backing.
 
> >>>> +static ssize_t vfio_mpci_read(void *device_data, char __user *buf,
> >>>> +			      size_t count, loff_t *ppos)
> >>>> +{
> >>>> +	struct vfio_mdev *vmdev = device_data;
> >>>> +	struct mdev_device *mdev = vmdev->mdev;
> >>>> +	struct parent_device *parent = mdev->parent;
> >>>> +	int ret = 0;
> >>>> +
> >>>> +	if (!count)
> >>>> +		return 0;
> >>>> +
> >>>> +	if (parent->ops->read) {
> >>>> +		char *ret_data, *ptr;
> >>>> +
> >>>> +		ptr = ret_data = kzalloc(count, GFP_KERNEL);    
> >>>
> >>> Do we really need to support arbitrary lengths in one shot?  Seems like
> >>> we could just use a 4 or 8 byte variable on the stack and iterate until
> >>> done.
> >>>     
> >>
> >> We just want to pass the arguments to vendor driver as is here. Vendor
> >> driver could take care of that.  
> > 
> > But I think this is exploitable, it lets the user make the kernel
> > allocate an arbitrarily sized buffer.
> >    
> >>>> +
> >>>> +static ssize_t vfio_mpci_write(void *device_data, const char __user *buf,
> >>>> +			       size_t count, loff_t *ppos)
> >>>> +{
> >>>> +	struct vfio_mdev *vmdev = device_data;
> >>>> +	struct mdev_device *mdev = vmdev->mdev;
> >>>> +	struct parent_device *parent = mdev->parent;
> >>>> +	int ret = 0;
> >>>> +
> >>>> +	if (!count)
> >>>> +		return 0;
> >>>> +
> >>>> +	if (parent->ops->write) {
> >>>> +		char *usr_data, *ptr;
> >>>> +
> >>>> +		ptr = usr_data = memdup_user(buf, count);    
> >>>
> >>> Same here, how much do we care to let the user write in one pass and is
> >>> there any advantage to it?  When QEMU is our userspace we're only
> >>> likely to see 4-byte accesses anyway.    
> >>
> >> Same as above.
> >>  
> >>>> +
> >>>> +static int mdev_dev_mmio_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
> >>>> +{    
> >> ...  
> >>>> +	} else {
> >>>> +		struct pci_dev *pdev;
> >>>> +
> >>>> +		virtaddr = vma->vm_start;
> >>>> +		req_size = vma->vm_end - vma->vm_start;
> >>>> +
> >>>> +		pdev = to_pci_dev(parent->dev);
> >>>> +		index = VFIO_PCI_OFFSET_TO_INDEX(vma->vm_pgoff << PAGE_SHIFT);    
> >>>
> >>> Iterate through region_info[*].offset/size provided by vendor driver.
> >>>     
> >>
> >> Yes, makes sense.
> >>  
> >>>> +
> >>>> +int vfio_mpci_match(struct device *dev)
> >>>> +{
> >>>> +	if (dev_is_pci(dev->parent))    
> >>>
> >>> This is the wrong test, there's really no requirement that a pci mdev
> >>> device is hosted by a real pci device.      
> >>
> >> Ideally this module is for the mediated device whose parent is PCI
> >> device. And we are relying on kernel functions like
> >> pci_resource_start(), to_pci_dev() in this module, so better to check it
> >> while loading.  
> > 
> > IMO, we don't want to care what the parent device is, it's not ideal,
> > it's actually a limitation to impose that it is a PCI device.  I want to
> > be able to make purely virtual mediated devices.  I only see that you
> > use these functions in the mmio fault handling.  Is it useful to assume
> > that on mmio fault we map to the parent device PCI BAR regions?  Just
> > require that the vendor driver provides a fault mapping function or
> > SIGBUS if we get a fault and it doesn't.
> >   
> >>> Can't we check that the device
> >>> is on an mdev_pci_bus_type?
> >>>     
> >>
> >> I didn't get this part.
> >>
> >> Each mediated device is of mdev_bus_type. But VFIO module could be
> >> different based on parent device type and loaded at the same time. For
> >> example, there should be different modules for channel IO or any other
> >> type of devices and could be loaded at the same time. Then when mdev
> >> device is created based on check in match() function of each module, and
> >> proper driver would be linked for that mdev device.
> >>
> >> If this check is not based on parent device type, do you expect to set
> >> parent device type by vendor driver and accordingly load corresponding
> >> VFIO driver?  
> > 
> > mdev_pci_bus_type was an off the cuff response since the driver.bus
> > controls which devices a probe function will see.  If we have a unique
> > bus for a driver and create devices appropriately, we really don't
> > even need a match function.   
> 
> I still think that all types of mdev devices should have unique bus type
> so that VFIO IOMMU module could be used for any type of mediated device
> without any change. Otherwise we have to add checks for all supported
> bus types in vfio_iommu_type1_attach_group().

Good point, so perhaps the vendor driver reporting the type through
vfio_device_info.flags is the way to go.

> > That would still work, but what if you
> > made a get_device_info callback to the vendor driver rather than
> > creating that info in the mediated bus driver layer.  Then the probe
> > function here could simply check the flags to see if the device is
> > VFIO_DEVICE_FLAGS_PCI?
> >   
> 
> Right. get_device_info() would be a mandatory callback and it would be
> vendor driver's responsibility to return proper flag.

Yep, then we don't care what the parent device is, the flags will tell
us that the mediated device adheres to PCI and that's all we care about
for binding here.

> >>>> @@ -18,6 +18,7 @@
> >>>>  #include <linux/uaccess.h>
> >>>>  #include <linux/io.h>
> >>>>  #include <linux/vgaarb.h>
> >>>> +#include <linux/vfio.h>
> >>>>  
> >>>>  #include "vfio_pci_private.h"
> >>>>  
> >>>> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> >>>> index 0ecae0b1cd34..431b824b0d3e 100644
> >>>> --- a/include/linux/vfio.h
> >>>> +++ b/include/linux/vfio.h
> >>>> @@ -18,6 +18,13 @@
> >>>>  #include <linux/poll.h>
> >>>>  #include <uapi/linux/vfio.h>
> >>>>  
> >>>> +#define VFIO_PCI_OFFSET_SHIFT   40
> >>>> +
> >>>> +#define VFIO_PCI_OFFSET_TO_INDEX(off)   (off >> VFIO_PCI_OFFSET_SHIFT)
> >>>> +#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
> >>>> +#define VFIO_PCI_OFFSET_MASK    (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
> >>>> +
> >>>> +    
> >>>
> >>> Nak this, I'm not interested in making this any sort of ABI.
> >>>     
> >>
> >> These macros are used by drivers/vfio/pci/vfio_pci.c and
> >> drivers/vfio/mdev/vfio_mpci.c and to use those in both these modules,
> >> they should be moved to common place as you suggested in earlier
> >> reviews. I think this is better common place. Are there any other
> >> suggestion?  
> > 
> > They're only used in ways that I objected to above and you've agreed
> > to.  These define implementation details that must not become part of
> > the mediated vendor driver ABI.  A vendor driver is free to redefine
> > this the same if they want, but as we can see with how easily they slip
> > into code where they don't belong, the only way to make sure they don't
> > become ABI is to keep them in private headers.
> >    
> 
> Then I think, I can't use these macros in mdev modules, they are defined
> in drivers/vfio/pci/vfio_pci_private.h
> I have to define similar macros in drivers/vfio/mdev/mdev_private.h?
> 
> parent->ops->get_region_info() is called from vfio_mpci_open() that is
> before PCI config space is setup. Main expectation from
> get_region_info() was to get flags and size. At this point of time
> vendor driver also don't know about the base addresses of regions.
> 
>     case VFIO_DEVICE_GET_REGION_INFO:
> ...
> 
>         info.offset = vmdev->vfio_region_info[info.index].offset;
> 
> In that case, as suggested in previous reply, above is not going to work.
> I'll define such macros in drivers/vfio/mdev/mdev_private.h, set above
> offset according to these macros. Then on first access to any BAR
> region, i.e. after PCI config space is populated, call
> parent->ops->get_region_info() again so that
> vfio_region_info[index].offset for all regions are set by vendor driver.
> Then use these offsets to calculate 'pos' for
> read/write/validate_map_request(). Does this seems reasonable?

This doesn't make any sense to me, there should be absolutely no reason
for the mid-layer mediated device infrastructure to impose region
offsets.  vfio-pci is a leaf driver, like the mediated vendor driver.
Only the leaf drivers can define how they layout the offsets within the
device file descriptor.  Being a VFIO_PCI device only defines region
indexes to resources, not offsets (ie. region 0 is BAR0, region 1 is
BAR1,... region 7 is PCI config space).  If this mid-layer even needs
to know region offsets, then caching them on opening the vendor device
is certainly sufficient.  Remember we're talking about the offset into
the vfio device file descriptor, how that potentially maps onto a
physical MMIO space later doesn't matter here.  It seems like maybe
we're confusing those points.  Anyway, the more I hear about needing to
reproduce these INDEX/OFFSET translation macros in places they
shouldn't be used, the more confident I am in keeping them private.
Thanks,

Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kirti Wankhede Aug. 11, 2016, 5:46 p.m. UTC | #8
On 8/11/2016 9:54 PM, Alex Williamson wrote:
> On Thu, 11 Aug 2016 21:29:35 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> On 8/11/2016 4:30 AM, Alex Williamson wrote:
>>> On Thu, 11 Aug 2016 02:53:10 +0530
>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>   
>>>> On 8/10/2016 12:30 AM, Alex Williamson wrote:  
>>>>> On Thu, 4 Aug 2016 00:33:52 +0530
>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>>>     
>>>>
>>>> ...
>>>>>>  #include "vfio_pci_private.h"
>>>>>>  
>>>>>> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
>>>>>> index 0ecae0b1cd34..431b824b0d3e 100644
>>>>>> --- a/include/linux/vfio.h
>>>>>> +++ b/include/linux/vfio.h
>>>>>> @@ -18,6 +18,13 @@
>>>>>>  #include <linux/poll.h>
>>>>>>  #include <uapi/linux/vfio.h>
>>>>>>  
>>>>>> +#define VFIO_PCI_OFFSET_SHIFT   40
>>>>>> +
>>>>>> +#define VFIO_PCI_OFFSET_TO_INDEX(off)   (off >> VFIO_PCI_OFFSET_SHIFT)
>>>>>> +#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
>>>>>> +#define VFIO_PCI_OFFSET_MASK    (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
>>>>>> +
>>>>>> +    
>>>>>
>>>>> Nak this, I'm not interested in making this any sort of ABI.
>>>>>     
>>>>
>>>> These macros are used by drivers/vfio/pci/vfio_pci.c and
>>>> drivers/vfio/mdev/vfio_mpci.c and to use those in both these modules,
>>>> they should be moved to common place as you suggested in earlier
>>>> reviews. I think this is better common place. Are there any other
>>>> suggestion?  
>>>
>>> They're only used in ways that I objected to above and you've agreed
>>> to.  These define implementation details that must not become part of
>>> the mediated vendor driver ABI.  A vendor driver is free to redefine
>>> this the same if they want, but as we can see with how easily they slip
>>> into code where they don't belong, the only way to make sure they don't
>>> become ABI is to keep them in private headers.
>>>    
>>
>> Then I think, I can't use these macros in mdev modules, they are defined
>> in drivers/vfio/pci/vfio_pci_private.h
>> I have to define similar macros in drivers/vfio/mdev/mdev_private.h?
>>
>> parent->ops->get_region_info() is called from vfio_mpci_open() that is
>> before PCI config space is setup. Main expectation from
>> get_region_info() was to get flags and size. At this point of time
>> vendor driver also don't know about the base addresses of regions.
>>
>>     case VFIO_DEVICE_GET_REGION_INFO:
>> ...
>>
>>         info.offset = vmdev->vfio_region_info[info.index].offset;
>>
>> In that case, as suggested in previous reply, above is not going to work.
>> I'll define such macros in drivers/vfio/mdev/mdev_private.h, set above
>> offset according to these macros. Then on first access to any BAR
>> region, i.e. after PCI config space is populated, call
>> parent->ops->get_region_info() again so that
>> vfio_region_info[index].offset for all regions are set by vendor driver.
>> Then use these offsets to calculate 'pos' for
>> read/write/validate_map_request(). Does this seems reasonable?
> 
> This doesn't make any sense to me, there should be absolutely no reason
> for the mid-layer mediated device infrastructure to impose region
> offsets.  vfio-pci is a leaf driver, like the mediated vendor driver.
> Only the leaf drivers can define how they layout the offsets within the
> device file descriptor.  Being a VFIO_PCI device only defines region
> indexes to resources, not offsets (ie. region 0 is BAR0, region 1 is
> BAR1,... region 7 is PCI config space).  If this mid-layer even needs
> to know region offsets, then caching them on opening the vendor device
> is certainly sufficient.  Remember we're talking about the offset into
> the vfio device file descriptor, how that potentially maps onto a
> physical MMIO space later doesn't matter here.  It seems like maybe
> we're confusing those points.  Anyway, the more I hear about needing to
> reproduce these INDEX/OFFSET translation macros in places they
> shouldn't be used, the more confident I am in keeping them private.

If vendor driver defines the offsets into vfio device file descriptor,
it will be vendor drivers responsibility that the ranges defined (offset
to offset + size) are not overlapping with other regions ranges. There
will be no validation in vfio-mpci, right?

In current implementation there is a provision that if
validate_map_request() callback is not provided, map it to physical
device's region and start of physical device's BAR address is queried
using pci_resource_start(). Since with the above change that you are
proposing, index could not be extracted from offset. Then if vendor
driver doesn't provide validate_map_request(), return SIGBUS from fault
handler.
So that impose indirect requirement that if vendor driver sets
VFIO_REGION_INFO_FLAG_MMAP for any region, they should provide
validate_map_request().

Thanks,
Kirti.

> Thanks,
> 
> Alex
> 

-----------------------------------------------------------------------------------
This email message is for the sole use of the intended recipient(s) and may contain
confidential information.  Any unauthorized review, use, disclosure or distribution
is prohibited.  If you are not the intended recipient, please contact the sender by
reply email and destroy all copies of the original message.
-----------------------------------------------------------------------------------
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson Aug. 11, 2016, 6:43 p.m. UTC | #9
On Thu, 11 Aug 2016 23:16:06 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 8/11/2016 9:54 PM, Alex Williamson wrote:
> > On Thu, 11 Aug 2016 21:29:35 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> >> On 8/11/2016 4:30 AM, Alex Williamson wrote:  
> >>> On Thu, 11 Aug 2016 02:53:10 +0530
> >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>     
> >>>> On 8/10/2016 12:30 AM, Alex Williamson wrote:    
> >>>>> On Thu, 4 Aug 2016 00:33:52 +0530
> >>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>>>       
> >>>>
> >>>> ...  
> >>>>>>  #include "vfio_pci_private.h"
> >>>>>>  
> >>>>>> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> >>>>>> index 0ecae0b1cd34..431b824b0d3e 100644
> >>>>>> --- a/include/linux/vfio.h
> >>>>>> +++ b/include/linux/vfio.h
> >>>>>> @@ -18,6 +18,13 @@
> >>>>>>  #include <linux/poll.h>
> >>>>>>  #include <uapi/linux/vfio.h>
> >>>>>>  
> >>>>>> +#define VFIO_PCI_OFFSET_SHIFT   40
> >>>>>> +
> >>>>>> +#define VFIO_PCI_OFFSET_TO_INDEX(off)   (off >> VFIO_PCI_OFFSET_SHIFT)
> >>>>>> +#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
> >>>>>> +#define VFIO_PCI_OFFSET_MASK    (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
> >>>>>> +
> >>>>>> +      
> >>>>>
> >>>>> Nak this, I'm not interested in making this any sort of ABI.
> >>>>>       
> >>>>
> >>>> These macros are used by drivers/vfio/pci/vfio_pci.c and
> >>>> drivers/vfio/mdev/vfio_mpci.c and to use those in both these modules,
> >>>> they should be moved to common place as you suggested in earlier
> >>>> reviews. I think this is better common place. Are there any other
> >>>> suggestion?    
> >>>
> >>> They're only used in ways that I objected to above and you've agreed
> >>> to.  These define implementation details that must not become part of
> >>> the mediated vendor driver ABI.  A vendor driver is free to redefine
> >>> this the same if they want, but as we can see with how easily they slip
> >>> into code where they don't belong, the only way to make sure they don't
> >>> become ABI is to keep them in private headers.
> >>>      
> >>
> >> Then I think, I can't use these macros in mdev modules, they are defined
> >> in drivers/vfio/pci/vfio_pci_private.h
> >> I have to define similar macros in drivers/vfio/mdev/mdev_private.h?
> >>
> >> parent->ops->get_region_info() is called from vfio_mpci_open() that is
> >> before PCI config space is setup. Main expectation from
> >> get_region_info() was to get flags and size. At this point of time
> >> vendor driver also don't know about the base addresses of regions.
> >>
> >>     case VFIO_DEVICE_GET_REGION_INFO:
> >> ...
> >>
> >>         info.offset = vmdev->vfio_region_info[info.index].offset;
> >>
> >> In that case, as suggested in previous reply, above is not going to work.
> >> I'll define such macros in drivers/vfio/mdev/mdev_private.h, set above
> >> offset according to these macros. Then on first access to any BAR
> >> region, i.e. after PCI config space is populated, call
> >> parent->ops->get_region_info() again so that
> >> vfio_region_info[index].offset for all regions are set by vendor driver.
> >> Then use these offsets to calculate 'pos' for
> >> read/write/validate_map_request(). Does this seems reasonable?  
> > 
> > This doesn't make any sense to me, there should be absolutely no reason
> > for the mid-layer mediated device infrastructure to impose region
> > offsets.  vfio-pci is a leaf driver, like the mediated vendor driver.
> > Only the leaf drivers can define how they layout the offsets within the
> > device file descriptor.  Being a VFIO_PCI device only defines region
> > indexes to resources, not offsets (ie. region 0 is BAR0, region 1 is
> > BAR1,... region 7 is PCI config space).  If this mid-layer even needs
> > to know region offsets, then caching them on opening the vendor device
> > is certainly sufficient.  Remember we're talking about the offset into
> > the vfio device file descriptor, how that potentially maps onto a
> > physical MMIO space later doesn't matter here.  It seems like maybe
> > we're confusing those points.  Anyway, the more I hear about needing to
> > reproduce these INDEX/OFFSET translation macros in places they
> > shouldn't be used, the more confident I am in keeping them private.  
> 
> If vendor driver defines the offsets into vfio device file descriptor,
> it will be vendor drivers responsibility that the ranges defined (offset
> to offset + size) are not overlapping with other regions ranges. There
> will be no validation in vfio-mpci, right?

Right, this seems like a pretty basic requirement of the vendor driver
to offer region ranges that do not overlap and there's plenty else
about the vendor driver that the mid-layer can't validate its
behavior...

> In current implementation there is a provision that if
> validate_map_request() callback is not provided, map it to physical
> device's region and start of physical device's BAR address is queried
> using pci_resource_start(). Since with the above change that you are
> proposing, index could not be extracted from offset. Then if vendor
> driver doesn't provide validate_map_request(), return SIGBUS from fault
> handler.
> So that impose indirect requirement that if vendor driver sets
> VFIO_REGION_INFO_FLAG_MMAP for any region, they should provide
> validate_map_request().

TBH, I don't see how providing a default implementation of
validate_map_request() is useful.  How many mediated devices are going
to want to identity map resources from the parent?  Even if they do, it
seems we can only support a single mediated device per parent device
since each will map the same parent resource offset. Let's not even try
to define a default.  If we get a fault and the vendor driver hasn't
provided a handler, send a SIGBUS.  I expect we should also allow
vendor drivers to fill the mapping at mmap() time rather than expecting
this map on fault scheme.  Maybe the mid-level driver should not even be
interacting with mmap() and should let the vendor driver entirely
determine the handling.

For the most part these mid-level drivers, like mediated pci, should be
as thin as possible, and to some extent I wonder if we need them at
all.  We mostly want user interaction with the vfio device file
descriptor to pass directly to the vendor driver and we should only be
adding logic to the mid-level driver when it actually provides some
useful and generic simplification to the vendor driver.  Things like
this default fault handling scheme don't appear to be generic at all,
it's actually a very unique use case I think.  For the most part
I think the mediated interface is just a shim to standardize the
lifecycle of a mediated device for management purposes,
integrate "fake/virtual" devices into the vfio infrastructure,
provide common page tracking, pinning and mapping services, but
the device interface itself should mostly just pass through the
vfio device API straight through to the vendor driver.  Thanks,

Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kirti Wankhede Aug. 12, 2016, 5:57 p.m. UTC | #10
On 8/12/2016 12:13 AM, Alex Williamson wrote:

> 
> TBH, I don't see how providing a default implementation of
> validate_map_request() is useful.  How many mediated devices are going
> to want to identity map resources from the parent?  Even if they do, it
> seems we can only support a single mediated device per parent device
> since each will map the same parent resource offset. Let's not even try
> to define a default.  If we get a fault and the vendor driver hasn't
> provided a handler, send a SIGBUS.  I expect we should also allow
> vendor drivers to fill the mapping at mmap() time rather than expecting
> this map on fault scheme.  Maybe the mid-level driver should not even be
> interacting with mmap() and should let the vendor driver entirely
> determine the handling.
>

Should we go ahead with pass through mmap() call to vendor driver and
let vendor driver decide what to do in mmap() call, either
remap_pfn_range in mmap() or do fault on access and handle the fault in
their driver. In that case we don't need to track mappings in mdev core.
Let vendor driver do that on their own, right?



> For the most part these mid-level drivers, like mediated pci, should be
> as thin as possible, and to some extent I wonder if we need them at
> all.  We mostly want user interaction with the vfio device file
> descriptor to pass directly to the vendor driver and we should only be
> adding logic to the mid-level driver when it actually provides some
> useful and generic simplification to the vendor driver.  Things like
> this default fault handling scheme don't appear to be generic at all,
> it's actually a very unique use case I think.  For the most part
> I think the mediated interface is just a shim to standardize the
> lifecycle of a mediated device for management purposes,
> integrate "fake/virtual" devices into the vfio infrastructure,
> provide common page tracking, pinning and mapping services, but
> the device interface itself should mostly just pass through the
> vfio device API straight through to the vendor driver.  Thanks,
> 
> Alex
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson Aug. 12, 2016, 9:25 p.m. UTC | #11
On Fri, 12 Aug 2016 23:27:01 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 8/12/2016 12:13 AM, Alex Williamson wrote:
> 
> > 
> > TBH, I don't see how providing a default implementation of
> > validate_map_request() is useful.  How many mediated devices are going
> > to want to identity map resources from the parent?  Even if they do, it
> > seems we can only support a single mediated device per parent device
> > since each will map the same parent resource offset. Let's not even try
> > to define a default.  If we get a fault and the vendor driver hasn't
> > provided a handler, send a SIGBUS.  I expect we should also allow
> > vendor drivers to fill the mapping at mmap() time rather than expecting
> > this map on fault scheme.  Maybe the mid-level driver should not even be
> > interacting with mmap() and should let the vendor driver entirely
> > determine the handling.
> >  
> 
> Should we go ahead with pass through mmap() call to vendor driver and
> let vendor driver decide what to do in mmap() call, either
> remap_pfn_range in mmap() or do fault on access and handle the fault in
> their driver. In that case we don't need to track mappings in mdev core.
> Let vendor driver do that on their own, right?

This sounds right to me, I don't think we want to impose either model
on the vendor driver.  The vendor driver owns the vfio device file
descriptor and is responsible for managing it should they expose mmap
support for regions on the file descriptor.  They either need to insert
mappings at the point where mmap() is called or setup fault handlers to
insert them on demand.  If we can provide helper functions so that each
vendor driver doesn't need to re-invent either of those, that would be
a bonus.  Thanks,

Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kirti Wankhede Aug. 13, 2016, 12:42 a.m. UTC | #12
On 8/13/2016 2:55 AM, Alex Williamson wrote:
> On Fri, 12 Aug 2016 23:27:01 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> On 8/12/2016 12:13 AM, Alex Williamson wrote:
>>
>>>
>>> TBH, I don't see how providing a default implementation of
>>> validate_map_request() is useful.  How many mediated devices are going
>>> to want to identity map resources from the parent?  Even if they do, it
>>> seems we can only support a single mediated device per parent device
>>> since each will map the same parent resource offset. Let's not even try
>>> to define a default.  If we get a fault and the vendor driver hasn't
>>> provided a handler, send a SIGBUS.  I expect we should also allow
>>> vendor drivers to fill the mapping at mmap() time rather than expecting
>>> this map on fault scheme.  Maybe the mid-level driver should not even be
>>> interacting with mmap() and should let the vendor driver entirely
>>> determine the handling.
>>>  
>>
>> Should we go ahead with pass through mmap() call to vendor driver and
>> let vendor driver decide what to do in mmap() call, either
>> remap_pfn_range in mmap() or do fault on access and handle the fault in
>> their driver. In that case we don't need to track mappings in mdev core.
>> Let vendor driver do that on their own, right?
> 
> This sounds right to me, I don't think we want to impose either model
> on the vendor driver.  The vendor driver owns the vfio device file
> descriptor and is responsible for managing it should they expose mmap
> support for regions on the file descriptor.  They either need to insert
> mappings at the point where mmap() is called or setup fault handlers to
> insert them on demand.  If we can provide helper functions so that each
> vendor driver doesn't need to re-invent either of those, that would be
> a bonus.  Thanks,
> 

Since mmap() is going to be handled in vendor driver, let vendor driver
do their own tracking logic of mappings based on which way they decide
to go. No need to keep it in mdev coer module and try to handle all the
cases in one function.

Thanks,
Kirti


> Alex
> 

-----------------------------------------------------------------------------------
This email message is for the sole use of the intended recipient(s) and may contain
confidential information.  Any unauthorized review, use, disclosure or distribution
is prohibited.  If you are not the intended recipient, please contact the sender by
reply email and destroy all copies of the original message.
-----------------------------------------------------------------------------------
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/vfio/mdev/Kconfig b/drivers/vfio/mdev/Kconfig
index a34fbc66f92f..431ed595c8da 100644
--- a/drivers/vfio/mdev/Kconfig
+++ b/drivers/vfio/mdev/Kconfig
@@ -9,4 +9,10 @@  config VFIO_MDEV
 
         If you don't know what do here, say N.
 
+config VFIO_MPCI
+    tristate "VFIO support for Mediated PCI devices"
+    depends on VFIO && PCI && VFIO_MDEV
+    default n
+    help
+        VFIO based driver for mediated PCI devices.
 
diff --git a/drivers/vfio/mdev/Makefile b/drivers/vfio/mdev/Makefile
index 56a75e689582..264fb03dd0e3 100644
--- a/drivers/vfio/mdev/Makefile
+++ b/drivers/vfio/mdev/Makefile
@@ -2,4 +2,5 @@ 
 mdev-y := mdev_core.o mdev_sysfs.o mdev_driver.o
 
 obj-$(CONFIG_VFIO_MDEV) += mdev.o
+obj-$(CONFIG_VFIO_MPCI) += vfio_mpci.o
 
diff --git a/drivers/vfio/mdev/vfio_mpci.c b/drivers/vfio/mdev/vfio_mpci.c
new file mode 100644
index 000000000000..9da94b76ae3e
--- /dev/null
+++ b/drivers/vfio/mdev/vfio_mpci.c
@@ -0,0 +1,536 @@ 
+/*
+ * VFIO based Mediated PCI device driver
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *     Author: Neo Jia <cjia@nvidia.com>
+ *	       Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/uuid.h>
+#include <linux/vfio.h>
+#include <linux/iommu.h>
+#include <linux/mdev.h>
+
+#include "mdev_private.h"
+
+#define DRIVER_VERSION  "0.1"
+#define DRIVER_AUTHOR   "NVIDIA Corporation"
+#define DRIVER_DESC     "VFIO based Mediated PCI device driver"
+
+struct vfio_mdev {
+	struct iommu_group *group;
+	struct mdev_device *mdev;
+	int		    refcnt;
+	struct vfio_region_info vfio_region_info[VFIO_PCI_NUM_REGIONS];
+	struct mutex	    vfio_mdev_lock;
+};
+
+static int vfio_mpci_open(void *device_data)
+{
+	int ret = 0;
+	struct vfio_mdev *vmdev = device_data;
+	struct parent_device *parent = vmdev->mdev->parent;
+
+	if (!try_module_get(THIS_MODULE))
+		return -ENODEV;
+
+	mutex_lock(&vmdev->vfio_mdev_lock);
+	if (!vmdev->refcnt && parent->ops->get_region_info) {
+		int index;
+
+		for (index = VFIO_PCI_BAR0_REGION_INDEX;
+		     index < VFIO_PCI_NUM_REGIONS; index++) {
+			ret = parent->ops->get_region_info(vmdev->mdev, index,
+					      &vmdev->vfio_region_info[index]);
+			if (ret)
+				goto open_error;
+		}
+	}
+
+	vmdev->refcnt++;
+
+open_error:
+	mutex_unlock(&vmdev->vfio_mdev_lock);
+	if (ret)
+		module_put(THIS_MODULE);
+
+	return ret;
+}
+
+static void vfio_mpci_close(void *device_data)
+{
+	struct vfio_mdev *vmdev = device_data;
+
+	mutex_lock(&vmdev->vfio_mdev_lock);
+	vmdev->refcnt--;
+	if (!vmdev->refcnt) {
+		memset(&vmdev->vfio_region_info, 0,
+			sizeof(vmdev->vfio_region_info));
+	}
+	mutex_unlock(&vmdev->vfio_mdev_lock);
+	module_put(THIS_MODULE);
+}
+
+static u8 mpci_find_pci_capability(struct mdev_device *mdev, u8 capability)
+{
+	loff_t pos = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_CONFIG_REGION_INDEX);
+	struct parent_device *parent = mdev->parent;
+	u16 status;
+	u8  cap_ptr, cap_id = 0xff;
+
+	parent->ops->read(mdev, (char *)&status, sizeof(status),
+			  pos + PCI_STATUS);
+	if (!(status & PCI_STATUS_CAP_LIST))
+		return 0;
+
+	parent->ops->read(mdev, &cap_ptr, sizeof(cap_ptr),
+			  pos + PCI_CAPABILITY_LIST);
+
+	do {
+		cap_ptr &= 0xfc;
+		parent->ops->read(mdev, &cap_id, sizeof(cap_id),
+				  pos + cap_ptr + PCI_CAP_LIST_ID);
+		if (cap_id == capability)
+			return cap_ptr;
+		parent->ops->read(mdev, &cap_ptr, sizeof(cap_ptr),
+				  pos + cap_ptr + PCI_CAP_LIST_NEXT);
+	} while (cap_ptr && cap_id != 0xff);
+
+	return 0;
+}
+
+static int mpci_get_irq_count(struct vfio_mdev *vmdev, int irq_type)
+{
+	loff_t pos = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_CONFIG_REGION_INDEX);
+	struct mdev_device *mdev = vmdev->mdev;
+	struct parent_device *parent = mdev->parent;
+
+	if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
+		u8 pin;
+
+		parent->ops->read(mdev, &pin, sizeof(pin),
+				  pos + PCI_INTERRUPT_PIN);
+		if (IS_ENABLED(CONFIG_VFIO_PCI_INTX) && pin)
+			return 1;
+
+	} else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
+		u8 cap_ptr;
+		u16 flags;
+
+		cap_ptr = mpci_find_pci_capability(mdev, PCI_CAP_ID_MSI);
+		if (cap_ptr) {
+			parent->ops->read(mdev, (char *)&flags, sizeof(flags),
+					pos + cap_ptr + PCI_MSI_FLAGS);
+			return 1 << ((flags & PCI_MSI_FLAGS_QMASK) >> 1);
+		}
+	} else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) {
+		u8 cap_ptr;
+		u16 flags;
+
+		cap_ptr = mpci_find_pci_capability(mdev, PCI_CAP_ID_MSIX);
+		if (cap_ptr) {
+			parent->ops->read(mdev, (char *)&flags, sizeof(flags),
+					pos + cap_ptr + PCI_MSIX_FLAGS);
+
+			return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
+		}
+	} else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) {
+		u8 cap_ptr;
+
+		cap_ptr = mpci_find_pci_capability(mdev, PCI_CAP_ID_EXP);
+		if (cap_ptr)
+			return 1;
+	} else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) {
+		return 1;
+	}
+
+	return 0;
+}
+
+static long vfio_mpci_unlocked_ioctl(void *device_data,
+				     unsigned int cmd, unsigned long arg)
+{
+	int ret = 0;
+	struct vfio_mdev *vmdev = device_data;
+	unsigned long minsz;
+
+	switch (cmd) {
+	case VFIO_DEVICE_GET_INFO:
+	{
+		struct vfio_device_info info;
+		struct parent_device *parent = vmdev->mdev->parent;
+
+		minsz = offsetofend(struct vfio_device_info, num_irqs);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		info.flags = VFIO_DEVICE_FLAGS_PCI;
+
+		if (parent->ops->reset)
+			info.flags |= VFIO_DEVICE_FLAGS_RESET;
+
+		info.num_regions = VFIO_PCI_NUM_REGIONS;
+		info.num_irqs = VFIO_PCI_NUM_IRQS;
+
+		return copy_to_user((void __user *)arg, &info, minsz);
+	}
+	case VFIO_DEVICE_GET_REGION_INFO:
+	{
+		struct vfio_region_info info;
+
+		minsz = offsetofend(struct vfio_region_info, offset);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		switch (info.index) {
+		case VFIO_PCI_CONFIG_REGION_INDEX:
+		case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
+			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+			info.size = vmdev->vfio_region_info[info.index].size;
+			if (!info.size) {
+				info.flags = 0;
+				break;
+			}
+
+			info.flags = vmdev->vfio_region_info[info.index].flags;
+			break;
+		case VFIO_PCI_VGA_REGION_INDEX:
+		case VFIO_PCI_ROM_REGION_INDEX:
+		default:
+			return -EINVAL;
+		}
+
+		return copy_to_user((void __user *)arg, &info, minsz);
+	}
+	case VFIO_DEVICE_GET_IRQ_INFO:
+	{
+		struct vfio_irq_info info;
+
+		minsz = offsetofend(struct vfio_irq_info, count);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
+			return -EINVAL;
+
+		switch (info.index) {
+		case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSI_IRQ_INDEX:
+		case VFIO_PCI_REQ_IRQ_INDEX:
+			break;
+			/* pass thru to return error */
+		case VFIO_PCI_MSIX_IRQ_INDEX:
+		default:
+			return -EINVAL;
+		}
+
+		info.flags = VFIO_IRQ_INFO_EVENTFD;
+		info.count = mpci_get_irq_count(vmdev, info.index);
+
+		if (info.count == -1)
+			return -EINVAL;
+
+		if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
+			info.flags |= (VFIO_IRQ_INFO_MASKABLE |
+					VFIO_IRQ_INFO_AUTOMASKED);
+		else
+			info.flags |= VFIO_IRQ_INFO_NORESIZE;
+
+		return copy_to_user((void __user *)arg, &info, minsz);
+	}
+	case VFIO_DEVICE_SET_IRQS:
+	{
+		struct vfio_irq_set hdr;
+		struct mdev_device *mdev = vmdev->mdev;
+		struct parent_device *parent = mdev->parent;
+		u8 *data = NULL, *ptr = NULL;
+
+		minsz = offsetofend(struct vfio_irq_set, count);
+
+		if (copy_from_user(&hdr, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (hdr.argsz < minsz || hdr.index >= VFIO_PCI_NUM_IRQS ||
+		    hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
+				  VFIO_IRQ_SET_ACTION_TYPE_MASK))
+			return -EINVAL;
+
+		if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
+			size_t size;
+			int max = mpci_get_irq_count(vmdev, hdr.index);
+
+			if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL)
+				size = sizeof(uint8_t);
+			else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD)
+				size = sizeof(int32_t);
+			else
+				return -EINVAL;
+
+			if (hdr.argsz - minsz < hdr.count * size ||
+			    hdr.start >= max || hdr.start + hdr.count > max)
+				return -EINVAL;
+
+			ptr = data = memdup_user((void __user *)(arg + minsz),
+						 hdr.count * size);
+			if (IS_ERR(data))
+				return PTR_ERR(data);
+		}
+
+		if (parent->ops->set_irqs)
+			ret = parent->ops->set_irqs(mdev, hdr.flags, hdr.index,
+						    hdr.start, hdr.count, data);
+
+		kfree(ptr);
+		return ret;
+	}
+	case VFIO_DEVICE_RESET:
+	{
+		struct parent_device *parent = vmdev->mdev->parent;
+
+		if (parent->ops->reset)
+			return parent->ops->reset(vmdev->mdev);
+
+		return -EINVAL;
+	}
+	}
+	return -ENOTTY;
+}
+
+static ssize_t vfio_mpci_read(void *device_data, char __user *buf,
+			      size_t count, loff_t *ppos)
+{
+	struct vfio_mdev *vmdev = device_data;
+	struct mdev_device *mdev = vmdev->mdev;
+	struct parent_device *parent = mdev->parent;
+	int ret = 0;
+
+	if (!count)
+		return 0;
+
+	if (parent->ops->read) {
+		char *ret_data, *ptr;
+
+		ptr = ret_data = kzalloc(count, GFP_KERNEL);
+
+		if (!ret_data)
+			return  -ENOMEM;
+
+		ret = parent->ops->read(mdev, ret_data, count, *ppos);
+
+		if (ret > 0) {
+			if (copy_to_user(buf, ret_data, ret))
+				ret = -EFAULT;
+			else
+				*ppos += ret;
+		}
+		kfree(ptr);
+	}
+
+	return ret;
+}
+
+static ssize_t vfio_mpci_write(void *device_data, const char __user *buf,
+			       size_t count, loff_t *ppos)
+{
+	struct vfio_mdev *vmdev = device_data;
+	struct mdev_device *mdev = vmdev->mdev;
+	struct parent_device *parent = mdev->parent;
+	int ret = 0;
+
+	if (!count)
+		return 0;
+
+	if (parent->ops->write) {
+		char *usr_data, *ptr;
+
+		ptr = usr_data = memdup_user(buf, count);
+		if (IS_ERR(usr_data))
+			return PTR_ERR(usr_data);
+
+		ret = parent->ops->write(mdev, usr_data, count, *ppos);
+
+		if (ret > 0)
+			*ppos += ret;
+
+		kfree(ptr);
+	}
+
+	return ret;
+}
+
+static int mdev_dev_mmio_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	int ret;
+	struct vfio_mdev *vmdev = vma->vm_private_data;
+	struct mdev_device *mdev;
+	struct parent_device *parent;
+	u64 virtaddr = (u64)vmf->virtual_address;
+	unsigned long req_size, pgoff = 0;
+	pgprot_t pg_prot;
+	unsigned int index;
+
+	if (!vmdev && !vmdev->mdev)
+		return -EINVAL;
+
+	mdev = vmdev->mdev;
+	parent  = mdev->parent;
+
+	pg_prot  = vma->vm_page_prot;
+
+	if (parent->ops->validate_map_request) {
+		u64 offset;
+		loff_t pos;
+
+		offset   = virtaddr - vma->vm_start;
+		req_size = vma->vm_end - virtaddr;
+		pos = (vma->vm_pgoff << PAGE_SHIFT) + offset;
+
+		ret = parent->ops->validate_map_request(mdev, pos, &virtaddr,
+						&pgoff, &req_size, &pg_prot);
+		if (ret)
+			return ret;
+
+		/*
+		 * Verify pgoff and req_size are valid and virtaddr is within
+		 * vma range
+		 */
+		if (!pgoff || !req_size || (virtaddr < vma->vm_start) ||
+		    ((virtaddr + req_size) >= vma->vm_end))
+			return -EINVAL;
+	} else {
+		struct pci_dev *pdev;
+
+		virtaddr = vma->vm_start;
+		req_size = vma->vm_end - vma->vm_start;
+
+		pdev = to_pci_dev(parent->dev);
+		index = VFIO_PCI_OFFSET_TO_INDEX(vma->vm_pgoff << PAGE_SHIFT);
+		pgoff = pci_resource_start(pdev, index) >> PAGE_SHIFT;
+	}
+
+	ret = remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot);
+
+	return ret | VM_FAULT_NOPAGE;
+}
+
+void mdev_dev_mmio_close(struct vm_area_struct *vma)
+{
+	struct vfio_mdev *vmdev = vma->vm_private_data;
+	struct mdev_device *mdev = vmdev->mdev;
+
+	mdev_del_phys_mapping(mdev, vma->vm_pgoff << PAGE_SHIFT);
+}
+
+static const struct vm_operations_struct mdev_dev_mmio_ops = {
+	.fault = mdev_dev_mmio_fault,
+	.close = mdev_dev_mmio_close,
+};
+
+static int vfio_mpci_mmap(void *device_data, struct vm_area_struct *vma)
+{
+	unsigned int index;
+	struct vfio_mdev *vmdev = device_data;
+	struct mdev_device *mdev = vmdev->mdev;
+
+	index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
+
+	if (index >= VFIO_PCI_ROM_REGION_INDEX)
+		return -EINVAL;
+
+	vma->vm_private_data = vmdev;
+	vma->vm_ops = &mdev_dev_mmio_ops;
+
+	return mdev_add_phys_mapping(mdev, vma->vm_file->f_mapping,
+				     vma->vm_pgoff << PAGE_SHIFT,
+				     vma->vm_end - vma->vm_start);
+}
+
+static const struct vfio_device_ops vfio_mpci_dev_ops = {
+	.name		= "vfio-mpci",
+	.open		= vfio_mpci_open,
+	.release	= vfio_mpci_close,
+	.ioctl		= vfio_mpci_unlocked_ioctl,
+	.read		= vfio_mpci_read,
+	.write		= vfio_mpci_write,
+	.mmap		= vfio_mpci_mmap,
+};
+
+int vfio_mpci_probe(struct device *dev)
+{
+	struct vfio_mdev *vmdev;
+	struct mdev_device *mdev = to_mdev_device(dev);
+	int ret;
+
+	vmdev = kzalloc(sizeof(*vmdev), GFP_KERNEL);
+	if (IS_ERR(vmdev))
+		return PTR_ERR(vmdev);
+
+	vmdev->mdev = mdev_get_device(mdev);
+	vmdev->group = mdev->group;
+	mutex_init(&vmdev->vfio_mdev_lock);
+
+	ret = vfio_add_group_dev(dev, &vfio_mpci_dev_ops, vmdev);
+	if (ret)
+		kfree(vmdev);
+
+	mdev_put_device(mdev);
+	return ret;
+}
+
+void vfio_mpci_remove(struct device *dev)
+{
+	struct vfio_mdev *vmdev;
+
+	vmdev = vfio_del_group_dev(dev);
+	kfree(vmdev);
+}
+
+int vfio_mpci_match(struct device *dev)
+{
+	if (dev_is_pci(dev->parent))
+		return 1;
+
+	return 0;
+}
+
+struct mdev_driver vfio_mpci_driver = {
+	.name	= "vfio_mpci",
+	.probe	= vfio_mpci_probe,
+	.remove	= vfio_mpci_remove,
+	.match	= vfio_mpci_match,
+};
+
+static int __init vfio_mpci_init(void)
+{
+	return mdev_register_driver(&vfio_mpci_driver, THIS_MODULE);
+}
+
+static void __exit vfio_mpci_exit(void)
+{
+	mdev_unregister_driver(&vfio_mpci_driver);
+}
+
+module_init(vfio_mpci_init)
+module_exit(vfio_mpci_exit)
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index 8a7d546d18a0..04a450908ffb 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -19,12 +19,6 @@ 
 #ifndef VFIO_PCI_PRIVATE_H
 #define VFIO_PCI_PRIVATE_H
 
-#define VFIO_PCI_OFFSET_SHIFT   40
-
-#define VFIO_PCI_OFFSET_TO_INDEX(off)	(off >> VFIO_PCI_OFFSET_SHIFT)
-#define VFIO_PCI_INDEX_TO_OFFSET(index)	((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
-#define VFIO_PCI_OFFSET_MASK	(((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
-
 /* Special capability IDs predefined access */
 #define PCI_CAP_ID_INVALID		0xFF	/* default raw access */
 #define PCI_CAP_ID_INVALID_VIRT		0xFE	/* default virt access */
diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c
index 5ffd1d9ad4bd..5b912be9d9c3 100644
--- a/drivers/vfio/pci/vfio_pci_rdwr.c
+++ b/drivers/vfio/pci/vfio_pci_rdwr.c
@@ -18,6 +18,7 @@ 
 #include <linux/uaccess.h>
 #include <linux/io.h>
 #include <linux/vgaarb.h>
+#include <linux/vfio.h>
 
 #include "vfio_pci_private.h"
 
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 0ecae0b1cd34..431b824b0d3e 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -18,6 +18,13 @@ 
 #include <linux/poll.h>
 #include <uapi/linux/vfio.h>
 
+#define VFIO_PCI_OFFSET_SHIFT   40
+
+#define VFIO_PCI_OFFSET_TO_INDEX(off)   (off >> VFIO_PCI_OFFSET_SHIFT)
+#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
+#define VFIO_PCI_OFFSET_MASK    (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
+
+
 /**
  * struct vfio_device_ops - VFIO bus driver device callbacks
  *