diff mbox

[v4,kvmtool,10/12] vfio: Support non-mmappable regions

Message ID 20171122185823.7765-11-jean-philippe.brucker@arm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jean-Philippe Brucker Nov. 22, 2017, 6:58 p.m. UTC
In some cases device regions don't support mmap. They can still be made
available to the guest by trapping all accesses and forwarding reads or
writes to VFIO. Such regions may be:

* PCI I/O port BARs.
* Sub-page regions, for example a 4kB region on a host with 64k pages.
* Similarly, sparse mmap regions. For example when VFIO allows to mmap
  fragments of a PCI BAR and forbids accessing things like MSI-X tables.
  We don't support the sparse capability at the moment, so trap these
  regions instead (if VFIO rejects the mmap).

Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
---
v3->v4: new.
---
 include/kvm/vfio.h |   3 +
 vfio/core.c        | 165 +++++++++++++++++++++++++++++++++++++++++++++++------
 vfio/pci.c         |  41 +++++++------
 3 files changed, 176 insertions(+), 33 deletions(-)

Comments

Punit Agrawal March 14, 2018, 5:42 p.m. UTC | #1
Hi Jean-Philippe,

One comment below.

Jean-Philippe Brucker <jean-philippe.brucker@arm.com> writes:

> In some cases device regions don't support mmap. They can still be made
> available to the guest by trapping all accesses and forwarding reads or
> writes to VFIO. Such regions may be:
>
> * PCI I/O port BARs.
> * Sub-page regions, for example a 4kB region on a host with 64k pages.
> * Similarly, sparse mmap regions. For example when VFIO allows to mmap
>   fragments of a PCI BAR and forbids accessing things like MSI-X tables.
>   We don't support the sparse capability at the moment, so trap these
>   regions instead (if VFIO rejects the mmap).
>
> Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
> ---
> v3->v4: new.
> ---
>  include/kvm/vfio.h |   3 +
>  vfio/core.c        | 165 +++++++++++++++++++++++++++++++++++++++++++++++------
>  vfio/pci.c         |  41 +++++++------
>  3 files changed, 176 insertions(+), 33 deletions(-)
>

[...]

> diff --git a/vfio/pci.c b/vfio/pci.c
> index 698b19582143..9c3ef2f1a373 100644
> --- a/vfio/pci.c
> +++ b/vfio/pci.c

[...]

> @@ -785,6 +788,7 @@ static int vfio_pci_configure_bar(struct kvm *kvm, struct vfio_device *vdev,
>  				  size_t nr)
>  {
>  	int ret;
> +	u32 bar;
>  	size_t map_size;
>  	struct vfio_pci_device *pdev = &vdev->pci;
>  	struct vfio_region *region = &vdev->regions[nr];
> @@ -792,6 +796,10 @@ static int vfio_pci_configure_bar(struct kvm *kvm, struct vfio_device *vdev,
>  	if (nr >= vdev->info.num_regions)
>  		return 0;
>  
> +	bar = pdev->hdr.bar[nr];
> +
> +	region->vdev = vdev;
> +	region->is_ioport = !!(bar & PCI_BASE_ADDRESS_SPACE_IO);
>  	region->info = (struct vfio_region_info) {
>  		.argsz = sizeof(*region),

sizeof(region->info)?

Otherwise the patch looks good.

Thanks,
Punit

>  		.index = nr,
> @@ -819,14 +827,13 @@ static int vfio_pci_configure_bar(struct kvm *kvm, struct vfio_device *vdev,
>  		}
>  	}
>  
> -	/* Grab some MMIO space in the guest */
> -	map_size = ALIGN(region->info.size, PAGE_SIZE);
> -	region->guest_phys_addr = pci_get_io_space_block(map_size);
> +	if (!region->is_ioport) {
> +		/* Grab some MMIO space in the guest */
> +		map_size = ALIGN(region->info.size, PAGE_SIZE);
> +		region->guest_phys_addr = pci_get_io_space_block(map_size);
> +	}
>  
> -	/*
> -	 * Map the BARs into the guest. We'll later need to update
> -	 * configuration space to reflect our allocation.
> -	 */
> +	/* Map the BARs into the guest or setup a trap region. */
>  	ret = vfio_map_region(kvm, vdev, region);
>  	if (ret)
>  		return ret;
Jean-Philippe Brucker March 14, 2018, 6:23 p.m. UTC | #2
On 14/03/18 17:42, Punit Agrawal wrote:
>> +	region->vdev = vdev;
>> +	region->is_ioport = !!(bar & PCI_BASE_ADDRESS_SPACE_IO);
>>  	region->info = (struct vfio_region_info) {
>>  		.argsz = sizeof(*region),
> 
> sizeof(region->info)?

Woops, good catch

Thanks,
Jean
diff mbox

Patch

diff --git a/include/kvm/vfio.h b/include/kvm/vfio.h
index eee821b333d9..0e3366274d0e 100644
--- a/include/kvm/vfio.h
+++ b/include/kvm/vfio.h
@@ -83,8 +83,11 @@  struct vfio_pci_device {
 
 struct vfio_region {
 	struct vfio_region_info		info;
+	struct vfio_device		*vdev;
 	u64				guest_phys_addr;
 	void				*host_addr;
+	u32				port_base;
+	int				is_ioport	:1;
 };
 
 struct vfio_device {
diff --git a/vfio/core.c b/vfio/core.c
index e1b7366b9eda..fade197a1dd0 100644
--- a/vfio/core.c
+++ b/vfio/core.c
@@ -1,5 +1,6 @@ 
 #include "kvm/kvm.h"
 #include "kvm/vfio.h"
+#include "kvm/ioport.h"
 
 #include <linux/list.h>
 
@@ -78,6 +79,141 @@  out_free_buf:
 	return ret;
 }
 
+static bool vfio_ioport_in(struct ioport *ioport, struct kvm_cpu *vcpu,
+			   u16 port, void *data, int len)
+{
+	u32 val;
+	ssize_t nr;
+	struct vfio_region *region = ioport->priv;
+	struct vfio_device *vdev = region->vdev;
+
+	u32 offset = port - region->port_base;
+
+	if (!(region->info.flags & VFIO_REGION_INFO_FLAG_READ))
+		return false;
+
+	nr = pread(vdev->fd, &val, len, region->info.offset + offset);
+	if (nr != len) {
+		dev_err(vdev, "could not read %d bytes from I/O port 0x%x\n",
+			len, port);
+		return false;
+	}
+
+	switch (len) {
+	case 1:
+		ioport__write8(data, val);
+		break;
+	case 2:
+		ioport__write16(data, val);
+		break;
+	case 4:
+		ioport__write32(data, val);
+		break;
+	default:
+		return false;
+	}
+
+	return true;
+}
+
+static bool vfio_ioport_out(struct ioport *ioport, struct kvm_cpu *vcpu,
+			    u16 port, void *data, int len)
+{
+	u32 val;
+	ssize_t nr;
+	struct vfio_region *region = ioport->priv;
+	struct vfio_device *vdev = region->vdev;
+
+	u32 offset = port - region->port_base;
+
+	if (!(region->info.flags & VFIO_REGION_INFO_FLAG_WRITE))
+		return false;
+
+	switch (len) {
+	case 1:
+		val = ioport__read8(data);
+		break;
+	case 2:
+		val = ioport__read16(data);
+		break;
+	case 4:
+		val = ioport__read32(data);
+		break;
+	default:
+		return false;
+	}
+
+	nr = pwrite(vdev->fd, &val, len, region->info.offset + offset);
+	if (nr != len)
+		dev_err(vdev, "could not write %d bytes to I/O port 0x%x", len,
+			port);
+
+	return nr == len;
+}
+
+static struct ioport_operations vfio_ioport_ops = {
+	.io_in	= vfio_ioport_in,
+	.io_out	= vfio_ioport_out,
+};
+
+static void vfio_mmio_access(struct kvm_cpu *vcpu, u64 addr, u8 *data, u32 len,
+			     u8 is_write, void *ptr)
+{
+	u64 val;
+	ssize_t nr;
+	struct vfio_region *region = ptr;
+	struct vfio_device *vdev = region->vdev;
+
+	u32 offset = addr - region->guest_phys_addr;
+
+	if (len < 1 || len > 8)
+		goto err_report;
+
+	if (is_write) {
+		if (!(region->info.flags & VFIO_REGION_INFO_FLAG_WRITE))
+			goto err_report;
+
+		memcpy(&val, data, len);
+
+		nr = pwrite(vdev->fd, &val, len, region->info.offset + offset);
+		if ((u32)nr != len)
+			goto err_report;
+	} else {
+		if (!(region->info.flags & VFIO_REGION_INFO_FLAG_READ))
+			goto err_report;
+
+		nr = pread(vdev->fd, &val, len, region->info.offset + offset);
+		if ((u32)nr != len)
+			goto err_report;
+
+		memcpy(data, &val, len);
+	}
+
+	return;
+
+err_report:
+	dev_err(vdev, "could not %s %u bytes at 0x%x (0x%llx)", is_write ?
+		"write" : "read", len, offset, addr);
+}
+
+static int vfio_setup_trap_region(struct kvm *kvm, struct vfio_device *vdev,
+				  struct vfio_region *region)
+{
+	if (region->is_ioport) {
+		int port = ioport__register(kvm, IOPORT_EMPTY, &vfio_ioport_ops,
+					    region->info.size, region);
+		if (port < 0)
+			return port;
+
+		region->port_base = port;
+		return 0;
+	}
+
+	return kvm__register_mmio(kvm, region->guest_phys_addr,
+				  region->info.size, false, vfio_mmio_access,
+				  region);
+}
+
 int vfio_map_region(struct kvm *kvm, struct vfio_device *vdev,
 		    struct vfio_region *region)
 {
@@ -86,17 +222,8 @@  int vfio_map_region(struct kvm *kvm, struct vfio_device *vdev,
 	/* KVM needs page-aligned regions */
 	u64 map_size = ALIGN(region->info.size, PAGE_SIZE);
 
-	/*
-	 * We don't want to mess about trapping config accesses, so require that
-	 * they can be mmap'd. Note that for PCI, this precludes the use of I/O
-	 * BARs in the guest (we will hide them from Configuration Space, which
-	 * is trapped).
-	 */
-	if (!(region->info.flags & VFIO_REGION_INFO_FLAG_MMAP)) {
-		dev_info(vdev, "ignoring region %u, as it can't be mmap'd",
-			 region->info.index);
-		return 0;
-	}
+	if (!(region->info.flags & VFIO_REGION_INFO_FLAG_MMAP))
+		return vfio_setup_trap_region(kvm, vdev, region);
 
 	if (region->info.flags & VFIO_REGION_INFO_FLAG_READ)
 		prot |= PROT_READ;
@@ -106,10 +233,10 @@  int vfio_map_region(struct kvm *kvm, struct vfio_device *vdev,
 	base = mmap(NULL, region->info.size, prot, MAP_SHARED, vdev->fd,
 		    region->info.offset);
 	if (base == MAP_FAILED) {
-		ret = -errno;
-		dev_err(vdev, "failed to mmap region %u (0x%llx bytes)",
-			region->info.index, region->info.size);
-		return ret;
+		/* TODO: support sparse mmap */
+		dev_warn(vdev, "failed to mmap region %u (0x%llx bytes), falling back to trapping",
+			 region->info.index, region->info.size);
+		return vfio_setup_trap_region(kvm, vdev, region);
 	}
 	region->host_addr = base;
 
@@ -125,7 +252,13 @@  int vfio_map_region(struct kvm *kvm, struct vfio_device *vdev,
 
 void vfio_unmap_region(struct kvm *kvm, struct vfio_region *region)
 {
-	munmap(region->host_addr, region->info.size);
+	if (region->host_addr) {
+		munmap(region->host_addr, region->info.size);
+	} else if (region->is_ioport) {
+		ioport__unregister(kvm, region->port_base);
+	} else {
+		kvm__deregister_mmio(kvm, region->guest_phys_addr);
+	}
 }
 
 static int vfio_configure_device(struct kvm *kvm, struct vfio_device *vdev)
diff --git a/vfio/pci.c b/vfio/pci.c
index 698b19582143..9c3ef2f1a373 100644
--- a/vfio/pci.c
+++ b/vfio/pci.c
@@ -629,24 +629,27 @@  static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev)
 	struct vfio_region_info *info;
 	struct vfio_pci_device *pdev = &vdev->pci;
 
-	/* Enable exclusively MMIO and bus mastering */
-	pdev->hdr.command &= ~PCI_COMMAND_IO;
-	pdev->hdr.command |= PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER;
-
 	/* Initialise the BARs */
 	for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
+		u64 base;
 		struct vfio_region *region = &vdev->regions[i];
-		u64 base = region->guest_phys_addr;
+
+		/* Construct a fake reg to match what we've mapped. */
+		if (region->is_ioport) {
+			base = (region->port_base & PCI_BASE_ADDRESS_IO_MASK) |
+				PCI_BASE_ADDRESS_SPACE_IO;
+		} else {
+			base = (region->guest_phys_addr &
+				PCI_BASE_ADDRESS_MEM_MASK) |
+				PCI_BASE_ADDRESS_SPACE_MEMORY;
+		}
+
+		pdev->hdr.bar[i] = base;
 
 		if (!base)
 			continue;
 
 		pdev->hdr.bar_size[i] = region->info.size;
-
-		/* Construct a fake reg to match what we've mapped. */
-		pdev->hdr.bar[i] = (base & PCI_BASE_ADDRESS_MEM_MASK) |
-					PCI_BASE_ADDRESS_SPACE_MEMORY |
-					PCI_BASE_ADDRESS_MEM_TYPE_32;
 	}
 
 	/* I really can't be bothered to support cardbus. */
@@ -785,6 +788,7 @@  static int vfio_pci_configure_bar(struct kvm *kvm, struct vfio_device *vdev,
 				  size_t nr)
 {
 	int ret;
+	u32 bar;
 	size_t map_size;
 	struct vfio_pci_device *pdev = &vdev->pci;
 	struct vfio_region *region = &vdev->regions[nr];
@@ -792,6 +796,10 @@  static int vfio_pci_configure_bar(struct kvm *kvm, struct vfio_device *vdev,
 	if (nr >= vdev->info.num_regions)
 		return 0;
 
+	bar = pdev->hdr.bar[nr];
+
+	region->vdev = vdev;
+	region->is_ioport = !!(bar & PCI_BASE_ADDRESS_SPACE_IO);
 	region->info = (struct vfio_region_info) {
 		.argsz = sizeof(*region),
 		.index = nr,
@@ -819,14 +827,13 @@  static int vfio_pci_configure_bar(struct kvm *kvm, struct vfio_device *vdev,
 		}
 	}
 
-	/* Grab some MMIO space in the guest */
-	map_size = ALIGN(region->info.size, PAGE_SIZE);
-	region->guest_phys_addr = pci_get_io_space_block(map_size);
+	if (!region->is_ioport) {
+		/* Grab some MMIO space in the guest */
+		map_size = ALIGN(region->info.size, PAGE_SIZE);
+		region->guest_phys_addr = pci_get_io_space_block(map_size);
+	}
 
-	/*
-	 * Map the BARs into the guest. We'll later need to update
-	 * configuration space to reflect our allocation.
-	 */
+	/* Map the BARs into the guest or setup a trap region. */
 	ret = vfio_map_region(kvm, vdev, region);
 	if (ret)
 		return ret;