diff mbox

[v2,kvmtool,06/10] Add PCI device passthrough using VFIO

Message ID 20170622170536.14319-7-jean-philippe.brucker@arm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jean-Philippe Brucker June 22, 2017, 5:05 p.m. UTC
Assigning devices using VFIO allows the guest to have direct access to the
device, whilst filtering accesses to sensitive areas by trapping config
space accesses and mapping DMA with an IOMMU.

This patch adds a new option to lkvm run: --vfio-group=<group_number>.
Before assigning a device to a VM, some preparation is required. As
described in Linux Documentation/vfio.txt, the device driver need to be
changed to vfio-pci:

  $ dev=0000:00:00.0

  $ echo $dev > /sys/bus/pci/devices/$dev/driver/unbind
  $ echo vfio-pci > /sys/bus/pci/devices/$dev/driver_override
  $ echo $dev > /sys/bus/pci/drivers_probe
  $ readlink /sys/bus/pci/devices/$dev/iommu_group
  ../../../kernel/iommu_groups/5

Adding --vfio[-group]=5 to lkvm-run will pass the device to the guest.
Multiple groups can be passed to the guest by adding more --vfio
parameters.

This patch only implements PCI with INTx. MSI-X routing will be added in a
subsequent patch, and at some point we might add support for passing
platform devices to guests.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
---
 Makefile                 |   2 +
 arm/pci.c                |   1 +
 builtin-run.c            |   5 +
 include/kvm/kvm-config.h |   3 +
 include/kvm/pci.h        |   3 +-
 include/kvm/vfio.h       |  57 +++++++
 vfio/core.c              | 395 +++++++++++++++++++++++++++++++++++++++++++++++
 vfio/pci.c               | 365 +++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 830 insertions(+), 1 deletion(-)
 create mode 100644 include/kvm/vfio.h
 create mode 100644 vfio/core.c
 create mode 100644 vfio/pci.c

Comments

Punit Agrawal July 31, 2017, 5:52 p.m. UTC | #1
Hi Jean-Philippe,

A couple of queries below -

Jean-Philippe Brucker <jean-philippe.brucker@arm.com> writes:

> Assigning devices using VFIO allows the guest to have direct access to the
> device, whilst filtering accesses to sensitive areas by trapping config
> space accesses and mapping DMA with an IOMMU.
>
> This patch adds a new option to lkvm run: --vfio-group=<group_number>.
> Before assigning a device to a VM, some preparation is required. As
> described in Linux Documentation/vfio.txt, the device driver need to be

Nitpick: "needs"

> changed to vfio-pci:
>
>   $ dev=0000:00:00.0
>
>   $ echo $dev > /sys/bus/pci/devices/$dev/driver/unbind
>   $ echo vfio-pci > /sys/bus/pci/devices/$dev/driver_override
>   $ echo $dev > /sys/bus/pci/drivers_probe
>   $ readlink /sys/bus/pci/devices/$dev/iommu_group
>   ../../../kernel/iommu_groups/5
>
> Adding --vfio[-group]=5 to lkvm-run will pass the device to the guest.
> Multiple groups can be passed to the guest by adding more --vfio
> parameters.
>
> This patch only implements PCI with INTx. MSI-X routing will be added in a
> subsequent patch, and at some point we might add support for passing
> platform devices to guests.
>
> Signed-off-by: Will Deacon <will.deacon@arm.com>
> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
> Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
> ---
>  Makefile                 |   2 +
>  arm/pci.c                |   1 +
>  builtin-run.c            |   5 +
>  include/kvm/kvm-config.h |   3 +
>  include/kvm/pci.h        |   3 +-
>  include/kvm/vfio.h       |  57 +++++++
>  vfio/core.c              | 395 +++++++++++++++++++++++++++++++++++++++++++++++
>  vfio/pci.c               | 365 +++++++++++++++++++++++++++++++++++++++++++
>  8 files changed, 830 insertions(+), 1 deletion(-)
>  create mode 100644 include/kvm/vfio.h
>  create mode 100644 vfio/core.c
>  create mode 100644 vfio/pci.c
>

[...]

> +static int vfio_container_init(struct kvm *kvm)
> +{
> +	int api, i, ret, iommu_type;;
> +
> +	/* Create a container for our IOMMU groups */
> +	vfio_container = open(VFIO_DEV_NODE, O_RDWR);
> +	if (vfio_container == -1) {
> +		ret = errno;
> +		pr_err("Failed to open %s", VFIO_DEV_NODE);
> +		return ret;
> +	}
> +
> +	api = ioctl(vfio_container, VFIO_GET_API_VERSION);
> +	if (api != VFIO_API_VERSION) {
> +		pr_err("Unknown VFIO API version %d", api);
> +		return -ENODEV;
> +	}

We are using the VFIO_API_VERSION pulled in from linux/vfio.h. Will
kvmtool be in trouble if for some reason the API version is incompatibly
incremented in the future?

> +
> +	iommu_type = vfio_get_iommu_type();
> +	if (iommu_type < 0) {
> +		pr_err("VFIO type-1 IOMMU not supported on this platform");
> +		return iommu_type;
> +	}
> +
> +	/* Sanity check our groups and add them to the container */
> +	for (i = 0; i < kvm->cfg.num_vfio_groups; ++i) {
> +		ret = vfio_group_init(kvm, &kvm->cfg.vfio_group[i]);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	/* Finalise the container */
> +	if (ioctl(vfio_container, VFIO_SET_IOMMU, iommu_type)) {
> +		ret = -errno;
> +		pr_err("Failed to set IOMMU type %d for VFIO container",
> +		       iommu_type);
> +		return ret;

Just checking - is there a need to remove the groups from the containers
(added by VFIO_GROUP_SET_CONTAINER in vfio_group_init()) before erroring
out here? I imagine freeing of the vfio_container when kvmtool is
cleaning up after itself will do the right thing.

Thanks,
Punit

> +	} else {
> +		pr_info("Using IOMMU type %d for VFIO container", iommu_type);
> +	}
> +
> +	return kvm__for_each_mem_bank(kvm, KVM_MEM_TYPE_RAM, vfio_map_mem_bank,
> +				      NULL);
> +}
> +
> +static int vfio__init(struct kvm *kvm)
> +{
> +	int ret;
> +
> +	if (!kvm->cfg.num_vfio_groups)
> +		return 0;
> +
> +	ret = vfio_container_init(kvm);
> +	if (ret)
> +		return ret;
> +
> +	ret = vfio_configure_iommu_groups(kvm);
> +	if (ret)
> +		return ret;
> +
> +	return 0;
> +}
> +dev_base_init(vfio__init);
> +
> +static int vfio__exit(struct kvm *kvm)
> +{
> +	int i;
> +
> +	if (!kvm->cfg.num_vfio_groups)
> +		return 0;
> +
> +	for (i = 0; i < kvm->cfg.num_vfio_groups; ++i)
> +		vfio_group_exit(kvm, &kvm->cfg.vfio_group[i]);
> +
> +	kvm__for_each_mem_bank(kvm, KVM_MEM_TYPE_RAM, vfio_unmap_mem_bank, NULL);
> +	return close(vfio_container);
> +}
> +dev_base_exit(vfio__exit);

[...]
Jean-Philippe Brucker Aug. 2, 2017, 3:17 p.m. UTC | #2
Hi Punit,

Thanks for reviewing and testing

On 31/07/17 18:52, Punit Agrawal wrote:
> Hi Jean-Philippe,
> 
> A couple of queries below -
> 
> Jean-Philippe Brucker <jean-philippe.brucker@arm.com> writes:
> 
>> Assigning devices using VFIO allows the guest to have direct access to the
>> device, whilst filtering accesses to sensitive areas by trapping config
>> space accesses and mapping DMA with an IOMMU.
>>
>> This patch adds a new option to lkvm run: --vfio-group=<group_number>.
>> Before assigning a device to a VM, some preparation is required. As
>> described in Linux Documentation/vfio.txt, the device driver need to be
> 
> Nitpick: "needs"
> 
>> changed to vfio-pci:
>>
>>   $ dev=0000:00:00.0
>>
>>   $ echo $dev > /sys/bus/pci/devices/$dev/driver/unbind
>>   $ echo vfio-pci > /sys/bus/pci/devices/$dev/driver_override
>>   $ echo $dev > /sys/bus/pci/drivers_probe
>>   $ readlink /sys/bus/pci/devices/$dev/iommu_group
>>   ../../../kernel/iommu_groups/5
>>
>> Adding --vfio[-group]=5 to lkvm-run will pass the device to the guest.
>> Multiple groups can be passed to the guest by adding more --vfio
>> parameters.
>>
>> This patch only implements PCI with INTx. MSI-X routing will be added in a
>> subsequent patch, and at some point we might add support for passing
>> platform devices to guests.
>>
>> Signed-off-by: Will Deacon <will.deacon@arm.com>
>> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
>> Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
>> ---
>>  Makefile                 |   2 +
>>  arm/pci.c                |   1 +
>>  builtin-run.c            |   5 +
>>  include/kvm/kvm-config.h |   3 +
>>  include/kvm/pci.h        |   3 +-
>>  include/kvm/vfio.h       |  57 +++++++
>>  vfio/core.c              | 395 +++++++++++++++++++++++++++++++++++++++++++++++
>>  vfio/pci.c               | 365 +++++++++++++++++++++++++++++++++++++++++++
>>  8 files changed, 830 insertions(+), 1 deletion(-)
>>  create mode 100644 include/kvm/vfio.h
>>  create mode 100644 vfio/core.c
>>  create mode 100644 vfio/pci.c
>>
> 
> [...]
> 
>> +static int vfio_container_init(struct kvm *kvm)
>> +{
>> +	int api, i, ret, iommu_type;;
>> +
>> +	/* Create a container for our IOMMU groups */
>> +	vfio_container = open(VFIO_DEV_NODE, O_RDWR);
>> +	if (vfio_container == -1) {
>> +		ret = errno;
>> +		pr_err("Failed to open %s", VFIO_DEV_NODE);
>> +		return ret;
>> +	}
>> +
>> +	api = ioctl(vfio_container, VFIO_GET_API_VERSION);
>> +	if (api != VFIO_API_VERSION) {
>> +		pr_err("Unknown VFIO API version %d", api);
>> +		return -ENODEV;
>> +	}
> 
> We are using the VFIO_API_VERSION pulled in from linux/vfio.h. Will
> kvmtool be in trouble if for some reason the API version is incompatibly
> incremented in the future?

There are no precedents for this, as VFIO version is still 0. Additions to
the API are added in a backward-compatible way (for example by adding new
flags in the info structures and new fields at the end). If something
significant enough happened and required to increase the version number,
then the next version would probably be incompatible, and kvmtool would
have to support it separately.

>> +
>> +	iommu_type = vfio_get_iommu_type();
>> +	if (iommu_type < 0) {
>> +		pr_err("VFIO type-1 IOMMU not supported on this platform");
>> +		return iommu_type;
>> +	}
>> +
>> +	/* Sanity check our groups and add them to the container */
>> +	for (i = 0; i < kvm->cfg.num_vfio_groups; ++i) {
>> +		ret = vfio_group_init(kvm, &kvm->cfg.vfio_group[i]);
>> +		if (ret)
>> +			return ret;
>> +	}
>> +
>> +	/* Finalise the container */
>> +	if (ioctl(vfio_container, VFIO_SET_IOMMU, iommu_type)) {
>> +		ret = -errno;
>> +		pr_err("Failed to set IOMMU type %d for VFIO container",
>> +		       iommu_type);
>> +		return ret;
> 
> Just checking - is there a need to remove the groups from the containers
> (added by VFIO_GROUP_SET_CONTAINER in vfio_group_init()) before erroring
> out here? I imagine freeing of the vfio_container when kvmtool is
> cleaning up after itself will do the right thing.

Yes, VFIO cleans everything up when the container and group file
descriptors are released.

Thanks,
Jean
Punit Agrawal Aug. 3, 2017, 9:36 a.m. UTC | #3
Jean-Philippe Brucker <jean-philippe.brucker@arm.com> writes:

> Hi Punit,
>
> Thanks for reviewing and testing
>
> On 31/07/17 18:52, Punit Agrawal wrote:
>> Hi Jean-Philippe,
>> 
>> A couple of queries below -
>> 
>> Jean-Philippe Brucker <jean-philippe.brucker@arm.com> writes:
>> 
>>> Assigning devices using VFIO allows the guest to have direct access to the
>>> device, whilst filtering accesses to sensitive areas by trapping config
>>> space accesses and mapping DMA with an IOMMU.
>>>
>>> This patch adds a new option to lkvm run: --vfio-group=<group_number>.
>>> Before assigning a device to a VM, some preparation is required. As
>>> described in Linux Documentation/vfio.txt, the device driver need to be
>> 
>> Nitpick: "needs"
>> 
>>> changed to vfio-pci:
>>>
>>>   $ dev=0000:00:00.0
>>>
>>>   $ echo $dev > /sys/bus/pci/devices/$dev/driver/unbind
>>>   $ echo vfio-pci > /sys/bus/pci/devices/$dev/driver_override
>>>   $ echo $dev > /sys/bus/pci/drivers_probe
>>>   $ readlink /sys/bus/pci/devices/$dev/iommu_group
>>>   ../../../kernel/iommu_groups/5
>>>
>>> Adding --vfio[-group]=5 to lkvm-run will pass the device to the guest.
>>> Multiple groups can be passed to the guest by adding more --vfio
>>> parameters.
>>>
>>> This patch only implements PCI with INTx. MSI-X routing will be added in a
>>> subsequent patch, and at some point we might add support for passing
>>> platform devices to guests.
>>>
>>> Signed-off-by: Will Deacon <will.deacon@arm.com>
>>> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
>>> Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
>>> ---
>>>  Makefile                 |   2 +
>>>  arm/pci.c                |   1 +
>>>  builtin-run.c            |   5 +
>>>  include/kvm/kvm-config.h |   3 +
>>>  include/kvm/pci.h        |   3 +-
>>>  include/kvm/vfio.h       |  57 +++++++
>>>  vfio/core.c              | 395 +++++++++++++++++++++++++++++++++++++++++++++++
>>>  vfio/pci.c               | 365 +++++++++++++++++++++++++++++++++++++++++++
>>>  8 files changed, 830 insertions(+), 1 deletion(-)
>>>  create mode 100644 include/kvm/vfio.h
>>>  create mode 100644 vfio/core.c
>>>  create mode 100644 vfio/pci.c
>>>
>> 
>> [...]
>> 
>>> +static int vfio_container_init(struct kvm *kvm)
>>> +{
>>> +	int api, i, ret, iommu_type;;
>>> +
>>> +	/* Create a container for our IOMMU groups */
>>> +	vfio_container = open(VFIO_DEV_NODE, O_RDWR);
>>> +	if (vfio_container == -1) {
>>> +		ret = errno;
>>> +		pr_err("Failed to open %s", VFIO_DEV_NODE);
>>> +		return ret;
>>> +	}
>>> +
>>> +	api = ioctl(vfio_container, VFIO_GET_API_VERSION);
>>> +	if (api != VFIO_API_VERSION) {
>>> +		pr_err("Unknown VFIO API version %d", api);
>>> +		return -ENODEV;
>>> +	}
>> 
>> We are using the VFIO_API_VERSION pulled in from linux/vfio.h. Will
>> kvmtool be in trouble if for some reason the API version is incompatibly
>> incremented in the future?
>
> There are no precedents for this, as VFIO version is still 0. Additions to
> the API are added in a backward-compatible way (for example by adding new
> flags in the info structures and new fields at the end). If something
> significant enough happened and required to increase the version number,
> then the next version would probably be incompatible, and kvmtool would
> have to support it separately.

In that case, we should use a local version number macro for comparison
and if ever VFIO api changes kvmtool will explicitly complain instead of
silently dealing with an unknown version.

>
>>> +
>>> +	iommu_type = vfio_get_iommu_type();
>>> +	if (iommu_type < 0) {
>>> +		pr_err("VFIO type-1 IOMMU not supported on this platform");
>>> +		return iommu_type;
>>> +	}
>>> +
>>> +	/* Sanity check our groups and add them to the container */
>>> +	for (i = 0; i < kvm->cfg.num_vfio_groups; ++i) {
>>> +		ret = vfio_group_init(kvm, &kvm->cfg.vfio_group[i]);
>>> +		if (ret)
>>> +			return ret;
>>> +	}
>>> +
>>> +	/* Finalise the container */
>>> +	if (ioctl(vfio_container, VFIO_SET_IOMMU, iommu_type)) {
>>> +		ret = -errno;
>>> +		pr_err("Failed to set IOMMU type %d for VFIO container",
>>> +		       iommu_type);
>>> +		return ret;
>> 
>> Just checking - is there a need to remove the groups from the containers
>> (added by VFIO_GROUP_SET_CONTAINER in vfio_group_init()) before erroring
>> out here? I imagine freeing of the vfio_container when kvmtool is
>> cleaning up after itself will do the right thing.
>
> Yes, VFIO cleans everything up when the container and group file
> descriptors are released.

Thanks for confirming!
Jean-Philippe Brucker Aug. 3, 2017, 11:24 a.m. UTC | #4
On 03/08/17 10:36, Punit Agrawal wrote:
> Jean-Philippe Brucker <jean-philippe.brucker@arm.com> writes:
> 
>> Hi Punit,
>>
>> Thanks for reviewing and testing
>>
>> On 31/07/17 18:52, Punit Agrawal wrote:
>>> Hi Jean-Philippe,
>>>
>>> A couple of queries below -
>>>
>>> Jean-Philippe Brucker <jean-philippe.brucker@arm.com> writes:
>>>
>>>> Assigning devices using VFIO allows the guest to have direct access to the
>>>> device, whilst filtering accesses to sensitive areas by trapping config
>>>> space accesses and mapping DMA with an IOMMU.
>>>>
>>>> This patch adds a new option to lkvm run: --vfio-group=<group_number>.
>>>> Before assigning a device to a VM, some preparation is required. As
>>>> described in Linux Documentation/vfio.txt, the device driver need to be
>>>
>>> Nitpick: "needs"
>>>
>>>> changed to vfio-pci:
>>>>
>>>>   $ dev=0000:00:00.0
>>>>
>>>>   $ echo $dev > /sys/bus/pci/devices/$dev/driver/unbind
>>>>   $ echo vfio-pci > /sys/bus/pci/devices/$dev/driver_override
>>>>   $ echo $dev > /sys/bus/pci/drivers_probe
>>>>   $ readlink /sys/bus/pci/devices/$dev/iommu_group
>>>>   ../../../kernel/iommu_groups/5
>>>>
>>>> Adding --vfio[-group]=5 to lkvm-run will pass the device to the guest.
>>>> Multiple groups can be passed to the guest by adding more --vfio
>>>> parameters.
>>>>
>>>> This patch only implements PCI with INTx. MSI-X routing will be added in a
>>>> subsequent patch, and at some point we might add support for passing
>>>> platform devices to guests.
>>>>
>>>> Signed-off-by: Will Deacon <will.deacon@arm.com>
>>>> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
>>>> Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
>>>> ---
>>>>  Makefile                 |   2 +
>>>>  arm/pci.c                |   1 +
>>>>  builtin-run.c            |   5 +
>>>>  include/kvm/kvm-config.h |   3 +
>>>>  include/kvm/pci.h        |   3 +-
>>>>  include/kvm/vfio.h       |  57 +++++++
>>>>  vfio/core.c              | 395 +++++++++++++++++++++++++++++++++++++++++++++++
>>>>  vfio/pci.c               | 365 +++++++++++++++++++++++++++++++++++++++++++
>>>>  8 files changed, 830 insertions(+), 1 deletion(-)
>>>>  create mode 100644 include/kvm/vfio.h
>>>>  create mode 100644 vfio/core.c
>>>>  create mode 100644 vfio/pci.c
>>>>
>>>
>>> [...]
>>>
>>>> +static int vfio_container_init(struct kvm *kvm)
>>>> +{
>>>> +	int api, i, ret, iommu_type;;
>>>> +
>>>> +	/* Create a container for our IOMMU groups */
>>>> +	vfio_container = open(VFIO_DEV_NODE, O_RDWR);
>>>> +	if (vfio_container == -1) {
>>>> +		ret = errno;
>>>> +		pr_err("Failed to open %s", VFIO_DEV_NODE);
>>>> +		return ret;
>>>> +	}
>>>> +
>>>> +	api = ioctl(vfio_container, VFIO_GET_API_VERSION);
>>>> +	if (api != VFIO_API_VERSION) {
>>>> +		pr_err("Unknown VFIO API version %d", api);
>>>> +		return -ENODEV;
>>>> +	}
>>>
>>> We are using the VFIO_API_VERSION pulled in from linux/vfio.h. Will
>>> kvmtool be in trouble if for some reason the API version is incompatibly
>>> incremented in the future?
>>
>> There are no precedents for this, as VFIO version is still 0. Additions to
>> the API are added in a backward-compatible way (for example by adding new
>> flags in the info structures and new fields at the end). If something
>> significant enough happened and required to increase the version number,
>> then the next version would probably be incompatible, and kvmtool would
>> have to support it separately.
> 
> In that case, we should use a local version number macro for comparison
> and if ever VFIO api changes kvmtool will explicitly complain instead of
> silently dealing with an unknown version.

Good idea. I think I'll go one step further and pull the UAPI headers into
include/linux/vfio.h. It seems to be what we usually do for this kind of
situation (and I'll need it anyway when working with SVM virtualization).

Thanks,
Jean
diff mbox

Patch

diff --git a/Makefile b/Makefile
index 57714815..caae6f07 100644
--- a/Makefile
+++ b/Makefile
@@ -59,6 +59,8 @@  OBJS	+= main.o
 OBJS	+= mmio.o
 OBJS	+= pci.o
 OBJS	+= term.o
+OBJS	+= vfio/core.o
+OBJS	+= vfio/pci.o
 OBJS	+= virtio/blk.o
 OBJS	+= virtio/scsi.o
 OBJS	+= virtio/console.o
diff --git a/arm/pci.c b/arm/pci.c
index 744b14c2..557cfa98 100644
--- a/arm/pci.c
+++ b/arm/pci.c
@@ -1,5 +1,6 @@ 
 #include "kvm/devices.h"
 #include "kvm/fdt.h"
+#include "kvm/kvm.h"
 #include "kvm/of_pci.h"
 #include "kvm/pci.h"
 #include "kvm/util.h"
diff --git a/builtin-run.c b/builtin-run.c
index 72b878dc..3ee735d9 100644
--- a/builtin-run.c
+++ b/builtin-run.c
@@ -146,6 +146,11 @@  void kvm_run_set_wrapper_sandbox(void)
 	OPT_BOOLEAN('\0', "no-dhcp", &(cfg)->no_dhcp, "Disable kernel"	\
 			" DHCP in rootfs mode"),			\
 									\
+	OPT_GROUP("VFIO options:"),					\
+	OPT_CALLBACK('\0', "vfio-group", NULL, "group number",		\
+			"Assign a VFIO group to the virtual machine",	\
+			vfio_group_parser, kvm),			\
+									\
 	OPT_GROUP("Debug options:"),					\
 	OPT_BOOLEAN('\0', "debug", &do_debug_print,			\
 			"Enable debug messages"),			\
diff --git a/include/kvm/kvm-config.h b/include/kvm/kvm-config.h
index 386fa8c5..62dc6a2f 100644
--- a/include/kvm/kvm-config.h
+++ b/include/kvm/kvm-config.h
@@ -2,6 +2,7 @@ 
 #define KVM_CONFIG_H_
 
 #include "kvm/disk-image.h"
+#include "kvm/vfio.h"
 #include "kvm/kvm-config-arch.h"
 
 #define DEFAULT_KVM_DEV		"/dev/kvm"
@@ -20,9 +21,11 @@ 
 struct kvm_config {
 	struct kvm_config_arch arch;
 	struct disk_image_params disk_image[MAX_DISK_IMAGES];
+	struct vfio_group vfio_group[MAX_VFIO_GROUPS];
 	u64 ram_size;
 	u8  image_count;
 	u8 num_net_devices;
+	u8 num_vfio_groups;
 	bool virtio_rng;
 	int active_console;
 	int debug_iodelay;
diff --git a/include/kvm/pci.h b/include/kvm/pci.h
index 2950bb10..44e5adff 100644
--- a/include/kvm/pci.h
+++ b/include/kvm/pci.h
@@ -7,7 +7,6 @@ 
 #include <endian.h>
 
 #include "kvm/devices.h"
-#include "kvm/kvm.h"
 #include "kvm/msi.h"
 #include "kvm/fdt.h"
 
@@ -22,6 +21,8 @@ 
 #define PCI_IO_SIZE		0x100
 #define PCI_CFG_SIZE		(1ULL << 24)
 
+struct kvm;
+
 union pci_config_address {
 	struct {
 #if __BYTE_ORDER == __LITTLE_ENDIAN
diff --git a/include/kvm/vfio.h b/include/kvm/vfio.h
new file mode 100644
index 00000000..060f32a3
--- /dev/null
+++ b/include/kvm/vfio.h
@@ -0,0 +1,57 @@ 
+#ifndef KVM__VFIO_H
+#define KVM__VFIO_H
+
+#include "kvm/parse-options.h"
+#include "kvm/pci.h"
+
+#include <linux/vfio.h>
+
+#include <dirent.h>
+
+#define dev_err(vdev, fmt, ...)		pr_err("%s: " fmt, vdev->name, ##__VA_ARGS__)
+#define dev_warn(vdev, fmt, ...)	pr_warning("%s: " fmt, vdev->name, ##__VA_ARGS__)
+#define dev_info(vdev, fmt, ...)	pr_info("%s: " fmt, vdev->name, ##__VA_ARGS__)
+#define dev_die(vdev, fmt, ...)		die("%s: " fmt, vdev->name, ##__VA_ARGS__)
+
+#define MAX_VFIO_GROUPS			16
+
+struct vfio_pci_device {
+	struct pci_device_header	hdr;
+};
+
+struct vfio_region {
+	struct vfio_region_info		info;
+	u64				guest_phys_addr;
+	void				*host_addr;
+};
+
+struct vfio_device {
+	struct device_header		dev_hdr;
+
+	int				fd;
+	struct vfio_device_info		info;
+	struct vfio_irq_info		irq_info;
+	struct vfio_region		*regions;
+
+	char				*name;
+	char				*sysfs_path;
+
+	struct hlist_node		list;
+
+	struct vfio_pci_device		pci;
+};
+
+struct vfio_group {
+	unsigned long			id; /* iommu_group number in sysfs */
+	int				fd;
+	struct hlist_head		devices;
+};
+
+int vfio_group_parser(const struct option *opt, const char *arg, int unset);
+int vfio_map_region(struct kvm *kvm, struct vfio_device *vdev,
+		    struct vfio_region *region);
+void vfio_unmap_region(struct kvm *kvm, struct vfio_region *region);
+int vfio_pci_setup_device(struct kvm *kvm, struct vfio_device *device);
+void vfio_pci_teardown_device(struct kvm *kvm, struct vfio_device *vdev);
+
+#endif /* KVM__VFIO_H */
diff --git a/vfio/core.c b/vfio/core.c
new file mode 100644
index 00000000..7e1ba789
--- /dev/null
+++ b/vfio/core.c
@@ -0,0 +1,395 @@ 
+#include "kvm/kvm.h"
+#include "kvm/vfio.h"
+
+#include <linux/list.h>
+
+#define VFIO_DEV_DIR		"/dev/vfio"
+#define VFIO_DEV_NODE		VFIO_DEV_DIR "/vfio"
+#define IOMMU_GROUP_DIR		"/sys/kernel/iommu_groups"
+
+#define VFIO_PATH_MAX_LEN	16
+
+static int vfio_container;
+
+int vfio_group_parser(const struct option *opt, const char *arg, int unset)
+{
+	char *cur, *buf = strdup(arg);
+	static int idx = 0;
+	struct kvm *kvm = opt->ptr;
+	struct vfio_group *group = &kvm->cfg.vfio_group[idx];
+
+	if (idx >= MAX_VFIO_GROUPS) {
+		if (idx++ == MAX_VFIO_GROUPS)
+			pr_warning("Too many VFIO groups");
+		free(buf);
+		return 0;
+	}
+
+	cur = strtok(buf, ",");
+	group->id = strtoul(cur, NULL, 0);
+
+	kvm->cfg.num_vfio_groups = ++idx;
+	free(buf);
+
+	return 0;
+}
+
+int vfio_map_region(struct kvm *kvm, struct vfio_device *vdev,
+		    struct vfio_region *region)
+{
+	void *base;
+	int ret, prot = 0;
+	/* KVM needs page-aligned regions */
+	u64 map_size = ALIGN(region->info.size, PAGE_SIZE);
+
+	/*
+	 * We don't want to mess about trapping config accesses, so require that
+	 * they can be mmap'd. Note that for PCI, this precludes the use of I/O
+	 * BARs in the guest (we will hide them from Configuration Space, which
+	 * is trapped).
+	 */
+	if (!(region->info.flags & VFIO_REGION_INFO_FLAG_MMAP)) {
+		dev_info(vdev, "ignoring region %u, as it can't be mmap'd",
+			 region->info.index);
+		return 0;
+	}
+
+	if (region->info.flags & VFIO_REGION_INFO_FLAG_READ)
+		prot |= PROT_READ;
+	if (region->info.flags & VFIO_REGION_INFO_FLAG_WRITE)
+		prot |= PROT_WRITE;
+
+	base = mmap(NULL, region->info.size, prot, MAP_SHARED, vdev->fd,
+		    region->info.offset);
+	if (base == MAP_FAILED) {
+		ret = -errno;
+		dev_err(vdev, "failed to mmap region %u (0x%llx bytes)",
+			region->info.index, region->info.size);
+		return ret;
+	}
+	region->host_addr = base;
+
+	ret = kvm__register_dev_mem(kvm, region->guest_phys_addr, map_size,
+				    region->host_addr);
+	if (ret) {
+		dev_err(vdev, "failed to register region with KVM");
+		return ret;
+	}
+
+	return 0;
+}
+
+void vfio_unmap_region(struct kvm *kvm, struct vfio_region *region)
+{
+	munmap(region->host_addr, region->info.size);
+}
+
+static int vfio_configure_device(struct kvm *kvm, struct vfio_group *group,
+				 const char *dirpath, const char *name)
+{
+	u32 num_regions;
+	int ret = -ENOMEM;
+	char fullpath[PATH_MAX];
+	struct vfio_device *vdev;
+
+	snprintf(fullpath, PATH_MAX, "%s/%s", dirpath, name);
+
+	vdev = calloc(1, sizeof(*vdev));
+	if (!vdev)
+		return -ENOMEM;
+
+	vdev->name = strdup(name);
+	if (!vdev->name)
+		goto err_free_device;
+
+	vdev->sysfs_path = strndup(fullpath, PATH_MAX);
+	if (!vdev->sysfs_path)
+		goto err_free_name;
+
+	vdev->fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
+	if (vdev->fd < 0) {
+		dev_err(vdev, "failed to get fd");
+
+		/* The device might be a bridge without an fd */
+		ret = 0;
+		goto err_free_path;
+	}
+
+	vdev->info.argsz = sizeof(vdev->info);
+	if (ioctl(vdev->fd, VFIO_DEVICE_GET_INFO, &vdev->info)) {
+		ret = -errno;
+		dev_err(vdev, "failed to get info");
+		goto err_close_device;
+	}
+
+	if (vdev->info.flags & VFIO_DEVICE_FLAGS_RESET &&
+	    ioctl(vdev->fd, VFIO_DEVICE_RESET) < 0)
+		dev_warn(vdev, "failed to reset device");
+
+	num_regions = vdev->info.num_regions;
+
+	vdev->regions = calloc(num_regions, sizeof(*vdev->regions));
+	if (!vdev->regions) {
+		ret = -ENOMEM;
+		goto err_close_device;
+	}
+
+	/* Now for the bus-specific initialization... */
+	if (vdev->info.flags & VFIO_DEVICE_FLAGS_PCI) {
+		ret = vfio_pci_setup_device(kvm, vdev);
+	} else {
+		dev_warn(vdev, "only vfio-pci is supported");
+		ret = -EINVAL;
+	}
+
+	if (ret)
+		goto err_free_regions;
+
+	dev_info(vdev, "assigned to device number 0x%x in group %lu",
+		 vdev->dev_hdr.dev_num, group->id);
+
+	hlist_add_head(&vdev->list, &group->devices);
+
+	return 0;
+
+err_free_regions:
+	free(vdev->regions);
+err_close_device:
+	close(vdev->fd);
+err_free_path:
+	free((void *)vdev->sysfs_path);
+err_free_name:
+	free((void *)vdev->name);
+err_free_device:
+	free(vdev);
+
+	return ret;
+}
+
+static int vfio_configure_iommu_groups(struct kvm *kvm)
+{
+	int i, ret;
+
+	for (i = 0; i < kvm->cfg.num_vfio_groups; ++i) {
+		DIR *dir;
+		struct dirent *dirent;
+		char dirpath[PATH_MAX];
+		struct vfio_group *group = &kvm->cfg.vfio_group[i];
+
+		snprintf(dirpath, PATH_MAX, IOMMU_GROUP_DIR "/%lu/devices",
+			 group->id);
+
+		dir = opendir(dirpath);
+		if (!dir) {
+			ret = -errno;
+			pr_err("Failed to open IOMMU group %s", dirpath);
+			return ret;
+		}
+
+		while ((dirent = readdir(dir))) {
+			if (dirent->d_type != DT_LNK)
+				continue;
+
+			ret = vfio_configure_device(kvm, group, dirpath,
+						    dirent->d_name);
+			if (ret)
+				return ret;
+		}
+
+		if (closedir(dir))
+			pr_warning("Failed to close IOMMU group %s", dirpath);
+	}
+
+	return 0;
+}
+
+static int vfio_get_iommu_type(void)
+{
+	if (ioctl(vfio_container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_NESTING_IOMMU))
+		return VFIO_TYPE1_NESTING_IOMMU;
+
+	if (ioctl(vfio_container, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU))
+		return VFIO_TYPE1v2_IOMMU;
+
+	if (ioctl(vfio_container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU))
+		return VFIO_TYPE1_IOMMU;
+
+	return -ENODEV;
+}
+
+static int vfio_map_mem_bank(struct kvm *kvm, struct kvm_mem_bank *bank, void *data)
+{
+	int ret = 0;
+	struct vfio_iommu_type1_dma_map dma_map = {
+		.argsz	= sizeof(dma_map),
+		.flags	= VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE,
+		.vaddr	= (unsigned long)bank->host_addr,
+		.iova	= (u64)bank->guest_phys_addr,
+		.size	= bank->size,
+	};
+
+	/* Map the guest memory for DMA (i.e. provide isolation) */
+	if (ioctl(vfio_container, VFIO_IOMMU_MAP_DMA, &dma_map)) {
+		ret = -errno;
+		pr_err("Failed to map 0x%llx -> 0x%llx (%llu) for DMA",
+		       dma_map.iova, dma_map.vaddr, dma_map.size);
+	}
+
+	return ret;
+}
+
+static int vfio_unmap_mem_bank(struct kvm *kvm, struct kvm_mem_bank *bank, void *data)
+{
+	struct vfio_iommu_type1_dma_unmap dma_unmap = {
+		.argsz = sizeof(dma_unmap),
+		.size = bank->size,
+		.iova = bank->guest_phys_addr,
+	};
+
+	ioctl(vfio_container, VFIO_IOMMU_UNMAP_DMA, &dma_unmap);
+
+	return 0;
+}
+
+static int vfio_group_init(struct kvm *kvm, struct vfio_group *group)
+{
+	int ret;
+	char group_node[VFIO_PATH_MAX_LEN];
+	struct vfio_group_status group_status = {
+		.argsz = sizeof(group_status),
+	};
+
+	INIT_HLIST_HEAD(&group->devices);
+
+	snprintf(group_node, VFIO_PATH_MAX_LEN, VFIO_DEV_DIR "/%lu",
+		 group->id);
+
+	group->fd = open(group_node, O_RDWR);
+	if (group->fd == -1) {
+		ret = -errno;
+		pr_err("Failed to open IOMMU group %s", group_node);
+		return ret;
+	}
+
+	if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &group_status)) {
+		ret = -errno;
+		pr_err("Failed to determine status of IOMMU group %s",
+		       group_node);
+		return ret;
+	}
+
+	if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
+		pr_err("IOMMU group %s is not viable", group_node);
+		return -EINVAL;
+	}
+
+	if (ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &vfio_container)) {
+		ret = -errno;
+		pr_err("Failed to add IOMMU group %s to VFIO container",
+		       group_node);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void vfio_group_exit(struct kvm *kvm, struct vfio_group *group)
+{
+	int fd = group->fd;
+	struct hlist_node *next;
+	struct vfio_device *vdev;
+
+	hlist_for_each_entry_safe(vdev, next, &group->devices, list) {
+		if (vdev->info.flags & VFIO_DEVICE_FLAGS_PCI)
+			vfio_pci_teardown_device(kvm, vdev);
+
+		close(vdev->fd);
+
+		free(vdev->regions);
+		free(vdev->name);
+		free(vdev->sysfs_path);
+		free(vdev);
+	}
+
+	ioctl(fd, VFIO_GROUP_UNSET_CONTAINER);
+	close(fd);
+}
+
+static int vfio_container_init(struct kvm *kvm)
+{
+	int api, i, ret, iommu_type;;
+
+	/* Create a container for our IOMMU groups */
+	vfio_container = open(VFIO_DEV_NODE, O_RDWR);
+	if (vfio_container == -1) {
+		ret = errno;
+		pr_err("Failed to open %s", VFIO_DEV_NODE);
+		return ret;
+	}
+
+	api = ioctl(vfio_container, VFIO_GET_API_VERSION);
+	if (api != VFIO_API_VERSION) {
+		pr_err("Unknown VFIO API version %d", api);
+		return -ENODEV;
+	}
+
+	iommu_type = vfio_get_iommu_type();
+	if (iommu_type < 0) {
+		pr_err("VFIO type-1 IOMMU not supported on this platform");
+		return iommu_type;
+	}
+
+	/* Sanity check our groups and add them to the container */
+	for (i = 0; i < kvm->cfg.num_vfio_groups; ++i) {
+		ret = vfio_group_init(kvm, &kvm->cfg.vfio_group[i]);
+		if (ret)
+			return ret;
+	}
+
+	/* Finalise the container */
+	if (ioctl(vfio_container, VFIO_SET_IOMMU, iommu_type)) {
+		ret = -errno;
+		pr_err("Failed to set IOMMU type %d for VFIO container",
+		       iommu_type);
+		return ret;
+	} else {
+		pr_info("Using IOMMU type %d for VFIO container", iommu_type);
+	}
+
+	return kvm__for_each_mem_bank(kvm, KVM_MEM_TYPE_RAM, vfio_map_mem_bank,
+				      NULL);
+}
+
+static int vfio__init(struct kvm *kvm)
+{
+	int ret;
+
+	if (!kvm->cfg.num_vfio_groups)
+		return 0;
+
+	ret = vfio_container_init(kvm);
+	if (ret)
+		return ret;
+
+	ret = vfio_configure_iommu_groups(kvm);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+dev_base_init(vfio__init);
+
+static int vfio__exit(struct kvm *kvm)
+{
+	int i;
+
+	if (!kvm->cfg.num_vfio_groups)
+		return 0;
+
+	for (i = 0; i < kvm->cfg.num_vfio_groups; ++i)
+		vfio_group_exit(kvm, &kvm->cfg.vfio_group[i]);
+
+	kvm__for_each_mem_bank(kvm, KVM_MEM_TYPE_RAM, vfio_unmap_mem_bank, NULL);
+	return close(vfio_container);
+}
+dev_base_exit(vfio__exit);
diff --git a/vfio/pci.c b/vfio/pci.c
new file mode 100644
index 00000000..aca43431
--- /dev/null
+++ b/vfio/pci.c
@@ -0,0 +1,365 @@ 
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/vfio.h"
+
+#include <sys/ioctl.h>
+#include <sys/eventfd.h>
+
+/* Wrapper around UAPI vfio_irq_set */
+struct vfio_irq_eventfd {
+	struct vfio_irq_set	irq;
+	int			fd;
+};
+
+static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr,
+			      u8 offset, void *data, int sz)
+{
+	struct vfio_region_info *info;
+	struct vfio_pci_device *pdev;
+	struct vfio_device *vdev;
+	char base[sz];
+
+	pdev = container_of(pci_hdr, struct vfio_pci_device, hdr);
+	vdev = container_of(pdev, struct vfio_device, pci);
+	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
+
+	/* Dummy read in case of side-effects */
+	if (pread(vdev->fd, base, sz, info->offset + offset) != sz)
+		dev_warn(vdev, "failed to read %d bytes from Configuration Space at 0x%x",
+			 sz, offset);
+}
+
+static void vfio_pci_cfg_write(struct kvm *kvm, struct pci_device_header *pci_hdr,
+			       u8 offset, void *data, int sz)
+{
+	struct vfio_region_info *info;
+	struct vfio_pci_device *pdev;
+	struct vfio_device *vdev;
+	void *base = pci_hdr;
+
+	pdev = container_of(pci_hdr, struct vfio_pci_device, hdr);
+	vdev = container_of(pdev, struct vfio_device, pci);
+	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
+
+	if (pwrite(vdev->fd, data, sz, info->offset + offset) != sz)
+		dev_warn(vdev, "Failed to write %d bytes to Configuration Space at 0x%x",
+			 sz, offset);
+
+	if (pread(vdev->fd, base + offset, sz, info->offset + offset) != sz)
+		dev_warn(vdev, "Failed to read %d bytes from Configuration Space at 0x%x",
+			 sz, offset);
+}
+
+static int vfio_pci_parse_caps(struct vfio_device *vdev)
+{
+	struct vfio_pci_device *pdev = &vdev->pci;
+
+	if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST))
+		return 0;
+
+	pdev->hdr.status &= ~PCI_STATUS_CAP_LIST;
+	pdev->hdr.capabilities = 0;
+
+	/* TODO: install virtual capabilities */
+
+	return 0;
+}
+
+static int vfio_pci_parse_cfg_space(struct vfio_device *vdev)
+{
+	struct vfio_region_info *info;
+	ssize_t sz = PCI_DEV_CFG_SIZE;
+	struct vfio_pci_device *pdev = &vdev->pci;
+
+	if (vdev->info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) {
+		dev_err(vdev, "Config Space not found");
+		return -ENODEV;
+	}
+
+	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
+	*info = (struct vfio_region_info) {
+			.argsz = sizeof(*info),
+			.index = VFIO_PCI_CONFIG_REGION_INDEX,
+	};
+
+	ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info);
+	if (!info->size) {
+		dev_err(vdev, "Config Space has size zero?!");
+		return -EINVAL;
+	}
+
+	if (pread(vdev->fd, &pdev->hdr, sz, info->offset) != sz) {
+		dev_err(vdev, "failed to read %zd bytes of Config Space", sz);
+		return -EIO;
+	}
+
+	/* Strip bit 7, that indicates multifunction */
+	pdev->hdr.header_type &= 0x7f;
+
+	if (pdev->hdr.header_type != PCI_HEADER_TYPE_NORMAL) {
+		dev_err(vdev, "unsupported header type %u",
+			pdev->hdr.header_type);
+		return -EOPNOTSUPP;
+	}
+
+	vfio_pci_parse_caps(vdev);
+
+	return 0;
+}
+
+static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev)
+{
+	int i;
+	ssize_t hdr_sz;
+	struct vfio_region_info *info;
+	struct vfio_pci_device *pdev = &vdev->pci;
+
+	/* Enable exclusively MMIO and bus mastering */
+	pdev->hdr.command &= ~PCI_COMMAND_IO;
+	pdev->hdr.command |= PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER;
+
+	/* Initialise the BARs */
+	for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
+		struct vfio_region *region = &vdev->regions[i];
+		u64 base = region->guest_phys_addr;
+
+		if (!base)
+			continue;
+
+		pdev->hdr.bar_size[i] = region->info.size;
+
+		/* Construct a fake reg to match what we've mapped. */
+		pdev->hdr.bar[i] = (base & PCI_BASE_ADDRESS_MEM_MASK) |
+					PCI_BASE_ADDRESS_SPACE_MEMORY |
+					PCI_BASE_ADDRESS_MEM_TYPE_32;
+	}
+
+	/* I really can't be bothered to support cardbus. */
+	pdev->hdr.card_bus = 0;
+
+	/*
+	 * Nuke the expansion ROM for now. If we want to do this properly,
+	 * we need to save its size somewhere and map into the guest.
+	 */
+	pdev->hdr.exp_rom_bar = 0;
+
+	/* Install our fake Configuration Space, without the caps */
+	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
+	hdr_sz = offsetof(struct pci_device_header, msix);
+	if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) {
+		dev_err(vdev, "failed to write %zd bytes to Config Space",
+			hdr_sz);
+		return -EIO;
+	}
+
+	/* TODO: install virtual capability */
+
+	/* Register callbacks for cfg accesses */
+	pdev->hdr.cfg_ops = (struct pci_config_operations) {
+		.read	= vfio_pci_cfg_read,
+		.write	= vfio_pci_cfg_write,
+	};
+
+	pdev->hdr.irq_type = IRQ_TYPE_LEVEL_HIGH;
+
+	return 0;
+}
+
+static int vfio_pci_configure_dev_regions(struct kvm *kvm,
+					  struct vfio_device *vdev)
+{
+	u32 i;
+	int ret;
+	size_t map_size;
+
+	ret = vfio_pci_parse_cfg_space(vdev);
+	if (ret)
+		return ret;
+
+	/* First of all, map the BARs directly into the guest */
+	for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
+		struct vfio_region *region = &vdev->regions[i];
+
+		if (i >= vdev->info.num_regions)
+			break;
+
+		region->info = (struct vfio_region_info) {
+			.argsz = sizeof(*region),
+			.index = i,
+		};
+
+		ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO,
+			    &region->info);
+		if (ret) {
+			ret = -errno;
+			dev_err(vdev, "cannot get info for region %u", i);
+			return ret;
+		}
+
+		/* Ignore invalid or unimplemented regions */
+		if (!region->info.size)
+			continue;
+
+		/* Grab some MMIO space in the guest */
+		map_size = ALIGN(region->info.size, PAGE_SIZE);
+		region->guest_phys_addr = pci_get_io_space_block(map_size);
+
+		/*
+		 * Map the BARs into the guest. We'll later need to update
+		 * configuration space to reflect our allocation.
+		 */
+		ret = vfio_map_region(kvm, vdev, region);
+		if (ret)
+			return ret;
+	}
+
+	/* We've configured the BARs, fake up a Configuration Space */
+	return vfio_pci_fixup_cfg_space(vdev);
+}
+
+static int vfio_pci_init_irqfd(struct kvm *kvm, int devfd, int gsi)
+{
+	int ret;
+	int trigger_fd, unmask_fd;
+	struct vfio_irq_eventfd	trigger;
+	struct vfio_irq_eventfd	unmask;
+
+	/*
+	 * PCI IRQ is level-triggered, so we use two eventfds. trigger_fd
+	 * signals an interrupt from host to guest, and unmask_fd signals the
+	 * deassertion of the line from guest to host.
+	 */
+	trigger_fd = eventfd(0, 0);
+	if (trigger_fd < 0) {
+		pr_err("Failed to create trigger eventfd");
+		return trigger_fd;
+	}
+
+	unmask_fd = eventfd(0, 0);
+	if (unmask_fd < 0) {
+		pr_err("Failed to create unmask eventfd");
+		close(trigger_fd);
+		return unmask_fd;
+	}
+
+	ret = irq__add_irqfd(kvm, gsi, trigger_fd, unmask_fd);
+	if (ret)
+		goto err_close;
+
+	trigger.irq = (struct vfio_irq_set) {
+		.argsz	= sizeof(trigger),
+		.flags	= VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
+		.index	= VFIO_PCI_INTX_IRQ_INDEX,
+		.start	= 0,
+		.count	= 1,
+	};
+	trigger.fd = trigger_fd;
+
+	ret = ioctl(devfd, VFIO_DEVICE_SET_IRQS, &trigger);
+	if (ret < 0) {
+		pr_err("Failed to setup VFIO IRQ");
+		goto err_delete_line;
+	}
+
+	unmask.irq = (struct vfio_irq_set) {
+		.argsz	= sizeof(unmask),
+		.flags	= VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK,
+		.index	= VFIO_PCI_INTX_IRQ_INDEX,
+		.start	= 0,
+		.count	= 1,
+	};
+	unmask.fd = unmask_fd;
+
+	ret = ioctl(devfd, VFIO_DEVICE_SET_IRQS, &unmask);
+	if (ret < 0) {
+		pr_err("Failed to setup unmask IRQ");
+		goto err_remove_event;
+	}
+
+	return 0;
+
+err_remove_event:
+	/* Remove trigger event */
+	trigger.irq.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
+	ioctl(devfd, VFIO_DEVICE_SET_IRQS, &trigger.irq);
+
+err_delete_line:
+	irq__del_irqfd(kvm, gsi, trigger_fd);
+
+err_close:
+	close(trigger_fd);
+	close(unmask_fd);
+	return ret;
+}
+
+static int vfio_pci_configure_dev_irqs(struct kvm *kvm, struct vfio_device *vdev)
+{
+	struct vfio_pci_device *pdev = &vdev->pci;
+	int gsi = pdev->hdr.irq_line - KVM_IRQ_OFFSET;
+
+	vdev->irq_info = (struct vfio_irq_info) {
+		.argsz = sizeof(vdev->irq_info),
+	};
+
+	ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &vdev->irq_info);
+	if (vdev->irq_info.count == 0) {
+		dev_err(vdev, "no interrupt found by VFIO");
+		return -ENODEV;
+	}
+
+	if (!(vdev->irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
+		dev_err(vdev, "interrupt not EVENTFD capable");
+		return -EINVAL;
+	}
+
+	/* TODO: add MSI support */
+	dev_err(vdev, "MSI-X not available, falling back to INTx");
+
+	if (!(vdev->irq_info.flags & VFIO_IRQ_INFO_AUTOMASKED)) {
+		dev_err(vdev, "INTx interrupt not AUTOMASKED");
+		return -EINVAL;
+	}
+
+	return vfio_pci_init_irqfd(kvm, vdev->fd, gsi);
+}
+
+int vfio_pci_setup_device(struct kvm *kvm, struct vfio_device *vdev)
+{
+	int ret;
+
+	ret = vfio_pci_configure_dev_regions(kvm, vdev);
+	if (ret) {
+		dev_err(vdev, "failed to configure regions");
+		return ret;
+	}
+
+	vdev->dev_hdr = (struct device_header) {
+		.bus_type	= DEVICE_BUS_PCI,
+		.data		= &vdev->pci.hdr,
+	};
+
+	ret = device__register(&vdev->dev_hdr);
+	if (ret) {
+		dev_err(vdev, "failed to register VFIO device");
+		return ret;
+	}
+
+	ret = vfio_pci_configure_dev_irqs(kvm, vdev);
+	if (ret) {
+		dev_err(vdev, "failed to configure IRQs");
+		return ret;
+	}
+
+	return 0;
+}
+
+void vfio_pci_teardown_device(struct kvm *kvm, struct vfio_device *vdev)
+{
+	size_t i;
+
+	for (i = 0; i < vdev->info.num_regions; i++)
+		vfio_unmap_region(kvm, &vdev->regions[i]);
+
+	device__unregister(&vdev->dev_hdr);
+}