diff mbox

[RFC,5/5] VFIO based device assignment

Message ID 20100711180942.20121.97368.stgit@localhost6.localdomain6 (mailing list archive)
State New, archived
Headers show

Commit Message

Alex Williamson July 11, 2010, 6:09 p.m. UTC
None
diff mbox

Patch

diff --git a/Makefile.target b/Makefile.target
index 0c1b916..4936d96 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -197,6 +197,7 @@  obj-i386-y += vmmouse.o vmport.o hpet.o
 obj-i386-y += device-hotplug.o pci-hotplug.o smbios.o wdt_ib700.o
 obj-i386-y += debugcon.o multiboot.o
 obj-i386-y += pc_piix.o
+obj-i386-y += vfio.o
 
 # shared objects
 obj-ppc-y = ppc.o
diff --git a/hw/linux-vfio.h b/hw/linux-vfio.h
new file mode 100644
index 0000000..06bd3f3
--- /dev/null
+++ b/hw/linux-vfio.h
@@ -0,0 +1,200 @@ 
+/*
+ * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
+ * Author: Tom Lyon, pugs@cisco.com
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Portions derived from drivers/uio/uio.c:
+ * Copyright(C) 2005, Benedikt Spranger <b.spranger@linutronix.de>
+ * Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2006, Hans J. Koch <hjk@linutronix.de>
+ * Copyright(C) 2006, Greg Kroah-Hartman <greg@kroah.com>
+ *
+ * Portions derived from drivers/uio/uio_pci_generic.c:
+ * Copyright (C) 2009 Red Hat, Inc.
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ */
+
+/*
+ * VFIO driver - allow mapping and use of certain PCI devices
+ * in unprivileged user processes. (If IOMMU is present)
+ * Especially useful for Virtual Function parts of SR-IOV devices
+ */
+
+#ifdef __KERNEL__
+
+struct vfio_dev {
+	struct device	*dev;
+	struct pci_dev	*pdev;
+	u8		*pci_config_map;
+	int		pci_config_size;
+	char		name[8];
+	int		devnum;
+	int		pmaster;
+	void __iomem	*bar[PCI_ROM_RESOURCE+1];
+	spinlock_t	irqlock;	/* guards command register accesses */
+	int		listeners;
+	u32		locked_pages;
+	struct mutex	lgate;		/* listener gate */
+	struct mutex	dgate;		/* dma op gate */
+	struct mutex	igate;		/* intr op gate */
+	struct msix_entry	*msix;
+	int			nvec;
+	struct uiommu_domain	*udomain;
+	int			cachec;
+	struct eventfd_ctx	*ev_irq;
+	struct eventfd_ctx	*ev_msi;
+	struct eventfd_ctx	**ev_msix;
+	struct {
+		u8	intr;
+		u8	bardirty;
+		u8	rombar[4];
+		u8	bar[6*4];
+		u8	msi[24];
+	} vinfo;
+};
+
+struct vfio_listener {
+	struct vfio_dev	*vdev;
+	struct list_head	dm_list;
+	struct mm_struct	*mm;
+	struct mmu_notifier	mmu_notifier;
+};
+
+/*
+ * Structure for keeping track of memory nailed down by the
+ * user for DMA
+ */
+struct dma_map_page {
+	struct list_head list;
+	struct page     **pages;
+	dma_addr_t      daddr;
+	unsigned long	vaddr;
+	int		npage;
+	int		rdwr;
+};
+
+/* VFIO class infrastructure */
+struct vfio_class {
+	struct kref kref;
+	struct class *class;
+};
+extern struct vfio_class *vfio_class;
+
+ssize_t vfio_io_readwrite(int, struct vfio_dev *,
+			char __user *, size_t, loff_t *);
+ssize_t vfio_mem_readwrite(int, struct vfio_dev *,
+			char __user *, size_t, loff_t *);
+ssize_t vfio_config_readwrite(int, struct vfio_dev *,
+			char __user *, size_t, loff_t *);
+
+void vfio_disable_msi(struct vfio_dev *);
+void vfio_disable_msix(struct vfio_dev *);
+int vfio_enable_msi(struct vfio_dev *, int);
+int vfio_enable_msix(struct vfio_dev *, int, void __user *);
+
+#ifndef PCI_MSIX_ENTRY_SIZE
+#define	PCI_MSIX_ENTRY_SIZE	16
+#endif
+#ifndef PCI_STATUS_INTERRUPT
+#define	PCI_STATUS_INTERRUPT	0x08
+#endif
+
+struct vfio_dma_map;
+void vfio_dma_unmapall(struct vfio_listener *);
+int vfio_dma_unmap_dm(struct vfio_listener *, struct vfio_dma_map *);
+int vfio_dma_map_common(struct vfio_listener *, unsigned int,
+			struct vfio_dma_map *);
+int vfio_domain_set(struct vfio_dev *, int);
+void vfio_domain_unset(struct vfio_dev *);
+
+int vfio_class_init(void);
+void vfio_class_destroy(void);
+int vfio_dev_add_attributes(struct vfio_dev *);
+extern struct idr vfio_idr;
+extern struct mutex vfio_minor_lock;
+int vfio_build_config_map(struct vfio_dev *);
+
+irqreturn_t vfio_interrupt(int, void *);
+
+#endif	/* __KERNEL__ */
+
+/* Kernel & User level defines for ioctls */
+
+/*
+ * Structure for DMA mapping of user buffers
+ * vaddr, dmaaddr, and size must all be page aligned
+ * buffer may only be larger than 1 page if (a) there is
+ * an iommu in the system, or (b) buffer is part of a huge page
+ */
+struct vfio_dma_map {
+	__u64	vaddr;		/* process virtual addr */
+	__u64	dmaaddr;	/* desired and/or returned dma address */
+	__u64	size;		/* size in bytes */
+	__u64	flags;		/* bool: 0 for r/o; 1 for r/w */
+#define	VFIO_FLAG_WRITE		0x1	/* req writeable DMA mem */
+};
+
+/* map user pages at specific dma address */
+/* requires previous VFIO_DOMAIN_SET */
+#define	VFIO_DMA_MAP_IOVA	_IOWR(';', 101, struct vfio_dma_map)
+
+/* unmap user pages */
+#define	VFIO_DMA_UNMAP		_IOW(';', 102, struct vfio_dma_map)
+
+/* set device DMA mask & master status */
+#define	VFIO_DMA_MASK		_IOW(';', 103, __u64)
+
+/* request IRQ interrupts; use given eventfd */
+#define	VFIO_EVENTFD_IRQ	_IOW(';', 104, int)
+
+/* request MSI interrupts; use given eventfd */
+#define	VFIO_EVENTFD_MSI	_IOW(';', 105, int)
+
+/* Request MSI-X interrupts: arg[0] is #, arg[1-n] are eventfds */
+#define	VFIO_EVENTFDS_MSIX	_IOW(';', 106, int)
+
+/* Get length of a BAR */
+#define	VFIO_BAR_LEN		_IOWR(';', 107, __u32)
+
+/* Set the IOMMU domain - arg is fd from uiommu driver */
+#define	VFIO_DOMAIN_SET		_IOW(';', 108, int)
+
+/* Unset the IOMMU domain */
+#define	VFIO_DOMAIN_UNSET	_IO(';', 109)
+
+/*
+ * Reads, writes, and mmaps determine which PCI BAR (or config space)
+ * from the high level bits of the file offset
+ */
+#define	VFIO_PCI_BAR0_RESOURCE		0x0
+#define	VFIO_PCI_BAR1_RESOURCE		0x1
+#define	VFIO_PCI_BAR2_RESOURCE		0x2
+#define	VFIO_PCI_BAR3_RESOURCE		0x3
+#define	VFIO_PCI_BAR4_RESOURCE		0x4
+#define	VFIO_PCI_BAR5_RESOURCE		0x5
+#define	VFIO_PCI_ROM_RESOURCE		0x6
+#define	VFIO_PCI_CONFIG_RESOURCE	0xF
+#define	VFIO_PCI_SPACE_SHIFT	32
+#define VFIO_PCI_CONFIG_OFF vfio_pci_space_to_offset(VFIO_PCI_CONFIG_RESOURCE)
+
+static inline int vfio_offset_to_pci_space(__u64 off)
+{
+	return (off >> VFIO_PCI_SPACE_SHIFT) & 0xF;
+}
+
+static inline __u64 vfio_pci_space_to_offset(int sp)
+{
+	return (__u64)(sp) << VFIO_PCI_SPACE_SHIFT;
+}
diff --git a/hw/vfio.c b/hw/vfio.c
new file mode 100644
index 0000000..d9ff3d8
--- /dev/null
+++ b/hw/vfio.c
@@ -0,0 +1,1295 @@ 
+/*
+ * vfio based device assignment support
+ *
+ * Copyright Red Hat, Inc. 2010
+ *
+ * Authors:
+ *  Alex Williamson <alex.williamson@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Based on qemu-kvm device-assignment:
+ *  Adapted for KVM by Qumranet.
+ *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
+ *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
+ *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
+ *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
+ *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
+ */
+
+#include <dirent.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/io.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include "event_notifier.h"
+#include "hw.h"
+#include "memory.h"
+#include "monitor.h"
+#include "pc.h"
+#include "qemu-error.h"
+#include "vfio.h"
+#include <pci/header.h>
+#include <pci/types.h>
+#include <linux/types.h>
+#include "linux-vfio.h"
+
+//#define DEBUG_VFIO
+#ifdef DEBUG_VFIO
+#define DPRINTF(fmt, ...) \
+    do { printf("vfio: " fmt, ## __VA_ARGS__); } while (0)
+#else
+#define DPRINTF(fmt, ...) \
+    do { } while (0)
+#endif
+
+static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
+static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
+                                  uint32_t val, int len);
+/*
+ * Generic
+ */
+static uint8_t pci_find_cap_offset(PCIDevice *pdev, uint8_t cap)
+{
+    int id;
+    int max_cap = 48;
+    int pos = PCI_CAPABILITY_LIST;
+    int status;
+
+    status = pdev->config[PCI_STATUS];
+    if ((status & PCI_STATUS_CAP_LIST) == 0) {
+        return 0;
+    }
+
+    while (max_cap--) {
+        pos = pdev->config[pos];
+        if (pos < 0x40) {
+            break;
+        }
+
+        pos &= ~3;
+        id = pdev->config[pos + PCI_CAP_LIST_ID];
+
+        if (id == 0xff) {
+            break;
+        }
+        if (id == cap) {
+            return pos;
+        }
+
+        pos += PCI_CAP_LIST_NEXT;
+    }
+    return 0;
+}
+
+static int parse_hostaddr(DeviceState *qdev, Property *prop, const char *str)
+{
+    PCIHostDevice *ptr = qdev_get_prop_ptr(qdev, prop);
+    const char *p = str;
+    int n, seg, bus, dev, func;
+    char field[5];
+
+    if (sscanf(p, "%4[^:]%n", field, &n) != 1 || p[n] != ':') {
+        return -1;
+    }
+
+    seg = strtol(field, NULL, 16);
+    p += n + 1;
+
+    if (sscanf(p, "%4[^:]%n", field, &n) != 1) {
+        return -1;
+    }
+
+    if (p[n] == ':') {
+        bus = strtol(field, NULL, 16);
+        p += n + 1;
+    } else {
+        bus = seg;
+        seg = 0;
+    }
+
+    if (sscanf(p, "%4[^.]%n", field, &n) != 1 || p[n] != '.') {
+        return -1;
+    }
+
+    dev = strtol(field, NULL, 16);
+    p += n + 1;
+
+    if (!qemu_isdigit(*p)) {
+        return -1;
+    }
+
+    func = *p - '0';
+
+    ptr->seg = seg;
+    ptr->bus = bus;
+    ptr->dev = dev;
+    ptr->func = func;
+    return 0;
+}
+
+static int print_hostaddr(DeviceState *qdev, Property *prop,
+                          char *dest, size_t len)
+{
+    PCIHostDevice *ptr = qdev_get_prop_ptr(qdev, prop);
+
+    return snprintf(dest, len, "%04x:%02x:%02x.%x",
+                    ptr->seg, ptr->bus, ptr->dev, ptr->func);
+}
+
+/*
+ * MSI-X
+ */
+static uint32_t msix_mmio_read(VFIODevice *vdev,
+                               target_phys_addr_t addr, int len)
+{
+    unsigned int offset = addr & 0xfff;
+    uint32_t val = 0;
+
+    memcpy(&val, (void *)&((uint8_t *)vdev->msix.table)[offset], len);
+    DPRINTF("%s(%04x:%02x:%02x.%x, 0x%lx, 0x%x) = 0x%x\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func, addr, len, val);
+    return val;
+}
+
+static uint32_t msix_mmio_readl(void *opaque, target_phys_addr_t addr)
+{
+    return msix_mmio_read(opaque, addr, 4);
+}
+
+static uint32_t msix_mmio_readw(void *opaque, target_phys_addr_t addr)
+{
+    return msix_mmio_read(opaque, addr, 2);
+}
+
+static uint32_t msix_mmio_readb(void *opaque, target_phys_addr_t addr)
+{
+    return msix_mmio_read(opaque, addr, 1);
+}
+
+static CPUReadMemoryFunc *msix_mmio_reads[] = {
+    msix_mmio_readb,    msix_mmio_readw,        msix_mmio_readl
+};
+
+static void msix_mmio_write(VFIODevice *vdev, target_phys_addr_t addr,
+                            uint32_t val, int len)
+{
+    unsigned int offset = addr & 0xfff;
+
+    memcpy((void *)&((uint8_t *)vdev->msix.table)[offset], &val, len);
+
+    DPRINTF("%s(%04x:%02x:%02x.%x, 0x%lx, 0x%x, 0x%x)\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func, addr, val, len);
+
+    if ((offset & 0xf) == 0xc && vdev->msix.enabled) {
+        uint64_t off = vdev->msix.bar_offset + offset +
+                       vfio_pci_space_to_offset(VFIO_PCI_BAR0_RESOURCE +
+                                                vdev->msix.bar);
+        if (pwrite(vdev->vfiofd, &val, len, off) != len) {
+            fprintf(stderr, "vfio: Error: Failed to update MSIX table ctrl\n");
+        }
+    }
+}
+
+static void msix_mmio_writel(void *opaque,
+                             target_phys_addr_t addr, uint32_t val)
+{
+    msix_mmio_write(opaque, addr, val, 4);
+}
+
+static void msix_mmio_writew(void *opaque,
+                             target_phys_addr_t addr, uint32_t val)
+{
+    msix_mmio_write(opaque, addr, val, 2);
+}
+
+static void msix_mmio_writeb(void *opaque,
+                             target_phys_addr_t addr, uint32_t val)
+{
+    msix_mmio_write(opaque, addr, val, 1);
+}
+
+static CPUWriteMemoryFunc *msix_mmio_writes[] = {
+    msix_mmio_writeb,   msix_mmio_writew,       msix_mmio_writel
+};
+
+static void vfio_msix_interrupt(void *opaque)
+{
+    MSIXEvent *event = opaque;
+    uint64_t addr;
+    uint32_t data;
+
+    if (!event_notifier_test_and_clear(&event->notifier)) {
+        return;
+    }
+
+    addr = le32_to_cpu(event->entry->upper_addr);
+    addr = (addr << 32) | le32_to_cpu(event->entry->addr);
+    data = le32_to_cpu(event->entry->data);
+    DPRINTF("%s: 0x%x -> 0x%lx\n", __FUNCTION__, data, addr);
+    stl_phys(addr, data);
+}
+
+static void vfio_enable_msix(VFIODevice *vdev)
+{
+    int i, vectors, *fds;
+    uint64_t off = vdev->msix.bar_offset +
+                   vfio_pci_space_to_offset(VFIO_PCI_BAR0_RESOURCE +
+                                            vdev->msix.bar);
+
+    /* Hmm, it's probably possible for a driver to setup less then
+     * the full table of vectors... right?
+     */
+    for (i = 0; i < vdev->msix.table_len; i++) {
+        if (!vdev->msix.table[i].addr) {
+            break;
+        }
+    }
+
+    vectors = i;
+    if (!vectors) {
+        fprintf(stderr, "vfio: Error: no MSIX vectors enabled\n");
+        return;
+    }
+
+    vdev->msix.events = qemu_mallocz(vectors * sizeof(MSIXEvent));
+    vdev->msix.num_events = vectors;
+    fds = qemu_malloc((vectors + 1) * sizeof(int));
+    fds[0] = vectors;
+
+    for (i = 0; i < vectors; i++) {
+        vdev->msix.events[i].entry = &vdev->msix.table[i];
+        if (event_notifier_init(&vdev->msix.events[i].notifier, 0))
+            fprintf(stderr, "vfio: Error: event_notifier_init failed\n");
+
+        fds[i + 1] = event_notifier_get_fd(&vdev->msix.events[i].notifier);
+        qemu_set_fd_handler(fds[i + 1], vfio_msix_interrupt, NULL,
+                            &vdev->msix.events[i]);
+    }
+
+    if (ioctl(vdev->vfiofd, VFIO_EVENTFDS_MSIX, fds)) {
+        fprintf(stderr, "vfio: Error: Failed to setup MSIX fds %s\n",
+                strerror(errno));
+        qemu_free(fds);
+        return;
+    }
+
+    qemu_free(fds);
+
+    for (i = 0; i < vectors; i++) {
+        MSIXTableEntry *te = &vdev->msix.table[i];
+        if (pwrite(vdev->vfiofd, &te->ctrl, sizeof(te->ctrl),
+                   off + (i * sizeof(MSIXTableEntry)) +
+                   offsetof(MSIXTableEntry, ctrl)) != sizeof(te->ctrl)) {
+            fprintf(stderr, "vfio: Error: Failed to update MSIX table ctrl\n");
+        }
+    }
+    vdev->msix.enabled = 1;
+    DPRINTF("%s(%04x:%02x:%02x.%x)\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func);
+}
+
+static void vfio_disable_msix(VFIODevice *vdev)
+{
+    uint32_t vectors = 0;
+    int i;
+
+    if (!vdev->msix.enabled) {
+        return;
+    }
+
+    for (i = 0; i < vdev->msix.num_events; i++) {
+        int fd = event_notifier_get_fd(&vdev->msix.events[i].notifier);
+        qemu_set_fd_handler(fd, NULL, NULL, NULL);
+        event_notifier_cleanup(&vdev->msix.events[i].notifier);
+    }
+
+    ioctl(vdev->vfiofd, VFIO_EVENTFDS_MSIX, &vectors);
+    qemu_free(vdev->msix.events);
+    vdev->msix.events = NULL;
+    vdev->msix.num_events = 0;
+    vdev->msix.enabled = 0;
+    DPRINTF("%s(%04x:%02x:%02x.%x)\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func);
+}
+
+/*
+ * MSI
+ */
+static void vfio_msi_interrupt(void *opaque)
+{
+    MSIEvent *event = opaque;
+    uint64_t addr;
+    uint32_t data;
+
+    if (!event_notifier_test_and_clear(&event->notifier)) {
+        return;
+    }
+
+    if (event->upper_addr) {
+        addr = pci_get_long(event->upper_addr);
+        addr = (addr << 32) | pci_get_long(event->addr);
+    } else {
+        addr = pci_get_long(event->addr);
+    }
+    data = pci_get_word(event->data);
+    DPRINTF("%s: 0x%x -> 0x%lx\n", __FUNCTION__, data, addr);
+    stl_phys(addr, data);
+}
+
+static void vfio_enable_msi(VFIODevice *vdev)
+{
+    int i, vectors, *fds;
+    uint16_t ctrl = vfio_pci_read_config(&vdev->pdev,
+                                         vdev->msi.pos + PCI_MSI_FLAGS,
+                                         sizeof(ctrl));
+    ctrl = le32_to_cpu(ctrl);
+    vectors = 1 << ((ctrl & PCI_MSI_FLAGS_QSIZE) >> 4);
+
+    if (vectors > 32) {
+        fprintf(stderr, "vfio: Error: Invalid configured MSI vectors %d\n",
+                vectors);
+        return;
+    }
+
+    vdev->msi.events = qemu_mallocz(vectors * sizeof(MSIEvent));
+    vdev->msi.num_events = vectors;
+    fds = qemu_malloc((vectors + 1) * sizeof(int));
+    fds[0] = vectors;
+
+    for (i = 0; i < vectors; i++) {
+        vdev->msi.events[i].addr = vdev->pdev.config +
+                                   vdev->msi.pos + PCI_MSI_ADDRESS_LO;
+        if (ctrl & PCI_MSI_FLAGS_64BIT) {
+            vdev->msi.events[i].upper_addr = vdev->pdev.config +
+                                             vdev->msi.pos +
+                                             PCI_MSI_ADDRESS_HI;
+            vdev->msi.events[i].data = vdev->pdev.config +
+                                             vdev->msi.pos + PCI_MSI_DATA_64;
+        } else {
+            vdev->msi.events[i].upper_addr = NULL;
+            vdev->msi.events[i].data = vdev->pdev.config +
+                                             vdev->msi.pos + PCI_MSI_DATA_32;
+        }
+
+        if (event_notifier_init(&vdev->msi.events[i].notifier, 0)) {
+            fprintf(stderr, "vfio: Error: event_notifier_init failed\n");
+        }
+        fds[i + 1] = event_notifier_get_fd(&vdev->msi.events[i].notifier);
+        qemu_set_fd_handler(fds[i + 1], vfio_msi_interrupt, NULL,
+                            &vdev->msi.events[i]);
+    }
+    
+    /* FIXME: current vfio only supports 1 MSI */
+    if (vectors > 1) {
+        fprintf(stderr, "vfio: Error: only support 1 MSI vector, want %d\n",
+                vectors);
+        abort();
+    }
+    if (ioctl(vdev->vfiofd, VFIO_EVENTFD_MSI, &fds[1])) {
+        fprintf(stderr, "vfio: Error: Failed to setup MSI fds %s\n",
+                strerror(errno));
+        qemu_free(fds);
+        return;
+    }
+
+    qemu_free(fds);
+    vdev->msi.enabled = 1;
+    DPRINTF("%s(%04x:%02x:%02x.%x)\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func);
+}
+
+static void vfio_disable_msi(VFIODevice *vdev)
+{
+    uint32_t vectors = -1;
+    int i;
+
+    if (!vdev->msi.enabled) {
+        return;
+    }
+
+    for (i = 0; i < vdev->msi.num_events; i++) {
+        int fd = event_notifier_get_fd(&vdev->msi.events[i].notifier);
+        qemu_set_fd_handler(fd, NULL, NULL, NULL);
+        event_notifier_cleanup(&vdev->msi.events[i].notifier);
+    }
+
+    ioctl(vdev->vfiofd, VFIO_EVENTFD_MSI, &vectors);
+    qemu_free(vdev->msi.events);
+    vdev->msi.events = NULL;
+    vdev->msi.num_events = 0;
+    vdev->msi.enabled = 0;
+    DPRINTF("%s(%04x:%02x:%02x.%x)\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func);
+}
+
+/*
+ * INTx
+ */
+static void vfio_unmask_intx(VFIODevice *vdev)
+{
+    uint16_t cmd;
+
+    cmd = vfio_pci_read_config(&vdev->pdev, PCI_COMMAND, sizeof(cmd));
+    cmd = le16_to_cpu(cmd);
+    cmd &= ~PCI_COMMAND_INTX_DISABLE;
+    cmd = cpu_to_le16(cmd);
+    vfio_pci_write_config(&vdev->pdev, PCI_COMMAND, cmd, sizeof(cmd));
+}
+
+static void vfio_intx_interrupt(void *opaque)
+{
+    VFIODevice *vdev = opaque;
+
+    if (!event_notifier_test_and_clear(&vdev->intx.notifier)) {
+        return;
+    }
+
+    DPRINTF(stderr, "%s(%04x:%02x:%02x.%x) Pin %c\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func, 'A' + vdev->intx.pin);
+
+    vdev->intx.pending = 1;
+    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 1);
+}
+
+static void vfio_eoi(ioapic_eoi_client *client)
+{
+    VFIODevice *vdev = container_of(client, VFIODevice, intx.eoi_client);
+
+    if (!vdev->intx.pending) {
+        return;
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) EOI\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func);
+
+    vdev->intx.pending = 0;
+    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
+    vfio_unmask_intx(vdev);
+}
+
+static int vfio_enable_intx(VFIODevice *vdev)
+{
+    int fd;
+
+    if (!(vdev->intx.pin = vfio_pci_read_config(&vdev->pdev,
+                                                PCI_INTERRUPT_PIN, 1))) {
+        return 0;
+    }
+
+    vdev->intx.pin--; /* Pin A (1) -> irq[0] */
+    vdev->intx.eoi_client.eoi = vfio_eoi;
+    vdev->intx.eoi_client.irq = pci_get_byte(vdev->pdev.config +
+                                             PCI_INTERRUPT_LINE);
+    ioapic_register_eoi_client(&vdev->intx.eoi_client);
+
+    if (event_notifier_init(&vdev->intx.notifier, 0)) {
+        fprintf(stderr, "vfio: Error: event_notifier_init failed\n");
+        return -1;
+    }
+
+    fd = event_notifier_get_fd(&vdev->intx.notifier);
+    qemu_set_fd_handler(fd, vfio_intx_interrupt, NULL, vdev);
+    if (ioctl(vdev->vfiofd, VFIO_EVENTFD_IRQ, &fd)) {
+        fprintf(stderr, "vfio: Error: Failed to setup INTx fd %s\n",
+                strerror(errno));
+        return -1;
+    }
+    vfio_unmask_intx(vdev);
+    vdev->intx.enabled = 1;
+    DPRINTF("%s(%04x:%02x:%02x.%x)\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func);
+
+    return 0;
+}
+
+static void vfio_disable_intx(VFIODevice *vdev)
+{
+    int fd;
+
+    if (!vdev->intx.enabled) {
+        return;
+    }
+
+    ioapic_unregister_eoi_client(&vdev->intx.eoi_client);
+    fd = event_notifier_get_fd(&vdev->intx.notifier);
+    qemu_set_fd_handler(fd, NULL, NULL, NULL);
+    event_notifier_cleanup(&vdev->intx.notifier);
+    fd = -1;
+    ioctl(vdev->vfiofd, VFIO_EVENTFD_IRQ, &fd);
+    vdev->intx.enabled = 0;
+    DPRINTF("%s(%04x:%02x:%02x.%x)\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func);
+}
+
+/*
+ * IO Port/MMIO
+ */
+static void vfio_resource_write(PCIResource *res, uint32_t addr,
+                                uint32_t val, int len)
+{
+    size_t offset = vfio_pci_space_to_offset(VFIO_PCI_BAR0_RESOURCE + res->bar);
+
+    if (pwrite(res->vfiofd, &val, len, offset + addr) != len) {
+        fprintf(stderr, "%s(,0x%x, 0x%x, %d) failed: %s\n",
+                __FUNCTION__, addr, val, len, strerror(errno));
+    }
+    DPRINTF("%s(BAR%d+0x%x, 0x%x, %d)\n",
+            __FUNCTION__, res->bar, addr, val, len);
+}
+
+static void vfio_resource_writeb(void *opaque, target_phys_addr_t addr,
+                                 uint32_t val)
+{
+    vfio_resource_write(opaque, addr, val, 1);
+}
+
+static void vfio_resource_writew(void *opaque, target_phys_addr_t addr,
+                                 uint32_t val)
+{
+    vfio_resource_write(opaque, addr, val, 2);
+}
+
+static void vfio_resource_writel(void *opaque, target_phys_addr_t addr,
+                                 uint32_t val)
+{
+    vfio_resource_write(opaque, addr, val, 4);
+}
+
+static CPUWriteMemoryFunc * const vfio_resource_writes[] = {
+    &vfio_resource_writeb,
+    &vfio_resource_writew,
+    &vfio_resource_writel
+};
+
+static void vfio_ioport_writeb(void *opaque, uint32_t addr, uint32_t val)
+{
+    PCIResource *res = opaque;
+    vfio_resource_write(res, addr - res->e_phys, val, 1);
+}
+
+static void vfio_ioport_writew(void *opaque, uint32_t addr, uint32_t val)
+{
+    PCIResource *res = opaque;
+    vfio_resource_write(res, addr - res->e_phys, val, 2);
+}
+
+static void vfio_ioport_writel(void *opaque, uint32_t addr, uint32_t val)
+{
+    PCIResource *res = opaque;
+    vfio_resource_write(res, addr - res->e_phys, val, 4);
+}
+
+static uint32_t vfio_resource_read(PCIResource *res, uint32_t addr, int len)
+{
+    size_t offset = vfio_pci_space_to_offset(VFIO_PCI_BAR0_RESOURCE + res->bar);
+    uint32_t val;
+
+    if (pread(res->vfiofd, &val, len, offset + addr) != len) {
+        fprintf(stderr, "%s(,0x%x, %d) failed: %s\n",
+                __FUNCTION__, addr, len, strerror(errno));
+        return 0xffffffffU;
+    }
+    DPRINTF("%s(BAR%d+0x%x, %d) = 0x%x\n",
+            __FUNCTION__, res->bar, addr, len, val);
+    return val;
+}
+
+static uint32_t vfio_resource_readb(void *opaque, target_phys_addr_t addr)
+{
+    return vfio_resource_read(opaque, addr, 1) & 0xff;
+}
+
+static uint32_t vfio_resource_readw(void *opaque, target_phys_addr_t addr)
+{
+    return vfio_resource_read(opaque, addr, 2) & 0xffff;
+}
+
+static uint32_t vfio_resource_readl(void *opaque, target_phys_addr_t addr)
+{
+    return vfio_resource_read(opaque, addr, 4);
+}
+
+static CPUReadMemoryFunc * const vfio_resource_reads[] = {
+    &vfio_resource_readb,
+    &vfio_resource_readw,
+    &vfio_resource_readl
+};
+
+static uint32_t vfio_ioport_readb(void *opaque, uint32_t addr)
+{
+    PCIResource *res = opaque;
+    return vfio_resource_read(res, addr - res->e_phys, 1) & 0xff;
+}
+
+static uint32_t vfio_ioport_readw(void *opaque, uint32_t addr)
+{
+    PCIResource *res = opaque;
+    return vfio_resource_read(res, addr - res->e_phys, 2) & 0xffff;
+}
+
+static uint32_t vfio_ioport_readl(void *opaque, uint32_t addr)
+{
+    PCIResource *res = opaque;
+    return vfio_resource_read(res, addr - res->e_phys, 4);
+}
+
+static void vfio_ioport_map(PCIDevice *pdev, int bar,
+                           pcibus_t e_phys, pcibus_t e_size, int type)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    PCIResource *res = &vdev->resources[bar];
+
+    DPRINTF("%s(%04x:%02x:%02x.%x, %d, 0x%lx, 0x%lx, %d)\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus,
+            vdev->host.dev, vdev->host.func, bar, e_phys, e_size, type);
+
+    res->e_phys = e_phys;
+    res->e_size = e_size;
+
+    register_ioport_write(e_phys, e_size, 1, vfio_ioport_writeb, res);
+    register_ioport_write(e_phys, e_size, 2, vfio_ioport_writew, res);
+    register_ioport_write(e_phys, e_size, 4, vfio_ioport_writel, res);
+    register_ioport_read(e_phys, e_size, 1, vfio_ioport_readb, res);
+    register_ioport_read(e_phys, e_size, 2, vfio_ioport_readw, res);
+    register_ioport_read(e_phys, e_size, 4, vfio_ioport_readl, res);
+}
+
+static void vfio_iomem_map(PCIDevice *pdev, int bar,
+                           pcibus_t e_phys, pcibus_t e_size, int type)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    PCIResource *res = &vdev->resources[bar];
+
+    DPRINTF("%s(%04x:%02x:%02x.%x, %d, 0x%lx, 0x%lx, %d)\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus,
+            vdev->host.dev, vdev->host.func, bar, e_phys, e_size, type);
+
+    res->e_phys = e_phys;
+    res->e_size = e_size;
+
+    if (e_size == 0) {
+        return;
+    }
+
+    if (e_size != res->size) {
+        fprintf(stderr, "vfio: Error: partial BAR map?\n");
+        abort();
+    }
+
+    if (res->msix) {
+        if (res->msix_offset > 0) {
+            cpu_register_physical_memory(e_phys, res->msix_offset,
+                                         res->memory_index[0]);
+        }
+
+        DPRINTF("Overlaying MSI-X table page\n");
+        cpu_register_physical_memory(e_phys + res->msix_offset,
+                                     TARGET_PAGE_SIZE, vdev->msix.index);
+
+        if (res->size > res->msix_offset + 0x1000) {
+            cpu_register_physical_memory(e_phys + res->msix_offset + 0x1000,
+                                         res->size - res->msix_offset - 0x1000,
+                                         res->memory_index[1]);
+        }
+    } else {
+        if (!res->slow) {
+            cpu_register_physical_memory(e_phys, e_size, res->memory_index[0]);
+        } else {
+            cpu_register_physical_memory(e_phys, e_size, res->io_mem);
+        }
+    }
+}
+
+/*
+ * PCI config space
+ */
+static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    uint32_t val = 0;
+
+    if (pread(vdev->vfiofd, &val, len, VFIO_PCI_CONFIG_OFF + addr) != len) {
+        fprintf(stderr, "%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) failed: %s\n",
+                __FUNCTION__, vdev->host.seg, vdev->host.bus,
+                vdev->host.dev, vdev->host.func, addr, len, strerror(errno));
+        return -1;
+    }
+    DPRINTF("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) %x\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus,
+            vdev->host.dev, vdev->host.func, addr, len, val);
+    return pci_default_read_config(pdev, addr, len);
+}
+
+static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
+                                  uint32_t val, int len)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+
+    if (pwrite(vdev->vfiofd, &val, len, VFIO_PCI_CONFIG_OFF + addr) != len) {
+        fprintf(stderr, "%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x) failed: %s\n",
+                __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+                vdev->host.func, addr, val, len, strerror(errno));
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x)\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func, addr, val, len);
+
+    if (vdev->msix.pos && (addr == vdev->msix.pos + PCI_MSIX_FLAGS)) {
+        if (vdev->msix.enabled) {
+            if (!(val & PCI_MSIX_FLAGS_ENABLE)) {
+                vfio_disable_msix(vdev);
+            }
+        } else {
+            if (val & PCI_MSIX_FLAGS_ENABLE) {
+                vfio_enable_msix(vdev);
+            }
+        }
+    }
+
+    if (vdev->msi.pos && (addr == vdev->msi.pos + PCI_MSI_FLAGS)) {
+        if (vdev->msi.enabled) {
+            if (!(val & PCI_MSI_FLAGS_ENABLE)) {
+                vfio_disable_msi(vdev);
+            }
+        } else {
+            if (val & PCI_MSI_FLAGS_ENABLE) {
+                vfio_enable_msi(vdev);
+            }
+        }
+    }
+
+    if (addr == PCI_INTERRUPT_LINE) {
+        if (len != 1) {
+            fprintf(stderr, "vfio: fixme: INTERRUPT_LINE written as %d bytes\n",
+                    len);
+        }
+        vdev->intx.eoi_client.irq = val;
+    }
+
+    pci_default_write_config(pdev, addr, val, len);
+}
+
+/*
+ * DMA
+ */
+static int vfio_do_map_iommu(VFIODevice *vdev, int map)
+{
+    QemuRamSlot *slot;
+
+    QLIST_FOREACH(slot, &ram_slots.slots, next) {
+        struct vfio_dma_map dma_map;
+
+        dma_map.vaddr = (uint64_t)qemu_get_ram_ptr(slot->offset);
+        dma_map.dmaaddr = slot->start_addr;
+        dma_map.size = slot->size;
+        dma_map.flags = VFIO_FLAG_WRITE;
+
+        if (map) {
+            if (ioctl(vdev->vfiofd, VFIO_DMA_MAP_IOVA, &dma_map))
+                return -1;
+        } else {
+            ioctl(vdev->vfiofd, VFIO_DMA_UNMAP, &dma_map);
+        }
+    }
+    return 0;
+}
+
+static int vfio_map_iommu(VFIODevice *vdev)
+{
+    return vfio_do_map_iommu(vdev, 1);
+}
+
+static void vfio_unmap_iommu(VFIODevice *vdev)
+{
+    vfio_do_map_iommu(vdev, 0);
+}
+
+/*
+ * Interrupt setup
+ */
+static int vfio_setup_msi(VFIODevice *vdev)
+{
+    int pos;
+
+    if ((pos = pci_find_cap_offset(&vdev->pdev, PCI_CAP_ID_MSI))) {
+        DPRINTF("%04x:%02x:%02x.%x PCI MSI CAP @%d\n", vdev->host.seg,
+                vdev->host.bus, vdev->host.dev, vdev->host.func, pos);
+        vdev->msi.pos = pos;
+    }
+
+    if ((pos = pci_find_cap_offset(&vdev->pdev, PCI_CAP_ID_MSIX))) {
+        uint16_t ctrl;
+        uint32_t table, pba, len;
+
+        if (pread(vdev->vfiofd, &ctrl, sizeof(ctrl),
+                  VFIO_PCI_CONFIG_OFF + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
+            return -1;
+        }
+
+        if (pread(vdev->vfiofd, &table, sizeof(table), VFIO_PCI_CONFIG_OFF +
+                  pos + PCI_MSIX_TABLE) != sizeof(table)) {
+            return -1;
+        }
+
+        if (pread(vdev->vfiofd, &pba, sizeof(pba),
+                  VFIO_PCI_CONFIG_OFF + pos + PCI_MSIX_PBA) != sizeof(pba)) {
+            return -1;
+        }
+
+        ctrl = le16_to_cpu(ctrl);
+        table = le32_to_cpu(table);
+        pba = le32_to_cpu(pba);
+
+        vdev->msix.pos = pos;
+        vdev->msix.table_len = (ctrl & PCI_MSIX_TABSIZE) + 1;
+        vdev->msix.bar = table & PCI_MSIX_BIR;
+        vdev->msix.bar_offset = table & ~PCI_MSIX_BIR;
+        vdev->resources[vdev->msix.bar].msix = 1;
+        vdev->resources[vdev->msix.bar].msix_offset = vdev->msix.bar_offset;
+
+        DPRINTF("%04x:%02x:%02x.%x PCI MSI-X CAP @%d, BAR %d, offset 0x%x\n",
+                vdev->host.seg, vdev->host.bus, vdev->host.dev,
+                vdev->host.func, pos, vdev->msix.bar, vdev->msix.bar_offset);
+
+        if ((pba & PCI_MSIX_BIR) == vdev->msix.bar &&
+            ((pba & ~0xfff) == vdev->msix.bar_offset)) {
+            fprintf(stderr, "vfio: Error: MSIX Table & PBA reside in the same "
+                    "page, not yet supported\n");
+            return -1;
+        }
+
+        /*
+         * Check if the BAR containing the MSIX table is 4k aligned, if
+         * so we can avoid slow mapping messiness.  This shouldn't fail
+         * for devices that follow the spec recommendations for sizing
+         * and placement. */
+        len = vdev->msix.bar;
+        if (ioctl(vdev->vfiofd, VFIO_BAR_LEN, &len)) {
+            fprintf(stderr, "vfio: VFIO_BAR_LEN failed for MSIX BAR\n");
+            return -1;
+        }
+        if (!len || len & 0xfff) {
+            fprintf(stderr, "vfio: MSIX BAR not 4k aligned\n");
+            return -1;
+        }
+
+        vdev->msix.table = mmap(NULL, 0x1000, PROT_READ|PROT_WRITE,
+                                MAP_ANONYMOUS|MAP_PRIVATE, 0, 0);
+        if (vdev->msix.table == MAP_FAILED) {
+            fprintf(stderr, "vfio: Failed to allocate MSIX table page\n");
+            return -1;
+        }
+
+        memset(vdev->msix.table, 0, 0x1000);
+        vdev->msix.index = cpu_register_io_memory(msix_mmio_reads,
+                                                  msix_mmio_writes, vdev);
+    }
+    return 0;
+}
+
+static void vfio_teardown_msi(VFIODevice *vdev)
+{
+    if (vdev->msix.table) {
+        munmap(vdev->msix.table, 0x1000);
+    }
+    if (vdev->msix.index) {
+        cpu_unregister_io_memory(vdev->msix.index);
+    }
+}
+
+/*
+ * Resource setup
+ */
+static int vfio_setup_resources(VFIODevice *vdev)
+{
+    int i;
+
+    for (i = 0; i < PCI_NUM_REGIONS; i++) {
+        uint32_t len, bar;
+        PCIResource *res;
+        uint8_t offset;
+        int ret, space;
+
+        res = &vdev->resources[i];
+        res->vfiofd = vdev->vfiofd;
+        res->bar = len = i;
+
+        if (ioctl(vdev->vfiofd, VFIO_BAR_LEN, &len)) {
+            fprintf(stderr, "vfio: VFIO_BAR_LEN failed for BAR %d\n", i);
+            return -1;
+        }
+        if (!len) {
+            continue;
+        }
+
+        offset = PCI_BASE_ADDRESS_0 + (4 * i);
+        ret = pread(vdev->vfiofd, &bar, sizeof(bar),
+                    VFIO_PCI_CONFIG_OFF + offset);
+        if (ret != sizeof(bar)) {
+            fprintf(stderr, "vfio: Failed to read BAR %d\n", i);
+            return -1;
+        }
+        bar = le32_to_cpu(bar);
+        space = bar & PCI_BASE_ADDRESS_SPACE;
+
+        if (space == PCI_BASE_ADDRESS_SPACE_MEMORY && !(len & 0xfff)) {
+            int off = VFIO_PCI_BAR0_RESOURCE + i;
+            int flags = PROT_READ;
+            char name[32];
+
+            res->mem = 1;
+            res->size = len;
+
+            if (i != PCI_ROM_SLOT) {
+                flags |= PROT_WRITE;
+            }
+           
+            if (vdev->pdev.qdev.info->vmsd) {
+                snprintf(name, sizeof(name), "%s.bar%d",
+                         vdev->pdev.qdev.info->vmsd->name, i);
+            } else {
+                snprintf(name, sizeof(name), "%s.bar%d",
+                         vdev->pdev.qdev.info->name, i);
+            }
+
+            if (res->msix) {
+                if (res->msix_offset) {
+                    char *c = &name[strlen(name)];
+                    
+                    res->r_virtbase[0] = mmap(NULL, res->msix_offset, flags,
+                                              MAP_SHARED, vdev->vfiofd,
+                                              vfio_pci_space_to_offset(off));
+
+                    if (res->r_virtbase[0] == MAP_FAILED) {
+                        fprintf(stderr, "vfio: Failed to mmap BAR %d\n", i);
+                        return -1;
+                    }
+                    strncat(name, ".0", sizeof(name));
+                    res->memory_index[0] = qemu_ram_map(&vdev->pdev.qdev,
+                                                        name, res->msix_offset,
+                                                        res->r_virtbase[0]);
+                    *c = 0;
+                }
+                if (len > res->msix_offset + 0x1000) {
+                    char *c = &name[strlen(name)];
+
+                    res->r_virtbase[1] = mmap(NULL,
+                                              len - res->msix_offset - 0x1000,
+                                              flags, MAP_SHARED, vdev->vfiofd,
+                                              vfio_pci_space_to_offset(off) +
+                                              res->msix_offset + 0x1000);
+
+                    if (res->r_virtbase[1] == MAP_FAILED) {
+                        fprintf(stderr, "vfio: Failed to mmap BAR %d\n", i);
+                        return -1;
+                    }
+                    strncat(name, ".1", sizeof(name));
+                    res->memory_index[1] = qemu_ram_map(&vdev->pdev.qdev, name,
+                                                        len - 0x1000 -
+                                                        res->msix_offset,
+                                                        res->r_virtbase[1]);
+                    *c = 0;
+                }
+            } else {
+                res->r_virtbase[0] = mmap(NULL, len, flags, MAP_SHARED,
+                                          vdev->vfiofd,
+                                          vfio_pci_space_to_offset(off));
+
+                if (res->r_virtbase[0] == MAP_FAILED) {
+                    fprintf(stderr, "vfio: Failed to mmap BAR %d\n", i);
+                    return -1;
+                }
+                res->memory_index[0] = qemu_ram_map(&vdev->pdev.qdev, name,
+                                                    len, res->r_virtbase[0]);
+                if (i == PCI_ROM_SLOT) {
+                    res->memory_index[0] |= IO_MEM_ROM;
+                }
+            }
+
+            pci_register_bar(&vdev->pdev, i, res->size,
+                             bar & PCI_BASE_ADDRESS_MEM_PREFETCH ?
+                             PCI_BASE_ADDRESS_MEM_PREFETCH :
+                             PCI_BASE_ADDRESS_SPACE_MEMORY,
+                             vfio_iomem_map);
+                  
+            if (bar & PCI_BASE_ADDRESS_MEM_TYPE_64) {
+                i++;
+            }
+        } else if (space == PCI_BASE_ADDRESS_SPACE_MEMORY) {
+            res->mem = 1;
+            res->size = len;
+            res->slow = 1;
+
+            DPRINTF("%s(%04x:%02x:%02x.%x) Using slow mapping for BAR %d\n",
+                    __FUNCTION__, vdev->host.seg, vdev->host.bus,
+            vdev->host.dev, vdev->host.func, i);
+
+            if (i == PCI_ROM_SLOT) {
+                res->io_mem = cpu_register_io_memory(vfio_resource_reads,
+                                                     NULL, res);
+            } else {
+                res->io_mem = cpu_register_io_memory(vfio_resource_reads,
+                                                     vfio_resource_writes, res);
+            }
+
+            pci_register_bar(&vdev->pdev, i, res->size,
+                             bar & PCI_BASE_ADDRESS_MEM_PREFETCH ?
+                             PCI_BASE_ADDRESS_MEM_PREFETCH :
+                             PCI_BASE_ADDRESS_SPACE_MEMORY,
+                             vfio_iomem_map);
+
+        } else if (space == PCI_BASE_ADDRESS_SPACE_IO) {
+            res->size = len;
+            pci_register_bar(&vdev->pdev, i, res->size,
+                             PCI_BASE_ADDRESS_SPACE_IO, vfio_ioport_map);
+        }
+        res->valid = 1;
+    }
+    return 0;
+}
+
+static void vfio_unmap_resources(VFIODevice *vdev)
+{
+    int i;
+    PCIResource *res = vdev->resources;
+
+    for (i = 0; i < PCI_NUM_REGIONS; i++, res++) {
+        if (res->valid && res->mem) {
+            if (res->msix) {
+                if (res->msix_offset) {
+                    cpu_register_physical_memory(res->e_phys, res->msix_offset,
+                                                 IO_MEM_UNASSIGNED);
+                    qemu_ram_unmap(res->memory_index[0]);
+                    munmap(res->r_virtbase[0], res->msix_offset);
+                }
+                if (res->size > res->msix_offset + 0x1000) {
+                    cpu_register_physical_memory(res->e_phys + 0x1000 +
+                                                 res->msix_offset,
+                                                 res->e_size - 0x1000 -
+                                                 res->msix_offset,
+                                                 IO_MEM_UNASSIGNED);
+                    qemu_ram_unmap(res->memory_index[1]);
+                    munmap(res->r_virtbase[1],
+                           res->size - 0x1000 - res->msix_offset);
+                }
+            } else {
+                if (!res->slow) {
+                    cpu_register_physical_memory(res->e_phys, res->e_size,
+                                                 IO_MEM_UNASSIGNED);
+                    qemu_ram_unmap(res->memory_index[0]);
+                    munmap(res->r_virtbase[0], res->size);
+                } else {
+                    cpu_unregister_io_memory(res->io_mem);
+                }
+            }
+        }
+    }
+}
+
+/*
+ * General setup
+ */
+static int get_vfio_fd(VFIODevice *vdev)
+{
+    if (vdev->vfiofd_name && strlen(vdev->vfiofd_name) > 0) {
+        if (qemu_isdigit(vdev->vfiofd_name[0])) {
+            vdev->vfiofd = strtol(vdev->vfiofd_name, NULL, 0);
+            return 0;
+        } else {
+            vdev->vfiofd = monitor_get_fd(cur_mon, vdev->vfiofd_name);
+            if (vdev->vfiofd < 0) {
+                fprintf(stderr, "%s: (%s) unkown\n", __func__,
+                        vdev->vfiofd_name);
+                return -1;
+            }
+            return 0;
+        }
+    } else {
+        char vfio_dir[64], vfio_dev[16];
+        DIR *dir;
+        struct dirent *de;
+
+        sprintf(vfio_dir, "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/vfio/",
+                vdev->host.seg, vdev->host.bus,
+                vdev->host.dev, vdev->host.func);
+        dir = opendir(vfio_dir);
+        if (!dir) {
+            error_report("vfio: error: Driver not attached\n");
+            return -1;
+        }
+
+        while ((de = readdir(dir))) {
+            if (de->d_name[0] == '.')
+                continue;
+            if (!strncmp(de->d_name, "vfio", 4))
+                break;
+        }
+
+        if (!de) {
+            error_report("vfio: error: Cannot find vfio* in %s\n", vfio_dir);
+            return -1;
+        }
+
+        sprintf(vfio_dev, "/dev/%s", de->d_name);
+        vdev->vfiofd = open(vfio_dev, O_RDWR);
+        if (vdev->vfiofd < 0) {
+            error_report("pci-assign: vfio: Failed to open %s: %s\n",
+                         vfio_dev, strerror(errno));
+            return -1;
+        }
+        return 0;
+    }
+}
+
+static int get_uiommu_fd(VFIODevice *vdev)
+{
+    if (vdev->uiommufd_name && strlen(vdev->uiommufd_name) > 0) {
+        if (qemu_isdigit(vdev->uiommufd_name[0])) {
+            vdev->uiommufd = strtol(vdev->uiommufd_name, NULL, 0);
+            return 0;
+        } else {
+            vdev->uiommufd = monitor_get_fd(cur_mon, vdev->uiommufd_name);
+            if (vdev->uiommufd < 0) {
+                fprintf(stderr, "%s: (%s) unkown\n", __func__,
+                        vdev->uiommufd_name);
+                return -1;
+            }
+            return 0;
+        }
+    } else {
+        vdev->uiommufd = open("/dev/uiommu", O_RDONLY);
+        if (vdev->uiommufd < 0) {
+            return -1;
+        }
+        vdev->uiommufd_name = NULL; /* easier test later */
+        return 0;
+    }
+}
+
+static int vfio_initfn(struct PCIDevice *pdev)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    char sys[64];
+    struct stat st;
+    int ret;
+
+    /* Check that the host device exists */
+    sprintf(sys, "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
+            vdev->host.seg, vdev->host.bus, vdev->host.dev, vdev->host.func);
+    if (stat(sys, &st) < 0) {
+        error_report("vfio: error: no such host device "
+                     "%04x:%02x:%02x.%01x", vdev->host.seg, vdev->host.bus,
+                     vdev->host.dev, vdev->host.func);
+        return -1;
+    }
+
+    if (get_uiommu_fd(vdev))
+        return -1;
+
+    if (get_vfio_fd(vdev))
+        goto out_close_uiommu;
+
+    if (ioctl(vdev->vfiofd, VFIO_DOMAIN_SET, &vdev->uiommufd))
+        goto out_close_vfiofd;
+
+    /* Get a copy of config space */
+    ret = pread(vdev->vfiofd, vdev->pdev.config,
+                pci_config_size(&vdev->pdev), VFIO_PCI_CONFIG_OFF);
+    if (ret < pci_config_size(&vdev->pdev)) {
+        fprintf(stderr, "vfio: Failed to read device config space\n");
+        goto out_unset_domain;
+    }
+
+    if (vfio_setup_msi(vdev))
+        goto out_unset_domain;
+
+    if (vfio_setup_resources(vdev))
+        goto out_disable_msix;
+
+    if (vfio_map_iommu(vdev))
+        goto out_unmap_resources;
+
+    if (vfio_enable_intx(vdev))
+        goto out_unmap_iommu;
+
+    return 0;
+
+out_unmap_iommu:
+    vfio_unmap_iommu(vdev);
+out_unmap_resources:
+    vfio_unmap_resources(vdev);
+out_disable_msix:
+    vfio_teardown_msi(vdev);
+out_unset_domain:
+    ioctl(vdev->vfiofd, VFIO_DOMAIN_UNSET);
+out_close_vfiofd:
+    close(vdev->vfiofd);
+out_close_uiommu:
+    if (!vdev->uiommufd_name)
+        close(vdev->uiommufd);
+    return -1;
+}
+
+static int vfio_exitfn(struct PCIDevice *pdev)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    
+    vfio_disable_intx(vdev);
+    vfio_disable_msi(vdev);
+    vfio_disable_msix(vdev);
+    vfio_unmap_iommu(vdev);
+    vfio_unmap_resources(vdev);
+    ioctl(vdev->vfiofd, VFIO_DOMAIN_UNSET);
+    close(vdev->vfiofd);
+    if (!vdev->uiommufd_name)
+        close(vdev->uiommufd);
+    return 0;
+}
+
+static PropertyInfo qdev_prop_hostaddr = {
+    .name  = "pci-hostaddr",
+    .type  = -1,
+    .size  = sizeof(PCIHostDevice),
+    .parse = parse_hostaddr,
+    .print = print_hostaddr,
+};
+
+static PCIDeviceInfo vfio_info = {
+    .qdev.name    = "vfio",
+    .qdev.desc    = "pass through host pci devices to the guest via vfio",
+    .qdev.size    = sizeof(VFIODevice),
+    .init         = vfio_initfn,
+    .exit         = vfio_exitfn,
+    .config_read  = vfio_pci_read_config,
+    .config_write = vfio_pci_write_config,
+    .qdev.props   = (Property[]) {
+        DEFINE_PROP("host", VFIODevice, host,
+                    qdev_prop_hostaddr, PCIHostDevice),
+        DEFINE_PROP_STRING("vfiofd", VFIODevice, vfiofd_name),
+        DEFINE_PROP_STRING("uiommufd", VFIODevice, uiommufd_name),
+        DEFINE_PROP_END_OF_LIST(),
+    },
+};
+
+static void vfio_register_devices(void)
+{
+    pci_qdev_register(&vfio_info);
+}
+
+device_init(vfio_register_devices)
diff --git a/hw/vfio.h b/hw/vfio.h
new file mode 100644
index 0000000..9d05ae1
--- /dev/null
+++ b/hw/vfio.h
@@ -0,0 +1,90 @@ 
+#ifndef __VFIO_H__
+#define __VFIO_H__
+
+#include "qemu-common.h"
+#include "qemu-queue.h"
+#include "pci.h"
+
+typedef struct PCIHostDevice {
+    uint16_t seg;
+    uint8_t bus;
+    uint8_t dev:5;
+    uint8_t func:3;
+} PCIHostDevice;
+
+typedef struct PCIResource {
+    uint8_t valid:1;
+    uint8_t mem:1;
+    uint8_t msix:1;
+    uint8_t bar:3;               /* see vfio_resource_read/write */
+    uint8_t slow:1;              /* use read/write rather than mmap */
+    uint64_t size;
+    ram_addr_t memory_index[2];  /* cpu_register_physical_memory() index */
+    void *r_virtbase[2];         /* mmapped address */
+    int io_mem;                  /* cpu_register_io_memory index */
+    pcibus_t e_phys;             /* emulated base address */
+    pcibus_t e_size;             /* emulated size of region in bytes */
+    uint32_t msix_offset;
+    int vfiofd;                  /* see vfio_resource_read/write */
+} PCIResource;
+
+typedef struct INTx {
+    uint8_t enabled:1;
+    uint8_t pending:1;
+    uint8_t pin:3;
+    EventNotifier notifier;
+    ioapic_eoi_client eoi_client;
+} INTx;
+
+typedef struct MSIEvent {
+    EventNotifier notifier;
+    uint8_t *addr;
+    uint8_t *upper_addr;
+    uint8_t *data;
+} MSIEvent;
+
+typedef struct MSI {
+    uint8_t enabled:1;
+    uint8_t pos;
+    int num_events;
+    MSIEvent *events;
+} MSI;
+
+typedef struct __attribute__((packed)) MSIXTableEntry {
+    uint32_t addr;
+    uint32_t upper_addr;
+    uint32_t data;
+    uint32_t ctrl;
+} MSIXTableEntry;
+
+typedef struct MSIXEvent {
+    EventNotifier notifier;
+    MSIXTableEntry *entry;
+} MSIXEvent;
+
+typedef struct MSIX {
+    uint8_t enabled:1;
+    uint8_t bar:3;
+    uint8_t pos;
+    uint16_t table_len;
+    uint32_t bar_offset;
+    MSIXTableEntry *table;
+    int num_events;
+    MSIXEvent *events;
+    int index;
+} MSIX;
+
+typedef struct VFIODevice {
+    PCIDevice pdev;
+    PCIHostDevice host;
+    PCIResource resources[PCI_NUM_REGIONS];
+    INTx intx;
+    MSI msi;
+    MSIX msix;
+    int vfiofd;
+    int uiommufd;
+    char *vfiofd_name;
+    char *uiommufd_name;
+} VFIODevice;
+
+#endif /* __VFIO_H__ */