From patchwork Sun Jul 11 18:09:42 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Alex Williamson X-Patchwork-Id: 111335 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by demeter.kernel.org (8.14.4/8.14.3) with ESMTP id o6BI9njN023950 for ; Sun, 11 Jul 2010 18:09:50 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1754654Ab0GKSJs (ORCPT ); Sun, 11 Jul 2010 14:09:48 -0400 Received: from mx1.redhat.com ([209.132.183.28]:51069 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1754635Ab0GKSJp (ORCPT ); Sun, 11 Jul 2010 14:09:45 -0400 Received: from int-mx02.intmail.prod.int.phx2.redhat.com (int-mx02.intmail.prod.int.phx2.redhat.com [10.5.11.12]) by mx1.redhat.com (8.13.8/8.13.8) with ESMTP id o6BI9hN4015828 (version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-SHA bits=256 verify=OK); Sun, 11 Jul 2010 14:09:44 -0400 Received: from localhost6.localdomain6 (ovpn01.gateway.prod.ext.phx2.redhat.com [10.5.9.1]) by int-mx02.intmail.prod.int.phx2.redhat.com (8.13.8/8.13.8) with ESMTP id o6BI9gLl012871; Sun, 11 Jul 2010 14:09:43 -0400 From: Alex Williamson Subject: [RFC PATCH 5/5] VFIO based device assignment To: kvm@vger.kernel.org, qemu-devel@nongnu.org Cc: pugs@cisco.com, chrisw@redhat.com, mst@redhat.com, alex.williamson@redhat.com Date: Sun, 11 Jul 2010 12:09:42 -0600 Message-ID: <20100711180942.20121.97368.stgit@localhost6.localdomain6> In-Reply-To: <20100711180910.20121.93313.stgit@localhost6.localdomain6> References: <20100711180910.20121.93313.stgit@localhost6.localdomain6> User-Agent: StGIT/0.14.3 MIME-Version: 1.0 X-Scanned-By: MIMEDefang 2.67 on 10.5.11.12 Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org X-Greylist: IP, sender and recipient auto-whitelisted, not delayed by milter-greylist-4.2.3 (demeter.kernel.org [140.211.167.41]); Sun, 11 Jul 2010 18:09:50 +0000 (UTC) diff --git a/Makefile.target b/Makefile.target index 0c1b916..4936d96 100644 --- a/Makefile.target +++ b/Makefile.target @@ -197,6 +197,7 @@ obj-i386-y += vmmouse.o vmport.o hpet.o obj-i386-y += device-hotplug.o pci-hotplug.o smbios.o wdt_ib700.o obj-i386-y += debugcon.o multiboot.o obj-i386-y += pc_piix.o +obj-i386-y += vfio.o # shared objects obj-ppc-y = ppc.o diff --git a/hw/linux-vfio.h b/hw/linux-vfio.h new file mode 100644 index 0000000..06bd3f3 --- /dev/null +++ b/hw/linux-vfio.h @@ -0,0 +1,200 @@ +/* + * Copyright 2010 Cisco Systems, Inc. All rights reserved. + * Author: Tom Lyon, pugs@cisco.com + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Portions derived from drivers/uio/uio.c: + * Copyright(C) 2005, Benedikt Spranger + * Copyright(C) 2005, Thomas Gleixner + * Copyright(C) 2006, Hans J. Koch + * Copyright(C) 2006, Greg Kroah-Hartman + * + * Portions derived from drivers/uio/uio_pci_generic.c: + * Copyright (C) 2009 Red Hat, Inc. + * Author: Michael S. Tsirkin + */ + +/* + * VFIO driver - allow mapping and use of certain PCI devices + * in unprivileged user processes. (If IOMMU is present) + * Especially useful for Virtual Function parts of SR-IOV devices + */ + +#ifdef __KERNEL__ + +struct vfio_dev { + struct device *dev; + struct pci_dev *pdev; + u8 *pci_config_map; + int pci_config_size; + char name[8]; + int devnum; + int pmaster; + void __iomem *bar[PCI_ROM_RESOURCE+1]; + spinlock_t irqlock; /* guards command register accesses */ + int listeners; + u32 locked_pages; + struct mutex lgate; /* listener gate */ + struct mutex dgate; /* dma op gate */ + struct mutex igate; /* intr op gate */ + struct msix_entry *msix; + int nvec; + struct uiommu_domain *udomain; + int cachec; + struct eventfd_ctx *ev_irq; + struct eventfd_ctx *ev_msi; + struct eventfd_ctx **ev_msix; + struct { + u8 intr; + u8 bardirty; + u8 rombar[4]; + u8 bar[6*4]; + u8 msi[24]; + } vinfo; +}; + +struct vfio_listener { + struct vfio_dev *vdev; + struct list_head dm_list; + struct mm_struct *mm; + struct mmu_notifier mmu_notifier; +}; + +/* + * Structure for keeping track of memory nailed down by the + * user for DMA + */ +struct dma_map_page { + struct list_head list; + struct page **pages; + dma_addr_t daddr; + unsigned long vaddr; + int npage; + int rdwr; +}; + +/* VFIO class infrastructure */ +struct vfio_class { + struct kref kref; + struct class *class; +}; +extern struct vfio_class *vfio_class; + +ssize_t vfio_io_readwrite(int, struct vfio_dev *, + char __user *, size_t, loff_t *); +ssize_t vfio_mem_readwrite(int, struct vfio_dev *, + char __user *, size_t, loff_t *); +ssize_t vfio_config_readwrite(int, struct vfio_dev *, + char __user *, size_t, loff_t *); + +void vfio_disable_msi(struct vfio_dev *); +void vfio_disable_msix(struct vfio_dev *); +int vfio_enable_msi(struct vfio_dev *, int); +int vfio_enable_msix(struct vfio_dev *, int, void __user *); + +#ifndef PCI_MSIX_ENTRY_SIZE +#define PCI_MSIX_ENTRY_SIZE 16 +#endif +#ifndef PCI_STATUS_INTERRUPT +#define PCI_STATUS_INTERRUPT 0x08 +#endif + +struct vfio_dma_map; +void vfio_dma_unmapall(struct vfio_listener *); +int vfio_dma_unmap_dm(struct vfio_listener *, struct vfio_dma_map *); +int vfio_dma_map_common(struct vfio_listener *, unsigned int, + struct vfio_dma_map *); +int vfio_domain_set(struct vfio_dev *, int); +void vfio_domain_unset(struct vfio_dev *); + +int vfio_class_init(void); +void vfio_class_destroy(void); +int vfio_dev_add_attributes(struct vfio_dev *); +extern struct idr vfio_idr; +extern struct mutex vfio_minor_lock; +int vfio_build_config_map(struct vfio_dev *); + +irqreturn_t vfio_interrupt(int, void *); + +#endif /* __KERNEL__ */ + +/* Kernel & User level defines for ioctls */ + +/* + * Structure for DMA mapping of user buffers + * vaddr, dmaaddr, and size must all be page aligned + * buffer may only be larger than 1 page if (a) there is + * an iommu in the system, or (b) buffer is part of a huge page + */ +struct vfio_dma_map { + __u64 vaddr; /* process virtual addr */ + __u64 dmaaddr; /* desired and/or returned dma address */ + __u64 size; /* size in bytes */ + __u64 flags; /* bool: 0 for r/o; 1 for r/w */ +#define VFIO_FLAG_WRITE 0x1 /* req writeable DMA mem */ +}; + +/* map user pages at specific dma address */ +/* requires previous VFIO_DOMAIN_SET */ +#define VFIO_DMA_MAP_IOVA _IOWR(';', 101, struct vfio_dma_map) + +/* unmap user pages */ +#define VFIO_DMA_UNMAP _IOW(';', 102, struct vfio_dma_map) + +/* set device DMA mask & master status */ +#define VFIO_DMA_MASK _IOW(';', 103, __u64) + +/* request IRQ interrupts; use given eventfd */ +#define VFIO_EVENTFD_IRQ _IOW(';', 104, int) + +/* request MSI interrupts; use given eventfd */ +#define VFIO_EVENTFD_MSI _IOW(';', 105, int) + +/* Request MSI-X interrupts: arg[0] is #, arg[1-n] are eventfds */ +#define VFIO_EVENTFDS_MSIX _IOW(';', 106, int) + +/* Get length of a BAR */ +#define VFIO_BAR_LEN _IOWR(';', 107, __u32) + +/* Set the IOMMU domain - arg is fd from uiommu driver */ +#define VFIO_DOMAIN_SET _IOW(';', 108, int) + +/* Unset the IOMMU domain */ +#define VFIO_DOMAIN_UNSET _IO(';', 109) + +/* + * Reads, writes, and mmaps determine which PCI BAR (or config space) + * from the high level bits of the file offset + */ +#define VFIO_PCI_BAR0_RESOURCE 0x0 +#define VFIO_PCI_BAR1_RESOURCE 0x1 +#define VFIO_PCI_BAR2_RESOURCE 0x2 +#define VFIO_PCI_BAR3_RESOURCE 0x3 +#define VFIO_PCI_BAR4_RESOURCE 0x4 +#define VFIO_PCI_BAR5_RESOURCE 0x5 +#define VFIO_PCI_ROM_RESOURCE 0x6 +#define VFIO_PCI_CONFIG_RESOURCE 0xF +#define VFIO_PCI_SPACE_SHIFT 32 +#define VFIO_PCI_CONFIG_OFF vfio_pci_space_to_offset(VFIO_PCI_CONFIG_RESOURCE) + +static inline int vfio_offset_to_pci_space(__u64 off) +{ + return (off >> VFIO_PCI_SPACE_SHIFT) & 0xF; +} + +static inline __u64 vfio_pci_space_to_offset(int sp) +{ + return (__u64)(sp) << VFIO_PCI_SPACE_SHIFT; +} diff --git a/hw/vfio.c b/hw/vfio.c new file mode 100644 index 0000000..d9ff3d8 --- /dev/null +++ b/hw/vfio.c @@ -0,0 +1,1295 @@ +/* + * vfio based device assignment support + * + * Copyright Red Hat, Inc. 2010 + * + * Authors: + * Alex Williamson + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Based on qemu-kvm device-assignment: + * Adapted for KVM by Qumranet. + * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com) + * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com) + * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com) + * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com) + * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "event_notifier.h" +#include "hw.h" +#include "memory.h" +#include "monitor.h" +#include "pc.h" +#include "qemu-error.h" +#include "vfio.h" +#include +#include +#include +#include "linux-vfio.h" + +//#define DEBUG_VFIO +#ifdef DEBUG_VFIO +#define DPRINTF(fmt, ...) \ + do { printf("vfio: " fmt, ## __VA_ARGS__); } while (0) +#else +#define DPRINTF(fmt, ...) \ + do { } while (0) +#endif + +static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len); +static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr, + uint32_t val, int len); +/* + * Generic + */ +static uint8_t pci_find_cap_offset(PCIDevice *pdev, uint8_t cap) +{ + int id; + int max_cap = 48; + int pos = PCI_CAPABILITY_LIST; + int status; + + status = pdev->config[PCI_STATUS]; + if ((status & PCI_STATUS_CAP_LIST) == 0) { + return 0; + } + + while (max_cap--) { + pos = pdev->config[pos]; + if (pos < 0x40) { + break; + } + + pos &= ~3; + id = pdev->config[pos + PCI_CAP_LIST_ID]; + + if (id == 0xff) { + break; + } + if (id == cap) { + return pos; + } + + pos += PCI_CAP_LIST_NEXT; + } + return 0; +} + +static int parse_hostaddr(DeviceState *qdev, Property *prop, const char *str) +{ + PCIHostDevice *ptr = qdev_get_prop_ptr(qdev, prop); + const char *p = str; + int n, seg, bus, dev, func; + char field[5]; + + if (sscanf(p, "%4[^:]%n", field, &n) != 1 || p[n] != ':') { + return -1; + } + + seg = strtol(field, NULL, 16); + p += n + 1; + + if (sscanf(p, "%4[^:]%n", field, &n) != 1) { + return -1; + } + + if (p[n] == ':') { + bus = strtol(field, NULL, 16); + p += n + 1; + } else { + bus = seg; + seg = 0; + } + + if (sscanf(p, "%4[^.]%n", field, &n) != 1 || p[n] != '.') { + return -1; + } + + dev = strtol(field, NULL, 16); + p += n + 1; + + if (!qemu_isdigit(*p)) { + return -1; + } + + func = *p - '0'; + + ptr->seg = seg; + ptr->bus = bus; + ptr->dev = dev; + ptr->func = func; + return 0; +} + +static int print_hostaddr(DeviceState *qdev, Property *prop, + char *dest, size_t len) +{ + PCIHostDevice *ptr = qdev_get_prop_ptr(qdev, prop); + + return snprintf(dest, len, "%04x:%02x:%02x.%x", + ptr->seg, ptr->bus, ptr->dev, ptr->func); +} + +/* + * MSI-X + */ +static uint32_t msix_mmio_read(VFIODevice *vdev, + target_phys_addr_t addr, int len) +{ + unsigned int offset = addr & 0xfff; + uint32_t val = 0; + + memcpy(&val, (void *)&((uint8_t *)vdev->msix.table)[offset], len); + DPRINTF("%s(%04x:%02x:%02x.%x, 0x%lx, 0x%x) = 0x%x\n", + __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev, + vdev->host.func, addr, len, val); + return val; +} + +static uint32_t msix_mmio_readl(void *opaque, target_phys_addr_t addr) +{ + return msix_mmio_read(opaque, addr, 4); +} + +static uint32_t msix_mmio_readw(void *opaque, target_phys_addr_t addr) +{ + return msix_mmio_read(opaque, addr, 2); +} + +static uint32_t msix_mmio_readb(void *opaque, target_phys_addr_t addr) +{ + return msix_mmio_read(opaque, addr, 1); +} + +static CPUReadMemoryFunc *msix_mmio_reads[] = { + msix_mmio_readb, msix_mmio_readw, msix_mmio_readl +}; + +static void msix_mmio_write(VFIODevice *vdev, target_phys_addr_t addr, + uint32_t val, int len) +{ + unsigned int offset = addr & 0xfff; + + memcpy((void *)&((uint8_t *)vdev->msix.table)[offset], &val, len); + + DPRINTF("%s(%04x:%02x:%02x.%x, 0x%lx, 0x%x, 0x%x)\n", + __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev, + vdev->host.func, addr, val, len); + + if ((offset & 0xf) == 0xc && vdev->msix.enabled) { + uint64_t off = vdev->msix.bar_offset + offset + + vfio_pci_space_to_offset(VFIO_PCI_BAR0_RESOURCE + + vdev->msix.bar); + if (pwrite(vdev->vfiofd, &val, len, off) != len) { + fprintf(stderr, "vfio: Error: Failed to update MSIX table ctrl\n"); + } + } +} + +static void msix_mmio_writel(void *opaque, + target_phys_addr_t addr, uint32_t val) +{ + msix_mmio_write(opaque, addr, val, 4); +} + +static void msix_mmio_writew(void *opaque, + target_phys_addr_t addr, uint32_t val) +{ + msix_mmio_write(opaque, addr, val, 2); +} + +static void msix_mmio_writeb(void *opaque, + target_phys_addr_t addr, uint32_t val) +{ + msix_mmio_write(opaque, addr, val, 1); +} + +static CPUWriteMemoryFunc *msix_mmio_writes[] = { + msix_mmio_writeb, msix_mmio_writew, msix_mmio_writel +}; + +static void vfio_msix_interrupt(void *opaque) +{ + MSIXEvent *event = opaque; + uint64_t addr; + uint32_t data; + + if (!event_notifier_test_and_clear(&event->notifier)) { + return; + } + + addr = le32_to_cpu(event->entry->upper_addr); + addr = (addr << 32) | le32_to_cpu(event->entry->addr); + data = le32_to_cpu(event->entry->data); + DPRINTF("%s: 0x%x -> 0x%lx\n", __FUNCTION__, data, addr); + stl_phys(addr, data); +} + +static void vfio_enable_msix(VFIODevice *vdev) +{ + int i, vectors, *fds; + uint64_t off = vdev->msix.bar_offset + + vfio_pci_space_to_offset(VFIO_PCI_BAR0_RESOURCE + + vdev->msix.bar); + + /* Hmm, it's probably possible for a driver to setup less then + * the full table of vectors... right? + */ + for (i = 0; i < vdev->msix.table_len; i++) { + if (!vdev->msix.table[i].addr) { + break; + } + } + + vectors = i; + if (!vectors) { + fprintf(stderr, "vfio: Error: no MSIX vectors enabled\n"); + return; + } + + vdev->msix.events = qemu_mallocz(vectors * sizeof(MSIXEvent)); + vdev->msix.num_events = vectors; + fds = qemu_malloc((vectors + 1) * sizeof(int)); + fds[0] = vectors; + + for (i = 0; i < vectors; i++) { + vdev->msix.events[i].entry = &vdev->msix.table[i]; + if (event_notifier_init(&vdev->msix.events[i].notifier, 0)) + fprintf(stderr, "vfio: Error: event_notifier_init failed\n"); + + fds[i + 1] = event_notifier_get_fd(&vdev->msix.events[i].notifier); + qemu_set_fd_handler(fds[i + 1], vfio_msix_interrupt, NULL, + &vdev->msix.events[i]); + } + + if (ioctl(vdev->vfiofd, VFIO_EVENTFDS_MSIX, fds)) { + fprintf(stderr, "vfio: Error: Failed to setup MSIX fds %s\n", + strerror(errno)); + qemu_free(fds); + return; + } + + qemu_free(fds); + + for (i = 0; i < vectors; i++) { + MSIXTableEntry *te = &vdev->msix.table[i]; + if (pwrite(vdev->vfiofd, &te->ctrl, sizeof(te->ctrl), + off + (i * sizeof(MSIXTableEntry)) + + offsetof(MSIXTableEntry, ctrl)) != sizeof(te->ctrl)) { + fprintf(stderr, "vfio: Error: Failed to update MSIX table ctrl\n"); + } + } + vdev->msix.enabled = 1; + DPRINTF("%s(%04x:%02x:%02x.%x)\n", + __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev, + vdev->host.func); +} + +static void vfio_disable_msix(VFIODevice *vdev) +{ + uint32_t vectors = 0; + int i; + + if (!vdev->msix.enabled) { + return; + } + + for (i = 0; i < vdev->msix.num_events; i++) { + int fd = event_notifier_get_fd(&vdev->msix.events[i].notifier); + qemu_set_fd_handler(fd, NULL, NULL, NULL); + event_notifier_cleanup(&vdev->msix.events[i].notifier); + } + + ioctl(vdev->vfiofd, VFIO_EVENTFDS_MSIX, &vectors); + qemu_free(vdev->msix.events); + vdev->msix.events = NULL; + vdev->msix.num_events = 0; + vdev->msix.enabled = 0; + DPRINTF("%s(%04x:%02x:%02x.%x)\n", + __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev, + vdev->host.func); +} + +/* + * MSI + */ +static void vfio_msi_interrupt(void *opaque) +{ + MSIEvent *event = opaque; + uint64_t addr; + uint32_t data; + + if (!event_notifier_test_and_clear(&event->notifier)) { + return; + } + + if (event->upper_addr) { + addr = pci_get_long(event->upper_addr); + addr = (addr << 32) | pci_get_long(event->addr); + } else { + addr = pci_get_long(event->addr); + } + data = pci_get_word(event->data); + DPRINTF("%s: 0x%x -> 0x%lx\n", __FUNCTION__, data, addr); + stl_phys(addr, data); +} + +static void vfio_enable_msi(VFIODevice *vdev) +{ + int i, vectors, *fds; + uint16_t ctrl = vfio_pci_read_config(&vdev->pdev, + vdev->msi.pos + PCI_MSI_FLAGS, + sizeof(ctrl)); + ctrl = le32_to_cpu(ctrl); + vectors = 1 << ((ctrl & PCI_MSI_FLAGS_QSIZE) >> 4); + + if (vectors > 32) { + fprintf(stderr, "vfio: Error: Invalid configured MSI vectors %d\n", + vectors); + return; + } + + vdev->msi.events = qemu_mallocz(vectors * sizeof(MSIEvent)); + vdev->msi.num_events = vectors; + fds = qemu_malloc((vectors + 1) * sizeof(int)); + fds[0] = vectors; + + for (i = 0; i < vectors; i++) { + vdev->msi.events[i].addr = vdev->pdev.config + + vdev->msi.pos + PCI_MSI_ADDRESS_LO; + if (ctrl & PCI_MSI_FLAGS_64BIT) { + vdev->msi.events[i].upper_addr = vdev->pdev.config + + vdev->msi.pos + + PCI_MSI_ADDRESS_HI; + vdev->msi.events[i].data = vdev->pdev.config + + vdev->msi.pos + PCI_MSI_DATA_64; + } else { + vdev->msi.events[i].upper_addr = NULL; + vdev->msi.events[i].data = vdev->pdev.config + + vdev->msi.pos + PCI_MSI_DATA_32; + } + + if (event_notifier_init(&vdev->msi.events[i].notifier, 0)) { + fprintf(stderr, "vfio: Error: event_notifier_init failed\n"); + } + fds[i + 1] = event_notifier_get_fd(&vdev->msi.events[i].notifier); + qemu_set_fd_handler(fds[i + 1], vfio_msi_interrupt, NULL, + &vdev->msi.events[i]); + } + + /* FIXME: current vfio only supports 1 MSI */ + if (vectors > 1) { + fprintf(stderr, "vfio: Error: only support 1 MSI vector, want %d\n", + vectors); + abort(); + } + if (ioctl(vdev->vfiofd, VFIO_EVENTFD_MSI, &fds[1])) { + fprintf(stderr, "vfio: Error: Failed to setup MSI fds %s\n", + strerror(errno)); + qemu_free(fds); + return; + } + + qemu_free(fds); + vdev->msi.enabled = 1; + DPRINTF("%s(%04x:%02x:%02x.%x)\n", + __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev, + vdev->host.func); +} + +static void vfio_disable_msi(VFIODevice *vdev) +{ + uint32_t vectors = -1; + int i; + + if (!vdev->msi.enabled) { + return; + } + + for (i = 0; i < vdev->msi.num_events; i++) { + int fd = event_notifier_get_fd(&vdev->msi.events[i].notifier); + qemu_set_fd_handler(fd, NULL, NULL, NULL); + event_notifier_cleanup(&vdev->msi.events[i].notifier); + } + + ioctl(vdev->vfiofd, VFIO_EVENTFD_MSI, &vectors); + qemu_free(vdev->msi.events); + vdev->msi.events = NULL; + vdev->msi.num_events = 0; + vdev->msi.enabled = 0; + DPRINTF("%s(%04x:%02x:%02x.%x)\n", + __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev, + vdev->host.func); +} + +/* + * INTx + */ +static void vfio_unmask_intx(VFIODevice *vdev) +{ + uint16_t cmd; + + cmd = vfio_pci_read_config(&vdev->pdev, PCI_COMMAND, sizeof(cmd)); + cmd = le16_to_cpu(cmd); + cmd &= ~PCI_COMMAND_INTX_DISABLE; + cmd = cpu_to_le16(cmd); + vfio_pci_write_config(&vdev->pdev, PCI_COMMAND, cmd, sizeof(cmd)); +} + +static void vfio_intx_interrupt(void *opaque) +{ + VFIODevice *vdev = opaque; + + if (!event_notifier_test_and_clear(&vdev->intx.notifier)) { + return; + } + + DPRINTF(stderr, "%s(%04x:%02x:%02x.%x) Pin %c\n", + __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev, + vdev->host.func, 'A' + vdev->intx.pin); + + vdev->intx.pending = 1; + qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 1); +} + +static void vfio_eoi(ioapic_eoi_client *client) +{ + VFIODevice *vdev = container_of(client, VFIODevice, intx.eoi_client); + + if (!vdev->intx.pending) { + return; + } + + DPRINTF("%s(%04x:%02x:%02x.%x) EOI\n", + __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev, + vdev->host.func); + + vdev->intx.pending = 0; + qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0); + vfio_unmask_intx(vdev); +} + +static int vfio_enable_intx(VFIODevice *vdev) +{ + int fd; + + if (!(vdev->intx.pin = vfio_pci_read_config(&vdev->pdev, + PCI_INTERRUPT_PIN, 1))) { + return 0; + } + + vdev->intx.pin--; /* Pin A (1) -> irq[0] */ + vdev->intx.eoi_client.eoi = vfio_eoi; + vdev->intx.eoi_client.irq = pci_get_byte(vdev->pdev.config + + PCI_INTERRUPT_LINE); + ioapic_register_eoi_client(&vdev->intx.eoi_client); + + if (event_notifier_init(&vdev->intx.notifier, 0)) { + fprintf(stderr, "vfio: Error: event_notifier_init failed\n"); + return -1; + } + + fd = event_notifier_get_fd(&vdev->intx.notifier); + qemu_set_fd_handler(fd, vfio_intx_interrupt, NULL, vdev); + if (ioctl(vdev->vfiofd, VFIO_EVENTFD_IRQ, &fd)) { + fprintf(stderr, "vfio: Error: Failed to setup INTx fd %s\n", + strerror(errno)); + return -1; + } + vfio_unmask_intx(vdev); + vdev->intx.enabled = 1; + DPRINTF("%s(%04x:%02x:%02x.%x)\n", + __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev, + vdev->host.func); + + return 0; +} + +static void vfio_disable_intx(VFIODevice *vdev) +{ + int fd; + + if (!vdev->intx.enabled) { + return; + } + + ioapic_unregister_eoi_client(&vdev->intx.eoi_client); + fd = event_notifier_get_fd(&vdev->intx.notifier); + qemu_set_fd_handler(fd, NULL, NULL, NULL); + event_notifier_cleanup(&vdev->intx.notifier); + fd = -1; + ioctl(vdev->vfiofd, VFIO_EVENTFD_IRQ, &fd); + vdev->intx.enabled = 0; + DPRINTF("%s(%04x:%02x:%02x.%x)\n", + __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev, + vdev->host.func); +} + +/* + * IO Port/MMIO + */ +static void vfio_resource_write(PCIResource *res, uint32_t addr, + uint32_t val, int len) +{ + size_t offset = vfio_pci_space_to_offset(VFIO_PCI_BAR0_RESOURCE + res->bar); + + if (pwrite(res->vfiofd, &val, len, offset + addr) != len) { + fprintf(stderr, "%s(,0x%x, 0x%x, %d) failed: %s\n", + __FUNCTION__, addr, val, len, strerror(errno)); + } + DPRINTF("%s(BAR%d+0x%x, 0x%x, %d)\n", + __FUNCTION__, res->bar, addr, val, len); +} + +static void vfio_resource_writeb(void *opaque, target_phys_addr_t addr, + uint32_t val) +{ + vfio_resource_write(opaque, addr, val, 1); +} + +static void vfio_resource_writew(void *opaque, target_phys_addr_t addr, + uint32_t val) +{ + vfio_resource_write(opaque, addr, val, 2); +} + +static void vfio_resource_writel(void *opaque, target_phys_addr_t addr, + uint32_t val) +{ + vfio_resource_write(opaque, addr, val, 4); +} + +static CPUWriteMemoryFunc * const vfio_resource_writes[] = { + &vfio_resource_writeb, + &vfio_resource_writew, + &vfio_resource_writel +}; + +static void vfio_ioport_writeb(void *opaque, uint32_t addr, uint32_t val) +{ + PCIResource *res = opaque; + vfio_resource_write(res, addr - res->e_phys, val, 1); +} + +static void vfio_ioport_writew(void *opaque, uint32_t addr, uint32_t val) +{ + PCIResource *res = opaque; + vfio_resource_write(res, addr - res->e_phys, val, 2); +} + +static void vfio_ioport_writel(void *opaque, uint32_t addr, uint32_t val) +{ + PCIResource *res = opaque; + vfio_resource_write(res, addr - res->e_phys, val, 4); +} + +static uint32_t vfio_resource_read(PCIResource *res, uint32_t addr, int len) +{ + size_t offset = vfio_pci_space_to_offset(VFIO_PCI_BAR0_RESOURCE + res->bar); + uint32_t val; + + if (pread(res->vfiofd, &val, len, offset + addr) != len) { + fprintf(stderr, "%s(,0x%x, %d) failed: %s\n", + __FUNCTION__, addr, len, strerror(errno)); + return 0xffffffffU; + } + DPRINTF("%s(BAR%d+0x%x, %d) = 0x%x\n", + __FUNCTION__, res->bar, addr, len, val); + return val; +} + +static uint32_t vfio_resource_readb(void *opaque, target_phys_addr_t addr) +{ + return vfio_resource_read(opaque, addr, 1) & 0xff; +} + +static uint32_t vfio_resource_readw(void *opaque, target_phys_addr_t addr) +{ + return vfio_resource_read(opaque, addr, 2) & 0xffff; +} + +static uint32_t vfio_resource_readl(void *opaque, target_phys_addr_t addr) +{ + return vfio_resource_read(opaque, addr, 4); +} + +static CPUReadMemoryFunc * const vfio_resource_reads[] = { + &vfio_resource_readb, + &vfio_resource_readw, + &vfio_resource_readl +}; + +static uint32_t vfio_ioport_readb(void *opaque, uint32_t addr) +{ + PCIResource *res = opaque; + return vfio_resource_read(res, addr - res->e_phys, 1) & 0xff; +} + +static uint32_t vfio_ioport_readw(void *opaque, uint32_t addr) +{ + PCIResource *res = opaque; + return vfio_resource_read(res, addr - res->e_phys, 2) & 0xffff; +} + +static uint32_t vfio_ioport_readl(void *opaque, uint32_t addr) +{ + PCIResource *res = opaque; + return vfio_resource_read(res, addr - res->e_phys, 4); +} + +static void vfio_ioport_map(PCIDevice *pdev, int bar, + pcibus_t e_phys, pcibus_t e_size, int type) +{ + VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev); + PCIResource *res = &vdev->resources[bar]; + + DPRINTF("%s(%04x:%02x:%02x.%x, %d, 0x%lx, 0x%lx, %d)\n", + __FUNCTION__, vdev->host.seg, vdev->host.bus, + vdev->host.dev, vdev->host.func, bar, e_phys, e_size, type); + + res->e_phys = e_phys; + res->e_size = e_size; + + register_ioport_write(e_phys, e_size, 1, vfio_ioport_writeb, res); + register_ioport_write(e_phys, e_size, 2, vfio_ioport_writew, res); + register_ioport_write(e_phys, e_size, 4, vfio_ioport_writel, res); + register_ioport_read(e_phys, e_size, 1, vfio_ioport_readb, res); + register_ioport_read(e_phys, e_size, 2, vfio_ioport_readw, res); + register_ioport_read(e_phys, e_size, 4, vfio_ioport_readl, res); +} + +static void vfio_iomem_map(PCIDevice *pdev, int bar, + pcibus_t e_phys, pcibus_t e_size, int type) +{ + VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev); + PCIResource *res = &vdev->resources[bar]; + + DPRINTF("%s(%04x:%02x:%02x.%x, %d, 0x%lx, 0x%lx, %d)\n", + __FUNCTION__, vdev->host.seg, vdev->host.bus, + vdev->host.dev, vdev->host.func, bar, e_phys, e_size, type); + + res->e_phys = e_phys; + res->e_size = e_size; + + if (e_size == 0) { + return; + } + + if (e_size != res->size) { + fprintf(stderr, "vfio: Error: partial BAR map?\n"); + abort(); + } + + if (res->msix) { + if (res->msix_offset > 0) { + cpu_register_physical_memory(e_phys, res->msix_offset, + res->memory_index[0]); + } + + DPRINTF("Overlaying MSI-X table page\n"); + cpu_register_physical_memory(e_phys + res->msix_offset, + TARGET_PAGE_SIZE, vdev->msix.index); + + if (res->size > res->msix_offset + 0x1000) { + cpu_register_physical_memory(e_phys + res->msix_offset + 0x1000, + res->size - res->msix_offset - 0x1000, + res->memory_index[1]); + } + } else { + if (!res->slow) { + cpu_register_physical_memory(e_phys, e_size, res->memory_index[0]); + } else { + cpu_register_physical_memory(e_phys, e_size, res->io_mem); + } + } +} + +/* + * PCI config space + */ +static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len) +{ + VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev); + uint32_t val = 0; + + if (pread(vdev->vfiofd, &val, len, VFIO_PCI_CONFIG_OFF + addr) != len) { + fprintf(stderr, "%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) failed: %s\n", + __FUNCTION__, vdev->host.seg, vdev->host.bus, + vdev->host.dev, vdev->host.func, addr, len, strerror(errno)); + return -1; + } + DPRINTF("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) %x\n", + __FUNCTION__, vdev->host.seg, vdev->host.bus, + vdev->host.dev, vdev->host.func, addr, len, val); + return pci_default_read_config(pdev, addr, len); +} + +static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr, + uint32_t val, int len) +{ + VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev); + + if (pwrite(vdev->vfiofd, &val, len, VFIO_PCI_CONFIG_OFF + addr) != len) { + fprintf(stderr, "%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x) failed: %s\n", + __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev, + vdev->host.func, addr, val, len, strerror(errno)); + } + + DPRINTF("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x)\n", + __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev, + vdev->host.func, addr, val, len); + + if (vdev->msix.pos && (addr == vdev->msix.pos + PCI_MSIX_FLAGS)) { + if (vdev->msix.enabled) { + if (!(val & PCI_MSIX_FLAGS_ENABLE)) { + vfio_disable_msix(vdev); + } + } else { + if (val & PCI_MSIX_FLAGS_ENABLE) { + vfio_enable_msix(vdev); + } + } + } + + if (vdev->msi.pos && (addr == vdev->msi.pos + PCI_MSI_FLAGS)) { + if (vdev->msi.enabled) { + if (!(val & PCI_MSI_FLAGS_ENABLE)) { + vfio_disable_msi(vdev); + } + } else { + if (val & PCI_MSI_FLAGS_ENABLE) { + vfio_enable_msi(vdev); + } + } + } + + if (addr == PCI_INTERRUPT_LINE) { + if (len != 1) { + fprintf(stderr, "vfio: fixme: INTERRUPT_LINE written as %d bytes\n", + len); + } + vdev->intx.eoi_client.irq = val; + } + + pci_default_write_config(pdev, addr, val, len); +} + +/* + * DMA + */ +static int vfio_do_map_iommu(VFIODevice *vdev, int map) +{ + QemuRamSlot *slot; + + QLIST_FOREACH(slot, &ram_slots.slots, next) { + struct vfio_dma_map dma_map; + + dma_map.vaddr = (uint64_t)qemu_get_ram_ptr(slot->offset); + dma_map.dmaaddr = slot->start_addr; + dma_map.size = slot->size; + dma_map.flags = VFIO_FLAG_WRITE; + + if (map) { + if (ioctl(vdev->vfiofd, VFIO_DMA_MAP_IOVA, &dma_map)) + return -1; + } else { + ioctl(vdev->vfiofd, VFIO_DMA_UNMAP, &dma_map); + } + } + return 0; +} + +static int vfio_map_iommu(VFIODevice *vdev) +{ + return vfio_do_map_iommu(vdev, 1); +} + +static void vfio_unmap_iommu(VFIODevice *vdev) +{ + vfio_do_map_iommu(vdev, 0); +} + +/* + * Interrupt setup + */ +static int vfio_setup_msi(VFIODevice *vdev) +{ + int pos; + + if ((pos = pci_find_cap_offset(&vdev->pdev, PCI_CAP_ID_MSI))) { + DPRINTF("%04x:%02x:%02x.%x PCI MSI CAP @%d\n", vdev->host.seg, + vdev->host.bus, vdev->host.dev, vdev->host.func, pos); + vdev->msi.pos = pos; + } + + if ((pos = pci_find_cap_offset(&vdev->pdev, PCI_CAP_ID_MSIX))) { + uint16_t ctrl; + uint32_t table, pba, len; + + if (pread(vdev->vfiofd, &ctrl, sizeof(ctrl), + VFIO_PCI_CONFIG_OFF + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) { + return -1; + } + + if (pread(vdev->vfiofd, &table, sizeof(table), VFIO_PCI_CONFIG_OFF + + pos + PCI_MSIX_TABLE) != sizeof(table)) { + return -1; + } + + if (pread(vdev->vfiofd, &pba, sizeof(pba), + VFIO_PCI_CONFIG_OFF + pos + PCI_MSIX_PBA) != sizeof(pba)) { + return -1; + } + + ctrl = le16_to_cpu(ctrl); + table = le32_to_cpu(table); + pba = le32_to_cpu(pba); + + vdev->msix.pos = pos; + vdev->msix.table_len = (ctrl & PCI_MSIX_TABSIZE) + 1; + vdev->msix.bar = table & PCI_MSIX_BIR; + vdev->msix.bar_offset = table & ~PCI_MSIX_BIR; + vdev->resources[vdev->msix.bar].msix = 1; + vdev->resources[vdev->msix.bar].msix_offset = vdev->msix.bar_offset; + + DPRINTF("%04x:%02x:%02x.%x PCI MSI-X CAP @%d, BAR %d, offset 0x%x\n", + vdev->host.seg, vdev->host.bus, vdev->host.dev, + vdev->host.func, pos, vdev->msix.bar, vdev->msix.bar_offset); + + if ((pba & PCI_MSIX_BIR) == vdev->msix.bar && + ((pba & ~0xfff) == vdev->msix.bar_offset)) { + fprintf(stderr, "vfio: Error: MSIX Table & PBA reside in the same " + "page, not yet supported\n"); + return -1; + } + + /* + * Check if the BAR containing the MSIX table is 4k aligned, if + * so we can avoid slow mapping messiness. This shouldn't fail + * for devices that follow the spec recommendations for sizing + * and placement. */ + len = vdev->msix.bar; + if (ioctl(vdev->vfiofd, VFIO_BAR_LEN, &len)) { + fprintf(stderr, "vfio: VFIO_BAR_LEN failed for MSIX BAR\n"); + return -1; + } + if (!len || len & 0xfff) { + fprintf(stderr, "vfio: MSIX BAR not 4k aligned\n"); + return -1; + } + + vdev->msix.table = mmap(NULL, 0x1000, PROT_READ|PROT_WRITE, + MAP_ANONYMOUS|MAP_PRIVATE, 0, 0); + if (vdev->msix.table == MAP_FAILED) { + fprintf(stderr, "vfio: Failed to allocate MSIX table page\n"); + return -1; + } + + memset(vdev->msix.table, 0, 0x1000); + vdev->msix.index = cpu_register_io_memory(msix_mmio_reads, + msix_mmio_writes, vdev); + } + return 0; +} + +static void vfio_teardown_msi(VFIODevice *vdev) +{ + if (vdev->msix.table) { + munmap(vdev->msix.table, 0x1000); + } + if (vdev->msix.index) { + cpu_unregister_io_memory(vdev->msix.index); + } +} + +/* + * Resource setup + */ +static int vfio_setup_resources(VFIODevice *vdev) +{ + int i; + + for (i = 0; i < PCI_NUM_REGIONS; i++) { + uint32_t len, bar; + PCIResource *res; + uint8_t offset; + int ret, space; + + res = &vdev->resources[i]; + res->vfiofd = vdev->vfiofd; + res->bar = len = i; + + if (ioctl(vdev->vfiofd, VFIO_BAR_LEN, &len)) { + fprintf(stderr, "vfio: VFIO_BAR_LEN failed for BAR %d\n", i); + return -1; + } + if (!len) { + continue; + } + + offset = PCI_BASE_ADDRESS_0 + (4 * i); + ret = pread(vdev->vfiofd, &bar, sizeof(bar), + VFIO_PCI_CONFIG_OFF + offset); + if (ret != sizeof(bar)) { + fprintf(stderr, "vfio: Failed to read BAR %d\n", i); + return -1; + } + bar = le32_to_cpu(bar); + space = bar & PCI_BASE_ADDRESS_SPACE; + + if (space == PCI_BASE_ADDRESS_SPACE_MEMORY && !(len & 0xfff)) { + int off = VFIO_PCI_BAR0_RESOURCE + i; + int flags = PROT_READ; + char name[32]; + + res->mem = 1; + res->size = len; + + if (i != PCI_ROM_SLOT) { + flags |= PROT_WRITE; + } + + if (vdev->pdev.qdev.info->vmsd) { + snprintf(name, sizeof(name), "%s.bar%d", + vdev->pdev.qdev.info->vmsd->name, i); + } else { + snprintf(name, sizeof(name), "%s.bar%d", + vdev->pdev.qdev.info->name, i); + } + + if (res->msix) { + if (res->msix_offset) { + char *c = &name[strlen(name)]; + + res->r_virtbase[0] = mmap(NULL, res->msix_offset, flags, + MAP_SHARED, vdev->vfiofd, + vfio_pci_space_to_offset(off)); + + if (res->r_virtbase[0] == MAP_FAILED) { + fprintf(stderr, "vfio: Failed to mmap BAR %d\n", i); + return -1; + } + strncat(name, ".0", sizeof(name)); + res->memory_index[0] = qemu_ram_map(&vdev->pdev.qdev, + name, res->msix_offset, + res->r_virtbase[0]); + *c = 0; + } + if (len > res->msix_offset + 0x1000) { + char *c = &name[strlen(name)]; + + res->r_virtbase[1] = mmap(NULL, + len - res->msix_offset - 0x1000, + flags, MAP_SHARED, vdev->vfiofd, + vfio_pci_space_to_offset(off) + + res->msix_offset + 0x1000); + + if (res->r_virtbase[1] == MAP_FAILED) { + fprintf(stderr, "vfio: Failed to mmap BAR %d\n", i); + return -1; + } + strncat(name, ".1", sizeof(name)); + res->memory_index[1] = qemu_ram_map(&vdev->pdev.qdev, name, + len - 0x1000 - + res->msix_offset, + res->r_virtbase[1]); + *c = 0; + } + } else { + res->r_virtbase[0] = mmap(NULL, len, flags, MAP_SHARED, + vdev->vfiofd, + vfio_pci_space_to_offset(off)); + + if (res->r_virtbase[0] == MAP_FAILED) { + fprintf(stderr, "vfio: Failed to mmap BAR %d\n", i); + return -1; + } + res->memory_index[0] = qemu_ram_map(&vdev->pdev.qdev, name, + len, res->r_virtbase[0]); + if (i == PCI_ROM_SLOT) { + res->memory_index[0] |= IO_MEM_ROM; + } + } + + pci_register_bar(&vdev->pdev, i, res->size, + bar & PCI_BASE_ADDRESS_MEM_PREFETCH ? + PCI_BASE_ADDRESS_MEM_PREFETCH : + PCI_BASE_ADDRESS_SPACE_MEMORY, + vfio_iomem_map); + + if (bar & PCI_BASE_ADDRESS_MEM_TYPE_64) { + i++; + } + } else if (space == PCI_BASE_ADDRESS_SPACE_MEMORY) { + res->mem = 1; + res->size = len; + res->slow = 1; + + DPRINTF("%s(%04x:%02x:%02x.%x) Using slow mapping for BAR %d\n", + __FUNCTION__, vdev->host.seg, vdev->host.bus, + vdev->host.dev, vdev->host.func, i); + + if (i == PCI_ROM_SLOT) { + res->io_mem = cpu_register_io_memory(vfio_resource_reads, + NULL, res); + } else { + res->io_mem = cpu_register_io_memory(vfio_resource_reads, + vfio_resource_writes, res); + } + + pci_register_bar(&vdev->pdev, i, res->size, + bar & PCI_BASE_ADDRESS_MEM_PREFETCH ? + PCI_BASE_ADDRESS_MEM_PREFETCH : + PCI_BASE_ADDRESS_SPACE_MEMORY, + vfio_iomem_map); + + } else if (space == PCI_BASE_ADDRESS_SPACE_IO) { + res->size = len; + pci_register_bar(&vdev->pdev, i, res->size, + PCI_BASE_ADDRESS_SPACE_IO, vfio_ioport_map); + } + res->valid = 1; + } + return 0; +} + +static void vfio_unmap_resources(VFIODevice *vdev) +{ + int i; + PCIResource *res = vdev->resources; + + for (i = 0; i < PCI_NUM_REGIONS; i++, res++) { + if (res->valid && res->mem) { + if (res->msix) { + if (res->msix_offset) { + cpu_register_physical_memory(res->e_phys, res->msix_offset, + IO_MEM_UNASSIGNED); + qemu_ram_unmap(res->memory_index[0]); + munmap(res->r_virtbase[0], res->msix_offset); + } + if (res->size > res->msix_offset + 0x1000) { + cpu_register_physical_memory(res->e_phys + 0x1000 + + res->msix_offset, + res->e_size - 0x1000 - + res->msix_offset, + IO_MEM_UNASSIGNED); + qemu_ram_unmap(res->memory_index[1]); + munmap(res->r_virtbase[1], + res->size - 0x1000 - res->msix_offset); + } + } else { + if (!res->slow) { + cpu_register_physical_memory(res->e_phys, res->e_size, + IO_MEM_UNASSIGNED); + qemu_ram_unmap(res->memory_index[0]); + munmap(res->r_virtbase[0], res->size); + } else { + cpu_unregister_io_memory(res->io_mem); + } + } + } + } +} + +/* + * General setup + */ +static int get_vfio_fd(VFIODevice *vdev) +{ + if (vdev->vfiofd_name && strlen(vdev->vfiofd_name) > 0) { + if (qemu_isdigit(vdev->vfiofd_name[0])) { + vdev->vfiofd = strtol(vdev->vfiofd_name, NULL, 0); + return 0; + } else { + vdev->vfiofd = monitor_get_fd(cur_mon, vdev->vfiofd_name); + if (vdev->vfiofd < 0) { + fprintf(stderr, "%s: (%s) unkown\n", __func__, + vdev->vfiofd_name); + return -1; + } + return 0; + } + } else { + char vfio_dir[64], vfio_dev[16]; + DIR *dir; + struct dirent *de; + + sprintf(vfio_dir, "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/vfio/", + vdev->host.seg, vdev->host.bus, + vdev->host.dev, vdev->host.func); + dir = opendir(vfio_dir); + if (!dir) { + error_report("vfio: error: Driver not attached\n"); + return -1; + } + + while ((de = readdir(dir))) { + if (de->d_name[0] == '.') + continue; + if (!strncmp(de->d_name, "vfio", 4)) + break; + } + + if (!de) { + error_report("vfio: error: Cannot find vfio* in %s\n", vfio_dir); + return -1; + } + + sprintf(vfio_dev, "/dev/%s", de->d_name); + vdev->vfiofd = open(vfio_dev, O_RDWR); + if (vdev->vfiofd < 0) { + error_report("pci-assign: vfio: Failed to open %s: %s\n", + vfio_dev, strerror(errno)); + return -1; + } + return 0; + } +} + +static int get_uiommu_fd(VFIODevice *vdev) +{ + if (vdev->uiommufd_name && strlen(vdev->uiommufd_name) > 0) { + if (qemu_isdigit(vdev->uiommufd_name[0])) { + vdev->uiommufd = strtol(vdev->uiommufd_name, NULL, 0); + return 0; + } else { + vdev->uiommufd = monitor_get_fd(cur_mon, vdev->uiommufd_name); + if (vdev->uiommufd < 0) { + fprintf(stderr, "%s: (%s) unkown\n", __func__, + vdev->uiommufd_name); + return -1; + } + return 0; + } + } else { + vdev->uiommufd = open("/dev/uiommu", O_RDONLY); + if (vdev->uiommufd < 0) { + return -1; + } + vdev->uiommufd_name = NULL; /* easier test later */ + return 0; + } +} + +static int vfio_initfn(struct PCIDevice *pdev) +{ + VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev); + char sys[64]; + struct stat st; + int ret; + + /* Check that the host device exists */ + sprintf(sys, "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/", + vdev->host.seg, vdev->host.bus, vdev->host.dev, vdev->host.func); + if (stat(sys, &st) < 0) { + error_report("vfio: error: no such host device " + "%04x:%02x:%02x.%01x", vdev->host.seg, vdev->host.bus, + vdev->host.dev, vdev->host.func); + return -1; + } + + if (get_uiommu_fd(vdev)) + return -1; + + if (get_vfio_fd(vdev)) + goto out_close_uiommu; + + if (ioctl(vdev->vfiofd, VFIO_DOMAIN_SET, &vdev->uiommufd)) + goto out_close_vfiofd; + + /* Get a copy of config space */ + ret = pread(vdev->vfiofd, vdev->pdev.config, + pci_config_size(&vdev->pdev), VFIO_PCI_CONFIG_OFF); + if (ret < pci_config_size(&vdev->pdev)) { + fprintf(stderr, "vfio: Failed to read device config space\n"); + goto out_unset_domain; + } + + if (vfio_setup_msi(vdev)) + goto out_unset_domain; + + if (vfio_setup_resources(vdev)) + goto out_disable_msix; + + if (vfio_map_iommu(vdev)) + goto out_unmap_resources; + + if (vfio_enable_intx(vdev)) + goto out_unmap_iommu; + + return 0; + +out_unmap_iommu: + vfio_unmap_iommu(vdev); +out_unmap_resources: + vfio_unmap_resources(vdev); +out_disable_msix: + vfio_teardown_msi(vdev); +out_unset_domain: + ioctl(vdev->vfiofd, VFIO_DOMAIN_UNSET); +out_close_vfiofd: + close(vdev->vfiofd); +out_close_uiommu: + if (!vdev->uiommufd_name) + close(vdev->uiommufd); + return -1; +} + +static int vfio_exitfn(struct PCIDevice *pdev) +{ + VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev); + + vfio_disable_intx(vdev); + vfio_disable_msi(vdev); + vfio_disable_msix(vdev); + vfio_unmap_iommu(vdev); + vfio_unmap_resources(vdev); + ioctl(vdev->vfiofd, VFIO_DOMAIN_UNSET); + close(vdev->vfiofd); + if (!vdev->uiommufd_name) + close(vdev->uiommufd); + return 0; +} + +static PropertyInfo qdev_prop_hostaddr = { + .name = "pci-hostaddr", + .type = -1, + .size = sizeof(PCIHostDevice), + .parse = parse_hostaddr, + .print = print_hostaddr, +}; + +static PCIDeviceInfo vfio_info = { + .qdev.name = "vfio", + .qdev.desc = "pass through host pci devices to the guest via vfio", + .qdev.size = sizeof(VFIODevice), + .init = vfio_initfn, + .exit = vfio_exitfn, + .config_read = vfio_pci_read_config, + .config_write = vfio_pci_write_config, + .qdev.props = (Property[]) { + DEFINE_PROP("host", VFIODevice, host, + qdev_prop_hostaddr, PCIHostDevice), + DEFINE_PROP_STRING("vfiofd", VFIODevice, vfiofd_name), + DEFINE_PROP_STRING("uiommufd", VFIODevice, uiommufd_name), + DEFINE_PROP_END_OF_LIST(), + }, +}; + +static void vfio_register_devices(void) +{ + pci_qdev_register(&vfio_info); +} + +device_init(vfio_register_devices) diff --git a/hw/vfio.h b/hw/vfio.h new file mode 100644 index 0000000..9d05ae1 --- /dev/null +++ b/hw/vfio.h @@ -0,0 +1,90 @@ +#ifndef __VFIO_H__ +#define __VFIO_H__ + +#include "qemu-common.h" +#include "qemu-queue.h" +#include "pci.h" + +typedef struct PCIHostDevice { + uint16_t seg; + uint8_t bus; + uint8_t dev:5; + uint8_t func:3; +} PCIHostDevice; + +typedef struct PCIResource { + uint8_t valid:1; + uint8_t mem:1; + uint8_t msix:1; + uint8_t bar:3; /* see vfio_resource_read/write */ + uint8_t slow:1; /* use read/write rather than mmap */ + uint64_t size; + ram_addr_t memory_index[2]; /* cpu_register_physical_memory() index */ + void *r_virtbase[2]; /* mmapped address */ + int io_mem; /* cpu_register_io_memory index */ + pcibus_t e_phys; /* emulated base address */ + pcibus_t e_size; /* emulated size of region in bytes */ + uint32_t msix_offset; + int vfiofd; /* see vfio_resource_read/write */ +} PCIResource; + +typedef struct INTx { + uint8_t enabled:1; + uint8_t pending:1; + uint8_t pin:3; + EventNotifier notifier; + ioapic_eoi_client eoi_client; +} INTx; + +typedef struct MSIEvent { + EventNotifier notifier; + uint8_t *addr; + uint8_t *upper_addr; + uint8_t *data; +} MSIEvent; + +typedef struct MSI { + uint8_t enabled:1; + uint8_t pos; + int num_events; + MSIEvent *events; +} MSI; + +typedef struct __attribute__((packed)) MSIXTableEntry { + uint32_t addr; + uint32_t upper_addr; + uint32_t data; + uint32_t ctrl; +} MSIXTableEntry; + +typedef struct MSIXEvent { + EventNotifier notifier; + MSIXTableEntry *entry; +} MSIXEvent; + +typedef struct MSIX { + uint8_t enabled:1; + uint8_t bar:3; + uint8_t pos; + uint16_t table_len; + uint32_t bar_offset; + MSIXTableEntry *table; + int num_events; + MSIXEvent *events; + int index; +} MSIX; + +typedef struct VFIODevice { + PCIDevice pdev; + PCIHostDevice host; + PCIResource resources[PCI_NUM_REGIONS]; + INTx intx; + MSI msi; + MSIX msix; + int vfiofd; + int uiommufd; + char *vfiofd_name; + char *uiommufd_name; +} VFIODevice; + +#endif /* __VFIO_H__ */