From patchwork Mon Oct 15 20:28:15 2012
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Alex Williamson <alex.williamson@redhat.com>
X-Patchwork-Id: 1596521
Return-Path: <kvm-owner@vger.kernel.org>
X-Original-To: patchwork-kvm@patchwork.kernel.org
Delivered-To: patchwork-process-083081@patchwork1.kernel.org
Received: from vger.kernel.org (vger.kernel.org [209.132.180.67])
	by patchwork1.kernel.org (Postfix) with ESMTP id 0D54E3FD86
	for <patchwork-kvm@patchwork.kernel.org>;
	Mon, 15 Oct 2012 20:28:22 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
	id S1754702Ab2JOU2S (ORCPT
	<rfc822;patchwork-kvm@patchwork.kernel.org>);
	Mon, 15 Oct 2012 16:28:18 -0400
Received: from mx1.redhat.com ([209.132.183.28]:37410 "EHLO mx1.redhat.com"
	rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP
	id S1754680Ab2JOU2R (ORCPT <rfc822;kvm@vger.kernel.org>);
	Mon, 15 Oct 2012 16:28:17 -0400
Received: from int-mx02.intmail.prod.int.phx2.redhat.com
	(int-mx02.intmail.prod.int.phx2.redhat.com [10.5.11.12])
	by mx1.redhat.com (8.14.4/8.14.4) with ESMTP id q9FKSG8w014636
	(version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-SHA bits=256 verify=OK);
	Mon, 15 Oct 2012 16:28:16 -0400
Received: from bling.home (ovpn-113-170.phx2.redhat.com [10.3.113.170])
	by int-mx02.intmail.prod.int.phx2.redhat.com (8.13.8/8.13.8) with
	ESMTP id q9FKSFZi024999; Mon, 15 Oct 2012 16:28:15 -0400
From: Alex Williamson <alex.williamson@redhat.com>
Subject: [PATCH] vfio-pci: Add KVM INTx acceleration
To: alex.williamson@redhat.com
Cc: qemu-devel@nongnu.org, kvm@vger.kernel.org, mst@redhat.com
Date: Mon, 15 Oct 2012 14:28:15 -0600
Message-ID: <20121015202031.23323.72827.stgit@bling.home>
User-Agent: StGIT/0.14.3
MIME-Version: 1.0
X-Scanned-By: MIMEDefang 2.67 on 10.5.11.12
Sender: kvm-owner@vger.kernel.org
Precedence: bulk
List-ID: <kvm.vger.kernel.org>
X-Mailing-List: kvm@vger.kernel.org

This makes use of the new level irqfd support enabling bypass of
qemu userspace both on INTx injection and unmask.  This significantly
boosts the performance of devices making use of legacy interrupts.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

My INTx routing workaround below will probably raise some eyebrows,
but I don't feel it's worth subjecting users to core dumps if they
want to try vfio-pci on new platforms.  INTx routing is part of some
larger plan, but until that plan materializes we have to try to avoid
the API unless we think there's a good chance it might be there.
I'll accept the maintenance of updating a whitelist in the interim.
Thanks,

Alex

 hw/vfio_pci.c |  224 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 224 insertions(+)


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
index 639371e..777a5f8 100644
--- a/hw/vfio_pci.c
+++ b/hw/vfio_pci.c
@@ -154,6 +154,53 @@ static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
 static void vfio_mmap_set_enabled(VFIODevice *vdev, bool enabled);
 
 /*
+ * PCI code refuses to make it possible to probe whether the chipset
+ * supports pci_device_route_intx_to_irq() and booby traps the call
+ * to assert if doesn't.  For us, this is just an optimization, so
+ * only enable it when we know it's present.  Unfortunately PCIBus is
+ * private, so we can't just look at the function pointer.
+ */
+static bool vfio_pci_bus_has_intx_route(PCIDevice *pdev)
+{
+#ifdef CONFIG_KVM
+    BusState *bus = qdev_get_parent_bus(&pdev->qdev);
+    DeviceState *dev;
+
+    if (!kvm_irqchip_in_kernel() ||
+        !kvm_check_extension(kvm_state, KVM_CAP_IRQFD_RESAMPLE)) {
+	return false;
+    }
+
+    for (; bus->parent; bus = qdev_get_parent_bus(dev)) {
+
+        dev = bus->parent;
+
+        if (!strncmp("i440FX-pcihost", object_get_typename(OBJECT(dev)), 14)) {
+            return true;
+        }
+    }
+
+    error_report("vfio-pci: VM chipset does not support INTx routing, "
+                 "using slow INTx mode\n");
+#endif
+    return false;
+}
+
+static PCIINTxRoute vfio_pci_device_route_intx_to_irq(PCIDevice *pdev, int pin)
+{
+    if (!vfio_pci_bus_has_intx_route(pdev)) {
+        return (PCIINTxRoute) { .mode = PCI_INTX_DISABLED, .irq = -1 };
+    }
+
+    return pci_device_route_intx_to_irq(pdev, pin);
+}
+
+static bool vfio_pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new)
+{
+    return old->mode != new->mode || old->irq != new->irq;
+}
+
+/*
  * Common VFIO interrupt disable
  */
 static void vfio_disable_irqindex(VFIODevice *vdev, int index)
@@ -185,6 +232,21 @@ static void vfio_unmask_intx(VFIODevice *vdev)
     ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
 }
 
+#ifdef CONFIG_KVM
+static void vfio_mask_intx(VFIODevice *vdev)
+{
+    struct vfio_irq_set irq_set = {
+        .argsz = sizeof(irq_set),
+        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
+        .index = VFIO_PCI_INTX_IRQ_INDEX,
+        .start = 0,
+        .count = 1,
+    };
+
+    ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
+}
+#endif
+
 /*
  * Disabling BAR mmaping can be slow, but toggling it around INTx can
  * also be a huge overhead.  We try to get the best of both worlds by
@@ -248,6 +310,161 @@ static void vfio_eoi(VFIODevice *vdev)
     vfio_unmask_intx(vdev);
 }
 
+static void vfio_enable_intx_kvm(VFIODevice *vdev)
+{
+#ifdef CONFIG_KVM
+    struct kvm_irqfd irqfd = {
+        .fd = event_notifier_get_fd(&vdev->intx.interrupt),
+        .gsi = vdev->intx.route.irq,
+        .flags = KVM_IRQFD_FLAG_RESAMPLE,
+    };
+    struct vfio_irq_set *irq_set;
+    int ret, argsz;
+    int32_t *pfd;
+
+    if (!kvm_irqchip_in_kernel() ||
+        vdev->intx.route.mode != PCI_INTX_ENABLED ||
+        !kvm_check_extension(kvm_state, KVM_CAP_IRQFD_RESAMPLE)) {
+        return;
+    }
+
+    /* Get to a known interrupt state */
+    qemu_set_fd_handler(irqfd.fd, NULL, NULL, vdev);
+    vfio_mask_intx(vdev);
+    vdev->intx.pending = false;
+    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
+
+    /* Get an eventfd for resample/unmask */
+    if (event_notifier_init(&vdev->intx.unmask, 0)) {
+        error_report("vfio: Error: event_notifier_init failed eoi\n");
+        goto fail;
+    }
+
+    /* KVM triggers it, VFIO listens for it */
+    irqfd.resamplefd = event_notifier_get_fd(&vdev->intx.unmask);
+
+    if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
+        error_report("vfio: Error: Failed to setup resample irqfd: %m\n");
+        goto fail_irqfd;
+    }
+
+    argsz = sizeof(*irq_set) + sizeof(*pfd);
+
+    irq_set = g_malloc0(argsz);
+    irq_set->argsz = argsz;
+    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK;
+    irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
+    irq_set->start = 0;
+    irq_set->count = 1;
+    pfd = (int32_t *)&irq_set->data;
+
+    *pfd = irqfd.resamplefd;
+
+    ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
+    g_free(irq_set);
+    if (ret) {
+        error_report("vfio: Error: Failed to setup INTx unmask fd: %m\n");
+        goto fail_vfio;
+    }
+
+    /* Let'em rip */
+    vfio_unmask_intx(vdev);
+
+    vdev->intx.kvm_accel = true;
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel enabled\n",
+            __func__, vdev->host.domain, vdev->host.bus,
+            vdev->host.slot, vdev->host.function);
+
+    return;
+
+fail_vfio:
+    irqfd.flags = KVM_IRQFD_FLAG_DEASSIGN;
+    kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd);
+fail_irqfd:
+    event_notifier_cleanup(&vdev->intx.unmask);
+fail:
+    qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
+    vfio_unmask_intx(vdev);
+#endif
+}
+
+static void vfio_disable_intx_kvm(VFIODevice *vdev)
+{
+#ifdef CONFIG_KVM
+    struct kvm_irqfd irqfd = {
+        .fd = event_notifier_get_fd(&vdev->intx.interrupt),
+        .gsi = vdev->intx.route.irq,
+        .flags = KVM_IRQFD_FLAG_DEASSIGN,
+    };
+
+    if (!vdev->intx.kvm_accel) {
+        return;
+    }
+
+    /*
+     * Get to a known state, hardware masked, QEMU ready to accept new
+     * interrupts, QEMU IRQ de-asserted.
+     */
+    vfio_mask_intx(vdev);
+    vdev->intx.pending = false;
+    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
+
+    /* Tell KVM to stop listening for an INTx irqfd */
+    if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
+        error_report("vfio: Error: Failed to disable INTx irqfd: %m\n");
+    }
+
+    /* We only need to close the eventfd for VFIO to cleanup the kernel side */
+    event_notifier_cleanup(&vdev->intx.unmask);
+
+    /* QEMU starts listening for interrupt events. */
+    qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
+
+    vdev->intx.kvm_accel = false;
+
+    /* If we've missed an event, let it re-fire through QEMU */
+    vfio_unmask_intx(vdev);
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel disabled\n",
+            __func__, vdev->host.domain, vdev->host.bus,
+            vdev->host.slot, vdev->host.function);
+#endif
+}
+
+static void vfio_update_irq(PCIDevice *pdev)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    PCIINTxRoute route;
+
+    if (vdev->interrupt != VFIO_INT_INTx) {
+        return;
+    }
+
+    route = vfio_pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin);
+
+    if (!vfio_pci_intx_route_changed(&vdev->intx.route, &route)) {
+        return; /* Nothing changed */
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) IRQ moved %d -> %d\n", __func__,
+            vdev->host.domain, vdev->host.bus, vdev->host.slot,
+            vdev->host.function, vdev->intx.route.irq, route.irq);
+
+    vfio_disable_intx_kvm(vdev);
+
+    vdev->intx.route = route;
+
+    if (route.mode != PCI_INTX_ENABLED) {
+        return;
+    }
+
+    vfio_enable_intx_kvm(vdev);
+
+    /* Re-enable the interrupt in cased we missed an EOI */
+    vfio_eoi(vdev);
+}
+
 static int vfio_enable_intx(VFIODevice *vdev)
 {
     uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
@@ -262,6 +479,9 @@ static int vfio_enable_intx(VFIODevice *vdev)
     vfio_disable_interrupts(vdev);
 
     vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
+    vdev->intx.route = vfio_pci_device_route_intx_to_irq(&vdev->pdev,
+                                                         vdev->intx.pin);
+
     ret = event_notifier_init(&vdev->intx.interrupt, 0);
     if (ret) {
         error_report("vfio: Error: event_notifier_init failed\n");
@@ -290,6 +510,8 @@ static int vfio_enable_intx(VFIODevice *vdev)
         return -errno;
     }
 
+    vfio_enable_intx_kvm(vdev);
+
     vdev->interrupt = VFIO_INT_INTx;
 
     DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
@@ -303,6 +525,7 @@ static void vfio_disable_intx(VFIODevice *vdev)
     int fd;
 
     qemu_del_timer(vdev->intx.mmap_timer);
+    vfio_disable_intx_kvm(vdev);
     vfio_disable_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX);
     vdev->intx.pending = false;
     qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
@@ -1870,6 +2093,7 @@ static int vfio_initfn(PCIDevice *pdev)
     if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
         vdev->intx.mmap_timer = qemu_new_timer_ms(vm_clock,
                                                   vfio_intx_mmap_enable, vdev);
+        pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_update_irq);
         ret = vfio_enable_intx(vdev);
         if (ret) {
             goto out_teardown;