From patchwork Fri Mar 31 17:54:47 2017
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Jean-Philippe Brucker <Jean-Philippe.Brucker@arm.com>
X-Patchwork-Id: 9657053
Return-Path: <kvm-owner@kernel.org>
Received: from mail.wl.linuxfoundation.org (pdx-wl-mail.web.codeaurora.org
	[172.30.200.125])
	by pdx-korg-patchwork.web.codeaurora.org (Postfix) with ESMTP id
	EDC8F60350 for <patchwork-kvm@patchwork.kernel.org>;
	Fri, 31 Mar 2017 17:57:45 +0000 (UTC)
Received: from mail.wl.linuxfoundation.org (localhost [127.0.0.1])
	by mail.wl.linuxfoundation.org (Postfix) with ESMTP id DF5D427F8F
	for <patchwork-kvm@patchwork.kernel.org>;
	Fri, 31 Mar 2017 17:57:45 +0000 (UTC)
Received: by mail.wl.linuxfoundation.org (Postfix, from userid 486)
	id D1615286D1; Fri, 31 Mar 2017 17:57:45 +0000 (UTC)
X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on
	pdx-wl-mail.web.codeaurora.org
X-Spam-Level: 
X-Spam-Status: No, score=-6.9 required=2.0 tests=BAYES_00,RCVD_IN_DNSWL_HI
	autolearn=ham version=3.3.1
Received: from vger.kernel.org (vger.kernel.org [209.132.180.67])
	by mail.wl.linuxfoundation.org (Postfix) with ESMTP id D2CC7286DF
	for <patchwork-kvm@patchwork.kernel.org>;
	Fri, 31 Mar 2017 17:57:44 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
	id S933851AbdCaR5d (ORCPT
	<rfc822;patchwork-kvm@patchwork.kernel.org>);
	Fri, 31 Mar 2017 13:57:33 -0400
Received: from foss.arm.com ([217.140.101.70]:34938 "EHLO foss.arm.com"
	rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP
	id S933796AbdCaR5b (ORCPT <rfc822;kvm@vger.kernel.org>);
	Fri, 31 Mar 2017 13:57:31 -0400
Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.72.51.249])
	by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 239AB15AD;
	Fri, 31 Mar 2017 10:57:30 -0700 (PDT)
Received: from e106794-lin.cambridge.arm.com (e106794-lin.cambridge.arm.com
	[10.1.210.58])
	by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPA id
	7E0BF3F59A; Fri, 31 Mar 2017 10:57:29 -0700 (PDT)
From: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
To: kvm@vger.kernel.org
Cc: will.deacon@arm.com, robin.murphy@arm.com
Subject: [PATCH kvmtool 07/10] vfio-pci: add MSI-X support
Date: Fri, 31 Mar 2017 18:54:47 +0100
Message-Id: <20170331175450.24269-8-jean-philippe.brucker@arm.com>
X-Mailer: git-send-email 2.12.1
In-Reply-To: <20170331175450.24269-1-jean-philippe.brucker@arm.com>
References: <20170331175450.24269-1-jean-philippe.brucker@arm.com>
Sender: kvm-owner@vger.kernel.org
Precedence: bulk
List-ID: <kvm.vger.kernel.org>
X-Mailing-List: kvm@vger.kernel.org
X-Virus-Scanned: ClamAV using ClamSMTP

Add virtual MSI-X tables for PCI devices, and create IRQFD routes to let
the kernel inject MSIs from a physical device directly into the guest.

Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
---
 include/kvm/vfio.h |  24 ++++
 vfio.c             | 366 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 369 insertions(+), 21 deletions(-)

diff --git a/include/kvm/vfio.h b/include/kvm/vfio.h
index 6d2666b0..68535963 100644
--- a/include/kvm/vfio.h
+++ b/include/kvm/vfio.h
@@ -8,8 +8,32 @@
 
 #define MAX_VFIO_GROUPS			16
 
+struct vfio_pci_msix_entry {
+	struct msix_table		config;
+	int				gsi;
+	int				eventfd;
+};
+
+struct vfio_pci_msix_table {
+	size_t				nr_entries;
+	size_t				size;
+	unsigned int			bar;
+	u32				guest_phys_addr;
+	struct vfio_pci_msix_entry	*entries;
+};
+
+struct vfio_pci_msix_pba {
+	size_t				size;
+	off_t				offset; /* in VFIO device fd */
+	unsigned int			bar;
+	u32				guest_phys_addr;
+};
+
 struct vfio_pci_device {
 	struct pci_device_header	hdr;
+
+	struct vfio_pci_msix_table	msix_table;
+	struct vfio_pci_msix_pba	msix_pba;
 };
 
 struct vfio_region {
diff --git a/vfio.c b/vfio.c
index 0f5bc3dd..85d1ea8b 100644
--- a/vfio.c
+++ b/vfio.c
@@ -50,6 +50,70 @@ int vfio_group_parser(const struct option *opt, const char *arg, int unset)
 	return 0;
 }
 
+static void vfio_pci_msix_pba_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
+				     u32 len, u8 is_write, void *ptr)
+{
+	struct vfio_pci_device *pdev = ptr;
+	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
+	u64 offset = addr - pba->guest_phys_addr;
+	struct vfio_device *device = container_of(pdev, struct vfio_device, pci);
+
+	if (is_write)
+		return;
+
+	if (pread(device->fd, data, len, pba->offset + offset) != len)
+		pr_err("cannot access MSIX PBA\n");
+}
+
+static void vfio_pci_msix_table_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
+				       u32 len, u8 is_write, void *ptr)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct vfio_pci_device *pdev = ptr;
+	struct vfio_pci_msix_entry *entry;
+	struct vfio_pci_msix_table *table = &pdev->msix_table;
+	struct vfio_device *device = container_of(pdev, struct vfio_device, pci);
+
+	u64 offset = addr - table->guest_phys_addr;
+
+	size_t vector = offset / PCI_MSIX_ENTRY_SIZE;
+	/* PCI spec says that software must use aligned 4 or 8 bytes accesses */
+	off_t field = offset % PCI_MSIX_ENTRY_SIZE;
+	entry = &table->entries[vector];
+
+	if (!is_write) {
+		memcpy(data, (void *)&entry->config + field, len);
+		return;
+	}
+
+	memcpy((void *)&entry->config + field, data, len);
+
+	if (field != PCI_MSIX_ENTRY_VECTOR_CTRL)
+		return;
+
+	if (entry->gsi < 0) {
+		int ret = irq__add_msix_route(kvm, &entry->config.msg,
+					      device->dev_hdr.dev_num << 3);
+		if (ret < 0) {
+			pr_err("cannot create MSI-X route");
+		} else {
+			entry->gsi = ret;
+
+			ret = irq__add_irqfd(kvm, ret, entry->eventfd, -1);
+			if (ret < 0)
+				pr_err("Cannot setup irqfd");
+		}
+
+		if (ret < 0)
+			/* Not much we can do here. Mask the vector. */
+			entry->config.ctrl = 1;
+
+		return;
+	}
+
+	irq__update_msix_route(kvm, entry->gsi, &entry->config.msg);
+}
+
 static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr,
 			      u8 offset, void *data, int sz)
 {
@@ -89,17 +153,94 @@ static void vfio_pci_cfg_write(struct kvm *kvm, struct pci_device_header *pci_hd
 			   sz, offset);
 }
 
+static ssize_t vfio_pci_cap_size(struct pci_cap_hdr *cap_hdr)
+{
+	switch (cap_hdr->type) {
+	case PCI_CAP_ID_MSIX:
+		return PCI_CAP_MSIX_SIZEOF;
+	default:
+		pr_err("unknown PCI capability %u", cap_hdr->type);
+		return 0;
+	}
+}
+
+/*
+ * Copy capability from physical header into virtual header, and add it to the
+ * virtual capability list.
+ *
+ * @fd_offset: offset of pci header into vfio device fd
+ * @pos: offset of capability from start of header
+ */
+static int vfio_pci_add_cap(struct vfio_device *device, struct pci_cap_hdr *cap_hdr,
+			    off_t fd_offset, off_t pos)
+{
+	int i;
+	ssize_t size = vfio_pci_cap_size(cap_hdr);
+	struct pci_device_header *hdr = &device->pci.hdr;
+	struct pci_cap_hdr *out = (void *)hdr + pos;
+
+	if (pread(device->fd, out, size, fd_offset + pos) != size)
+		return -errno;
+
+	out->next = 0;
+
+	if (!hdr->capabilities) {
+		hdr->capabilities = pos;
+		hdr->status |= PCI_STATUS_CAP_LIST;
+	} else {
+		/* Add cap at end of list */
+		struct pci_cap_hdr *last;
+
+		pci_for_each_cap(i, last, hdr)
+			;
+		last->next = pos;
+	}
+
+	return 0;
+}
+
 static int vfio_pci_parse_caps(struct vfio_device *device)
 {
+	u8 pos;
+	int ret;
+	struct pci_cap_hdr cap;
+	ssize_t sz = sizeof(cap);
+	struct vfio_region_info *info;
 	struct vfio_pci_device *pdev = &device->pci;
 
 	if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST))
 		return 0;
 
+	pos = pdev->hdr.capabilities & ~3;
+	info = &device->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
+
 	pdev->hdr.status &= ~PCI_STATUS_CAP_LIST;
 	pdev->hdr.capabilities = 0;
 
-	/* TODO: install virtual capabilities */
+	for (; pos; pos = cap.next) {
+		if (pos >= PCI_DEV_CFG_SIZE) {
+			pr_warning("Ignoring cap outside of config space");
+			return -EINVAL;
+		}
+
+		if (pread(device->fd, &cap, sz, info->offset + pos) != sz) {
+			pr_warning("Failed to read from capabilities pointer (0x%x)",
+				   pos);
+			return -EINVAL;
+		}
+
+		switch (cap.type) {
+		case PCI_CAP_ID_MSIX:
+			ret = vfio_pci_add_cap(device, &cap, info->offset, pos);
+			if (ret) {
+				pr_warning("Failed to read MSI-X capability structure");
+				return ret;
+			}
+			break;
+
+			/* Any other capability is hidden */
+		}
+	}
 
 	return 0;
 }
@@ -150,7 +291,11 @@ static int vfio_pci_parse_cfg_space(struct vfio_device *device)
 static int vfio_pci_fixup_cfg_space(struct vfio_device *device)
 {
 	int i;
+	int pos;
 	ssize_t hdr_sz;
+	ssize_t cap_sz;
+	struct pci_cap_hdr *cap;
+	struct msix_cap *msix;
 	struct vfio_region_info *info;
 	struct vfio_pci_device *pdev = &device->pci;
 
@@ -183,6 +328,22 @@ static int vfio_pci_fixup_cfg_space(struct vfio_device *device)
 	 */
 	pdev->hdr.exp_rom_bar = 0;
 
+	/* Plumb in our fake MSI-X capability, if we have it. */
+	msix = pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX);
+	if (msix) {
+		/* Add a shortcut to the PBA region for the MMIO handler */
+		int pba_index = VFIO_PCI_BAR0_REGION_INDEX + pdev->msix_pba.bar;
+		pdev->msix_pba.offset = device->regions[pba_index].info.offset +
+					(msix->pba_offset & PCI_MSIX_PBA_OFFSET);
+
+		/* Tidy up the capability */
+		msix->table_offset &= PCI_MSIX_TABLE_BIR;
+		msix->pba_offset &= PCI_MSIX_PBA_BIR;
+		if (pdev->msix_table.bar == pdev->msix_pba.bar)
+			msix->pba_offset |= pdev->msix_table.size &
+					    PCI_MSIX_PBA_OFFSET;
+	}
+
 	/* Install our fake Configuration Space, without the caps */
 	info = &device->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
 	hdr_sz = offsetof(struct pci_device_header, msix);
@@ -191,7 +352,17 @@ static int vfio_pci_fixup_cfg_space(struct vfio_device *device)
 		return -EIO;
 	}
 
-	/* TODO: install virtual capabilities */
+	/* Install the fake capability list */
+	pci_for_each_cap(pos, cap, &pdev->hdr) {
+		cap_sz = vfio_pci_cap_size(cap);
+
+		if (pwrite(device->fd, cap, cap_sz, info->offset + pos) !=
+		    cap_sz) {
+			pr_err("Failed to write capability %u", cap->type);
+			return -EIO;
+		}
+	}
+
 	/* Register callbacks for cfg accesses */
 	pdev->hdr.cfg_ops = (struct pci_config_operations) {
 		.read	= vfio_pci_cfg_read,
@@ -250,16 +421,97 @@ static int vfio_pci_map_bar(struct kvm *kvm, int fd, struct vfio_region *region)
 	return 0;
 }
 
+static int vfio_pci_create_msix_table(struct kvm *kvm,
+				      struct vfio_pci_device *pdev,
+				      struct msix_cap *msix)
+{
+	int ret;
+	size_t i;
+	size_t nr_entries;
+	size_t table_size;
+	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
+	struct vfio_pci_msix_table *table = &pdev->msix_table;
+
+	table->bar = msix->table_offset & PCI_MSIX_TABLE_BIR;
+	pba->bar = msix->pba_offset & PCI_MSIX_TABLE_BIR;
+
+	/*
+	 * KVM needs memory regions to be multiple of and aligned on PAGE_SIZE.
+	 */
+	nr_entries = (msix->ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
+	table_size = ALIGN(nr_entries * PCI_MSIX_ENTRY_SIZE, PAGE_SIZE);
+
+	table->entries = calloc(nr_entries, sizeof(struct vfio_pci_msix_entry));
+	if (!table->entries)
+		return -ENOMEM;
+
+	for (i = 0; i < nr_entries; i++)
+		table->entries[i].config.ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT;
+
+	table->nr_entries = nr_entries;
+	table->size = table_size;
+
+	/*
+	 * To ease MSI-X cap configuration in case they share the same BAR,
+	 * collapse table and pending array. According to PCI, address spaces
+	 * must be power of two. Since nr_entries is a power of two, and PBA
+	 * size is less than table_size, reserve 2*table_size.
+	 */
+	table->guest_phys_addr = pci_get_io_space_block(2 * table_size);
+	if (!table->guest_phys_addr) {
+		pr_err("cannot allocate IO space");
+		ret = -ENOMEM;
+		goto out_free;
+	}
+	pba->guest_phys_addr = table->guest_phys_addr + table->size;
+
+	ret = kvm__register_mmio(kvm, table->guest_phys_addr, table_size, false,
+				 vfio_pci_msix_table_access, pdev);
+	if (ret < 0)
+		goto out_free;
+
+	/*
+	 * We could map the physical PBA directly into the guest, but it's
+	 * likely smaller than a page, and we can only hand full pages to the
+	 * guest. Even though the PCI spec disallows sharing a page used for
+	 * MSI-X with any other resource, it allows to share the same page
+	 * between MSI-X table and PBA. For the sake of isolation, create a
+	 * virtual PBA.
+	 */
+	pba->size = nr_entries / 8;
+
+	ret = kvm__register_mmio(kvm, pba->guest_phys_addr, pba->size, false,
+				 vfio_pci_msix_pba_access, pdev);
+	if (ret < 0)
+		goto out_free;
+
+	return 0;
+
+out_free:
+	free(table->entries);
+
+	return ret;
+}
+
 static int vfio_pci_configure_dev_regions(struct kvm *kvm,
 					  struct vfio_device *device)
 {
 	int ret;
+	struct msix_cap *msix;
+	struct vfio_pci_device *pdev = &device->pci;
 	u32 i, num_regions = device->info.num_regions;
 
 	ret = vfio_pci_parse_cfg_space(device);
 	if (ret)
 		return ret;
 
+	msix = pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX);
+	if (msix) {
+		ret = vfio_pci_create_msix_table(kvm, pdev, msix);
+		if (ret)
+			return ret;
+	}
+
 	/* First of all, map the BARs directly into the guest */
 	for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
 		struct vfio_region *region;
@@ -278,6 +530,16 @@ static int vfio_pci_configure_dev_regions(struct kvm *kvm,
 		if (!region->info.size)
 			continue;
 
+		if (msix) {
+			if (i == pdev->msix_table.bar) {
+				region->guest_phys_addr = pdev->msix_table.guest_phys_addr;
+				continue;
+			} else if (i == pdev->msix_pba.bar) {
+				region->guest_phys_addr = pdev->msix_pba.guest_phys_addr;
+				continue;
+			}
+		}
+
 		/*
 		 * Map the BARs into the guest. We'll later need to update
 		 * configuration space to reflect our allocation.
@@ -314,6 +576,64 @@ static int vfio_configure_dev_regions(struct kvm *kvm,
 	return vfio_pci_configure_dev_regions(kvm, device);
 }
 
+static int vfio_pci_init_msix_irqfd(struct kvm *kvm,
+				    struct vfio_device *device)
+{
+	int ret;
+	size_t i;
+	int *eventfds;
+	size_t irq_set_size;
+	struct vfio_irq_set *irq_set;
+	struct vfio_pci_msix_table *table = &device->pci.msix_table;
+
+	/*
+	 * We likely have VFIO_IRQ_INFO_NORESIZE for MSI-X, and we don't want to
+	 * enable/disable MSIs every time the guest requests a new one. Setup
+	 * IRQFD for all vectors upfront.
+	 *
+	 * We cannot start creating the MSI-X routes in KVM just now. First we
+	 * need to wait for all devices to allocate their IRQ lines, and only
+	 * after that number is freezed will we be able to allocate MSI numbers.
+	 * A bit unfortunate (it would be much easier to handle initialization
+	 * errors here), but okay. Store eventfd until we're ready to create the
+	 * routes.
+	 */
+	irq_set_size = sizeof(struct vfio_irq_set) +
+		       table->nr_entries * sizeof(int);
+	irq_set = malloc(irq_set_size);
+	if (!irq_set)
+		return -ENOMEM;
+
+	*irq_set = (struct vfio_irq_set) {
+		.argsz	= irq_set_size,
+		.flags 	= VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
+		.index 	= VFIO_PCI_MSIX_IRQ_INDEX,
+		.start 	= 0,
+		.count 	= table->nr_entries,
+	};
+	eventfds = (void *)irq_set + sizeof(struct vfio_irq_set);
+
+	for (i = 0; i < table->nr_entries; i++) {
+		eventfds[i] = eventfd(0, 0);
+		if (eventfds[i] < 0) {
+			pr_err("cannot create eventfd (try to increase RLIMIT_NOFILE)");
+			ret = -errno;
+			goto out_free;
+		}
+
+		table->entries[i].gsi = -1;
+		table->entries[i].eventfd = eventfds[i];
+	}
+
+	ret = ioctl(device->fd, VFIO_DEVICE_SET_IRQS, irq_set);
+	if (ret < 0)
+		pr_err("Cannot register vfio_irq_set");
+
+out_free:
+	free(irq_set);
+	return ret;
+}
+
 static int vfio_init_irqfd(struct kvm *kvm, int devfd, int gsi)
 {
 	int ret;
@@ -393,31 +713,37 @@ static int vfio_configure_dev_irqs(struct kvm *kvm, struct vfio_device *device)
 {
 	int ret;
 	struct vfio_pci_device *pdev = &device->pci;
+	struct msix_cap *msix = pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX);
 
 	device->irq_info = (struct vfio_irq_info) {
-		.argsz = sizeof(device->irq_info)
+		.argsz = sizeof(device->irq_info),
+		.index = msix ? VFIO_PCI_MSIX_IRQ_INDEX :
+			        VFIO_PCI_INTX_IRQ_INDEX,
 	};
 
-	if (pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX)) {
-		/* TODO: set up shadow PBA/table structures for MSI-X. */
+	ioctl(device->fd, VFIO_DEVICE_GET_IRQ_INFO, &device->irq_info);
+	if (device->irq_info.count == 0) {
+		pr_err("No interrupt found by VFIO");
+		return -ENODEV;
+	}
+
+	if (!(device->irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
+		pr_err("Interrupt not EVENTFD capable");
+		return -EINVAL;
+	}
+
+	if (msix) {
+		if (device->irq_info.count != pdev->msix_table.nr_entries) {
+			pr_err("Invalid number of MSI-X reported by VFIO");
+			return -EINVAL;
+		}
+
+		ret = vfio_pci_init_msix_irqfd(kvm, device);
 	} else {
 		int gsi = pdev->hdr.irq_line - KVM_IRQ_OFFSET;
 
-		/* We don't have MSI-X, so fall back on INTx */
 		pr_info("MSI-X not available for device 0x%x, falling back to INTx",
 			device->dev_hdr.dev_num);
-		device->irq_info.index = VFIO_PCI_INTX_IRQ_INDEX;
-		ioctl(device->fd, VFIO_DEVICE_GET_IRQ_INFO, &device->irq_info);
-
-		if (device->irq_info.count != 1) {
-			pr_err("No INTx interrupts found");
-			return -ENODEV;
-		}
-
-		if (!(device->irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
-			pr_err("INTx interrupt not EVENTFD capable");
-			return -EINVAL;
-		}
 
 		if (!(device->irq_info.flags & VFIO_IRQ_INFO_AUTOMASKED)) {
 			pr_err("INTx interrupt not AUTOMASKED");
@@ -425,11 +751,9 @@ static int vfio_configure_dev_irqs(struct kvm *kvm, struct vfio_device *device)
 		}
 
 		ret = vfio_init_irqfd(kvm, device->fd, gsi);
-		if (ret)
-			return ret;
 	}
 
-	return 0;
+	return ret;
 }
 
 static int vfio_configure_device(struct kvm *kvm, struct vfio_group *group,