@@ -8,8 +8,32 @@
#define MAX_VFIO_GROUPS 16
+struct vfio_pci_msix_entry {
+ struct msix_table config;
+ int gsi;
+ int eventfd;
+};
+
+struct vfio_pci_msix_table {
+ size_t nr_entries;
+ size_t size;
+ unsigned int bar;
+ u32 guest_phys_addr;
+ struct vfio_pci_msix_entry *entries;
+};
+
+struct vfio_pci_msix_pba {
+ size_t size;
+ off_t offset; /* in VFIO device fd */
+ unsigned int bar;
+ u32 guest_phys_addr;
+};
+
struct vfio_pci_device {
struct pci_device_header hdr;
+
+ struct vfio_pci_msix_table msix_table;
+ struct vfio_pci_msix_pba msix_pba;
};
struct vfio_region {
@@ -50,6 +50,70 @@ int vfio_group_parser(const struct option *opt, const char *arg, int unset)
return 0;
}
+static void vfio_pci_msix_pba_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
+ u32 len, u8 is_write, void *ptr)
+{
+ struct vfio_pci_device *pdev = ptr;
+ struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
+ u64 offset = addr - pba->guest_phys_addr;
+ struct vfio_device *device = container_of(pdev, struct vfio_device, pci);
+
+ if (is_write)
+ return;
+
+ if (pread(device->fd, data, len, pba->offset + offset) != len)
+ pr_err("cannot access MSIX PBA\n");
+}
+
+static void vfio_pci_msix_table_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
+ u32 len, u8 is_write, void *ptr)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct vfio_pci_device *pdev = ptr;
+ struct vfio_pci_msix_entry *entry;
+ struct vfio_pci_msix_table *table = &pdev->msix_table;
+ struct vfio_device *device = container_of(pdev, struct vfio_device, pci);
+
+ u64 offset = addr - table->guest_phys_addr;
+
+ size_t vector = offset / PCI_MSIX_ENTRY_SIZE;
+ /* PCI spec says that software must use aligned 4 or 8 bytes accesses */
+ off_t field = offset % PCI_MSIX_ENTRY_SIZE;
+ entry = &table->entries[vector];
+
+ if (!is_write) {
+ memcpy(data, (void *)&entry->config + field, len);
+ return;
+ }
+
+ memcpy((void *)&entry->config + field, data, len);
+
+ if (field != PCI_MSIX_ENTRY_VECTOR_CTRL)
+ return;
+
+ if (entry->gsi < 0) {
+ int ret = irq__add_msix_route(kvm, &entry->config.msg,
+ device->dev_hdr.dev_num << 3);
+ if (ret < 0) {
+ pr_err("cannot create MSI-X route");
+ } else {
+ entry->gsi = ret;
+
+ ret = irq__add_irqfd(kvm, ret, entry->eventfd, -1);
+ if (ret < 0)
+ pr_err("Cannot setup irqfd");
+ }
+
+ if (ret < 0)
+ /* Not much we can do here. Mask the vector. */
+ entry->config.ctrl = 1;
+
+ return;
+ }
+
+ irq__update_msix_route(kvm, entry->gsi, &entry->config.msg);
+}
+
static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr,
u8 offset, void *data, int sz)
{
@@ -89,17 +153,94 @@ static void vfio_pci_cfg_write(struct kvm *kvm, struct pci_device_header *pci_hd
sz, offset);
}
+static ssize_t vfio_pci_cap_size(struct pci_cap_hdr *cap_hdr)
+{
+ switch (cap_hdr->type) {
+ case PCI_CAP_ID_MSIX:
+ return PCI_CAP_MSIX_SIZEOF;
+ default:
+ pr_err("unknown PCI capability %u", cap_hdr->type);
+ return 0;
+ }
+}
+
+/*
+ * Copy capability from physical header into virtual header, and add it to the
+ * virtual capability list.
+ *
+ * @fd_offset: offset of pci header into vfio device fd
+ * @pos: offset of capability from start of header
+ */
+static int vfio_pci_add_cap(struct vfio_device *device, struct pci_cap_hdr *cap_hdr,
+ off_t fd_offset, off_t pos)
+{
+ int i;
+ ssize_t size = vfio_pci_cap_size(cap_hdr);
+ struct pci_device_header *hdr = &device->pci.hdr;
+ struct pci_cap_hdr *out = (void *)hdr + pos;
+
+ if (pread(device->fd, out, size, fd_offset + pos) != size)
+ return -errno;
+
+ out->next = 0;
+
+ if (!hdr->capabilities) {
+ hdr->capabilities = pos;
+ hdr->status |= PCI_STATUS_CAP_LIST;
+ } else {
+ /* Add cap at end of list */
+ struct pci_cap_hdr *last;
+
+ pci_for_each_cap(i, last, hdr)
+ ;
+ last->next = pos;
+ }
+
+ return 0;
+}
+
static int vfio_pci_parse_caps(struct vfio_device *device)
{
+ u8 pos;
+ int ret;
+ struct pci_cap_hdr cap;
+ ssize_t sz = sizeof(cap);
+ struct vfio_region_info *info;
struct vfio_pci_device *pdev = &device->pci;
if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST))
return 0;
+ pos = pdev->hdr.capabilities & ~3;
+ info = &device->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
+
pdev->hdr.status &= ~PCI_STATUS_CAP_LIST;
pdev->hdr.capabilities = 0;
- /* TODO: install virtual capabilities */
+ for (; pos; pos = cap.next) {
+ if (pos >= PCI_DEV_CFG_SIZE) {
+ pr_warning("Ignoring cap outside of config space");
+ return -EINVAL;
+ }
+
+ if (pread(device->fd, &cap, sz, info->offset + pos) != sz) {
+ pr_warning("Failed to read from capabilities pointer (0x%x)",
+ pos);
+ return -EINVAL;
+ }
+
+ switch (cap.type) {
+ case PCI_CAP_ID_MSIX:
+ ret = vfio_pci_add_cap(device, &cap, info->offset, pos);
+ if (ret) {
+ pr_warning("Failed to read MSI-X capability structure");
+ return ret;
+ }
+ break;
+
+ /* Any other capability is hidden */
+ }
+ }
return 0;
}
@@ -150,7 +291,11 @@ static int vfio_pci_parse_cfg_space(struct vfio_device *device)
static int vfio_pci_fixup_cfg_space(struct vfio_device *device)
{
int i;
+ int pos;
ssize_t hdr_sz;
+ ssize_t cap_sz;
+ struct pci_cap_hdr *cap;
+ struct msix_cap *msix;
struct vfio_region_info *info;
struct vfio_pci_device *pdev = &device->pci;
@@ -183,6 +328,22 @@ static int vfio_pci_fixup_cfg_space(struct vfio_device *device)
*/
pdev->hdr.exp_rom_bar = 0;
+ /* Plumb in our fake MSI-X capability, if we have it. */
+ msix = pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX);
+ if (msix) {
+ /* Add a shortcut to the PBA region for the MMIO handler */
+ int pba_index = VFIO_PCI_BAR0_REGION_INDEX + pdev->msix_pba.bar;
+ pdev->msix_pba.offset = device->regions[pba_index].info.offset +
+ (msix->pba_offset & PCI_MSIX_PBA_OFFSET);
+
+ /* Tidy up the capability */
+ msix->table_offset &= PCI_MSIX_TABLE_BIR;
+ msix->pba_offset &= PCI_MSIX_PBA_BIR;
+ if (pdev->msix_table.bar == pdev->msix_pba.bar)
+ msix->pba_offset |= pdev->msix_table.size &
+ PCI_MSIX_PBA_OFFSET;
+ }
+
/* Install our fake Configuration Space, without the caps */
info = &device->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
hdr_sz = offsetof(struct pci_device_header, msix);
@@ -191,7 +352,17 @@ static int vfio_pci_fixup_cfg_space(struct vfio_device *device)
return -EIO;
}
- /* TODO: install virtual capabilities */
+ /* Install the fake capability list */
+ pci_for_each_cap(pos, cap, &pdev->hdr) {
+ cap_sz = vfio_pci_cap_size(cap);
+
+ if (pwrite(device->fd, cap, cap_sz, info->offset + pos) !=
+ cap_sz) {
+ pr_err("Failed to write capability %u", cap->type);
+ return -EIO;
+ }
+ }
+
/* Register callbacks for cfg accesses */
pdev->hdr.cfg_ops = (struct pci_config_operations) {
.read = vfio_pci_cfg_read,
@@ -250,16 +421,97 @@ static int vfio_pci_map_bar(struct kvm *kvm, int fd, struct vfio_region *region)
return 0;
}
+static int vfio_pci_create_msix_table(struct kvm *kvm,
+ struct vfio_pci_device *pdev,
+ struct msix_cap *msix)
+{
+ int ret;
+ size_t i;
+ size_t nr_entries;
+ size_t table_size;
+ struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
+ struct vfio_pci_msix_table *table = &pdev->msix_table;
+
+ table->bar = msix->table_offset & PCI_MSIX_TABLE_BIR;
+ pba->bar = msix->pba_offset & PCI_MSIX_TABLE_BIR;
+
+ /*
+ * KVM needs memory regions to be multiple of and aligned on PAGE_SIZE.
+ */
+ nr_entries = (msix->ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
+ table_size = ALIGN(nr_entries * PCI_MSIX_ENTRY_SIZE, PAGE_SIZE);
+
+ table->entries = calloc(nr_entries, sizeof(struct vfio_pci_msix_entry));
+ if (!table->entries)
+ return -ENOMEM;
+
+ for (i = 0; i < nr_entries; i++)
+ table->entries[i].config.ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT;
+
+ table->nr_entries = nr_entries;
+ table->size = table_size;
+
+ /*
+ * To ease MSI-X cap configuration in case they share the same BAR,
+ * collapse table and pending array. According to PCI, address spaces
+ * must be power of two. Since nr_entries is a power of two, and PBA
+ * size is less than table_size, reserve 2*table_size.
+ */
+ table->guest_phys_addr = pci_get_io_space_block(2 * table_size);
+ if (!table->guest_phys_addr) {
+ pr_err("cannot allocate IO space");
+ ret = -ENOMEM;
+ goto out_free;
+ }
+ pba->guest_phys_addr = table->guest_phys_addr + table->size;
+
+ ret = kvm__register_mmio(kvm, table->guest_phys_addr, table_size, false,
+ vfio_pci_msix_table_access, pdev);
+ if (ret < 0)
+ goto out_free;
+
+ /*
+ * We could map the physical PBA directly into the guest, but it's
+ * likely smaller than a page, and we can only hand full pages to the
+ * guest. Even though the PCI spec disallows sharing a page used for
+ * MSI-X with any other resource, it allows to share the same page
+ * between MSI-X table and PBA. For the sake of isolation, create a
+ * virtual PBA.
+ */
+ pba->size = nr_entries / 8;
+
+ ret = kvm__register_mmio(kvm, pba->guest_phys_addr, pba->size, false,
+ vfio_pci_msix_pba_access, pdev);
+ if (ret < 0)
+ goto out_free;
+
+ return 0;
+
+out_free:
+ free(table->entries);
+
+ return ret;
+}
+
static int vfio_pci_configure_dev_regions(struct kvm *kvm,
struct vfio_device *device)
{
int ret;
+ struct msix_cap *msix;
+ struct vfio_pci_device *pdev = &device->pci;
u32 i, num_regions = device->info.num_regions;
ret = vfio_pci_parse_cfg_space(device);
if (ret)
return ret;
+ msix = pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX);
+ if (msix) {
+ ret = vfio_pci_create_msix_table(kvm, pdev, msix);
+ if (ret)
+ return ret;
+ }
+
/* First of all, map the BARs directly into the guest */
for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
struct vfio_region *region;
@@ -278,6 +530,16 @@ static int vfio_pci_configure_dev_regions(struct kvm *kvm,
if (!region->info.size)
continue;
+ if (msix) {
+ if (i == pdev->msix_table.bar) {
+ region->guest_phys_addr = pdev->msix_table.guest_phys_addr;
+ continue;
+ } else if (i == pdev->msix_pba.bar) {
+ region->guest_phys_addr = pdev->msix_pba.guest_phys_addr;
+ continue;
+ }
+ }
+
/*
* Map the BARs into the guest. We'll later need to update
* configuration space to reflect our allocation.
@@ -314,6 +576,64 @@ static int vfio_configure_dev_regions(struct kvm *kvm,
return vfio_pci_configure_dev_regions(kvm, device);
}
+static int vfio_pci_init_msix_irqfd(struct kvm *kvm,
+ struct vfio_device *device)
+{
+ int ret;
+ size_t i;
+ int *eventfds;
+ size_t irq_set_size;
+ struct vfio_irq_set *irq_set;
+ struct vfio_pci_msix_table *table = &device->pci.msix_table;
+
+ /*
+ * We likely have VFIO_IRQ_INFO_NORESIZE for MSI-X, and we don't want to
+ * enable/disable MSIs every time the guest requests a new one. Setup
+ * IRQFD for all vectors upfront.
+ *
+ * We cannot start creating the MSI-X routes in KVM just now. First we
+ * need to wait for all devices to allocate their IRQ lines, and only
+ * after that number is freezed will we be able to allocate MSI numbers.
+ * A bit unfortunate (it would be much easier to handle initialization
+ * errors here), but okay. Store eventfd until we're ready to create the
+ * routes.
+ */
+ irq_set_size = sizeof(struct vfio_irq_set) +
+ table->nr_entries * sizeof(int);
+ irq_set = malloc(irq_set_size);
+ if (!irq_set)
+ return -ENOMEM;
+
+ *irq_set = (struct vfio_irq_set) {
+ .argsz = irq_set_size,
+ .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
+ .index = VFIO_PCI_MSIX_IRQ_INDEX,
+ .start = 0,
+ .count = table->nr_entries,
+ };
+ eventfds = (void *)irq_set + sizeof(struct vfio_irq_set);
+
+ for (i = 0; i < table->nr_entries; i++) {
+ eventfds[i] = eventfd(0, 0);
+ if (eventfds[i] < 0) {
+ pr_err("cannot create eventfd (try to increase RLIMIT_NOFILE)");
+ ret = -errno;
+ goto out_free;
+ }
+
+ table->entries[i].gsi = -1;
+ table->entries[i].eventfd = eventfds[i];
+ }
+
+ ret = ioctl(device->fd, VFIO_DEVICE_SET_IRQS, irq_set);
+ if (ret < 0)
+ pr_err("Cannot register vfio_irq_set");
+
+out_free:
+ free(irq_set);
+ return ret;
+}
+
static int vfio_init_irqfd(struct kvm *kvm, int devfd, int gsi)
{
int ret;
@@ -393,31 +713,37 @@ static int vfio_configure_dev_irqs(struct kvm *kvm, struct vfio_device *device)
{
int ret;
struct vfio_pci_device *pdev = &device->pci;
+ struct msix_cap *msix = pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX);
device->irq_info = (struct vfio_irq_info) {
- .argsz = sizeof(device->irq_info)
+ .argsz = sizeof(device->irq_info),
+ .index = msix ? VFIO_PCI_MSIX_IRQ_INDEX :
+ VFIO_PCI_INTX_IRQ_INDEX,
};
- if (pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX)) {
- /* TODO: set up shadow PBA/table structures for MSI-X. */
+ ioctl(device->fd, VFIO_DEVICE_GET_IRQ_INFO, &device->irq_info);
+ if (device->irq_info.count == 0) {
+ pr_err("No interrupt found by VFIO");
+ return -ENODEV;
+ }
+
+ if (!(device->irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
+ pr_err("Interrupt not EVENTFD capable");
+ return -EINVAL;
+ }
+
+ if (msix) {
+ if (device->irq_info.count != pdev->msix_table.nr_entries) {
+ pr_err("Invalid number of MSI-X reported by VFIO");
+ return -EINVAL;
+ }
+
+ ret = vfio_pci_init_msix_irqfd(kvm, device);
} else {
int gsi = pdev->hdr.irq_line - KVM_IRQ_OFFSET;
- /* We don't have MSI-X, so fall back on INTx */
pr_info("MSI-X not available for device 0x%x, falling back to INTx",
device->dev_hdr.dev_num);
- device->irq_info.index = VFIO_PCI_INTX_IRQ_INDEX;
- ioctl(device->fd, VFIO_DEVICE_GET_IRQ_INFO, &device->irq_info);
-
- if (device->irq_info.count != 1) {
- pr_err("No INTx interrupts found");
- return -ENODEV;
- }
-
- if (!(device->irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
- pr_err("INTx interrupt not EVENTFD capable");
- return -EINVAL;
- }
if (!(device->irq_info.flags & VFIO_IRQ_INFO_AUTOMASKED)) {
pr_err("INTx interrupt not AUTOMASKED");
@@ -425,11 +751,9 @@ static int vfio_configure_dev_irqs(struct kvm *kvm, struct vfio_device *device)
}
ret = vfio_init_irqfd(kvm, device->fd, gsi);
- if (ret)
- return ret;
}
- return 0;
+ return ret;
}
static int vfio_configure_device(struct kvm *kvm, struct vfio_group *group,
Add virtual MSI-X tables for PCI devices, and create IRQFD routes to let the kernel inject MSIs from a physical device directly into the guest. Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com> --- include/kvm/vfio.h | 24 ++++ vfio.c | 366 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 369 insertions(+), 21 deletions(-)