@@ -59,6 +59,8 @@ OBJS += main.o
OBJS += mmio.o
OBJS += pci.o
OBJS += term.o
+OBJS += vfio/core.o
+OBJS += vfio/pci.o
OBJS += virtio/blk.o
OBJS += virtio/scsi.o
OBJS += virtio/console.o
@@ -1,5 +1,6 @@
#include "kvm/devices.h"
#include "kvm/fdt.h"
+#include "kvm/kvm.h"
#include "kvm/of_pci.h"
#include "kvm/pci.h"
#include "kvm/util.h"
@@ -146,6 +146,11 @@ void kvm_run_set_wrapper_sandbox(void)
OPT_BOOLEAN('\0', "no-dhcp", &(cfg)->no_dhcp, "Disable kernel" \
" DHCP in rootfs mode"), \
\
+ OPT_GROUP("VFIO options:"), \
+ OPT_CALLBACK('\0', "vfio-group", NULL, "group number", \
+ "Assign a VFIO group to the virtual machine", \
+ vfio_group_parser, kvm), \
+ \
OPT_GROUP("Debug options:"), \
OPT_BOOLEAN('\0', "debug", &do_debug_print, \
"Enable debug messages"), \
@@ -2,6 +2,7 @@
#define KVM_CONFIG_H_
#include "kvm/disk-image.h"
+#include "kvm/vfio.h"
#include "kvm/kvm-config-arch.h"
#define DEFAULT_KVM_DEV "/dev/kvm"
@@ -20,9 +21,11 @@
struct kvm_config {
struct kvm_config_arch arch;
struct disk_image_params disk_image[MAX_DISK_IMAGES];
+ struct vfio_group vfio_group[MAX_VFIO_GROUPS];
u64 ram_size;
u8 image_count;
u8 num_net_devices;
+ u8 num_vfio_groups;
bool virtio_rng;
int active_console;
int debug_iodelay;
@@ -7,7 +7,6 @@
#include <endian.h>
#include "kvm/devices.h"
-#include "kvm/kvm.h"
#include "kvm/msi.h"
#include "kvm/fdt.h"
@@ -22,6 +21,8 @@
#define PCI_IO_SIZE 0x100
#define PCI_CFG_SIZE (1ULL << 24)
+struct kvm;
+
union pci_config_address {
struct {
#if __BYTE_ORDER == __LITTLE_ENDIAN
new file mode 100644
@@ -0,0 +1,57 @@
+#ifndef KVM__VFIO_H
+#define KVM__VFIO_H
+
+#include "kvm/parse-options.h"
+#include "kvm/pci.h"
+
+#include <linux/vfio.h>
+
+#include <dirent.h>
+
+#define dev_err(vdev, fmt, ...) pr_err("%s: " fmt, vdev->name, ##__VA_ARGS__)
+#define dev_warn(vdev, fmt, ...) pr_warning("%s: " fmt, vdev->name, ##__VA_ARGS__)
+#define dev_info(vdev, fmt, ...) pr_info("%s: " fmt, vdev->name, ##__VA_ARGS__)
+#define dev_die(vdev, fmt, ...) die("%s: " fmt, vdev->name, ##__VA_ARGS__)
+
+#define MAX_VFIO_GROUPS 16
+
+struct vfio_pci_device {
+ struct pci_device_header hdr;
+};
+
+struct vfio_region {
+ struct vfio_region_info info;
+ u64 guest_phys_addr;
+ void *host_addr;
+};
+
+struct vfio_device {
+ struct device_header dev_hdr;
+
+ int fd;
+ struct vfio_device_info info;
+ struct vfio_irq_info irq_info;
+ struct vfio_region *regions;
+
+ char *name;
+ char *sysfs_path;
+
+ struct hlist_node list;
+
+ struct vfio_pci_device pci;
+};
+
+struct vfio_group {
+ unsigned long id; /* iommu_group number in sysfs */
+ int fd;
+ struct hlist_head devices;
+};
+
+int vfio_group_parser(const struct option *opt, const char *arg, int unset);
+int vfio_map_region(struct kvm *kvm, struct vfio_device *vdev,
+ struct vfio_region *region);
+void vfio_unmap_region(struct kvm *kvm, struct vfio_region *region);
+int vfio_pci_setup_device(struct kvm *kvm, struct vfio_device *device);
+void vfio_pci_teardown_device(struct kvm *kvm, struct vfio_device *vdev);
+
+#endif /* KVM__VFIO_H */
new file mode 100644
@@ -0,0 +1,395 @@
+#include "kvm/kvm.h"
+#include "kvm/vfio.h"
+
+#include <linux/list.h>
+
+#define VFIO_DEV_DIR "/dev/vfio"
+#define VFIO_DEV_NODE VFIO_DEV_DIR "/vfio"
+#define IOMMU_GROUP_DIR "/sys/kernel/iommu_groups"
+
+#define VFIO_PATH_MAX_LEN 16
+
+static int vfio_container;
+
+int vfio_group_parser(const struct option *opt, const char *arg, int unset)
+{
+ char *cur, *buf = strdup(arg);
+ static int idx = 0;
+ struct kvm *kvm = opt->ptr;
+ struct vfio_group *group = &kvm->cfg.vfio_group[idx];
+
+ if (idx >= MAX_VFIO_GROUPS) {
+ if (idx++ == MAX_VFIO_GROUPS)
+ pr_warning("Too many VFIO groups");
+ free(buf);
+ return 0;
+ }
+
+ cur = strtok(buf, ",");
+ group->id = strtoul(cur, NULL, 0);
+
+ kvm->cfg.num_vfio_groups = ++idx;
+ free(buf);
+
+ return 0;
+}
+
+int vfio_map_region(struct kvm *kvm, struct vfio_device *vdev,
+ struct vfio_region *region)
+{
+ void *base;
+ int ret, prot = 0;
+ /* KVM needs page-aligned regions */
+ u64 map_size = ALIGN(region->info.size, PAGE_SIZE);
+
+ /*
+ * We don't want to mess about trapping config accesses, so require that
+ * they can be mmap'd. Note that for PCI, this precludes the use of I/O
+ * BARs in the guest (we will hide them from Configuration Space, which
+ * is trapped).
+ */
+ if (!(region->info.flags & VFIO_REGION_INFO_FLAG_MMAP)) {
+ dev_info(vdev, "ignoring region %u, as it can't be mmap'd",
+ region->info.index);
+ return 0;
+ }
+
+ if (region->info.flags & VFIO_REGION_INFO_FLAG_READ)
+ prot |= PROT_READ;
+ if (region->info.flags & VFIO_REGION_INFO_FLAG_WRITE)
+ prot |= PROT_WRITE;
+
+ base = mmap(NULL, region->info.size, prot, MAP_SHARED, vdev->fd,
+ region->info.offset);
+ if (base == MAP_FAILED) {
+ ret = -errno;
+ dev_err(vdev, "failed to mmap region %u (0x%llx bytes)",
+ region->info.index, region->info.size);
+ return ret;
+ }
+ region->host_addr = base;
+
+ ret = kvm__register_dev_mem(kvm, region->guest_phys_addr, map_size,
+ region->host_addr);
+ if (ret) {
+ dev_err(vdev, "failed to register region with KVM");
+ return ret;
+ }
+
+ return 0;
+}
+
+void vfio_unmap_region(struct kvm *kvm, struct vfio_region *region)
+{
+ munmap(region->host_addr, region->info.size);
+}
+
+static int vfio_configure_device(struct kvm *kvm, struct vfio_group *group,
+ const char *dirpath, const char *name)
+{
+ u32 num_regions;
+ int ret = -ENOMEM;
+ char fullpath[PATH_MAX];
+ struct vfio_device *vdev;
+
+ snprintf(fullpath, PATH_MAX, "%s/%s", dirpath, name);
+
+ vdev = calloc(1, sizeof(*vdev));
+ if (!vdev)
+ return -ENOMEM;
+
+ vdev->name = strdup(name);
+ if (!vdev->name)
+ goto err_free_device;
+
+ vdev->sysfs_path = strndup(fullpath, PATH_MAX);
+ if (!vdev->sysfs_path)
+ goto err_free_name;
+
+ vdev->fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
+ if (vdev->fd < 0) {
+ dev_err(vdev, "failed to get fd");
+
+ /* The device might be a bridge without an fd */
+ ret = 0;
+ goto err_free_path;
+ }
+
+ vdev->info.argsz = sizeof(vdev->info);
+ if (ioctl(vdev->fd, VFIO_DEVICE_GET_INFO, &vdev->info)) {
+ ret = -errno;
+ dev_err(vdev, "failed to get info");
+ goto err_close_device;
+ }
+
+ if (vdev->info.flags & VFIO_DEVICE_FLAGS_RESET &&
+ ioctl(vdev->fd, VFIO_DEVICE_RESET) < 0)
+ dev_warn(vdev, "failed to reset device");
+
+ num_regions = vdev->info.num_regions;
+
+ vdev->regions = calloc(num_regions, sizeof(*vdev->regions));
+ if (!vdev->regions) {
+ ret = -ENOMEM;
+ goto err_close_device;
+ }
+
+ /* Now for the bus-specific initialization... */
+ if (vdev->info.flags & VFIO_DEVICE_FLAGS_PCI) {
+ ret = vfio_pci_setup_device(kvm, vdev);
+ } else {
+ dev_warn(vdev, "only vfio-pci is supported");
+ ret = -EINVAL;
+ }
+
+ if (ret)
+ goto err_free_regions;
+
+ dev_info(vdev, "assigned to device number 0x%x in group %lu",
+ vdev->dev_hdr.dev_num, group->id);
+
+ hlist_add_head(&vdev->list, &group->devices);
+
+ return 0;
+
+err_free_regions:
+ free(vdev->regions);
+err_close_device:
+ close(vdev->fd);
+err_free_path:
+ free((void *)vdev->sysfs_path);
+err_free_name:
+ free((void *)vdev->name);
+err_free_device:
+ free(vdev);
+
+ return ret;
+}
+
+static int vfio_configure_iommu_groups(struct kvm *kvm)
+{
+ int i, ret;
+
+ for (i = 0; i < kvm->cfg.num_vfio_groups; ++i) {
+ DIR *dir;
+ struct dirent *dirent;
+ char dirpath[PATH_MAX];
+ struct vfio_group *group = &kvm->cfg.vfio_group[i];
+
+ snprintf(dirpath, PATH_MAX, IOMMU_GROUP_DIR "/%lu/devices",
+ group->id);
+
+ dir = opendir(dirpath);
+ if (!dir) {
+ ret = -errno;
+ pr_err("Failed to open IOMMU group %s", dirpath);
+ return ret;
+ }
+
+ while ((dirent = readdir(dir))) {
+ if (dirent->d_type != DT_LNK)
+ continue;
+
+ ret = vfio_configure_device(kvm, group, dirpath,
+ dirent->d_name);
+ if (ret)
+ return ret;
+ }
+
+ if (closedir(dir))
+ pr_warning("Failed to close IOMMU group %s", dirpath);
+ }
+
+ return 0;
+}
+
+static int vfio_get_iommu_type(void)
+{
+ if (ioctl(vfio_container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_NESTING_IOMMU))
+ return VFIO_TYPE1_NESTING_IOMMU;
+
+ if (ioctl(vfio_container, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU))
+ return VFIO_TYPE1v2_IOMMU;
+
+ if (ioctl(vfio_container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU))
+ return VFIO_TYPE1_IOMMU;
+
+ return -ENODEV;
+}
+
+static int vfio_map_mem_bank(struct kvm *kvm, struct kvm_mem_bank *bank, void *data)
+{
+ int ret = 0;
+ struct vfio_iommu_type1_dma_map dma_map = {
+ .argsz = sizeof(dma_map),
+ .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE,
+ .vaddr = (unsigned long)bank->host_addr,
+ .iova = (u64)bank->guest_phys_addr,
+ .size = bank->size,
+ };
+
+ /* Map the guest memory for DMA (i.e. provide isolation) */
+ if (ioctl(vfio_container, VFIO_IOMMU_MAP_DMA, &dma_map)) {
+ ret = -errno;
+ pr_err("Failed to map 0x%llx -> 0x%llx (%llu) for DMA",
+ dma_map.iova, dma_map.vaddr, dma_map.size);
+ }
+
+ return ret;
+}
+
+static int vfio_unmap_mem_bank(struct kvm *kvm, struct kvm_mem_bank *bank, void *data)
+{
+ struct vfio_iommu_type1_dma_unmap dma_unmap = {
+ .argsz = sizeof(dma_unmap),
+ .size = bank->size,
+ .iova = bank->guest_phys_addr,
+ };
+
+ ioctl(vfio_container, VFIO_IOMMU_UNMAP_DMA, &dma_unmap);
+
+ return 0;
+}
+
+static int vfio_group_init(struct kvm *kvm, struct vfio_group *group)
+{
+ int ret;
+ char group_node[VFIO_PATH_MAX_LEN];
+ struct vfio_group_status group_status = {
+ .argsz = sizeof(group_status),
+ };
+
+ INIT_HLIST_HEAD(&group->devices);
+
+ snprintf(group_node, VFIO_PATH_MAX_LEN, VFIO_DEV_DIR "/%lu",
+ group->id);
+
+ group->fd = open(group_node, O_RDWR);
+ if (group->fd == -1) {
+ ret = -errno;
+ pr_err("Failed to open IOMMU group %s", group_node);
+ return ret;
+ }
+
+ if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &group_status)) {
+ ret = -errno;
+ pr_err("Failed to determine status of IOMMU group %s",
+ group_node);
+ return ret;
+ }
+
+ if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
+ pr_err("IOMMU group %s is not viable", group_node);
+ return -EINVAL;
+ }
+
+ if (ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &vfio_container)) {
+ ret = -errno;
+ pr_err("Failed to add IOMMU group %s to VFIO container",
+ group_node);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void vfio_group_exit(struct kvm *kvm, struct vfio_group *group)
+{
+ int fd = group->fd;
+ struct hlist_node *next;
+ struct vfio_device *vdev;
+
+ hlist_for_each_entry_safe(vdev, next, &group->devices, list) {
+ if (vdev->info.flags & VFIO_DEVICE_FLAGS_PCI)
+ vfio_pci_teardown_device(kvm, vdev);
+
+ close(vdev->fd);
+
+ free(vdev->regions);
+ free(vdev->name);
+ free(vdev->sysfs_path);
+ free(vdev);
+ }
+
+ ioctl(fd, VFIO_GROUP_UNSET_CONTAINER);
+ close(fd);
+}
+
+static int vfio_container_init(struct kvm *kvm)
+{
+ int api, i, ret, iommu_type;;
+
+ /* Create a container for our IOMMU groups */
+ vfio_container = open(VFIO_DEV_NODE, O_RDWR);
+ if (vfio_container == -1) {
+ ret = errno;
+ pr_err("Failed to open %s", VFIO_DEV_NODE);
+ return ret;
+ }
+
+ api = ioctl(vfio_container, VFIO_GET_API_VERSION);
+ if (api != VFIO_API_VERSION) {
+ pr_err("Unknown VFIO API version %d", api);
+ return -ENODEV;
+ }
+
+ iommu_type = vfio_get_iommu_type();
+ if (iommu_type < 0) {
+ pr_err("VFIO type-1 IOMMU not supported on this platform");
+ return iommu_type;
+ }
+
+ /* Sanity check our groups and add them to the container */
+ for (i = 0; i < kvm->cfg.num_vfio_groups; ++i) {
+ ret = vfio_group_init(kvm, &kvm->cfg.vfio_group[i]);
+ if (ret)
+ return ret;
+ }
+
+ /* Finalise the container */
+ if (ioctl(vfio_container, VFIO_SET_IOMMU, iommu_type)) {
+ ret = -errno;
+ pr_err("Failed to set IOMMU type %d for VFIO container",
+ iommu_type);
+ return ret;
+ } else {
+ pr_info("Using IOMMU type %d for VFIO container", iommu_type);
+ }
+
+ return kvm__for_each_mem_bank(kvm, KVM_MEM_TYPE_RAM, vfio_map_mem_bank,
+ NULL);
+}
+
+static int vfio__init(struct kvm *kvm)
+{
+ int ret;
+
+ if (!kvm->cfg.num_vfio_groups)
+ return 0;
+
+ ret = vfio_container_init(kvm);
+ if (ret)
+ return ret;
+
+ ret = vfio_configure_iommu_groups(kvm);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+dev_base_init(vfio__init);
+
+static int vfio__exit(struct kvm *kvm)
+{
+ int i;
+
+ if (!kvm->cfg.num_vfio_groups)
+ return 0;
+
+ for (i = 0; i < kvm->cfg.num_vfio_groups; ++i)
+ vfio_group_exit(kvm, &kvm->cfg.vfio_group[i]);
+
+ kvm__for_each_mem_bank(kvm, KVM_MEM_TYPE_RAM, vfio_unmap_mem_bank, NULL);
+ return close(vfio_container);
+}
+dev_base_exit(vfio__exit);
new file mode 100644
@@ -0,0 +1,365 @@
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/vfio.h"
+
+#include <sys/ioctl.h>
+#include <sys/eventfd.h>
+
+/* Wrapper around UAPI vfio_irq_set */
+struct vfio_irq_eventfd {
+ struct vfio_irq_set irq;
+ int fd;
+};
+
+static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr,
+ u8 offset, void *data, int sz)
+{
+ struct vfio_region_info *info;
+ struct vfio_pci_device *pdev;
+ struct vfio_device *vdev;
+ char base[sz];
+
+ pdev = container_of(pci_hdr, struct vfio_pci_device, hdr);
+ vdev = container_of(pdev, struct vfio_device, pci);
+ info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
+
+ /* Dummy read in case of side-effects */
+ if (pread(vdev->fd, base, sz, info->offset + offset) != sz)
+ dev_warn(vdev, "failed to read %d bytes from Configuration Space at 0x%x",
+ sz, offset);
+}
+
+static void vfio_pci_cfg_write(struct kvm *kvm, struct pci_device_header *pci_hdr,
+ u8 offset, void *data, int sz)
+{
+ struct vfio_region_info *info;
+ struct vfio_pci_device *pdev;
+ struct vfio_device *vdev;
+ void *base = pci_hdr;
+
+ pdev = container_of(pci_hdr, struct vfio_pci_device, hdr);
+ vdev = container_of(pdev, struct vfio_device, pci);
+ info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
+
+ if (pwrite(vdev->fd, data, sz, info->offset + offset) != sz)
+ dev_warn(vdev, "Failed to write %d bytes to Configuration Space at 0x%x",
+ sz, offset);
+
+ if (pread(vdev->fd, base + offset, sz, info->offset + offset) != sz)
+ dev_warn(vdev, "Failed to read %d bytes from Configuration Space at 0x%x",
+ sz, offset);
+}
+
+static int vfio_pci_parse_caps(struct vfio_device *vdev)
+{
+ struct vfio_pci_device *pdev = &vdev->pci;
+
+ if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST))
+ return 0;
+
+ pdev->hdr.status &= ~PCI_STATUS_CAP_LIST;
+ pdev->hdr.capabilities = 0;
+
+ /* TODO: install virtual capabilities */
+
+ return 0;
+}
+
+static int vfio_pci_parse_cfg_space(struct vfio_device *vdev)
+{
+ struct vfio_region_info *info;
+ ssize_t sz = PCI_DEV_CFG_SIZE;
+ struct vfio_pci_device *pdev = &vdev->pci;
+
+ if (vdev->info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) {
+ dev_err(vdev, "Config Space not found");
+ return -ENODEV;
+ }
+
+ info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
+ *info = (struct vfio_region_info) {
+ .argsz = sizeof(*info),
+ .index = VFIO_PCI_CONFIG_REGION_INDEX,
+ };
+
+ ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info);
+ if (!info->size) {
+ dev_err(vdev, "Config Space has size zero?!");
+ return -EINVAL;
+ }
+
+ if (pread(vdev->fd, &pdev->hdr, sz, info->offset) != sz) {
+ dev_err(vdev, "failed to read %zd bytes of Config Space", sz);
+ return -EIO;
+ }
+
+ /* Strip bit 7, that indicates multifunction */
+ pdev->hdr.header_type &= 0x7f;
+
+ if (pdev->hdr.header_type != PCI_HEADER_TYPE_NORMAL) {
+ dev_err(vdev, "unsupported header type %u",
+ pdev->hdr.header_type);
+ return -EOPNOTSUPP;
+ }
+
+ vfio_pci_parse_caps(vdev);
+
+ return 0;
+}
+
+static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev)
+{
+ int i;
+ ssize_t hdr_sz;
+ struct vfio_region_info *info;
+ struct vfio_pci_device *pdev = &vdev->pci;
+
+ /* Enable exclusively MMIO and bus mastering */
+ pdev->hdr.command &= ~PCI_COMMAND_IO;
+ pdev->hdr.command |= PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER;
+
+ /* Initialise the BARs */
+ for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
+ struct vfio_region *region = &vdev->regions[i];
+ u64 base = region->guest_phys_addr;
+
+ if (!base)
+ continue;
+
+ pdev->hdr.bar_size[i] = region->info.size;
+
+ /* Construct a fake reg to match what we've mapped. */
+ pdev->hdr.bar[i] = (base & PCI_BASE_ADDRESS_MEM_MASK) |
+ PCI_BASE_ADDRESS_SPACE_MEMORY |
+ PCI_BASE_ADDRESS_MEM_TYPE_32;
+ }
+
+ /* I really can't be bothered to support cardbus. */
+ pdev->hdr.card_bus = 0;
+
+ /*
+ * Nuke the expansion ROM for now. If we want to do this properly,
+ * we need to save its size somewhere and map into the guest.
+ */
+ pdev->hdr.exp_rom_bar = 0;
+
+ /* Install our fake Configuration Space, without the caps */
+ info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
+ hdr_sz = offsetof(struct pci_device_header, msix);
+ if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) {
+ dev_err(vdev, "failed to write %zd bytes to Config Space",
+ hdr_sz);
+ return -EIO;
+ }
+
+ /* TODO: install virtual capability */
+
+ /* Register callbacks for cfg accesses */
+ pdev->hdr.cfg_ops = (struct pci_config_operations) {
+ .read = vfio_pci_cfg_read,
+ .write = vfio_pci_cfg_write,
+ };
+
+ pdev->hdr.irq_type = IRQ_TYPE_LEVEL_HIGH;
+
+ return 0;
+}
+
+static int vfio_pci_configure_dev_regions(struct kvm *kvm,
+ struct vfio_device *vdev)
+{
+ u32 i;
+ int ret;
+ size_t map_size;
+
+ ret = vfio_pci_parse_cfg_space(vdev);
+ if (ret)
+ return ret;
+
+ /* First of all, map the BARs directly into the guest */
+ for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
+ struct vfio_region *region = &vdev->regions[i];
+
+ if (i >= vdev->info.num_regions)
+ break;
+
+ region->info = (struct vfio_region_info) {
+ .argsz = sizeof(*region),
+ .index = i,
+ };
+
+ ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO,
+ ®ion->info);
+ if (ret) {
+ ret = -errno;
+ dev_err(vdev, "cannot get info for region %u", i);
+ return ret;
+ }
+
+ /* Ignore invalid or unimplemented regions */
+ if (!region->info.size)
+ continue;
+
+ /* Grab some MMIO space in the guest */
+ map_size = ALIGN(region->info.size, PAGE_SIZE);
+ region->guest_phys_addr = pci_get_io_space_block(map_size);
+
+ /*
+ * Map the BARs into the guest. We'll later need to update
+ * configuration space to reflect our allocation.
+ */
+ ret = vfio_map_region(kvm, vdev, region);
+ if (ret)
+ return ret;
+ }
+
+ /* We've configured the BARs, fake up a Configuration Space */
+ return vfio_pci_fixup_cfg_space(vdev);
+}
+
+static int vfio_pci_init_irqfd(struct kvm *kvm, int devfd, int gsi)
+{
+ int ret;
+ int trigger_fd, unmask_fd;
+ struct vfio_irq_eventfd trigger;
+ struct vfio_irq_eventfd unmask;
+
+ /*
+ * PCI IRQ is level-triggered, so we use two eventfds. trigger_fd
+ * signals an interrupt from host to guest, and unmask_fd signals the
+ * deassertion of the line from guest to host.
+ */
+ trigger_fd = eventfd(0, 0);
+ if (trigger_fd < 0) {
+ pr_err("Failed to create trigger eventfd");
+ return trigger_fd;
+ }
+
+ unmask_fd = eventfd(0, 0);
+ if (unmask_fd < 0) {
+ pr_err("Failed to create unmask eventfd");
+ close(trigger_fd);
+ return unmask_fd;
+ }
+
+ ret = irq__add_irqfd(kvm, gsi, trigger_fd, unmask_fd);
+ if (ret)
+ goto err_close;
+
+ trigger.irq = (struct vfio_irq_set) {
+ .argsz = sizeof(trigger),
+ .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
+ .index = VFIO_PCI_INTX_IRQ_INDEX,
+ .start = 0,
+ .count = 1,
+ };
+ trigger.fd = trigger_fd;
+
+ ret = ioctl(devfd, VFIO_DEVICE_SET_IRQS, &trigger);
+ if (ret < 0) {
+ pr_err("Failed to setup VFIO IRQ");
+ goto err_delete_line;
+ }
+
+ unmask.irq = (struct vfio_irq_set) {
+ .argsz = sizeof(unmask),
+ .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK,
+ .index = VFIO_PCI_INTX_IRQ_INDEX,
+ .start = 0,
+ .count = 1,
+ };
+ unmask.fd = unmask_fd;
+
+ ret = ioctl(devfd, VFIO_DEVICE_SET_IRQS, &unmask);
+ if (ret < 0) {
+ pr_err("Failed to setup unmask IRQ");
+ goto err_remove_event;
+ }
+
+ return 0;
+
+err_remove_event:
+ /* Remove trigger event */
+ trigger.irq.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
+ ioctl(devfd, VFIO_DEVICE_SET_IRQS, &trigger.irq);
+
+err_delete_line:
+ irq__del_irqfd(kvm, gsi, trigger_fd);
+
+err_close:
+ close(trigger_fd);
+ close(unmask_fd);
+ return ret;
+}
+
+static int vfio_pci_configure_dev_irqs(struct kvm *kvm, struct vfio_device *vdev)
+{
+ struct vfio_pci_device *pdev = &vdev->pci;
+ int gsi = pdev->hdr.irq_line - KVM_IRQ_OFFSET;
+
+ vdev->irq_info = (struct vfio_irq_info) {
+ .argsz = sizeof(vdev->irq_info),
+ };
+
+ ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &vdev->irq_info);
+ if (vdev->irq_info.count == 0) {
+ dev_err(vdev, "no interrupt found by VFIO");
+ return -ENODEV;
+ }
+
+ if (!(vdev->irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
+ dev_err(vdev, "interrupt not EVENTFD capable");
+ return -EINVAL;
+ }
+
+ /* TODO: add MSI support */
+ dev_err(vdev, "MSI-X not available, falling back to INTx");
+
+ if (!(vdev->irq_info.flags & VFIO_IRQ_INFO_AUTOMASKED)) {
+ dev_err(vdev, "INTx interrupt not AUTOMASKED");
+ return -EINVAL;
+ }
+
+ return vfio_pci_init_irqfd(kvm, vdev->fd, gsi);
+}
+
+int vfio_pci_setup_device(struct kvm *kvm, struct vfio_device *vdev)
+{
+ int ret;
+
+ ret = vfio_pci_configure_dev_regions(kvm, vdev);
+ if (ret) {
+ dev_err(vdev, "failed to configure regions");
+ return ret;
+ }
+
+ vdev->dev_hdr = (struct device_header) {
+ .bus_type = DEVICE_BUS_PCI,
+ .data = &vdev->pci.hdr,
+ };
+
+ ret = device__register(&vdev->dev_hdr);
+ if (ret) {
+ dev_err(vdev, "failed to register VFIO device");
+ return ret;
+ }
+
+ ret = vfio_pci_configure_dev_irqs(kvm, vdev);
+ if (ret) {
+ dev_err(vdev, "failed to configure IRQs");
+ return ret;
+ }
+
+ return 0;
+}
+
+void vfio_pci_teardown_device(struct kvm *kvm, struct vfio_device *vdev)
+{
+ size_t i;
+
+ for (i = 0; i < vdev->info.num_regions; i++)
+ vfio_unmap_region(kvm, &vdev->regions[i]);
+
+ device__unregister(&vdev->dev_hdr);
+}