@@ -59,6 +59,7 @@ OBJS += main.o
OBJS += mmio.o
OBJS += pci.o
OBJS += term.o
+OBJS += vfio.o
OBJS += virtio/blk.o
OBJS += virtio/scsi.o
OBJS += virtio/console.o
@@ -1,5 +1,6 @@
#include "kvm/devices.h"
#include "kvm/fdt.h"
+#include "kvm/kvm.h"
#include "kvm/of_pci.h"
#include "kvm/pci.h"
#include "kvm/util.h"
@@ -146,6 +146,11 @@ void kvm_run_set_wrapper_sandbox(void)
OPT_BOOLEAN('\0', "no-dhcp", &(cfg)->no_dhcp, "Disable kernel" \
" DHCP in rootfs mode"), \
\
+ OPT_GROUP("VFIO options:"), \
+ OPT_CALLBACK('\0', "vfio-group", NULL, "group number", \
+ "Pass through a VFIO group to the virtual " \
+ "machine", vfio_group_parser, kvm), \
+ \
OPT_GROUP("Debug options:"), \
OPT_BOOLEAN('\0', "debug", &do_debug_print, \
"Enable debug messages"), \
@@ -2,6 +2,7 @@
#define KVM_CONFIG_H_
#include "kvm/disk-image.h"
+#include "kvm/vfio.h"
#include "kvm/kvm-config-arch.h"
#define DEFAULT_KVM_DEV "/dev/kvm"
@@ -20,9 +21,11 @@
struct kvm_config {
struct kvm_config_arch arch;
struct disk_image_params disk_image[MAX_DISK_IMAGES];
+ struct vfio_group vfio_group[MAX_VFIO_GROUPS];
u64 ram_size;
u8 image_count;
u8 num_net_devices;
+ u8 num_vfio_groups;
bool virtio_rng;
int active_console;
int debug_iodelay;
@@ -7,7 +7,6 @@
#include <endian.h>
#include "kvm/devices.h"
-#include "kvm/kvm.h"
#include "kvm/msi.h"
#include "kvm/fdt.h"
@@ -22,6 +21,8 @@
#define PCI_IO_SIZE 0x100
#define PCI_CFG_SIZE (1ULL << 24)
+struct kvm;
+
union pci_config_address {
struct {
#if __BYTE_ORDER == __LITTLE_ENDIAN
new file mode 100644
@@ -0,0 +1,39 @@
+#ifndef KVM__VFIO_H
+#define KVM__VFIO_H
+
+#include "kvm/parse-options.h"
+#include "kvm/pci.h"
+
+#include <linux/vfio.h>
+
+#define MAX_VFIO_GROUPS 16
+
+struct vfio_pci_device {
+ struct pci_device_header hdr;
+};
+
+struct vfio_region {
+ struct vfio_region_info info;
+ u32 guest_phys_addr;
+ void *host_addr;
+};
+
+struct vfio_device {
+ struct device_header dev_hdr;
+
+ int fd;
+ struct vfio_device_info info;
+ struct vfio_irq_info irq_info;
+ struct vfio_region *regions;
+
+ struct vfio_pci_device pci;
+};
+
+struct vfio_group {
+ unsigned long id; /* iommu_group number in sysfs */
+ int fd;
+};
+
+int vfio_group_parser(const struct option *opt, const char *arg, int unset);
+
+#endif /* KVM__VFIO_H */
new file mode 100644
@@ -0,0 +1,706 @@
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/pci.h"
+#include "kvm/util.h"
+#include "kvm/vfio.h"
+
+#include <linux/kvm.h>
+#include <linux/pci_regs.h>
+
+#include <sys/epoll.h>
+#include <sys/eventfd.h>
+
+#include <dirent.h>
+#include <pthread.h>
+
+#define VFIO_DEV_DIR "/dev/vfio"
+#define VFIO_DEV_NODE VFIO_DEV_DIR "/vfio"
+#define IOMMU_GROUP_DIR "/sys/kernel/iommu_groups"
+
+#define VFIO_PATH_MAX_LEN 16
+
+struct vfio_irq_eventfd {
+ struct vfio_irq_set irq;
+ int fd;
+};
+
+static int vfio_container;
+
+int vfio_group_parser(const struct option *opt, const char *arg, int unset)
+{
+ char *cur, *buf = strdup(arg);
+ static int idx = 0;
+ struct kvm *kvm = opt->ptr;
+ struct vfio_group *group = &kvm->cfg.vfio_group[idx];
+
+ if (idx >= MAX_VFIO_GROUPS) {
+ if (idx++ == MAX_VFIO_GROUPS)
+ pr_warning("Too many VFIO groups");
+ free(buf);
+ return 0;
+ }
+
+ cur = strtok(buf, ",");
+ group->id = strtoul(cur, NULL, 0);
+
+ kvm->cfg.num_vfio_groups = ++idx;
+ free(buf);
+
+ return 0;
+}
+
+static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr,
+ u8 offset, void *data, int sz)
+{
+ struct vfio_region_info *info;
+ struct vfio_pci_device *pdev;
+ struct vfio_device *device;
+ char base[sz];
+
+ pdev = container_of(pci_hdr, struct vfio_pci_device, hdr);
+ device = container_of(pdev, struct vfio_device, pci);
+ info = &device->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
+
+ /* Dummy read in case of side-effects */
+ if (pread(device->fd, base, sz, info->offset + offset) != sz)
+ pr_warning("Failed to read %d bytes from Configuration Space at 0x%x",
+ sz, offset);
+}
+
+static void vfio_pci_cfg_write(struct kvm *kvm, struct pci_device_header *pci_hdr,
+ u8 offset, void *data, int sz)
+{
+ struct vfio_region_info *info;
+ struct vfio_pci_device *pdev;
+ struct vfio_device *device;
+ void *base = pci_hdr;
+
+ pdev = container_of(pci_hdr, struct vfio_pci_device, hdr);
+ device = container_of(pdev, struct vfio_device, pci);
+ info = &device->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
+
+ if (pwrite(device->fd, data, sz, info->offset + offset) != sz)
+ pr_warning("Failed to write %d bytes to Configuration Space at 0x%x",
+ sz, offset);
+
+ if (pread(device->fd, base + offset, sz, info->offset + offset) != sz)
+ pr_warning("Failed to read %d bytes from Configuration Space at 0x%x",
+ sz, offset);
+}
+
+static int vfio_pci_parse_caps(struct vfio_device *device)
+{
+ struct vfio_pci_device *pdev = &device->pci;
+
+ if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST))
+ return 0;
+
+ pdev->hdr.status &= ~PCI_STATUS_CAP_LIST;
+ pdev->hdr.capabilities = 0;
+
+ /* TODO: install virtual capabilities */
+
+ return 0;
+}
+
+static int vfio_pci_parse_cfg_space(struct vfio_device *device)
+{
+ u8 hdr_type;
+ struct vfio_region_info *info;
+ ssize_t sz = PCI_DEV_CFG_SIZE;
+ struct vfio_pci_device *pdev = &device->pci;
+
+ if (device->info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) {
+ pr_err("Configuration Space not found");
+ return -ENODEV;
+ }
+
+ info = &device->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
+ *info = (struct vfio_region_info) {
+ .argsz = sizeof(*info),
+ .index = VFIO_PCI_CONFIG_REGION_INDEX,
+ };
+
+ ioctl(device->fd, VFIO_DEVICE_GET_REGION_INFO, info);
+ if (!info->size) {
+ pr_err("Configuration Space has size zero?!");
+ return -EINVAL;
+ }
+
+ if (pread(device->fd, &pdev->hdr, sz, info->offset) != sz) {
+ pr_err("Failed to read %zd bytes of Configuration Space", sz);
+ return -EIO;
+ }
+
+ /* Strip bit 7, that indicates multifunction */
+ hdr_type = pdev->hdr.header_type & 0x7f;
+
+ if (hdr_type != PCI_HEADER_TYPE_NORMAL) {
+ pr_err("Unsupported header type %u", hdr_type);
+ return -EOPNOTSUPP;
+ }
+
+ if (vfio_pci_parse_caps(device))
+ pr_warning("Failed to parse device capabilities");
+
+ return 0;
+}
+
+static int vfio_pci_fixup_cfg_space(struct vfio_device *device)
+{
+ int i;
+ ssize_t hdr_sz;
+ struct vfio_region_info *info;
+ struct vfio_pci_device *pdev = &device->pci;
+
+ /* Enable exclusively MMIO and bus mastering */
+ pdev->hdr.command &= ~PCI_COMMAND_IO;
+ pdev->hdr.command |= PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER;
+
+ /* Initialise the BARs */
+ for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
+ struct vfio_region *region = &device->regions[i];
+ u32 base = region->guest_phys_addr;
+
+ if (!base)
+ continue;
+
+ pdev->hdr.bar_size[i] = region->info.size;
+
+ /* Construct a fake reg to match what we've mapped. */
+ pdev->hdr.bar[i] = (base & PCI_BASE_ADDRESS_MEM_MASK) |
+ PCI_BASE_ADDRESS_SPACE_MEMORY |
+ PCI_BASE_ADDRESS_MEM_TYPE_32;
+ }
+
+ /* I really can't be bothered to support cardbus. */
+ pdev->hdr.card_bus = 0;
+
+ /*
+ * Nuke the expansion ROM for now. If we want to do this properly,
+ * we need to save its size somewhere and map into the guest.
+ */
+ pdev->hdr.exp_rom_bar = 0;
+
+ /* Install our fake Configuration Space, without the caps */
+ info = &device->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
+ hdr_sz = offsetof(struct pci_device_header, msix);
+ if (pwrite(device->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) {
+ pr_err("Failed to write %zd bytes to Configuration Space", hdr_sz);
+ return -EIO;
+ }
+
+ /* TODO: install virtual capabilities */
+ /* Register callbacks for cfg accesses */
+ pdev->hdr.cfg_ops = (struct pci_config_operations) {
+ .read = vfio_pci_cfg_read,
+ .write = vfio_pci_cfg_write,
+ };
+
+ pdev->hdr.irq_type = IRQ_TYPE_LEVEL_HIGH;
+
+ return 0;
+}
+
+static int vfio_pci_map_bar(struct kvm *kvm, int fd, struct vfio_region *region)
+{
+ void *base;
+ int ret, prot = 0;
+ u64 map_size = ALIGN(region->info.size, PAGE_SIZE);
+
+ /*
+ * We don't want to mess about trapping BAR accesses, so require
+ * that they can be mmap'd. Note that this precludes the use of
+ * I/O BARs in the guest (we will hide them from Configuration
+ * Space, which is trapped).
+ */
+ if (!(region->info.flags & VFIO_REGION_INFO_FLAG_MMAP)) {
+ pr_info("Ignoring BAR %u, as it can't be mmap'd",
+ region->info.index);
+ return 0;
+ }
+
+ if (region->info.flags & VFIO_REGION_INFO_FLAG_READ)
+ prot |= PROT_READ;
+ if (region->info.flags & VFIO_REGION_INFO_FLAG_WRITE)
+ prot |= PROT_WRITE;
+
+ base = mmap(NULL, region->info.size, prot, MAP_SHARED, fd,
+ region->info.offset);
+ if (base == MAP_FAILED) {
+ ret = -errno;
+ pr_err("Failed to mmap BAR region %u (0x%llx bytes)",
+ region->info.index, region->info.size);
+ return ret;
+ }
+ region->host_addr = base;
+
+ /* Grab some MMIO space in the guest */
+ region->guest_phys_addr = pci_get_io_space_block(map_size);
+
+ /* Register the BAR as a memory region with KVM */
+ ret = kvm__register_dev_mem(kvm, region->guest_phys_addr, map_size,
+ region->host_addr);
+ if (ret) {
+ pr_err("Failed to register BAR as memory region with KVM");
+ return ret;
+ }
+
+ return 0;
+}
+
+static int vfio_pci_configure_dev_regions(struct kvm *kvm,
+ struct vfio_device *device)
+{
+ int ret;
+ u32 i, num_regions = device->info.num_regions;
+
+ ret = vfio_pci_parse_cfg_space(device);
+ if (ret)
+ return ret;
+
+ /* First of all, map the BARs directly into the guest */
+ for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
+ struct vfio_region *region;
+
+ if (i >= num_regions)
+ return 0;
+
+ region = &device->regions[i];
+ region->info = (struct vfio_region_info) {
+ .argsz = sizeof(*region),
+ .index = i,
+ };
+
+ ioctl(device->fd, VFIO_DEVICE_GET_REGION_INFO, ®ion->info);
+ /* Ignore invalid or unimplemented regions */
+ if (!region->info.size)
+ continue;
+
+ /*
+ * Map the BARs into the guest. We'll later need to update
+ * configuration space to reflect our allocation.
+ */
+ ret = vfio_pci_map_bar(kvm, device->fd, region);
+ if (ret)
+ return ret;
+ }
+
+ /* We've configured the BARs, fake up a Configuration Space */
+ return vfio_pci_fixup_cfg_space(device);
+}
+
+static int vfio_configure_dev_regions(struct kvm *kvm,
+ struct vfio_device *device)
+{
+ u32 num_regions = device->info.num_regions;
+
+ /* We only support vfio-pci devices for the moment */
+ if (!(device->info.flags & VFIO_DEVICE_FLAGS_PCI)) {
+ pr_warning("Only vfio-pci devices are supported. "
+ "Ignoring device regions.");
+ device->info.num_regions = 0;
+ return 0;
+ }
+
+ device->regions = calloc(num_regions, sizeof(*device->regions));
+ if (!device->regions) {
+ pr_err("Failed to allocate %u regions for device",
+ num_regions);
+ return -ENOMEM;
+ }
+
+ return vfio_pci_configure_dev_regions(kvm, device);
+}
+
+static int vfio_init_irqfd(struct kvm *kvm, int devfd, int gsi)
+{
+ int ret;
+ int trigger_fd, unmask_fd;
+ struct vfio_irq_eventfd trigger;
+ struct vfio_irq_eventfd unmask;
+
+ /*
+ * PCI IRQ is level-triggered, so we use two eventfds. trigger_fd
+ * signals an interrupt from host to guest, and unmask_fd signals the
+ * deassertion of the line from guest to host.
+ */
+ trigger_fd = eventfd(0, 0);
+ if (trigger_fd < 0) {
+ pr_err("Failed to create trigger eventfd");
+ return trigger_fd;
+ }
+
+ unmask_fd = eventfd(0, 0);
+ if (unmask_fd < 0) {
+ pr_err("Failed to create unmask eventfd");
+ close(trigger_fd);
+ return unmask_fd;
+ }
+
+ ret = irq__add_irqfd(kvm, gsi, trigger_fd, unmask_fd);
+ if (ret)
+ goto err_close;
+
+ trigger.irq = (struct vfio_irq_set) {
+ .argsz = sizeof(trigger),
+ .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
+ .index = VFIO_PCI_INTX_IRQ_INDEX,
+ .start = 0,
+ .count = 1,
+ };
+ trigger.fd = trigger_fd;
+
+ ret = ioctl(devfd, VFIO_DEVICE_SET_IRQS, &trigger);
+ if (ret < 0) {
+ pr_err("Failed to setup VFIO IRQ");
+ goto err_delete_line;
+ }
+
+ unmask.irq = (struct vfio_irq_set) {
+ .argsz = sizeof(unmask),
+ .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK,
+ .index = VFIO_PCI_INTX_IRQ_INDEX,
+ .start = 0,
+ .count = 1,
+ };
+ unmask.fd = unmask_fd;
+
+ ret = ioctl(devfd, VFIO_DEVICE_SET_IRQS, &unmask);
+ if (ret < 0) {
+ pr_err("Failed to setup unmask IRQ");
+ goto err_remove_event;
+ }
+
+ return 0;
+
+err_remove_event:
+ /* Remove trigger event */
+ trigger.irq.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
+ ioctl(devfd, VFIO_DEVICE_SET_IRQS, &trigger.irq);
+
+err_delete_line:
+ irq__del_irqfd(kvm, gsi);
+
+err_close:
+ close(trigger_fd);
+ close(unmask_fd);
+ return ret;
+}
+
+static int vfio_configure_dev_irqs(struct kvm *kvm, struct vfio_device *device)
+{
+ int ret;
+ struct vfio_pci_device *pdev = &device->pci;
+
+ device->irq_info = (struct vfio_irq_info) {
+ .argsz = sizeof(device->irq_info)
+ };
+
+ if (pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX)) {
+ /* TODO: set up shadow PBA/table structures for MSI-X. */
+ } else {
+ int gsi = pdev->hdr.irq_line - KVM_IRQ_OFFSET;
+
+ /* We don't have MSI-X, so fall back on INTx */
+ pr_info("MSI-X not available for device 0x%x, falling back to INTx",
+ device->dev_hdr.dev_num);
+ device->irq_info.index = VFIO_PCI_INTX_IRQ_INDEX;
+ ioctl(device->fd, VFIO_DEVICE_GET_IRQ_INFO, &device->irq_info);
+
+ if (device->irq_info.count != 1) {
+ pr_err("No INTx interrupts found");
+ return -ENODEV;
+ }
+
+ if (!(device->irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
+ pr_err("INTx interrupt not EVENTFD capable");
+ return -EINVAL;
+ }
+
+ if (!(device->irq_info.flags & VFIO_IRQ_INFO_AUTOMASKED)) {
+ pr_err("INTx interrupt not AUTOMASKED");
+ return -EINVAL;
+ }
+
+ ret = vfio_init_irqfd(kvm, device->fd, gsi);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int vfio_configure_device(struct kvm *kvm, struct vfio_group *group,
+ struct dirent *dirent)
+{
+ int ret;
+ struct vfio_device *device;
+
+ device = calloc(1, sizeof(*device));
+ if (!device) {
+ pr_err("Failed to allocate VFIO device");
+ return -ENOMEM;
+ }
+
+ device->fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, dirent->d_name);
+ if (device->fd < 0) {
+ pr_err("Failed to get FD for device %s in group %lu",
+ dirent->d_name, group->id);
+ free(device);
+
+ /* The device might be a bridge without an fd */
+ return 0;
+ }
+
+ device->info.argsz = sizeof(device->info);
+ if (ioctl(device->fd, VFIO_DEVICE_GET_INFO, &device->info)) {
+ ret = -errno;
+ pr_err("Failed to get info for device %s in group %lu",
+ dirent->d_name, group->id);
+ return ret;
+ }
+
+ if (device->info.flags & VFIO_DEVICE_FLAGS_RESET &&
+ ioctl(device->fd, VFIO_DEVICE_RESET) < 0)
+ pr_warning("Failed to reset device %s in group %lu",
+ dirent->d_name, group->id);
+
+ ret = vfio_configure_dev_regions(kvm, device);
+ if (ret) {
+ pr_err("Failed to configure regions for device %s in group %lu",
+ dirent->d_name, group->id);
+ return ret;
+ }
+
+ device->dev_hdr = (struct device_header) {
+ .bus_type = DEVICE_BUS_PCI,
+ .data = &device->pci.hdr,
+ };
+
+ ret = device__register(&device->dev_hdr);
+ if (ret) {
+ pr_err("Failed to register VFIO device");
+ return ret;
+ }
+
+ ret = vfio_configure_dev_irqs(kvm, device);
+ if (ret) {
+ pr_err("Failed to configure IRQs for device %s in group%lu",
+ dirent->d_name, group->id);
+ return ret;
+ }
+
+ pr_info("Assigned device %s in group %lu to device number 0x%x",
+ dirent->d_name, group->id, device->dev_hdr.dev_num);
+
+ return 0;
+}
+
+static int vfio_configure_iommu_groups(struct kvm *kvm)
+{
+ int i, ret;
+
+ for (i = 0; i < kvm->cfg.num_vfio_groups; ++i) {
+ DIR *dir;
+ struct dirent *dirent;
+ char dirpath[PATH_MAX];
+ struct vfio_group *group = &kvm->cfg.vfio_group[i];
+
+ snprintf(dirpath, PATH_MAX, IOMMU_GROUP_DIR "/%lu/devices",
+ group->id);
+
+ dir = opendir(dirpath);
+ if (!dir) {
+ ret = -errno;
+ pr_err("Failed to open IOMMU group %s", dirpath);
+ return ret;
+ }
+
+ while ((dirent = readdir(dir))) {
+ if (dirent->d_type != DT_LNK)
+ continue;
+
+ ret = vfio_configure_device(kvm, group, dirent);
+ if (ret)
+ return ret;
+ }
+
+ if (closedir(dir))
+ pr_warning("Failed to close IOMMU group %s", dirpath);
+ }
+
+ return 0;
+}
+
+/* TODO: this should be an arch callback, so arm can return HYP only if vsmmu */
+static int vfio_get_iommu_type(void)
+{
+ if (ioctl(vfio_container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_NESTING_IOMMU))
+ return VFIO_TYPE1_NESTING_IOMMU;
+
+ if (ioctl(vfio_container, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU))
+ return VFIO_TYPE1v2_IOMMU;
+
+ if (ioctl(vfio_container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU))
+ return VFIO_TYPE1_IOMMU;
+
+ return -ENODEV;
+}
+
+static int vfio_map_mem_bank(struct kvm *kvm, struct kvm_mem_bank *bank, void *data)
+{
+ int ret = 0;
+ struct vfio_iommu_type1_dma_map dma_map = {
+ .argsz = sizeof(dma_map),
+ .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE,
+ .vaddr = (u64)bank->host_addr,
+ .iova = (u64)bank->guest_phys_addr,
+ .size = bank->size,
+ };
+
+ /* Map the guest memory for DMA (i.e. provide isolation) */
+ if (ioctl(vfio_container, VFIO_IOMMU_MAP_DMA, &dma_map)) {
+ ret = -errno;
+ pr_err("Failed to map 0x%llx -> 0x%llx (%llu) for DMA",
+ dma_map.iova, dma_map.vaddr, dma_map.size);
+ }
+
+ return ret;
+}
+
+static int vfio_unmap_mem_bank(struct kvm *kvm, struct kvm_mem_bank *bank, void *data)
+{
+ struct vfio_iommu_type1_dma_unmap dma_unmap = {
+ .argsz = sizeof(dma_unmap),
+ .size = bank->size,
+ .iova = bank->guest_phys_addr,
+ };
+
+ ioctl(vfio_container, VFIO_IOMMU_UNMAP_DMA, &dma_unmap);
+
+ return 0;
+}
+
+static int vfio_group_init(struct kvm *kvm, struct vfio_group *group)
+{
+ int ret;
+ char group_node[VFIO_PATH_MAX_LEN];
+ struct vfio_group_status group_status = {
+ .argsz = sizeof(group_status),
+ };
+
+ snprintf(group_node, VFIO_PATH_MAX_LEN, VFIO_DEV_DIR "/%lu",
+ group->id);
+
+ group->fd = open(group_node, O_RDWR);
+ if (group->fd == -1) {
+ ret = -errno;
+ pr_err("Failed to open IOMMU group %s", group_node);
+ return ret;
+ }
+
+ if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &group_status)) {
+ ret = -errno;
+ pr_err("Failed to determine status of IOMMU group %s",
+ group_node);
+ return ret;
+ }
+
+ if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
+ pr_err("IOMMU group %s is not viable", group_node);
+ return -EINVAL;
+ }
+
+ if (ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &vfio_container)) {
+ ret = -errno;
+ pr_err("Failed to add IOMMU group %s to VFIO container",
+ group_node);
+ return ret;
+ }
+
+ return 0;
+}
+
+static int vfio_container_init(struct kvm *kvm)
+{
+ int api, i, ret, iommu_type;;
+
+ /* Create a container for our IOMMU groups */
+ vfio_container = open(VFIO_DEV_NODE, O_RDWR);
+ if (vfio_container == -1) {
+ ret = errno;
+ pr_err("Failed to open %s", VFIO_DEV_NODE);
+ return ret;
+ }
+
+ api = ioctl(vfio_container, VFIO_GET_API_VERSION);
+ if (api != VFIO_API_VERSION) {
+ pr_err("Unknown VFIO API version %d", api);
+ return -ENODEV;
+ }
+
+ iommu_type = vfio_get_iommu_type();
+ if (iommu_type < 0) {
+ pr_err("VFIO type-1 IOMMU not supported on this platform");
+ return iommu_type;
+ }
+
+ /* Sanity check our groups and add them to the container */
+ for (i = 0; i < kvm->cfg.num_vfio_groups; ++i) {
+ ret = vfio_group_init(kvm, &kvm->cfg.vfio_group[i]);
+ if (ret)
+ return ret;
+ }
+
+ /* Finalise the container */
+ if (ioctl(vfio_container, VFIO_SET_IOMMU, iommu_type)) {
+ ret = -errno;
+ pr_err("Failed to set IOMMU type %d for VFIO container",
+ iommu_type);
+ return ret;
+ } else {
+ pr_info("Using IOMMU type %d for VFIO container", iommu_type);
+ }
+
+ return kvm__for_each_mem_bank(kvm, KVM_MEM_TYPE_RAM, vfio_map_mem_bank,
+ NULL);
+}
+
+static int vfio__init(struct kvm *kvm)
+{
+ int ret;
+
+ if (!kvm->cfg.num_vfio_groups)
+ return 0;
+
+ ret = vfio_container_init(kvm);
+ if (ret)
+ return ret;
+
+ ret = vfio_configure_iommu_groups(kvm);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+dev_base_init(vfio__init);
+
+static int vfio__exit(struct kvm *kvm)
+{
+ int i, fd;
+
+ if (!kvm->cfg.num_vfio_groups)
+ return 0;
+
+ for (i = 0; i < kvm->cfg.num_vfio_groups; ++i) {
+ fd = kvm->cfg.vfio_group[i].fd;
+ ioctl(fd, VFIO_GROUP_UNSET_CONTAINER);
+ close(fd);
+ }
+
+ kvm__for_each_mem_bank(kvm, KVM_MEM_TYPE_RAM, vfio_unmap_mem_bank, NULL);
+ return close(vfio_container);
+}
+dev_base_exit(vfio__exit);