[v2,kvmtool,06/10] Add PCI device passthrough using VFIO

Message ID	20170622170536.14319-7-jean-philippe.brucker@arm.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <kvm-owner@kernel.org> From: Jean-Philippe Brucker <jean-philippe.brucker@arm.com> To: kvm@vger.kernel.org Cc: will.deacon@arm.com, robin.murphy@arm.com, lorenzo.pieralisi@arm.com, marc.zyngier@arm.com Subject: [PATCH v2 kvmtool 06/10] Add PCI device passthrough using VFIO Date: Thu, 22 Jun 2017 18:05:32 +0100 Message-Id: <20170622170536.14319-7-jean-philippe.brucker@arm.com> In-Reply-To: <20170622170536.14319-1-jean-philippe.brucker@arm.com> References: <20170622170536.14319-1-jean-philippe.brucker@arm.com> Sender: kvm-owner@vger.kernel.org Precedence: bulk

diff --git a/Makefile b/Makefile index 57714815..caae6f07 100644 --- a/Makefile +++ b/Makefile @@ -59,6 +59,8 @@ OBJS += main.o OBJS += mmio.o OBJS += pci.o OBJS += term.o +OBJS += vfio/core.o +OBJS += vfio/pci.o OBJS += virtio/blk.o OBJS += virtio/scsi.o OBJS += virtio/console.o diff --git a/arm/pci.c b/arm/pci.c index 744b14c2..557cfa98 100644 --- a/arm/pci.c +++ b/arm/pci.c @@ -1,5 +1,6 @@ #include "kvm/devices.h" #include "kvm/fdt.h" +#include "kvm/kvm.h" #include "kvm/of_pci.h" #include "kvm/pci.h" #include "kvm/util.h" diff --git a/builtin-run.c b/builtin-run.c index 72b878dc..3ee735d9 100644 --- a/builtin-run.c +++ b/builtin-run.c @@ -146,6 +146,11 @@ void kvm_run_set_wrapper_sandbox(void) OPT_BOOLEAN('\0', "no-dhcp", &(cfg)->no_dhcp, "Disable kernel" \ " DHCP in rootfs mode"), \ \ + OPT_GROUP("VFIO options:"), \ + OPT_CALLBACK('\0', "vfio-group", NULL, "group number", \ + "Assign a VFIO group to the virtual machine", \ + vfio_group_parser, kvm), \ + \ OPT_GROUP("Debug options:"), \ OPT_BOOLEAN('\0', "debug", &do_debug_print, \ "Enable debug messages"), \ diff --git a/include/kvm/kvm-config.h b/include/kvm/kvm-config.h index 386fa8c5..62dc6a2f 100644 --- a/include/kvm/kvm-config.h +++ b/include/kvm/kvm-config.h @@ -2,6 +2,7 @@ #define KVM_CONFIG_H_ #include "kvm/disk-image.h" +#include "kvm/vfio.h" #include "kvm/kvm-config-arch.h" #define DEFAULT_KVM_DEV "/dev/kvm" @@ -20,9 +21,11 @@ struct kvm_config { struct kvm_config_arch arch; struct disk_image_params disk_image[MAX_DISK_IMAGES]; + struct vfio_group vfio_group[MAX_VFIO_GROUPS]; u64 ram_size; u8 image_count; u8 num_net_devices; + u8 num_vfio_groups; bool virtio_rng; int active_console; int debug_iodelay; diff --git a/include/kvm/pci.h b/include/kvm/pci.h index 2950bb10..44e5adff 100644 --- a/include/kvm/pci.h +++ b/include/kvm/pci.h @@ -7,7 +7,6 @@ #include <endian.h> #include "kvm/devices.h" -#include "kvm/kvm.h" #include "kvm/msi.h" #include "kvm/fdt.h" @@ -22,6 +21,8 @@ #define PCI_IO_SIZE 0x100 #define PCI_CFG_SIZE (1ULL << 24) +struct kvm; + union pci_config_address { struct { #if __BYTE_ORDER == __LITTLE_ENDIAN diff --git a/include/kvm/vfio.h b/include/kvm/vfio.h new file mode 100644 index 00000000..060f32a3 --- /dev/null +++ b/include/kvm/vfio.h @@ -0,0 +1,57 @@ +#ifndef KVM__VFIO_H +#define KVM__VFIO_H + +#include "kvm/parse-options.h" +#include "kvm/pci.h" + +#include <linux/vfio.h> + +#include <dirent.h> + +#define dev_err(vdev, fmt, ...) pr_err("%s: " fmt, vdev->name, ##__VA_ARGS__) +#define dev_warn(vdev, fmt, ...) pr_warning("%s: " fmt, vdev->name, ##__VA_ARGS__) +#define dev_info(vdev, fmt, ...) pr_info("%s: " fmt, vdev->name, ##__VA_ARGS__) +#define dev_die(vdev, fmt, ...) die("%s: " fmt, vdev->name, ##__VA_ARGS__) + +#define MAX_VFIO_GROUPS 16 + +struct vfio_pci_device { + struct pci_device_header hdr; +}; + +struct vfio_region { + struct vfio_region_info info; + u64 guest_phys_addr; + void *host_addr; +}; + +struct vfio_device { + struct device_header dev_hdr; + + int fd; + struct vfio_device_info info; + struct vfio_irq_info irq_info; + struct vfio_region *regions; + + char *name; + char *sysfs_path; + + struct hlist_node list; + + struct vfio_pci_device pci; +}; + +struct vfio_group { + unsigned long id; /* iommu_group number in sysfs */ + int fd; + struct hlist_head devices; +}; + +int vfio_group_parser(const struct option *opt, const char *arg, int unset); +int vfio_map_region(struct kvm *kvm, struct vfio_device *vdev, + struct vfio_region *region); +void vfio_unmap_region(struct kvm *kvm, struct vfio_region *region); +int vfio_pci_setup_device(struct kvm *kvm, struct vfio_device *device); +void vfio_pci_teardown_device(struct kvm *kvm, struct vfio_device *vdev); + +#endif /* KVM__VFIO_H */ diff --git a/vfio/core.c b/vfio/core.c new file mode 100644 index 00000000..7e1ba789 --- /dev/null +++ b/vfio/core.c @@ -0,0 +1,395 @@ +#include "kvm/kvm.h" +#include "kvm/vfio.h" + +#include <linux/list.h> + +#define VFIO_DEV_DIR "/dev/vfio" +#define VFIO_DEV_NODE VFIO_DEV_DIR "/vfio" +#define IOMMU_GROUP_DIR "/sys/kernel/iommu_groups" + +#define VFIO_PATH_MAX_LEN 16 + +static int vfio_container; + +int vfio_group_parser(const struct option *opt, const char *arg, int unset) +{ + char *cur, *buf = strdup(arg); + static int idx = 0; + struct kvm *kvm = opt->ptr; + struct vfio_group *group = &kvm->cfg.vfio_group[idx]; + + if (idx >= MAX_VFIO_GROUPS) { + if (idx++ == MAX_VFIO_GROUPS) + pr_warning("Too many VFIO groups"); + free(buf); + return 0; + } + + cur = strtok(buf, ","); + group->id = strtoul(cur, NULL, 0); + + kvm->cfg.num_vfio_groups = ++idx; + free(buf); + + return 0; +} + +int vfio_map_region(struct kvm *kvm, struct vfio_device *vdev, + struct vfio_region *region) +{ + void *base; + int ret, prot = 0; + /* KVM needs page-aligned regions */ + u64 map_size = ALIGN(region->info.size, PAGE_SIZE); + + /* + * We don't want to mess about trapping config accesses, so require that + * they can be mmap'd. Note that for PCI, this precludes the use of I/O + * BARs in the guest (we will hide them from Configuration Space, which + * is trapped). + */ + if (!(region->info.flags & VFIO_REGION_INFO_FLAG_MMAP)) { + dev_info(vdev, "ignoring region %u, as it can't be mmap'd", + region->info.index); + return 0; + } + + if (region->info.flags & VFIO_REGION_INFO_FLAG_READ) + prot |= PROT_READ; + if (region->info.flags & VFIO_REGION_INFO_FLAG_WRITE) + prot |= PROT_WRITE; + + base = mmap(NULL, region->info.size, prot, MAP_SHARED, vdev->fd, + region->info.offset); + if (base == MAP_FAILED) { + ret = -errno; + dev_err(vdev, "failed to mmap region %u (0x%llx bytes)", + region->info.index, region->info.size); + return ret; + } + region->host_addr = base; + + ret = kvm__register_dev_mem(kvm, region->guest_phys_addr, map_size, + region->host_addr); + if (ret) { + dev_err(vdev, "failed to register region with KVM"); + return ret; + } + + return 0; +} + +void vfio_unmap_region(struct kvm *kvm, struct vfio_region *region) +{ + munmap(region->host_addr, region->info.size); +} + +static int vfio_configure_device(struct kvm *kvm, struct vfio_group *group, + const char *dirpath, const char *name) +{ + u32 num_regions; + int ret = -ENOMEM; + char fullpath[PATH_MAX]; + struct vfio_device *vdev; + + snprintf(fullpath, PATH_MAX, "%s/%s", dirpath, name); + + vdev = calloc(1, sizeof(*vdev)); + if (!vdev) + return -ENOMEM; + + vdev->name = strdup(name); + if (!vdev->name) + goto err_free_device; + + vdev->sysfs_path = strndup(fullpath, PATH_MAX); + if (!vdev->sysfs_path) + goto err_free_name; + + vdev->fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name); + if (vdev->fd < 0) { + dev_err(vdev, "failed to get fd"); + + /* The device might be a bridge without an fd */ + ret = 0; + goto err_free_path; + } + + vdev->info.argsz = sizeof(vdev->info); + if (ioctl(vdev->fd, VFIO_DEVICE_GET_INFO, &vdev->info)) { + ret = -errno; + dev_err(vdev, "failed to get info"); + goto err_close_device; + } + + if (vdev->info.flags & VFIO_DEVICE_FLAGS_RESET && + ioctl(vdev->fd, VFIO_DEVICE_RESET) < 0) + dev_warn(vdev, "failed to reset device"); + + num_regions = vdev->info.num_regions; + + vdev->regions = calloc(num_regions, sizeof(*vdev->regions)); + if (!vdev->regions) { + ret = -ENOMEM; + goto err_close_device; + } + + /* Now for the bus-specific initialization... */ + if (vdev->info.flags & VFIO_DEVICE_FLAGS_PCI) { + ret = vfio_pci_setup_device(kvm, vdev); + } else { + dev_warn(vdev, "only vfio-pci is supported"); + ret = -EINVAL; + } + + if (ret) + goto err_free_regions; + + dev_info(vdev, "assigned to device number 0x%x in group %lu", + vdev->dev_hdr.dev_num, group->id); + + hlist_add_head(&vdev->list, &group->devices); + + return 0; + +err_free_regions: + free(vdev->regions); +err_close_device: + close(vdev->fd); +err_free_path: + free((void *)vdev->sysfs_path); +err_free_name: + free((void *)vdev->name); +err_free_device: + free(vdev); + + return ret; +} + +static int vfio_configure_iommu_groups(struct kvm *kvm) +{ + int i, ret; + + for (i = 0; i < kvm->cfg.num_vfio_groups; ++i) { + DIR *dir; + struct dirent *dirent; + char dirpath[PATH_MAX]; + struct vfio_group *group = &kvm->cfg.vfio_group[i]; + + snprintf(dirpath, PATH_MAX, IOMMU_GROUP_DIR "/%lu/devices", + group->id); + + dir = opendir(dirpath); + if (!dir) { + ret = -errno; + pr_err("Failed to open IOMMU group %s", dirpath); + return ret; + } + + while ((dirent = readdir(dir))) { + if (dirent->d_type != DT_LNK) + continue; + + ret = vfio_configure_device(kvm, group, dirpath, + dirent->d_name); + if (ret) + return ret; + } + + if (closedir(dir)) + pr_warning("Failed to close IOMMU group %s", dirpath); + } + + return 0; +} + +static int vfio_get_iommu_type(void) +{ + if (ioctl(vfio_container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_NESTING_IOMMU)) + return VFIO_TYPE1_NESTING_IOMMU; + + if (ioctl(vfio_container, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU)) + return VFIO_TYPE1v2_IOMMU; + + if (ioctl(vfio_container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) + return VFIO_TYPE1_IOMMU; + + return -ENODEV; +} + +static int vfio_map_mem_bank(struct kvm *kvm, struct kvm_mem_bank *bank, void *data) +{ + int ret = 0; + struct vfio_iommu_type1_dma_map dma_map = { + .argsz = sizeof(dma_map), + .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE, + .vaddr = (unsigned long)bank->host_addr, + .iova = (u64)bank->guest_phys_addr, + .size = bank->size, + }; + + /* Map the guest memory for DMA (i.e. provide isolation) */ + if (ioctl(vfio_container, VFIO_IOMMU_MAP_DMA, &dma_map)) { + ret = -errno; + pr_err("Failed to map 0x%llx -> 0x%llx (%llu) for DMA", + dma_map.iova, dma_map.vaddr, dma_map.size); + } + + return ret; +} + +static int vfio_unmap_mem_bank(struct kvm *kvm, struct kvm_mem_bank *bank, void *data) +{ + struct vfio_iommu_type1_dma_unmap dma_unmap = { + .argsz = sizeof(dma_unmap), + .size = bank->size, + .iova = bank->guest_phys_addr, + }; + + ioctl(vfio_container, VFIO_IOMMU_UNMAP_DMA, &dma_unmap); + + return 0; +} + +static int vfio_group_init(struct kvm *kvm, struct vfio_group *group) +{ + int ret; + char group_node[VFIO_PATH_MAX_LEN]; + struct vfio_group_status group_status = { + .argsz = sizeof(group_status), + }; + + INIT_HLIST_HEAD(&group->devices); + + snprintf(group_node, VFIO_PATH_MAX_LEN, VFIO_DEV_DIR "/%lu", + group->id); + + group->fd = open(group_node, O_RDWR); + if (group->fd == -1) { + ret = -errno; + pr_err("Failed to open IOMMU group %s", group_node); + return ret; + } + + if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &group_status)) { + ret = -errno; + pr_err("Failed to determine status of IOMMU group %s", + group_node); + return ret; + } + + if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) { + pr_err("IOMMU group %s is not viable", group_node); + return -EINVAL; + } + + if (ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &vfio_container)) { + ret = -errno; + pr_err("Failed to add IOMMU group %s to VFIO container", + group_node); + return ret; + } + + return 0; +} + +static void vfio_group_exit(struct kvm *kvm, struct vfio_group *group) +{ + int fd = group->fd; + struct hlist_node *next; + struct vfio_device *vdev; + + hlist_for_each_entry_safe(vdev, next, &group->devices, list) { + if (vdev->info.flags & VFIO_DEVICE_FLAGS_PCI) + vfio_pci_teardown_device(kvm, vdev); + + close(vdev->fd); + + free(vdev->regions); + free(vdev->name); + free(vdev->sysfs_path); + free(vdev); + } + + ioctl(fd, VFIO_GROUP_UNSET_CONTAINER); + close(fd); +} + +static int vfio_container_init(struct kvm *kvm) +{ + int api, i, ret, iommu_type;; + + /* Create a container for our IOMMU groups */ + vfio_container = open(VFIO_DEV_NODE, O_RDWR); + if (vfio_container == -1) { + ret = errno; + pr_err("Failed to open %s", VFIO_DEV_NODE); + return ret; + } + + api = ioctl(vfio_container, VFIO_GET_API_VERSION); + if (api != VFIO_API_VERSION) { + pr_err("Unknown VFIO API version %d", api); + return -ENODEV; + } + + iommu_type = vfio_get_iommu_type(); + if (iommu_type < 0) { + pr_err("VFIO type-1 IOMMU not supported on this platform"); + return iommu_type; + } + + /* Sanity check our groups and add them to the container */ + for (i = 0; i < kvm->cfg.num_vfio_groups; ++i) { + ret = vfio_group_init(kvm, &kvm->cfg.vfio_group[i]); + if (ret) + return ret; + } + + /* Finalise the container */ + if (ioctl(vfio_container, VFIO_SET_IOMMU, iommu_type)) { + ret = -errno; + pr_err("Failed to set IOMMU type %d for VFIO container", + iommu_type); + return ret; + } else { + pr_info("Using IOMMU type %d for VFIO container", iommu_type); + } + + return kvm__for_each_mem_bank(kvm, KVM_MEM_TYPE_RAM, vfio_map_mem_bank, + NULL); +} + +static int vfio__init(struct kvm *kvm) +{ + int ret; + + if (!kvm->cfg.num_vfio_groups) + return 0; + + ret = vfio_container_init(kvm); + if (ret) + return ret; + + ret = vfio_configure_iommu_groups(kvm); + if (ret) + return ret; + + return 0; +} +dev_base_init(vfio__init); + +static int vfio__exit(struct kvm *kvm) +{ + int i; + + if (!kvm->cfg.num_vfio_groups) + return 0; + + for (i = 0; i < kvm->cfg.num_vfio_groups; ++i) + vfio_group_exit(kvm, &kvm->cfg.vfio_group[i]); + + kvm__for_each_mem_bank(kvm, KVM_MEM_TYPE_RAM, vfio_unmap_mem_bank, NULL); + return close(vfio_container); +} +dev_base_exit(vfio__exit); diff --git a/vfio/pci.c b/vfio/pci.c new file mode 100644 index 00000000..aca43431 --- /dev/null +++ b/vfio/pci.c @@ -0,0 +1,365 @@ +#include "kvm/irq.h" +#include "kvm/kvm.h" +#include "kvm/kvm-cpu.h" +#include "kvm/vfio.h" + +#include <sys/ioctl.h> +#include <sys/eventfd.h> + +/* Wrapper around UAPI vfio_irq_set */ +struct vfio_irq_eventfd { + struct vfio_irq_set irq; + int fd; +}; + +static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr, + u8 offset, void *data, int sz) +{ + struct vfio_region_info *info; + struct vfio_pci_device *pdev; + struct vfio_device *vdev; + char base[sz]; + + pdev = container_of(pci_hdr, struct vfio_pci_device, hdr); + vdev = container_of(pdev, struct vfio_device, pci); + info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; + + /* Dummy read in case of side-effects */ + if (pread(vdev->fd, base, sz, info->offset + offset) != sz) + dev_warn(vdev, "failed to read %d bytes from Configuration Space at 0x%x", + sz, offset); +} + +static void vfio_pci_cfg_write(struct kvm *kvm, struct pci_device_header *pci_hdr, + u8 offset, void *data, int sz) +{ + struct vfio_region_info *info; + struct vfio_pci_device *pdev; + struct vfio_device *vdev; + void *base = pci_hdr; + + pdev = container_of(pci_hdr, struct vfio_pci_device, hdr); + vdev = container_of(pdev, struct vfio_device, pci); + info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; + + if (pwrite(vdev->fd, data, sz, info->offset + offset) != sz) + dev_warn(vdev, "Failed to write %d bytes to Configuration Space at 0x%x", + sz, offset); + + if (pread(vdev->fd, base + offset, sz, info->offset + offset) != sz) + dev_warn(vdev, "Failed to read %d bytes from Configuration Space at 0x%x", + sz, offset); +} + +static int vfio_pci_parse_caps(struct vfio_device *vdev) +{ + struct vfio_pci_device *pdev = &vdev->pci; + + if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST)) + return 0; + + pdev->hdr.status &= ~PCI_STATUS_CAP_LIST; + pdev->hdr.capabilities = 0; + + /* TODO: install virtual capabilities */ + + return 0; +} + +static int vfio_pci_parse_cfg_space(struct vfio_device *vdev) +{ + struct vfio_region_info *info; + ssize_t sz = PCI_DEV_CFG_SIZE; + struct vfio_pci_device *pdev = &vdev->pci; + + if (vdev->info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) { + dev_err(vdev, "Config Space not found"); + return -ENODEV; + } + + info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; + *info = (struct vfio_region_info) { + .argsz = sizeof(*info), + .index = VFIO_PCI_CONFIG_REGION_INDEX, + }; + + ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info); + if (!info->size) { + dev_err(vdev, "Config Space has size zero?!"); + return -EINVAL; + } + + if (pread(vdev->fd, &pdev->hdr, sz, info->offset) != sz) { + dev_err(vdev, "failed to read %zd bytes of Config Space", sz); + return -EIO; + } + + /* Strip bit 7, that indicates multifunction */ + pdev->hdr.header_type &= 0x7f; + + if (pdev->hdr.header_type != PCI_HEADER_TYPE_NORMAL) { + dev_err(vdev, "unsupported header type %u", + pdev->hdr.header_type); + return -EOPNOTSUPP; + } + + vfio_pci_parse_caps(vdev); + + return 0; +} + +static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev) +{ + int i; + ssize_t hdr_sz; + struct vfio_region_info *info; + struct vfio_pci_device *pdev = &vdev->pci; + + /* Enable exclusively MMIO and bus mastering */ + pdev->hdr.command &= ~PCI_COMMAND_IO; + pdev->hdr.command |= PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER; + + /* Initialise the BARs */ + for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) { + struct vfio_region *region = &vdev->regions[i]; + u64 base = region->guest_phys_addr; + + if (!base) + continue; + + pdev->hdr.bar_size[i] = region->info.size; + + /* Construct a fake reg to match what we've mapped. */ + pdev->hdr.bar[i] = (base & PCI_BASE_ADDRESS_MEM_MASK) | + PCI_BASE_ADDRESS_SPACE_MEMORY | + PCI_BASE_ADDRESS_MEM_TYPE_32; + } + + /* I really can't be bothered to support cardbus. */ + pdev->hdr.card_bus = 0; + + /* + * Nuke the expansion ROM for now. If we want to do this properly, + * we need to save its size somewhere and map into the guest. + */ + pdev->hdr.exp_rom_bar = 0; + + /* Install our fake Configuration Space, without the caps */ + info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; + hdr_sz = offsetof(struct pci_device_header, msix); + if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) { + dev_err(vdev, "failed to write %zd bytes to Config Space", + hdr_sz); + return -EIO; + } + + /* TODO: install virtual capability */ + + /* Register callbacks for cfg accesses */ + pdev->hdr.cfg_ops = (struct pci_config_operations) { + .read = vfio_pci_cfg_read, + .write = vfio_pci_cfg_write, + }; + + pdev->hdr.irq_type = IRQ_TYPE_LEVEL_HIGH; + + return 0; +} + +static int vfio_pci_configure_dev_regions(struct kvm *kvm, + struct vfio_device *vdev) +{ + u32 i; + int ret; + size_t map_size; + + ret = vfio_pci_parse_cfg_space(vdev); + if (ret) + return ret; + + /* First of all, map the BARs directly into the guest */ + for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) { + struct vfio_region *region = &vdev->regions[i]; + + if (i >= vdev->info.num_regions) + break; + + region->info = (struct vfio_region_info) { + .argsz = sizeof(*region), + .index = i, + }; + + ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, + &region->info); + if (ret) { + ret = -errno; + dev_err(vdev, "cannot get info for region %u", i); + return ret; + } + + /* Ignore invalid or unimplemented regions */ + if (!region->info.size) + continue; + + /* Grab some MMIO space in the guest */ + map_size = ALIGN(region->info.size, PAGE_SIZE); + region->guest_phys_addr = pci_get_io_space_block(map_size); + + /* + * Map the BARs into the guest. We'll later need to update + * configuration space to reflect our allocation. + */ + ret = vfio_map_region(kvm, vdev, region); + if (ret) + return ret; + } + + /* We've configured the BARs, fake up a Configuration Space */ + return vfio_pci_fixup_cfg_space(vdev); +} + +static int vfio_pci_init_irqfd(struct kvm *kvm, int devfd, int gsi) +{ + int ret; + int trigger_fd, unmask_fd; + struct vfio_irq_eventfd trigger; + struct vfio_irq_eventfd unmask; + + /* + * PCI IRQ is level-triggered, so we use two eventfds. trigger_fd + * signals an interrupt from host to guest, and unmask_fd signals the + * deassertion of the line from guest to host. + */ + trigger_fd = eventfd(0, 0); + if (trigger_fd < 0) { + pr_err("Failed to create trigger eventfd"); + return trigger_fd; + } + + unmask_fd = eventfd(0, 0); + if (unmask_fd < 0) { + pr_err("Failed to create unmask eventfd"); + close(trigger_fd); + return unmask_fd; + } + + ret = irq__add_irqfd(kvm, gsi, trigger_fd, unmask_fd); + if (ret) + goto err_close; + + trigger.irq = (struct vfio_irq_set) { + .argsz = sizeof(trigger), + .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER, + .index = VFIO_PCI_INTX_IRQ_INDEX, + .start = 0, + .count = 1, + }; + trigger.fd = trigger_fd; + + ret = ioctl(devfd, VFIO_DEVICE_SET_IRQS, &trigger); + if (ret < 0) { + pr_err("Failed to setup VFIO IRQ"); + goto err_delete_line; + } + + unmask.irq = (struct vfio_irq_set) { + .argsz = sizeof(unmask), + .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK, + .index = VFIO_PCI_INTX_IRQ_INDEX, + .start = 0, + .count = 1, + }; + unmask.fd = unmask_fd; + + ret = ioctl(devfd, VFIO_DEVICE_SET_IRQS, &unmask); + if (ret < 0) { + pr_err("Failed to setup unmask IRQ"); + goto err_remove_event; + } + + return 0; + +err_remove_event: + /* Remove trigger event */ + trigger.irq.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; + ioctl(devfd, VFIO_DEVICE_SET_IRQS, &trigger.irq); + +err_delete_line: + irq__del_irqfd(kvm, gsi, trigger_fd); + +err_close: + close(trigger_fd); + close(unmask_fd); + return ret; +} + +static int vfio_pci_configure_dev_irqs(struct kvm *kvm, struct vfio_device *vdev) +{ + struct vfio_pci_device *pdev = &vdev->pci; + int gsi = pdev->hdr.irq_line - KVM_IRQ_OFFSET; + + vdev->irq_info = (struct vfio_irq_info) { + .argsz = sizeof(vdev->irq_info), + }; + + ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &vdev->irq_info); + if (vdev->irq_info.count == 0) { + dev_err(vdev, "no interrupt found by VFIO"); + return -ENODEV; + } + + if (!(vdev->irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) { + dev_err(vdev, "interrupt not EVENTFD capable"); + return -EINVAL; + } + + /* TODO: add MSI support */ + dev_err(vdev, "MSI-X not available, falling back to INTx"); + + if (!(vdev->irq_info.flags & VFIO_IRQ_INFO_AUTOMASKED)) { + dev_err(vdev, "INTx interrupt not AUTOMASKED"); + return -EINVAL; + } + + return vfio_pci_init_irqfd(kvm, vdev->fd, gsi); +} + +int vfio_pci_setup_device(struct kvm *kvm, struct vfio_device *vdev) +{ + int ret; + + ret = vfio_pci_configure_dev_regions(kvm, vdev); + if (ret) { + dev_err(vdev, "failed to configure regions"); + return ret; + } + + vdev->dev_hdr = (struct device_header) { + .bus_type = DEVICE_BUS_PCI, + .data = &vdev->pci.hdr, + }; + + ret = device__register(&vdev->dev_hdr); + if (ret) { + dev_err(vdev, "failed to register VFIO device"); + return ret; + } + + ret = vfio_pci_configure_dev_irqs(kvm, vdev); + if (ret) { + dev_err(vdev, "failed to configure IRQs"); + return ret; + } + + return 0; +} + +void vfio_pci_teardown_device(struct kvm *kvm, struct vfio_device *vdev) +{ + size_t i; + + for (i = 0; i < vdev->info.num_regions; i++) + vfio_unmap_region(kvm, &vdev->regions[i]); + + device__unregister(&vdev->dev_hdr); +}

[v2,kvmtool,06/10] Add PCI device passthrough using VFIO

Commit Message

Comments

Patch