@@ -15,6 +15,7 @@ LIBKVM += lib/sparsebit.c
LIBKVM += lib/test_util.c
LIBKVM += lib/ucall_common.c
LIBKVM += lib/userfaultfd_util.c
+LIBKVM += lib/vfio_pci_util.c
LIBKVM_STRING += lib/string_override.c
@@ -133,6 +134,7 @@ TEST_GEN_PROGS_x86 += mmu_stress_test
TEST_GEN_PROGS_x86 += rseq_test
TEST_GEN_PROGS_x86 += set_memory_region_test
TEST_GEN_PROGS_x86 += steal_time
+TEST_GEN_PROGS_x86 += vfio_irq_test
TEST_GEN_PROGS_x86 += kvm_binary_stats_test
TEST_GEN_PROGS_x86 += system_counter_offset_test
TEST_GEN_PROGS_x86 += pre_fault_memory_test
new file mode 100644
@@ -0,0 +1,149 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2022, Google LLC.
+ */
+
+#ifndef SELFTEST_KVM_VFIO_UTIL_H
+#define SELFTEST_KVM_VFIO_UTIL_H
+
+#include <linux/pci_regs.h>
+#include <linux/vfio.h>
+
+#include "kvm_util.h"
+#include "test_util.h"
+
+struct vfio_pci_dev {
+ int fd;
+ int group_fd;
+ int container_fd;
+};
+
+struct vfio_pci_dev *__vfio_pci_init(const char *bdf, unsigned long iommu_type);
+void vfio_pci_free(struct vfio_pci_dev *dev);
+
+static inline struct vfio_pci_dev *vfio_pci_init(const char *bdf)
+{
+ return __vfio_pci_init(bdf, VFIO_TYPE1v2_IOMMU);
+}
+
+#define __vfio_ioctl(vfio_fd, cmd, arg) \
+({ \
+ __kvm_ioctl(vfio_fd, cmd, arg); \
+})
+
+#define vfio_ioctl(vfio_fd, cmd, arg) \
+({ \
+ int ret = __vfio_ioctl(vfio_fd, cmd, arg); \
+ \
+ TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(#cmd, ret)); \
+})
+
+static inline uint32_t vfio_pci_get_nr_irqs(struct vfio_pci_dev *dev,
+ uint32_t irq_type)
+{
+ struct vfio_irq_info irq_info = {
+ .argsz = sizeof(struct vfio_irq_info),
+ .index = irq_type,
+ };
+
+ vfio_ioctl(dev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
+
+ TEST_ASSERT(irq_info.flags & VFIO_IRQ_INFO_EVENTFD,
+ "eventfd signalling unsupported by IRQ type '%u'", irq_type);
+ return irq_info.count;
+}
+
+static inline uint32_t vfio_pci_get_nr_msi_irqs(struct vfio_pci_dev *dev)
+{
+ return vfio_pci_get_nr_irqs(dev, VFIO_PCI_MSI_IRQ_INDEX);
+}
+
+static inline uint32_t vfio_pci_get_nr_msix_irqs(struct vfio_pci_dev *dev)
+{
+ return vfio_pci_get_nr_irqs(dev, VFIO_PCI_MSIX_IRQ_INDEX);
+}
+
+static inline void __vfio_pci_irq_eventfd(struct vfio_pci_dev *dev, int eventfd,
+ uint32_t irq_type, uint32_t set)
+{
+ struct {
+ struct vfio_irq_set vfio;
+ uint32_t eventfd;
+ } buffer = {};
+
+ memset(&buffer, 0, sizeof(buffer));
+ buffer.vfio.argsz = sizeof(buffer);
+ buffer.vfio.flags = set | VFIO_IRQ_SET_ACTION_TRIGGER;
+ buffer.vfio.index = irq_type;
+ buffer.vfio.count = 1;
+ buffer.eventfd = eventfd;
+
+ vfio_ioctl(dev->fd, VFIO_DEVICE_SET_IRQS, &buffer.vfio);
+}
+
+static inline void vfio_pci_assign_irq_eventfd(struct vfio_pci_dev *dev,
+ int eventfd, uint32_t irq_type)
+{
+ __vfio_pci_irq_eventfd(dev, eventfd, irq_type, VFIO_IRQ_SET_DATA_EVENTFD);
+}
+
+static inline void vfio_pci_assign_msix(struct vfio_pci_dev *dev, int eventfd)
+{
+ vfio_pci_assign_irq_eventfd(dev, eventfd, VFIO_PCI_MSIX_IRQ_INDEX);
+}
+
+static inline void vfio_pci_release_irq_eventfds(struct vfio_pci_dev *dev,
+ uint32_t irq_type)
+{
+ struct vfio_irq_set vfio = {
+ .argsz = sizeof(struct vfio_irq_set),
+ .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
+ .index = irq_type,
+ .count = 0,
+ };
+
+ vfio_ioctl(dev->fd, VFIO_DEVICE_SET_IRQS, &vfio);
+}
+
+static inline void vfio_pci_release_msix(struct vfio_pci_dev *dev)
+{
+ vfio_pci_release_irq_eventfds(dev, VFIO_PCI_MSIX_IRQ_INDEX);
+}
+
+static inline void vfio_pci_send_irq_eventfd(struct vfio_pci_dev *dev,
+ int eventfd, uint32_t irq_type)
+{
+ __vfio_pci_irq_eventfd(dev, eventfd, irq_type, VFIO_IRQ_SET_DATA_NONE);
+}
+
+static inline void vfio_pci_send_msix(struct vfio_pci_dev *dev, int eventfd)
+{
+ vfio_pci_send_irq_eventfd(dev, eventfd, VFIO_PCI_MSIX_IRQ_INDEX);
+}
+
+void *vfio_pci_map_bar(struct vfio_pci_dev *dev, unsigned int bar_idx,
+ uint64_t *size);
+
+void vfio_pci_read_config_data(struct vfio_pci_dev *dev, size_t offset,
+ size_t size, void *data);
+
+static inline uint16_t vfio_pci_config_read_u16(struct vfio_pci_dev *dev,
+ size_t offset)
+{
+ uint16_t val;
+
+ vfio_pci_read_config_data(dev, offset, sizeof(val), &val);
+ return le16toh(val);
+}
+
+static inline uint16_t vfio_pci_get_vendor_id(struct vfio_pci_dev *dev)
+{
+ return vfio_pci_config_read_u16(dev, PCI_VENDOR_ID);
+}
+
+static inline uint16_t vfio_pci_get_device_id(struct vfio_pci_dev *dev)
+{
+ return vfio_pci_config_read_u16(dev, PCI_DEVICE_ID);
+}
+
+#endif /* SELFTEST_KVM_VFIO_UTIL_H */
@@ -19,6 +19,27 @@
#include "kvm_util.h"
#include "ucall_common.h"
+
+static inline void writel(uint32_t val, volatile void *addr)
+{
+ *(volatile uint32_t *)addr = val;
+}
+
+static inline uint32_t readl(volatile void *addr)
+{
+ return *(volatile uint32_t *)addr;
+}
+
+static inline void writeq(uint64_t val, volatile void *addr)
+{
+ *(volatile uint64_t *)addr = val;
+}
+
+static inline uint64_t readq(volatile void *addr)
+{
+ return *(volatile uint64_t *)addr;
+}
+
extern bool host_cpu_is_intel;
extern bool host_cpu_is_amd;
extern uint64_t guest_tsc_khz;
new file mode 100644
@@ -0,0 +1,201 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <poll.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <libgen.h>
+#include <endian.h>
+#include <sys/ioctl.h>
+#include <linux/mman.h>
+#include <asm/barrier.h>
+#include <sys/eventfd.h>
+#include <linux/limits.h>
+
+#include <linux/vfio.h>
+#include <linux/pci_regs.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "vfio_pci_util.h"
+
+#define VFIO_DEV_PATH "/dev/vfio/vfio"
+#define PCI_SYSFS_PATH "/sys/bus/pci/devices/"
+
+void *vfio_pci_map_bar(struct vfio_pci_dev *dev, unsigned int bar_idx,
+ uint64_t *size)
+{
+ struct vfio_region_info info = {
+ .argsz = sizeof(struct vfio_region_info),
+ .index = bar_idx,
+ };
+ int fd = dev->fd;
+ void *bar;
+ int prot;
+
+ TEST_ASSERT(bar_idx <= VFIO_PCI_BAR5_REGION_INDEX,
+ "Invalid BAR index: %d", bar_idx);
+
+ /* Currently only support the cases where the BAR can be mmap-ed */
+ vfio_ioctl(fd, VFIO_DEVICE_GET_REGION_INFO, &info);
+ TEST_ASSERT(info.flags & VFIO_REGION_INFO_FLAG_MMAP,
+ "BAR%d doesn't support mmap", bar_idx);
+
+ TEST_ASSERT(info.flags & VFIO_REGION_INFO_FLAG_READ,
+ "BAR%d doesn't support read?", bar_idx);
+
+ prot = PROT_READ;
+ if (info.flags & VFIO_REGION_INFO_FLAG_WRITE)
+ prot |= PROT_WRITE;
+
+ bar = mmap(NULL, info.size, prot, MAP_FILE | MAP_SHARED, fd, info.offset);
+ TEST_ASSERT(bar != MAP_FAILED, "mmap(BAR%d) failed", bar_idx);
+
+ *size = info.size;
+ return bar;
+}
+
+/*
+ * Read the PCI config space data
+ *
+ * Input Args:
+ * vfio_pci: Pointer to struct vfio_pci_dev
+ * config: The config space field's offset to read from (eg: PCI_VENDOR_ID)
+ * size: The size to read from the config region (could be one or more fields).
+ * data: Pointer to the region where the read data is to be copied into
+ *
+ * The data returned is in little-endian format, which is the standard for PCI config space.
+ */
+void vfio_pci_read_config_data(struct vfio_pci_dev *dev, size_t offset,
+ size_t size, void *data)
+{
+ struct vfio_region_info info = {
+ .argsz = sizeof(struct vfio_region_info),
+ .index = VFIO_PCI_CONFIG_REGION_INDEX,
+ };
+ int ret;
+
+ vfio_ioctl(dev->fd, VFIO_DEVICE_GET_REGION_INFO, &info);
+
+ TEST_ASSERT(offset + size <= PCI_CFG_SPACE_EXP_SIZE,
+ "Requested config (%lu) and size (%lu) is out of bounds (%u)",
+ offset, size, PCI_CFG_SPACE_EXP_SIZE);
+
+ ret = pread(dev->fd, data, size, info.offset + offset);
+ TEST_ASSERT(ret == size, "Failed to read the PCI config: 0x%lx\n", offset);
+}
+
+static unsigned int vfio_pci_get_group_from_dev(const char *bdf)
+{
+ char dev_iommu_group_path[PATH_MAX] = {0};
+ unsigned int pci_dev_sysfs_path_len;
+ char *pci_dev_sysfs_path;
+ unsigned int group;
+ int ret;
+
+ pci_dev_sysfs_path_len = strlen(PCI_SYSFS_PATH) + strlen("DDDD:BB:DD.F/iommu_group") + 1;
+
+ pci_dev_sysfs_path = calloc(1, pci_dev_sysfs_path_len);
+ TEST_ASSERT(pci_dev_sysfs_path, "Insufficient memory for pci dev sysfs path");
+
+ snprintf(pci_dev_sysfs_path, pci_dev_sysfs_path_len,
+ "%s%s/iommu_group", PCI_SYSFS_PATH, bdf);
+
+ ret = readlink(pci_dev_sysfs_path, dev_iommu_group_path,
+ sizeof(dev_iommu_group_path));
+ TEST_ASSERT(ret != -1, "Failed to get IOMMU group for device: %s", bdf);
+
+ ret = sscanf(basename(dev_iommu_group_path), "%u", &group);
+ TEST_ASSERT(ret == 1, "Failed to get IOMMU group for device: %s", bdf);
+
+ free(pci_dev_sysfs_path);
+ return group;
+}
+
+static void vfio_pci_setup_group(struct vfio_pci_dev *dev, const char *bdf)
+{
+ char group_path[32];
+ struct vfio_group_status group_status = {
+ .argsz = sizeof(group_status),
+ };
+ int group;
+
+ group = vfio_pci_get_group_from_dev(bdf);
+ snprintf(group_path, sizeof(group_path), "/dev/vfio/%d", group);
+
+ dev->group_fd = open(group_path, O_RDWR);
+ TEST_ASSERT(dev->group_fd >= 0,
+ "Failed to open the VFIO group %d for device: %s\n", group, bdf);
+
+ __vfio_ioctl(dev->group_fd, VFIO_GROUP_GET_STATUS, &group_status);
+ TEST_ASSERT(group_status.flags & VFIO_GROUP_FLAGS_VIABLE,
+ "Group %d for device %s not viable. Ensure all devices are bound to vfio-pci",
+ group, bdf);
+
+ vfio_ioctl(dev->group_fd, VFIO_GROUP_SET_CONTAINER, &dev->container_fd);
+}
+
+static void vfio_pci_set_iommu(struct vfio_pci_dev *dev, unsigned long iommu_type)
+{
+ TEST_ASSERT_EQ(__vfio_ioctl(dev->container_fd, VFIO_CHECK_EXTENSION, (void *)iommu_type), 1);
+ vfio_ioctl(dev->container_fd, VFIO_SET_IOMMU, (void *)iommu_type);
+}
+
+static void vfio_pci_open_device(struct vfio_pci_dev *dev, const char *bdf)
+{
+ struct vfio_device_info dev_info = {
+ .argsz = sizeof(dev_info),
+ };
+
+ dev->fd = __vfio_ioctl(dev->group_fd, VFIO_GROUP_GET_DEVICE_FD, bdf);
+ TEST_ASSERT(dev->fd >= 0, "Failed to get the device fd\n");
+
+ vfio_ioctl(dev->fd, VFIO_DEVICE_GET_INFO, &dev_info);
+
+ TEST_ASSERT(!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET),
+ "If VFIO tries to reset the VF, it will fail.");
+
+ /* Require at least all BAR regions and the config space. */
+ TEST_ASSERT(dev_info.num_regions >= VFIO_PCI_CONFIG_REGION_INDEX,
+ "Required number regions not supported (%d) for device: %s",
+ dev_info.num_regions, bdf);
+
+ /* Check for at least VFIO_PCI_MSIX_IRQ_INDEX irqs */
+ TEST_ASSERT(dev_info.num_irqs >= VFIO_PCI_MSIX_IRQ_INDEX,
+ "MSI-X IRQs (%d) not supported for device: %s",
+ dev_info.num_irqs, bdf);
+}
+
+/* bdf: PCI device's Domain:Bus:Device:Function in "DDDD:BB:DD.F" format */
+struct vfio_pci_dev *__vfio_pci_init(const char *bdf, unsigned long iommu_type)
+{
+ struct vfio_pci_dev *dev;
+ int vfio_version;
+
+ TEST_ASSERT(bdf, "PCI BDF not supplied\n");
+
+ dev = calloc(1, sizeof(*dev));
+ TEST_ASSERT(dev, "Insufficient memory for vfio_pci_dev");
+
+ dev->container_fd = open_path_or_exit(VFIO_DEV_PATH, O_RDWR);
+
+ vfio_version = __vfio_ioctl(dev->container_fd, VFIO_GET_API_VERSION, NULL);
+ TEST_REQUIRE(vfio_version == VFIO_API_VERSION);
+
+
+ vfio_pci_setup_group(dev, bdf);
+ vfio_pci_set_iommu(dev, iommu_type);
+ vfio_pci_open_device(dev, bdf);
+
+ return dev;
+}
+
+void vfio_pci_free(struct vfio_pci_dev *dev)
+{
+ close(dev->fd);
+ vfio_ioctl(dev->group_fd, VFIO_GROUP_UNSET_CONTAINER, NULL);
+
+ close(dev->group_fd);
+ close(dev->container_fd);
+
+ free(dev);
+}
new file mode 100644
@@ -0,0 +1,118 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2022, Google LLC.
+ */
+
+#ifndef SELFTEST_KVM_MERCURY_DEVICE_H
+#define SELFTEST_KVM_MERCURY_DEVICE_H
+
+#include "processor.h"
+#include "test_util.h"
+
+#define MERCURY_VENDOR_ID 0x1ae0
+#define MERCURY_DEVICE_ID 0x0050
+
+/* The base registers of the mercury device begin at the below offset from BAR0 */
+#define MERCURY_BASE_OFFSET (768 * 1024)
+
+#define MERCURY_MSIX_VECTOR 0
+#define MERCURY_MSIX_COUNT 1 /* Currently, only 1 vector is assigned to mercury */
+
+#define MERCURY_DMA_MAX_BUF_SIZE_BYTES SZ_8K
+#define MERCURY_DMA_MEMCPY_MAX_BUF_SIZE_BYTES SZ_1G
+
+/* Mercury device accepts the DMA size as double-word (4-bytes) */
+#define MERCURY_DMA_SIZE_STRIDE 4
+
+#define MERCURY_ABI_VERSION 0
+
+/* Register Offsets relative to MERCURY_BASE_OFFSET */
+/* Unless otherwise specified, all the registers are 32-bits */
+#define MERCURY_REG_VERSION 0x0 /* Read-only */
+#define MERCURY_REG_COMMAND 0x04 /* Write-only */
+#define MERCURY_REG_STATUS 0x08 /* Read-only, 64-bit register */
+#define MERCURY_REG_DMA_SRC_ADDR 0x10 /* Read/Write, 64-bit register */
+#define MERCURY_REG_DMA_DEST_ADDR 0x18 /* Read/Write, 64-bit register */
+#define MERCURY_REG_DMA_DW_LEN 0x20 /* Read/Write */
+#define MERCURY_REG_SCRATCH_REG0 0x24 /* Read/Write */
+#define MERCURY_REG_SCRATCH_REG1 0x1000 /* Read/Write */
+
+/* Bit positions of the STATUS register */
+enum mercury_status_bit {
+ MERCURY_STATUS_BIT_READY = 0,
+ MERCURY_STATUS_BIT_DMA_FROM_DEV_COMPLETE = 1,
+ MERCURY_STATUS_BIT_DMA_TO_DEV_COMPLETE = 2,
+ MERCURY_STATUS_BIT_DMA_MEMCPY_COMPLETE = 3,
+ MERCURY_STATUS_BIT_FORCE_INTERRUPT = 4,
+ MERCURY_STATUS_BIT_INVAL_DMA_SIZE = 5,
+ MERCURY_STATUS_BIT_DMA_ERROR = 6,
+ MERCURY_STATUS_BIT_CMD_ERR_INVAL_CMD = 7,
+ MERCURY_STATUS_BIT_CMD_ERR_DEV_NOT_READY = 8,
+};
+
+/* List of mercury commands that can be written into MERCURY_REG_COMMAND register */
+enum mercury_command {
+ MERCURY_COMMAND_RESET = 0,
+ MERCURY_COMMAND_TRIGGER_DMA_FROM_DEV = 1,
+ MERCURY_COMMAND_TRIGGER_DMA_TO_DEV = 2,
+ MERCURY_COMMAND_TRIGGER_DMA_MEMCPY = 3,
+ MERCURY_COMMAND_FORCE_INTERRUPT = 4,
+};
+
+static inline void mercury_write_reg64(void *bar0, uint32_t reg_off, uint64_t val)
+{
+ void *reg = bar0 + MERCURY_BASE_OFFSET + reg_off;
+
+ writeq(val, reg);
+}
+
+static inline void mercury_write_reg32(void *bar0, uint32_t reg_off, uint32_t val)
+{
+ void *reg = bar0 + MERCURY_BASE_OFFSET + reg_off;
+
+ writel(val, reg);
+}
+
+static inline uint32_t mercury_read_reg32(void *bar0, uint32_t reg_off)
+{
+ void *reg = bar0 + MERCURY_BASE_OFFSET + reg_off;
+
+ return readl(reg);
+}
+
+static inline uint64_t mercury_read_reg64(void *bar0, uint32_t reg_off)
+{
+ void *reg = bar0 + MERCURY_BASE_OFFSET + reg_off;
+
+ return readq(reg);
+}
+
+static inline uint64_t mercury_get_status(void *bar0)
+{
+ return mercury_read_reg64(bar0, MERCURY_REG_STATUS);
+}
+
+static inline void mercury_issue_command(void *bar0, enum mercury_command cmd)
+{
+ mercury_write_reg32(bar0, MERCURY_REG_COMMAND, cmd);
+}
+
+static inline void mercury_issue_reset(void *bar0)
+{
+ mercury_issue_command(bar0, MERCURY_COMMAND_RESET);
+}
+
+static inline void mercury_force_irq(void *bar0)
+{
+ mercury_issue_command(bar0, MERCURY_COMMAND_FORCE_INTERRUPT);
+}
+
+static inline void mercury_set_dma_size(void *bar0, size_t sz_bytes)
+{
+ /* Convert the DMA size from bytes to DWORDS, as accepted by the device */
+ size_t sz_dwords = sz_bytes / MERCURY_DMA_SIZE_STRIDE;
+
+ mercury_write_reg32(bar0, MERCURY_REG_DMA_DW_LEN, sz_dwords);
+}
+
+#endif /* SELFTEST_KVM_MERCURY_DEVICE_H */
new file mode 100644
@@ -0,0 +1,429 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include "apic.h"
+#include "processor.h"
+#include "test_util.h"
+#include "kvm_util.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <sched.h>
+#include <semaphore.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <signal.h>
+#include <stdint.h>
+#include <syscall.h>
+#include <sys/ioctl.h>
+#include <sys/sysinfo.h>
+#include <time.h>
+
+#include <sys/eventfd.h>
+
+#include "vfio_pci_util.h"
+#include "mercury_device.h"
+
+#define MERCURY_GSI 32
+#define MERCURY_IRQ_VECTOR 0x80
+
+#define MERCURY_BAR0_GPA 0xc0000000ul
+#define MERCURY_BAR0_SLOT 10
+
+/* Shared variables. */
+static bool do_guest_irq = true;
+
+/* Guest-only variables, shared across vCPUs. */
+static int irqs_received;
+static int irqs_sent;
+
+/* Host-only variables, shared across threads. */
+static cpu_set_t possible_mask;
+static int min_cpu, max_cpu;
+static bool done;
+static struct kvm_vcpu *target_vcpu;
+static sem_t do_irq;
+
+static bool x2apic;
+
+static void guest_irq_handler(struct ex_regs *regs)
+{
+ WRITE_ONCE(irqs_received, irqs_received + 1);
+
+ if (x2apic)
+ x2apic_write_reg(APIC_EOI, 0);
+ else
+ xapic_write_reg(APIC_EOI, 0);
+}
+
+static void guest_nmi_handler(struct ex_regs *regs)
+{
+ WRITE_ONCE(irqs_received, irqs_received + 1);
+}
+
+#define GUEST_VERIFY_IRQS() \
+do { \
+ int __received; \
+ \
+ __received = READ_ONCE(irqs_received); \
+ __GUEST_ASSERT(__received == irqs_sent, \
+ "Sent %u IRQ, received %u IRQs", irqs_sent, __received);\
+} while (0)
+
+#define GUEST_WAIT_FOR_IRQ() \
+do { \
+ safe_halt(); \
+ GUEST_VERIFY_IRQS(); \
+ cli(); \
+} while (0)
+
+static void guest_code(uint32_t vcpu_id)
+{
+ /* GPA is identity mapped. */
+ void *mercury_bar0 = (void *)MERCURY_BAR0_GPA;
+ uint64_t status;
+ int i;
+
+ cli();
+
+ if (x2apic) {
+ x2apic_enable();
+ GUEST_ASSERT(x2apic_read_reg(APIC_ID) == vcpu_id);
+ } else {
+ xapic_enable();
+ GUEST_ASSERT(xapic_read_reg(APIC_ID) >> 24 == vcpu_id);
+ }
+
+ if (vcpu_id == 0) {
+ irqs_sent++;
+ GUEST_ASSERT(READ_ONCE(do_guest_irq));
+ mercury_issue_reset(mercury_bar0);
+ GUEST_WAIT_FOR_IRQ();
+
+ status = mercury_get_status(mercury_bar0);
+ __GUEST_ASSERT(status & BIT(MERCURY_STATUS_BIT_READY),
+ "Expected device ready after reset");
+ GUEST_SYNC(irqs_received);
+ }
+
+ for ( ; !READ_ONCE(done); ) {
+ irqs_sent++;
+ if (READ_ONCE(do_guest_irq))
+ mercury_force_irq(mercury_bar0);
+ GUEST_WAIT_FOR_IRQ();
+ GUEST_SYNC(irqs_received);
+ }
+
+ sti_nop();
+
+ for (i = 0; i < 1000; i++) {
+ mercury_force_irq(mercury_bar0);
+ cpu_relax();
+ }
+
+ GUEST_VERIFY_IRQS();
+ GUEST_SYNC(irqs_received);
+}
+
+static void *irq_worker(void *mercury_bar0)
+{
+ struct kvm_vcpu *vcpu;
+
+ for (;;) {
+ sem_wait(&do_irq);
+
+ if (READ_ONCE(done))
+ break;
+
+ vcpu = READ_ONCE(target_vcpu);
+ while (!vcpu_get_stat(vcpu, blocking))
+ cpu_relax();
+
+ mercury_force_irq(mercury_bar0);
+ }
+ return NULL;
+}
+
+static int next_cpu(int cpu)
+{
+ /*
+ * Advance to the next CPU, skipping those that weren't in the original
+ * affinity set. Sadly, there is no CPU_SET_FOR_EACH, and cpu_set_t's
+ * data storage is considered as opaque. Note, if this task is pinned
+ * to a small set of discontiguous CPUs, e.g. 2 and 1023, this loop will
+ * burn a lot cycles and the test will take longer than normal to
+ * complete.
+ */
+ do {
+ cpu++;
+ if (cpu > max_cpu) {
+ cpu = min_cpu;
+ TEST_ASSERT(CPU_ISSET(cpu, &possible_mask),
+ "Min CPU = %d must always be usable", cpu);
+ break;
+ }
+ } while (!CPU_ISSET(cpu, &possible_mask));
+
+ return cpu;
+}
+
+static void *migration_worker(void *__guest_tid)
+{
+ pid_t guest_tid = (pid_t)(unsigned long)__guest_tid;
+ cpu_set_t allowed_mask;
+ int r, i, cpu;
+
+ CPU_ZERO(&allowed_mask);
+
+ for (i = 0, cpu = min_cpu; !READ_ONCE(done); i++, cpu = next_cpu(cpu)) {
+ CPU_SET(cpu, &allowed_mask);
+
+ r = sched_setaffinity(guest_tid, sizeof(allowed_mask), &allowed_mask);
+ TEST_ASSERT(!r, "sched_setaffinity failed, errno = %d (%s)",
+ errno, strerror(errno));
+
+ CPU_CLR(cpu, &allowed_mask);
+
+ usleep((i % 10) + 10);
+ }
+ return NULL;
+}
+
+static void calc_min_max_cpu(void)
+{
+ int i, cnt, nproc;
+
+ TEST_REQUIRE(CPU_COUNT(&possible_mask) >= 2);
+
+ /*
+ * CPU_SET doesn't provide a FOR_EACH helper, get the min/max CPU that
+ * this task is affined to in order to reduce the time spent querying
+ * unusable CPUs, e.g. if this task is pinned to a small percentage of
+ * total CPUs.
+ */
+ nproc = get_nprocs_conf();
+ min_cpu = -1;
+ max_cpu = -1;
+ cnt = 0;
+
+ for (i = 0; i < nproc; i++) {
+ if (!CPU_ISSET(i, &possible_mask))
+ continue;
+ if (min_cpu == -1)
+ min_cpu = i;
+ max_cpu = i;
+ cnt++;
+ }
+
+ __TEST_REQUIRE(cnt >= 2, "Only one usable CPU, task migration not possible");
+}
+
+static void sanity_check_mercury_device(struct vfio_pci_dev *dev, void *bar0)
+{
+ uint16_t vendor_id, device_id;
+ uint32_t version;
+
+ vendor_id = vfio_pci_get_vendor_id(dev);
+ device_id = vfio_pci_get_device_id(dev);
+
+ TEST_ASSERT(vendor_id == MERCURY_VENDOR_ID &&
+ device_id == MERCURY_DEVICE_ID,
+ "Mercury vendor-id/device-id mismatch. "
+ "Expected vendor: 0x%04x, device: 0x%04x. "
+ "Got vendor: 0x%04x, device: 0x%04x",
+ MERCURY_VENDOR_ID, MERCURY_DEVICE_ID,
+ vendor_id, device_id);
+
+ version = mercury_read_reg32(bar0, MERCURY_REG_VERSION);
+ TEST_ASSERT_EQ(version, MERCURY_ABI_VERSION);
+}
+
+static void set_empty_routing(struct kvm_vm *vm, struct kvm_irq_routing *routing)
+{
+ routing->nr = 0;
+ routing->entries[0].gsi = MERCURY_GSI;
+ routing->entries[0].type = KVM_IRQ_ROUTING_IRQCHIP;
+ routing->entries[0].flags = 0;
+ routing->entries[0].u.msi.address_lo = 0;
+ routing->entries[0].u.msi.address_hi = 0;
+ routing->entries[0].u.msi.data = 0xfe;
+ vm_ioctl(vm, KVM_SET_GSI_ROUTING, routing);
+}
+
+static void set_gsi_dest(struct kvm_vcpu *vcpu, struct kvm_irq_routing *routing,
+ bool do_nmi)
+{
+ routing->nr = 1;
+ routing->entries[0].gsi = MERCURY_GSI;
+ routing->entries[0].type = KVM_IRQ_ROUTING_MSI;
+ routing->entries[0].flags = 0;
+ routing->entries[0].u.msi.address_lo = (vcpu->id << 12);
+ routing->entries[0].u.msi.address_hi = 0;
+ if (do_nmi)
+ routing->entries[0].u.msi.data = NMI_VECTOR | (4 << 8);
+ else
+ routing->entries[0].u.msi.data = MERCURY_IRQ_VECTOR;
+ vm_ioctl(vcpu->vm, KVM_SET_GSI_ROUTING, routing);
+}
+
+static void vcpu_run_and_verify(struct kvm_vcpu *vcpu, int nr_irqs)
+{
+ struct ucall uc;
+
+ vcpu_run(vcpu);
+ TEST_ASSERT_EQ(get_ucall(vcpu, &uc), UCALL_SYNC);
+ TEST_ASSERT_EQ(uc.args[1], nr_irqs);
+}
+
+int main(int argc, char *argv[])
+{
+ bool migrate = false, nmi = false, async = false, empty = false;
+ pthread_t migration_thread, irq_thread;
+ struct kvm_irq_routing *routing;
+ struct vfio_pci_dev *dev;
+ struct kvm_vcpu *vcpus[2];
+ int opt, r, eventfd, i;
+ int nr_irqs = 10000;
+ struct kvm_vm *vm;
+ uint64_t bar_size;
+ char *bdf = NULL;
+ void *bar;
+
+ sem_init(&do_irq, 0, 0);
+
+ while ((opt = getopt(argc, argv, "had:ei:mnx")) != -1) {
+ switch (opt) {
+ case 'a':
+ async = true;
+ break;
+ case 'd':
+ bdf = strdup(optarg);
+ break;
+ case 'e':
+ empty = true;
+ break;
+ case 'i':
+ nr_irqs = atoi_positive("Number of IRQs", optarg);
+ break;
+ case 'm':
+ migrate = true;
+ break;
+ case 'n':
+ nmi = true;
+ break;
+ case 'x':
+ x2apic = false;
+ break;
+ case 'h':
+ default:
+ pr_info("Usage: %s [-h] <-d pci-bdf>\n\n", argv[0]);
+ pr_info("\t-d: PCI Domain, Bus, Device, Function in the format DDDD:BB:DD.F\n");
+ pr_info("\t-h: print this help screen\n");
+ exit(KSFT_SKIP);
+ }
+ }
+
+ __TEST_REQUIRE(bdf, "Required argument -d <pci-bdf> missing");
+
+ dev = vfio_pci_init(bdf);
+ bar = vfio_pci_map_bar(dev, VFIO_PCI_BAR0_REGION_INDEX, &bar_size);
+ sanity_check_mercury_device(dev, bar);
+
+ vm = vm_create_with_vcpus(ARRAY_SIZE(vcpus), guest_code, vcpus);
+ vm_install_exception_handler(vm, MERCURY_IRQ_VECTOR, guest_irq_handler);
+ vm_install_exception_handler(vm, NMI_VECTOR, guest_nmi_handler);
+
+ vcpu_args_set(vcpus[0], 1, 0);
+ vcpu_args_set(vcpus[1], 1, 1);
+
+ virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA);
+
+ vm_set_user_memory_region(vm, MERCURY_BAR0_SLOT, 0, MERCURY_BAR0_GPA,
+ bar_size, bar);
+ virt_map(vm, MERCURY_BAR0_GPA, MERCURY_BAR0_GPA,
+ vm_calc_num_guest_pages(VM_MODE_DEFAULT, bar_size));
+
+ routing = kvm_gsi_routing_create();
+
+ eventfd = kvm_new_eventfd();
+ vfio_pci_assign_msix(dev, eventfd);
+ kvm_assign_irqfd(vm, MERCURY_GSI, eventfd);
+
+ r = sched_getaffinity(0, sizeof(possible_mask), &possible_mask);
+ TEST_ASSERT(!r, "sched_getaffinity failed, errno = %d (%s)", errno,
+ strerror(errno));
+
+ if (migrate) {
+ calc_min_max_cpu();
+
+ pthread_create(&migration_thread, NULL, migration_worker,
+ (void *)(unsigned long)syscall(SYS_gettid));
+ }
+
+ if (nmi || async)
+ pthread_create(&irq_thread, NULL, irq_worker, bar);
+
+ set_gsi_dest(vcpus[0], routing, false);
+ vcpu_run_and_verify(vcpus[0], 1);
+
+#if 0
+ /*
+ * Hack if the user wants to manually mess with interrupt routing while
+ * the test is running, e.g. by modifying smp_affinity in the host.
+ */
+ for (i = 1; i < nr_irqs; i++) {
+ usleep(1000 * 1000);
+ vcpu_run_and_verify(vcpus[0], i + 1);
+ }
+#endif
+
+ for (i = 1; i < nr_irqs; i++) {
+ struct kvm_vcpu *vcpu = vcpus[!!(i & BIT(1))];
+ const bool do_nmi = nmi && (i & BIT(2));
+ const bool do_empty = empty && (i & BIT(3));
+ const bool do_async = nmi || async;
+
+ if (do_empty)
+ set_empty_routing(vm, routing);
+
+ set_gsi_dest(vcpu, routing, do_nmi);
+
+ WRITE_ONCE(do_guest_irq, !do_async);
+ sync_global_to_guest(vm, do_guest_irq);
+
+ if (do_async) {
+ WRITE_ONCE(target_vcpu, vcpu);
+ sem_post(&do_irq);
+ }
+
+ vcpu_run_and_verify(vcpu, i + 1);
+ }
+
+ WRITE_ONCE(done, true);
+ sync_global_to_guest(vm, done);
+ sem_post(&do_irq);
+
+ for (i = 0; empty && i < ARRAY_SIZE(vcpus); i++) {
+ struct kvm_vcpu *vcpu = vcpus[i];
+
+ if (!i)
+ set_gsi_dest(vcpu, routing, false);
+ set_empty_routing(vm, routing);
+ vcpu_run_and_verify(vcpu, nr_irqs);
+ }
+
+ set_gsi_dest(vcpus[0], routing, false);
+
+ if (migrate)
+ pthread_join(migration_thread, NULL);
+
+ if (nmi || async)
+ pthread_join(irq_thread, NULL);
+
+ r = munmap(bar, bar_size);
+ TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("munmap()", r));
+
+ vfio_pci_free(dev);
+
+ return 0;
+}
Signed-off-by: Sean Christopherson <seanjc@google.com> --- tools/testing/selftests/kvm/Makefile.kvm | 2 + .../selftests/kvm/include/vfio_pci_util.h | 149 ++++++ .../selftests/kvm/include/x86/processor.h | 21 + .../testing/selftests/kvm/lib/vfio_pci_util.c | 201 ++++++++ tools/testing/selftests/kvm/mercury_device.h | 118 +++++ tools/testing/selftests/kvm/vfio_irq_test.c | 429 ++++++++++++++++++ 6 files changed, 920 insertions(+) create mode 100644 tools/testing/selftests/kvm/include/vfio_pci_util.h create mode 100644 tools/testing/selftests/kvm/lib/vfio_pci_util.c create mode 100644 tools/testing/selftests/kvm/mercury_device.h create mode 100644 tools/testing/selftests/kvm/vfio_irq_test.c