@@ -409,6 +409,14 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev)
}
}
+ if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV)) {
+ ret = vfio_pci_zdev_io_init(vdev);
+ if (ret && ret != -ENODEV) {
+ pci_warn(pdev, "Failed to setup zPCI I/O region\n");
+ return ret;
+ }
+ }
+
vfio_pci_probe_mmaps(vdev);
return 0;
@@ -217,12 +217,18 @@ static inline int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev)
#ifdef CONFIG_VFIO_PCI_ZDEV
extern int vfio_pci_info_zdev_add_caps(struct vfio_pci_device *vdev,
struct vfio_info_cap *caps);
+extern int vfio_pci_zdev_io_init(struct vfio_pci_device *vdev);
#else
static inline int vfio_pci_info_zdev_add_caps(struct vfio_pci_device *vdev,
struct vfio_info_cap *caps)
{
return -ENODEV;
}
+
+static inline int vfio_pci_zdev_io_init(struct vfio_pci_device *vdev)
+{
+ return -ENODEV;
+}
#endif
#endif /* VFIO_PCI_PRIVATE_H */
@@ -18,6 +18,7 @@
#include <linux/vfio_zdev.h>
#include <asm/pci_clp.h>
#include <asm/pci_io.h>
+#include <asm/pci_insn.h>
#include "vfio_pci_private.h"
@@ -143,3 +144,160 @@ int vfio_pci_info_zdev_add_caps(struct vfio_pci_device *vdev,
return ret;
}
+
+static size_t vfio_pci_zdev_io_rw(struct vfio_pci_device *vdev,
+ char __user *buf, size_t count,
+ loff_t *ppos, bool iswrite)
+{
+ unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
+ struct vfio_region_zpci_io *region = vdev->region[i].data;
+ struct zpci_dev *zdev = to_zpci(vdev->pdev);
+ loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+ void *base = region;
+ struct page *gpage;
+ void *gaddr;
+ u64 *data;
+ int ret;
+ u64 req;
+
+ if ((!vdev->pdev->bus) || (!zdev))
+ return -ENODEV;
+
+ if (pos >= vdev->region[i].size)
+ return -EINVAL;
+
+ count = min(count, (size_t)(vdev->region[i].size - pos));
+
+ if (!iswrite) {
+ /* Only allow reads to the _hdr area */
+ if (pos + count > offsetof(struct vfio_region_zpci_io, req))
+ return -EFAULT;
+ if (copy_to_user(buf, base + pos, count))
+ return -EFAULT;
+ return count;
+ }
+
+ /* Only allow writes to the _req area */
+ if (pos < offsetof(struct vfio_region_zpci_io, req))
+ return -EFAULT;
+ if (copy_from_user(base + pos, buf, count))
+ return -EFAULT;
+
+ /*
+ * Read operations are limited to 8B
+ */
+ if ((region->req.flags & VFIO_ZPCI_IO_FLAG_READ) &&
+ (region->req.len > 8)) {
+ return -EIO;
+ }
+
+ /*
+ * Block write operations are limited to hardware-reported max
+ */
+ if ((region->req.flags & VFIO_ZPCI_IO_FLAG_BLOCKW) &&
+ (region->req.len > zdev->maxstbl)) {
+ return -EIO;
+ }
+
+ /*
+ * While some devices may allow relaxed alignment for the PCISTB
+ * instruction, the VFIO region requires the input buffer to be on a
+ * DWORD boundary for all operations for simplicity.
+ */
+ if (!IS_ALIGNED(region->req.gaddr, sizeof(uint64_t)))
+ return -EIO;
+
+ /*
+ * For now, the largest allowed block I/O is advertised as PAGE_SIZE,
+ * and cannot exceed a page boundary - so a single page is enough. The
+ * guest should have validated this but let's double-check that the
+ * request will not cross a page boundary.
+ */
+ if (((region->req.gaddr & ~PAGE_MASK)
+ + region->req.len - 1) & PAGE_MASK) {
+ return -EIO;
+ }
+
+ mutex_lock(&zdev->lock);
+
+ ret = get_user_pages_fast(region->req.gaddr & PAGE_MASK, 1, 0, &gpage);
+ if (ret <= 0) {
+ count = -EIO;
+ goto out;
+ }
+ gaddr = page_address(gpage);
+ gaddr += (region->req.gaddr & ~PAGE_MASK);
+ data = (u64 *)gaddr;
+
+ req = ZPCI_CREATE_REQ(zdev->fh, region->req.pcias, region->req.len);
+
+ /* Perform the requested I/O operation */
+ if (region->req.flags & VFIO_ZPCI_IO_FLAG_READ) {
+ /* PCILG */
+ ret = __zpci_load(data, req,
+ region->req.offset);
+ } else if (region->req.flags & VFIO_ZPCI_IO_FLAG_BLOCKW) {
+ /* PCISTB */
+ ret = __zpci_store_block(data, req,
+ region->req.offset);
+ } else {
+ /* Undefined Operation or none provided */
+ count = -EIO;
+ }
+ if (ret < 0)
+ count = -EIO;
+
+ put_page(gpage);
+
+out:
+ mutex_unlock(&zdev->lock);
+ return count;
+}
+
+static void vfio_pci_zdev_io_release(struct vfio_pci_device *vdev,
+ struct vfio_pci_region *region)
+{
+ kfree(region->data);
+}
+
+static const struct vfio_pci_regops vfio_pci_zdev_io_regops = {
+ .rw = vfio_pci_zdev_io_rw,
+ .release = vfio_pci_zdev_io_release,
+};
+
+int vfio_pci_zdev_io_init(struct vfio_pci_device *vdev)
+{
+ struct vfio_region_zpci_io *region;
+ struct zpci_dev *zdev;
+ int ret;
+
+ if (!vdev->pdev->bus)
+ return -ENODEV;
+
+ zdev = to_zpci(vdev->pdev);
+ if (!zdev)
+ return -ENODEV;
+
+ region = kmalloc(sizeof(struct vfio_region_zpci_io), GFP_KERNEL);
+
+ ret = vfio_pci_register_dev_region(vdev,
+ PCI_VENDOR_ID_IBM | VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
+ VFIO_REGION_SUBTYPE_IBM_ZPCI_IO,
+ &vfio_pci_zdev_io_regops,
+ sizeof(struct vfio_region_zpci_io),
+ VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE,
+ region);
+
+ if (ret) {
+ kfree(region);
+ return ret;
+ }
+
+ /* Setup the initial header information */
+ region->hdr.flags = 0;
+ region->hdr.max = zdev->maxstbl;
+ region->hdr.reserved = 0;
+ region->hdr.reserved2 = 0;
+
+ return ret;
+}
@@ -338,6 +338,10 @@ struct vfio_region_info_cap_type {
* to do TLB invalidation on a GPU.
*/
#define VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD (1)
+/*
+ * IBM zPCI I/O region
+ */
+#define VFIO_REGION_SUBTYPE_IBM_ZPCI_IO (2)
/* sub-types for VFIO_REGION_TYPE_GFX */
#define VFIO_REGION_SUBTYPE_GFX_EDID (1)
@@ -76,4 +76,36 @@ struct vfio_device_info_cap_zpci_pfip {
__u8 pfip[];
};
+/**
+ * VFIO_REGION_SUBTYPE_IBM_ZPCI_IO - VFIO zPCI PCI Direct I/O Region
+ *
+ * This region is used to transfer I/O operations from the guest directly
+ * to the host zPCI I/O layer.
+ *
+ * The _hdr area is user-readable and is used to provide setup information.
+ * The _req area is user-writable and is used to provide the I/O operation.
+ */
+struct vfio_zpci_io_hdr {
+ __u64 flags;
+ __u16 max; /* Max block operation size allowed */
+ __u16 reserved;
+ __u32 reserved2;
+};
+
+struct vfio_zpci_io_req {
+ __u64 flags;
+#define VFIO_ZPCI_IO_FLAG_READ (1 << 0) /* Read Operation Specified */
+#define VFIO_ZPCI_IO_FLAG_BLOCKW (1 << 1) /* Block Write Operation Specified */
+ __u64 gaddr; /* Address of guest data */
+ __u64 offset; /* Offset into target PCI Address Space */
+ __u32 reserved;
+ __u16 len; /* Length of guest operation */
+ __u8 pcias; /* Target PCI Address Space */
+ __u8 reserved2;
+};
+
+struct vfio_region_zpci_io {
+ struct vfio_zpci_io_hdr hdr;
+ struct vfio_zpci_io_req req;
+};
#endif
Some s390 PCI devices (e.g. ISM) perform I/O operations that have very specific requirements in terms of alignment as well as the patterns in which the data is read/written. Allowing these to proceed through the typical vfio_pci_bar_rw path will cause them to be broken in up in such a way that these requirements can't be guaranteed. In addition, ISM devices do not support the MIO codepaths that might be triggered on vfio I/O coming from userspace; we must be able to ensure that these devices use the non-MIO instructions. To facilitate this, provide a new vfio region by which non-MIO instructions can be passed directly to the host kernel s390 PCI layer, to be reliably issued as non-MIO instructions. This patch introduces the new vfio VFIO_REGION_SUBTYPE_IBM_ZPCI_IO region and implements the ability to pass PCISTB and PCILG instructions over it, as these are what is required for ISM devices. Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com> --- drivers/vfio/pci/vfio_pci.c | 8 ++ drivers/vfio/pci/vfio_pci_private.h | 6 ++ drivers/vfio/pci/vfio_pci_zdev.c | 158 ++++++++++++++++++++++++++++++++++++ include/uapi/linux/vfio.h | 4 + include/uapi/linux/vfio_zdev.h | 32 ++++++++ 5 files changed, 208 insertions(+)