@@ -102,6 +102,13 @@ static int create_cxl_region(struct vfio_pci_core_device *core_dev)
cxl_accel_get_region_params(cxl->region.region, &start, &end);
cxl->region.addr = start;
+ cxl->region.vaddr = ioremap(start, end - start);
+ if (!cxl->region.addr) {
+ pci_err(pdev, "Fail to map CXL region\n");
+ cxl_region_detach(cxl->cxled);
+ cxl_dpa_free(cxl->cxled);
+ goto out;
+ }
out:
cxl_release_endpoint(cxl->cxlmd, cxl->endpoint);
return ret;
@@ -152,17 +159,135 @@ static void disable_cxl(struct vfio_pci_core_device *core_dev)
{
struct vfio_cxl *cxl = &core_dev->cxl;
- if (cxl->region.region)
+ if (cxl->region.region) {
+ iounmap(cxl->region.vaddr);
cxl_region_detach(cxl->cxled);
+ }
if (cxl->cxled)
cxl_dpa_free(cxl->cxled);
}
+static unsigned long vma_to_pfn(struct vm_area_struct *vma)
+{
+ struct vfio_pci_core_device *vdev = vma->vm_private_data;
+ struct vfio_cxl *cxl = &vdev->cxl;
+ u64 pgoff;
+
+ pgoff = vma->vm_pgoff &
+ ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+
+ return (cxl->region.addr >> PAGE_SHIFT) + pgoff;
+}
+
+static vm_fault_t vfio_cxl_mmap_fault(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct vfio_pci_core_device *vdev = vma->vm_private_data;
+ unsigned long pfn, pgoff = vmf->pgoff - vma->vm_pgoff;
+ unsigned long addr = vma->vm_start;
+ vm_fault_t ret = VM_FAULT_SIGBUS;
+
+ pfn = vma_to_pfn(vma);
+
+ down_read(&vdev->memory_lock);
+
+ if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev))
+ goto out_unlock;
+
+ ret = vmf_insert_pfn(vma, vmf->address, pfn + pgoff);
+ if (ret & VM_FAULT_ERROR)
+ goto out_unlock;
+
+ for (; addr < vma->vm_end; addr += PAGE_SIZE, pfn++) {
+ if (addr == vmf->address)
+ continue;
+
+ if (vmf_insert_pfn(vma, addr, pfn) & VM_FAULT_ERROR)
+ break;
+ }
+
+out_unlock:
+ up_read(&vdev->memory_lock);
+
+ return ret;
+}
+
+static const struct vm_operations_struct vfio_cxl_mmap_ops = {
+ .fault = vfio_cxl_mmap_fault,
+};
+
+static int vfio_cxl_region_mmap(struct vfio_pci_core_device *core_dev,
+ struct vfio_pci_region *region,
+ struct vm_area_struct *vma)
+{
+ struct vfio_cxl *cxl = &core_dev->cxl;
+ u64 phys_len, req_len, pgoff, req_start;
+
+ if (!(region->flags & VFIO_REGION_INFO_FLAG_MMAP))
+ return -EINVAL;
+
+ if (!(region->flags & VFIO_REGION_INFO_FLAG_READ) &&
+ (vma->vm_flags & VM_READ))
+ return -EPERM;
+
+ if (!(region->flags & VFIO_REGION_INFO_FLAG_WRITE) &&
+ (vma->vm_flags & VM_WRITE))
+ return -EPERM;
+
+ phys_len = cxl->region.size;
+ req_len = vma->vm_end - vma->vm_start;
+ pgoff = vma->vm_pgoff &
+ ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+ req_start = pgoff << PAGE_SHIFT;
+
+ if (req_start + req_len > phys_len)
+ return -EINVAL;
+
+ vma->vm_private_data = core_dev;
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+ vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
+
+ vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED | VM_IO | VM_PFNMAP |
+ VM_DONTEXPAND | VM_DONTDUMP);
+ vma->vm_ops = &vfio_cxl_mmap_ops;
+
+ return 0;
+}
+
+static ssize_t vfio_cxl_region_rw(struct vfio_pci_core_device *core_dev,
+ char __user *buf, size_t count, loff_t *ppos,
+ bool iswrite)
+{
+ unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
+ struct vfio_cxl_region *cxl_region = core_dev->region[i].data;
+ loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+
+ if (!count)
+ return 0;
+
+ return vfio_pci_core_do_io_rw(core_dev, false,
+ cxl_region->vaddr,
+ (char __user *)buf, pos, count,
+ 0, 0, iswrite);
+}
+
+static void vfio_cxl_region_release(struct vfio_pci_core_device *vdev,
+ struct vfio_pci_region *region)
+{
+}
+
+static const struct vfio_pci_regops vfio_cxl_regops = {
+ .rw = vfio_cxl_region_rw,
+ .mmap = vfio_cxl_region_mmap,
+ .release = vfio_cxl_region_release,
+};
+
int vfio_cxl_core_enable(struct vfio_pci_core_device *core_dev)
{
struct vfio_cxl *cxl = &core_dev->cxl;
struct pci_dev *pdev = core_dev->pdev;
+ u32 flags;
u16 dvsec;
int ret;
@@ -182,8 +307,21 @@ int vfio_cxl_core_enable(struct vfio_pci_core_device *core_dev)
if (ret)
goto err_enable_cxl_device;
+ flags = VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE |
+ VFIO_REGION_INFO_FLAG_MMAP;
+
+ ret = vfio_pci_core_register_dev_region(core_dev,
+ PCI_VENDOR_ID_CXL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
+ VFIO_REGION_SUBTYPE_CXL, &vfio_cxl_regops,
+ cxl->region.size, flags, &cxl->region);
+ if (ret)
+ goto err_register_cxl_region;
+
return 0;
+err_register_cxl_region:
+ disable_cxl(core_dev);
err_enable_cxl_device:
vfio_pci_core_disable(core_dev);
return ret;
@@ -412,6 +412,7 @@ bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev)
return pdev->current_state < PCI_D3hot &&
(pdev->no_command_memory || (cmd & PCI_COMMAND_MEMORY));
}
+EXPORT_SYMBOL(__vfio_pci_memory_enabled);
/*
* Restore the *real* BARs after we detect a FLR or backdoor reset.
@@ -54,6 +54,7 @@ struct vfio_pci_region {
struct vfio_cxl_region {
u64 size;
u64 addr;
+ void *vaddr;
struct cxl_region *region;
};
@@ -372,6 +372,9 @@ struct vfio_region_info_cap_type {
/* sub-types for VFIO_REGION_TYPE_GFX */
#define VFIO_REGION_SUBTYPE_GFX_EDID (1)
+/* sub-types for VFIO CXL region */
+#define VFIO_REGION_SUBTYPE_CXL (1)
+
/**
* struct vfio_region_gfx_edid - EDID region layout.
*
To directly access the device memory, a CXL region is required. Creating a CXL region requires to configure HDM decoders on the path to map the access of HPA level by level and evetually hit the DPA in the CXL topology. For the usersapce, e.g. QEMU, to access the CXL region, the region is required to be exposed via VFIO interfaces. Introduce a new VFIO device region and region ops to expose the created CXL region when initailize the device in the vfio-cxl-core. Introduce a new sub region type for the userspace to identify a CXL region. Signed-off-by: Zhi Wang <zhiw@nvidia.com> --- drivers/vfio/pci/vfio_cxl_core.c | 140 ++++++++++++++++++++++++++++- drivers/vfio/pci/vfio_pci_config.c | 1 + include/linux/vfio_pci_core.h | 1 + include/uapi/linux/vfio.h | 3 + 4 files changed, 144 insertions(+), 1 deletion(-)