diff mbox series

[RFC,05/13] vfio/cxl: expose CXL region to the usersapce via a new VFIO device region

Message ID 20240920223446.1908673-6-zhiw@nvidia.com (mailing list archive)
State New, archived
Headers show
Series vfio: introduce vfio-cxl to support CXL type-2 accelerator passthrough | expand

Commit Message

Zhi Wang Sept. 20, 2024, 10:34 p.m. UTC
To directly access the device memory, a CXL region is required. Creating
a CXL region requires to configure HDM decoders on the path to map the
access of HPA level by level and evetually hit the DPA in the CXL
topology.

For the usersapce, e.g. QEMU, to access the CXL region, the region is
required to be exposed via VFIO interfaces.

Introduce a new VFIO device region and region ops to expose the created
CXL region when initailize the device in the vfio-cxl-core. Introduce a
new sub region type for the userspace to identify a CXL region.

Signed-off-by: Zhi Wang <zhiw@nvidia.com>
---
 drivers/vfio/pci/vfio_cxl_core.c   | 140 ++++++++++++++++++++++++++++-
 drivers/vfio/pci/vfio_pci_config.c |   1 +
 include/linux/vfio_pci_core.h      |   1 +
 include/uapi/linux/vfio.h          |   3 +
 4 files changed, 144 insertions(+), 1 deletion(-)

Comments

Alex Williamson Oct. 11, 2024, 7:12 p.m. UTC | #1
On Fri, 20 Sep 2024 15:34:38 -0700
Zhi Wang <zhiw@nvidia.com> wrote:

> To directly access the device memory, a CXL region is required. Creating
> a CXL region requires to configure HDM decoders on the path to map the
> access of HPA level by level and evetually hit the DPA in the CXL
> topology.
> 
> For the usersapce, e.g. QEMU, to access the CXL region, the region is
> required to be exposed via VFIO interfaces.
> 
> Introduce a new VFIO device region and region ops to expose the created
> CXL region when initailize the device in the vfio-cxl-core. Introduce a
> new sub region type for the userspace to identify a CXL region.
> 
> Signed-off-by: Zhi Wang <zhiw@nvidia.com>
> ---
>  drivers/vfio/pci/vfio_cxl_core.c   | 140 ++++++++++++++++++++++++++++-
>  drivers/vfio/pci/vfio_pci_config.c |   1 +
>  include/linux/vfio_pci_core.h      |   1 +
>  include/uapi/linux/vfio.h          |   3 +
>  4 files changed, 144 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/vfio/pci/vfio_cxl_core.c b/drivers/vfio/pci/vfio_cxl_core.c
> index 6a7859333f67..ffc15fd94b22 100644
> --- a/drivers/vfio/pci/vfio_cxl_core.c
> +++ b/drivers/vfio/pci/vfio_cxl_core.c
> @@ -102,6 +102,13 @@ static int create_cxl_region(struct vfio_pci_core_device *core_dev)
>  	cxl_accel_get_region_params(cxl->region.region, &start, &end);
>  
>  	cxl->region.addr = start;
> +	cxl->region.vaddr = ioremap(start, end - start);
> +	if (!cxl->region.addr) {

This is testing addr, not the newly added vaddr.

> +		pci_err(pdev, "Fail to map CXL region\n");
> +		cxl_region_detach(cxl->cxled);
> +		cxl_dpa_free(cxl->cxled);
> +		goto out;
> +	}
>  out:
>  	cxl_release_endpoint(cxl->cxlmd, cxl->endpoint);
>  	return ret;
> @@ -152,17 +159,135 @@ static void disable_cxl(struct vfio_pci_core_device *core_dev)
>  {
>  	struct vfio_cxl *cxl = &core_dev->cxl;
>  
> -	if (cxl->region.region)
> +	if (cxl->region.region) {
> +		iounmap(cxl->region.vaddr);
>  		cxl_region_detach(cxl->cxled);
> +	}
>  
>  	if (cxl->cxled)
>  		cxl_dpa_free(cxl->cxled);
>  }
>  
> +static unsigned long vma_to_pfn(struct vm_area_struct *vma)
> +{
> +	struct vfio_pci_core_device *vdev = vma->vm_private_data;
> +	struct vfio_cxl *cxl = &vdev->cxl;
> +	u64 pgoff;
> +
> +	pgoff = vma->vm_pgoff &
> +		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
> +
> +	return (cxl->region.addr >> PAGE_SHIFT) + pgoff;
> +}
> +
> +static vm_fault_t vfio_cxl_mmap_fault(struct vm_fault *vmf)
> +{
> +	struct vm_area_struct *vma = vmf->vma;
> +	struct vfio_pci_core_device *vdev = vma->vm_private_data;
> +	unsigned long pfn, pgoff = vmf->pgoff - vma->vm_pgoff;
> +	unsigned long addr = vma->vm_start;
> +	vm_fault_t ret = VM_FAULT_SIGBUS;
> +
> +	pfn = vma_to_pfn(vma);
> +
> +	down_read(&vdev->memory_lock);
> +
> +	if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev))
> +		goto out_unlock;
> +
> +	ret = vmf_insert_pfn(vma, vmf->address, pfn + pgoff);
> +	if (ret & VM_FAULT_ERROR)
> +		goto out_unlock;
> +
> +	for (; addr < vma->vm_end; addr += PAGE_SIZE, pfn++) {
> +		if (addr == vmf->address)
> +			continue;
> +
> +		if (vmf_insert_pfn(vma, addr, pfn) & VM_FAULT_ERROR)
> +			break;
> +	}
> +
> +out_unlock:
> +	up_read(&vdev->memory_lock);
> +
> +	return ret;
> +}

This is identical to vfio_pci_mmap_fault(), we should figure out how to
reuse it rather than duplicate it.

> +
> +static const struct vm_operations_struct vfio_cxl_mmap_ops = {
> +	.fault = vfio_cxl_mmap_fault,
> +};

This should make use of the huge_fault support similar to what we added
in vfio-pci too, right?

> +
> +static int vfio_cxl_region_mmap(struct vfio_pci_core_device *core_dev,
> +				struct vfio_pci_region *region,
> +				struct vm_area_struct *vma)
> +{
> +	struct vfio_cxl *cxl = &core_dev->cxl;
> +	u64 phys_len, req_len, pgoff, req_start;
> +
> +	if (!(region->flags & VFIO_REGION_INFO_FLAG_MMAP))
> +		return -EINVAL;
> +
> +	if (!(region->flags & VFIO_REGION_INFO_FLAG_READ) &&
> +	    (vma->vm_flags & VM_READ))
> +		return -EPERM;
> +
> +	if (!(region->flags & VFIO_REGION_INFO_FLAG_WRITE) &&
> +	    (vma->vm_flags & VM_WRITE))
> +		return -EPERM;
> +
> +	phys_len = cxl->region.size;
> +	req_len = vma->vm_end - vma->vm_start;
> +	pgoff = vma->vm_pgoff &
> +		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
> +	req_start = pgoff << PAGE_SHIFT;
> +
> +	if (req_start + req_len > phys_len)
> +		return -EINVAL;
> +
> +	vma->vm_private_data = core_dev;
> +	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);

I thought a large part of CXL was that the memory is coherent, maybe I
don't understand what we're providing access to here.

> +	vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
> +
> +	vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED | VM_IO | VM_PFNMAP |
> +			VM_DONTEXPAND | VM_DONTDUMP);
> +	vma->vm_ops = &vfio_cxl_mmap_ops;
> +
> +	return 0;
> +}
> +
> +static ssize_t vfio_cxl_region_rw(struct vfio_pci_core_device *core_dev,
> +				  char __user *buf, size_t count, loff_t *ppos,
> +				  bool iswrite)
> +{
> +	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
> +	struct vfio_cxl_region *cxl_region = core_dev->region[i].data;
> +	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
> +
> +	if (!count)
> +		return 0;
> +
> +	return vfio_pci_core_do_io_rw(core_dev, false,
> +				      cxl_region->vaddr,
> +				      (char __user *)buf, pos, count,
> +				      0, 0, iswrite);
> +}
> +
> +static void vfio_cxl_region_release(struct vfio_pci_core_device *vdev,
> +				    struct vfio_pci_region *region)
> +{
> +}
> +
> +static const struct vfio_pci_regops vfio_cxl_regops = {
> +	.rw		= vfio_cxl_region_rw,
> +	.mmap		= vfio_cxl_region_mmap,
> +	.release	= vfio_cxl_region_release,
> +};
> +
>  int vfio_cxl_core_enable(struct vfio_pci_core_device *core_dev)
>  {
>  	struct vfio_cxl *cxl = &core_dev->cxl;
>  	struct pci_dev *pdev = core_dev->pdev;
> +	u32 flags;
>  	u16 dvsec;
>  	int ret;
>  
> @@ -182,8 +307,21 @@ int vfio_cxl_core_enable(struct vfio_pci_core_device *core_dev)
>  	if (ret)
>  		goto err_enable_cxl_device;
>  
> +	flags = VFIO_REGION_INFO_FLAG_READ |
> +		VFIO_REGION_INFO_FLAG_WRITE |
> +		VFIO_REGION_INFO_FLAG_MMAP;
> +
> +	ret = vfio_pci_core_register_dev_region(core_dev,
> +		PCI_VENDOR_ID_CXL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
> +		VFIO_REGION_SUBTYPE_CXL, &vfio_cxl_regops,
> +		cxl->region.size, flags, &cxl->region);
> +	if (ret)
> +		goto err_register_cxl_region;
> +
>  	return 0;
>  
> +err_register_cxl_region:
> +	disable_cxl(core_dev);
>  err_enable_cxl_device:
>  	vfio_pci_core_disable(core_dev);
>  	return ret;
> diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
> index 97422aafaa7b..98f3ac2d305c 100644
> --- a/drivers/vfio/pci/vfio_pci_config.c
> +++ b/drivers/vfio/pci/vfio_pci_config.c
> @@ -412,6 +412,7 @@ bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev)
>  	return pdev->current_state < PCI_D3hot &&
>  	       (pdev->no_command_memory || (cmd & PCI_COMMAND_MEMORY));
>  }
> +EXPORT_SYMBOL(__vfio_pci_memory_enabled);

_GPL

There should also be a lockdep assert added if this is exported.  With
that it could also drop the underscore prefix.

>  
>  /*
>   * Restore the *real* BARs after we detect a FLR or backdoor reset.
> diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
> index 7762d4a3e825..6523d9d1bffe 100644
> --- a/include/linux/vfio_pci_core.h
> +++ b/include/linux/vfio_pci_core.h
> @@ -54,6 +54,7 @@ struct vfio_pci_region {
>  struct vfio_cxl_region {
>  	u64 size;
>  	u64 addr;
> +	void *vaddr;
>  	struct cxl_region *region;
>  };
>  
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 2b68e6cdf190..71f766c29060 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -372,6 +372,9 @@ struct vfio_region_info_cap_type {
>  /* sub-types for VFIO_REGION_TYPE_GFX */
>  #define VFIO_REGION_SUBTYPE_GFX_EDID            (1)
>  
> +/* sub-types for VFIO CXL region */
> +#define VFIO_REGION_SUBTYPE_CXL                 (1)

This is a PCI vendor sub-type with vendor ID 0x1e98.  The comment
should reflect that.  Thanks,

Alex

> +
>  /**
>   * struct vfio_region_gfx_edid - EDID region layout.
>   *
diff mbox series

Patch

diff --git a/drivers/vfio/pci/vfio_cxl_core.c b/drivers/vfio/pci/vfio_cxl_core.c
index 6a7859333f67..ffc15fd94b22 100644
--- a/drivers/vfio/pci/vfio_cxl_core.c
+++ b/drivers/vfio/pci/vfio_cxl_core.c
@@ -102,6 +102,13 @@  static int create_cxl_region(struct vfio_pci_core_device *core_dev)
 	cxl_accel_get_region_params(cxl->region.region, &start, &end);
 
 	cxl->region.addr = start;
+	cxl->region.vaddr = ioremap(start, end - start);
+	if (!cxl->region.addr) {
+		pci_err(pdev, "Fail to map CXL region\n");
+		cxl_region_detach(cxl->cxled);
+		cxl_dpa_free(cxl->cxled);
+		goto out;
+	}
 out:
 	cxl_release_endpoint(cxl->cxlmd, cxl->endpoint);
 	return ret;
@@ -152,17 +159,135 @@  static void disable_cxl(struct vfio_pci_core_device *core_dev)
 {
 	struct vfio_cxl *cxl = &core_dev->cxl;
 
-	if (cxl->region.region)
+	if (cxl->region.region) {
+		iounmap(cxl->region.vaddr);
 		cxl_region_detach(cxl->cxled);
+	}
 
 	if (cxl->cxled)
 		cxl_dpa_free(cxl->cxled);
 }
 
+static unsigned long vma_to_pfn(struct vm_area_struct *vma)
+{
+	struct vfio_pci_core_device *vdev = vma->vm_private_data;
+	struct vfio_cxl *cxl = &vdev->cxl;
+	u64 pgoff;
+
+	pgoff = vma->vm_pgoff &
+		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+
+	return (cxl->region.addr >> PAGE_SHIFT) + pgoff;
+}
+
+static vm_fault_t vfio_cxl_mmap_fault(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	struct vfio_pci_core_device *vdev = vma->vm_private_data;
+	unsigned long pfn, pgoff = vmf->pgoff - vma->vm_pgoff;
+	unsigned long addr = vma->vm_start;
+	vm_fault_t ret = VM_FAULT_SIGBUS;
+
+	pfn = vma_to_pfn(vma);
+
+	down_read(&vdev->memory_lock);
+
+	if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev))
+		goto out_unlock;
+
+	ret = vmf_insert_pfn(vma, vmf->address, pfn + pgoff);
+	if (ret & VM_FAULT_ERROR)
+		goto out_unlock;
+
+	for (; addr < vma->vm_end; addr += PAGE_SIZE, pfn++) {
+		if (addr == vmf->address)
+			continue;
+
+		if (vmf_insert_pfn(vma, addr, pfn) & VM_FAULT_ERROR)
+			break;
+	}
+
+out_unlock:
+	up_read(&vdev->memory_lock);
+
+	return ret;
+}
+
+static const struct vm_operations_struct vfio_cxl_mmap_ops = {
+	.fault = vfio_cxl_mmap_fault,
+};
+
+static int vfio_cxl_region_mmap(struct vfio_pci_core_device *core_dev,
+				struct vfio_pci_region *region,
+				struct vm_area_struct *vma)
+{
+	struct vfio_cxl *cxl = &core_dev->cxl;
+	u64 phys_len, req_len, pgoff, req_start;
+
+	if (!(region->flags & VFIO_REGION_INFO_FLAG_MMAP))
+		return -EINVAL;
+
+	if (!(region->flags & VFIO_REGION_INFO_FLAG_READ) &&
+	    (vma->vm_flags & VM_READ))
+		return -EPERM;
+
+	if (!(region->flags & VFIO_REGION_INFO_FLAG_WRITE) &&
+	    (vma->vm_flags & VM_WRITE))
+		return -EPERM;
+
+	phys_len = cxl->region.size;
+	req_len = vma->vm_end - vma->vm_start;
+	pgoff = vma->vm_pgoff &
+		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+	req_start = pgoff << PAGE_SHIFT;
+
+	if (req_start + req_len > phys_len)
+		return -EINVAL;
+
+	vma->vm_private_data = core_dev;
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
+
+	vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED | VM_IO | VM_PFNMAP |
+			VM_DONTEXPAND | VM_DONTDUMP);
+	vma->vm_ops = &vfio_cxl_mmap_ops;
+
+	return 0;
+}
+
+static ssize_t vfio_cxl_region_rw(struct vfio_pci_core_device *core_dev,
+				  char __user *buf, size_t count, loff_t *ppos,
+				  bool iswrite)
+{
+	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
+	struct vfio_cxl_region *cxl_region = core_dev->region[i].data;
+	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+
+	if (!count)
+		return 0;
+
+	return vfio_pci_core_do_io_rw(core_dev, false,
+				      cxl_region->vaddr,
+				      (char __user *)buf, pos, count,
+				      0, 0, iswrite);
+}
+
+static void vfio_cxl_region_release(struct vfio_pci_core_device *vdev,
+				    struct vfio_pci_region *region)
+{
+}
+
+static const struct vfio_pci_regops vfio_cxl_regops = {
+	.rw		= vfio_cxl_region_rw,
+	.mmap		= vfio_cxl_region_mmap,
+	.release	= vfio_cxl_region_release,
+};
+
 int vfio_cxl_core_enable(struct vfio_pci_core_device *core_dev)
 {
 	struct vfio_cxl *cxl = &core_dev->cxl;
 	struct pci_dev *pdev = core_dev->pdev;
+	u32 flags;
 	u16 dvsec;
 	int ret;
 
@@ -182,8 +307,21 @@  int vfio_cxl_core_enable(struct vfio_pci_core_device *core_dev)
 	if (ret)
 		goto err_enable_cxl_device;
 
+	flags = VFIO_REGION_INFO_FLAG_READ |
+		VFIO_REGION_INFO_FLAG_WRITE |
+		VFIO_REGION_INFO_FLAG_MMAP;
+
+	ret = vfio_pci_core_register_dev_region(core_dev,
+		PCI_VENDOR_ID_CXL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
+		VFIO_REGION_SUBTYPE_CXL, &vfio_cxl_regops,
+		cxl->region.size, flags, &cxl->region);
+	if (ret)
+		goto err_register_cxl_region;
+
 	return 0;
 
+err_register_cxl_region:
+	disable_cxl(core_dev);
 err_enable_cxl_device:
 	vfio_pci_core_disable(core_dev);
 	return ret;
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index 97422aafaa7b..98f3ac2d305c 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -412,6 +412,7 @@  bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev)
 	return pdev->current_state < PCI_D3hot &&
 	       (pdev->no_command_memory || (cmd & PCI_COMMAND_MEMORY));
 }
+EXPORT_SYMBOL(__vfio_pci_memory_enabled);
 
 /*
  * Restore the *real* BARs after we detect a FLR or backdoor reset.
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index 7762d4a3e825..6523d9d1bffe 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -54,6 +54,7 @@  struct vfio_pci_region {
 struct vfio_cxl_region {
 	u64 size;
 	u64 addr;
+	void *vaddr;
 	struct cxl_region *region;
 };
 
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 2b68e6cdf190..71f766c29060 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -372,6 +372,9 @@  struct vfio_region_info_cap_type {
 /* sub-types for VFIO_REGION_TYPE_GFX */
 #define VFIO_REGION_SUBTYPE_GFX_EDID            (1)
 
+/* sub-types for VFIO CXL region */
+#define VFIO_REGION_SUBTYPE_CXL                 (1)
+
 /**
  * struct vfio_region_gfx_edid - EDID region layout.
  *