diff mbox series

[v2,2/3] vfio/nvgrace-gpu: Expose the blackwell device PF BAR1 to the VM

Message ID 20250105173615.28481-3-ankita@nvidia.com (mailing list archive)
State New, archived
Headers show
Series vfio/nvgrace-gpu: Enable grace blackwell boards | expand

Commit Message

Ankit Agrawal Jan. 5, 2025, 5:36 p.m. UTC
From: Ankit Agrawal <ankita@nvidia.com>

There is a HW defect on Grace Hopper (GH) to support the
Multi-Instance GPU (MIG) feature [1] that necessiated the presence
of a 1G region carved out from the device memory and mapped as
uncached. The 1G region is shown as a fake BAR (comprising region 2 and 3)
to workaround the issue.

The Grace Blackwell systems (GB) differ from GH systems in the following
aspects:
1. The aforementioned HW defect is fixed on GB systems.
2. There is a usable BAR1 (region 2 and 3) on GB systems for the
GPUdirect RDMA feature [2].

This patch accommodate those GB changes by showing the 64b physical
device BAR1 (region2 and 3) to the VM instead of the fake one. This
takes care of both the differences.

Moreover, the entire device memory is exposed on GB as cacheable to
the VM as there is no carveout required.

Link: https://www.nvidia.com/en-in/technologies/multi-instance-gpu/ [1]
Link: https://docs.nvidia.com/cuda/gpudirect-rdma/ [2]

Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
---
 drivers/vfio/pci/nvgrace-gpu/main.c | 32 +++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

Comments

Alex Williamson Jan. 9, 2025, 7:21 p.m. UTC | #1
On Sun, 5 Jan 2025 17:36:14 +0000
<ankita@nvidia.com> wrote:

> From: Ankit Agrawal <ankita@nvidia.com>
> 
> There is a HW defect on Grace Hopper (GH) to support the
> Multi-Instance GPU (MIG) feature [1] that necessiated the presence
> of a 1G region carved out from the device memory and mapped as
> uncached. The 1G region is shown as a fake BAR (comprising region 2 and 3)
> to workaround the issue.
> 
> The Grace Blackwell systems (GB) differ from GH systems in the following
> aspects:
> 1. The aforementioned HW defect is fixed on GB systems.
> 2. There is a usable BAR1 (region 2 and 3) on GB systems for the
> GPUdirect RDMA feature [2].
> 
> This patch accommodate those GB changes by showing the 64b physical
> device BAR1 (region2 and 3) to the VM instead of the fake one. This
> takes care of both the differences.
> 
> Moreover, the entire device memory is exposed on GB as cacheable to
> the VM as there is no carveout required.
> 
> Link: https://www.nvidia.com/en-in/technologies/multi-instance-gpu/ [1]
> Link: https://docs.nvidia.com/cuda/gpudirect-rdma/ [2]
> 
> Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
> ---
>  drivers/vfio/pci/nvgrace-gpu/main.c | 32 +++++++++++++++++++++--------
>  1 file changed, 24 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
> index 85eacafaffdf..44a276c886e1 100644
> --- a/drivers/vfio/pci/nvgrace-gpu/main.c
> +++ b/drivers/vfio/pci/nvgrace-gpu/main.c
> @@ -72,7 +72,7 @@ nvgrace_gpu_memregion(int index,
>  	if (index == USEMEM_REGION_INDEX)
>  		return &nvdev->usemem;
>  
> -	if (index == RESMEM_REGION_INDEX)
> +	if (!nvdev->has_mig_hw_bug_fix && index == RESMEM_REGION_INDEX)
>  		return &nvdev->resmem;
>  
>  	return NULL;
> @@ -715,6 +715,16 @@ static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = {
>  	.detach_ioas	= vfio_iommufd_physical_detach_ioas,
>  };
>  
> +static void
> +nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
> +			      struct nvgrace_gpu_pci_core_device *nvdev,
> +			      u64 memphys, u64 memlength)
> +{
> +	nvdev->usemem.memphys = memphys;
> +	nvdev->usemem.memlength = memlength;
> +	nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength);
> +}
> +
>  static int
>  nvgrace_gpu_fetch_memory_property(struct pci_dev *pdev,
>  				  u64 *pmemphys, u64 *pmemlength)
> @@ -752,9 +762,9 @@ nvgrace_gpu_fetch_memory_property(struct pci_dev *pdev,
>  }
>  
>  static int
> -nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
> -			      struct nvgrace_gpu_pci_core_device *nvdev,
> -			      u64 memphys, u64 memlength)
> +nvgrace_gpu_nvdev_struct_workaround(struct pci_dev *pdev,
> +				    struct nvgrace_gpu_pci_core_device *nvdev,
> +				    u64 memphys, u64 memlength)
>  {
>  	int ret = 0;
>  
> @@ -864,10 +874,16 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
>  		 * Device memory properties are identified in the host ACPI
>  		 * table. Set the nvgrace_gpu_pci_core_device structure.
>  		 */
> -		ret = nvgrace_gpu_init_nvdev_struct(pdev, nvdev,
> -						    memphys, memlength);
> -		if (ret)
> -			goto out_put_vdev;
> +		if (nvdev->has_mig_hw_bug_fix) {
> +			nvgrace_gpu_init_nvdev_struct(pdev, nvdev,
> +						      memphys, memlength);
> +		} else {
> +			ret = nvgrace_gpu_nvdev_struct_workaround(pdev, nvdev,
> +								  memphys,
> +								  memlength);
> +			if (ret)
> +				goto out_put_vdev;
> +		}

Doesn't this work out much more naturally if we just do something like:

diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c
b/drivers/vfio/pci/nvgrace-gpu/main.c index 85eacafaffdf..43a9457442ff
100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -17,9 +17,6 @@
 #define RESMEM_REGION_INDEX VFIO_PCI_BAR2_REGION_INDEX
 #define USEMEM_REGION_INDEX VFIO_PCI_BAR4_REGION_INDEX
 
-/* Memory size expected as non cached and reserved by the VM driver */
-#define RESMEM_SIZE SZ_1G
-
 /* A hardwired and constant ABI value between the GPU FW and VFIO
driver. */ #define MEMBLK_SIZE SZ_512M
 
@@ -72,7 +69,7 @@ nvgrace_gpu_memregion(int index,
 	if (index == USEMEM_REGION_INDEX)
 		return &nvdev->usemem;
 
-	if (index == RESMEM_REGION_INDEX)
+	if (nvdev->resmem.memlength && index == RESMEM_REGION_INDEX)
 		return &nvdev->resmem;
 
 	return NULL;
@@ -757,6 +754,13 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
 			      u64 memphys, u64 memlength)
 {
 	int ret = 0;
+	u64 resmem_size = 0;
+
+	/*
+	 * Comment about the GH bug that requires this and fix in GB
+	 */
+	if (!nvdev->has_mig_hw_bug_fix)
+		resmem_size = SZ_1G;
 
 	/*
 	 * The VM GPU device driver needs a non-cacheable region to
support @@ -780,7 +784,7 @@ nvgrace_gpu_init_nvdev_struct(struct
pci_dev *pdev,
 	 * memory (usemem) is added to the kernel for usage by the VM
 	 * workloads. Make the usable memory size memblock aligned.
 	 */
-	if (check_sub_overflow(memlength, RESMEM_SIZE,
+	if (check_sub_overflow(memlength, resmem_size,
 			       &nvdev->usemem.memlength)) {
 		ret = -EOVERFLOW;
 		goto done;
@@ -813,7 +817,9 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
 	 * the BAR size for them.
 	 */
 	nvdev->usemem.bar_size =
roundup_pow_of_two(nvdev->usemem.memlength);
-	nvdev->resmem.bar_size =
roundup_pow_of_two(nvdev->resmem.memlength);
+	if (nvdev->resmem.memlength)
+		nvdev->resmem.bar_size =
+			roundup_pow_of_two(nvdev->resmem.memlength);
 done:
 	return ret;
 }

Thanks,
Alex
Ankit Agrawal Jan. 9, 2025, 10:57 p.m. UTC | #2
> Doesn't this work out much more naturally if we just do something like:
> 
> diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c
> b/drivers/vfio/pci/nvgrace-gpu/main.c index 85eacafaffdf..43a9457442ff
> 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c
> +++ b/drivers/vfio/pci/nvgrace-gpu/main.c
> @@ -17,9 +17,6 @@
> #define RESMEM_REGION_INDEX VFIO_PCI_BAR2_REGION_INDEX
> #define USEMEM_REGION_INDEX VFIO_PCI_BAR4_REGION_INDEX
> 
> -/* Memory size expected as non cached and reserved by the VM driver */
> -#define RESMEM_SIZE SZ_1G
> -
> /* A hardwired and constant ABI value between the GPU FW and VFIO
> driver. */ #define MEMBLK_SIZE SZ_512M
> 
> @@ -72,7 +69,7 @@ nvgrace_gpu_memregion(int index,
>         if (index == USEMEM_REGION_INDEX)
>                 return &nvdev->usemem;
> 
> -       if (index == RESMEM_REGION_INDEX)
> +       if (nvdev->resmem.memlength && index == RESMEM_REGION_INDEX)
>                 return &nvdev->resmem;
> 
>         return NULL;
> @@ -757,6 +754,13 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
>                               u64 memphys, u64 memlength)
> {
>          int ret = 0;
> +       u64 resmem_size = 0;
> +
> +       /*
> +        * Comment about the GH bug that requires this and fix in GB
> +        */
> +       if (!nvdev->has_mig_hw_bug_fix)
> +               resmem_size = SZ_1G;
> 
>         /*
>          * The VM GPU device driver needs a non-cacheable region to
> support @@ -780,7 +784,7 @@ nvgrace_gpu_init_nvdev_struct(struct
> pci_dev *pdev,
>          * memory (usemem) is added to the kernel for usage by the VM
>          * workloads. Make the usable memory size memblock aligned.
>          */
> -       if (check_sub_overflow(memlength, RESMEM_SIZE,
> +       if (check_sub_overflow(memlength, resmem_size,
>                                &nvdev->usemem.memlength)) {
>                 ret = -EOVERFLOW;
>                 goto done;
> @@ -813,7 +817,9 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
>          * the BAR size for them.
>          */
>         nvdev->usemem.bar_size =
> roundup_pow_of_two(nvdev->usemem.memlength);
> -       nvdev->resmem.bar_size =
> roundup_pow_of_two(nvdev->resmem.memlength);
> +       if (nvdev->resmem.memlength)
> +               nvdev->resmem.bar_size =
> +                       roundup_pow_of_two(nvdev->resmem.memlength);
>  done:
>         return ret;
>  }
> 

Thanks Alex, you suggestion does looks simpler and better.

I'll test that out and send out an updated version of the patch.
diff mbox series

Patch

diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index 85eacafaffdf..44a276c886e1 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -72,7 +72,7 @@  nvgrace_gpu_memregion(int index,
 	if (index == USEMEM_REGION_INDEX)
 		return &nvdev->usemem;
 
-	if (index == RESMEM_REGION_INDEX)
+	if (!nvdev->has_mig_hw_bug_fix && index == RESMEM_REGION_INDEX)
 		return &nvdev->resmem;
 
 	return NULL;
@@ -715,6 +715,16 @@  static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = {
 	.detach_ioas	= vfio_iommufd_physical_detach_ioas,
 };
 
+static void
+nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
+			      struct nvgrace_gpu_pci_core_device *nvdev,
+			      u64 memphys, u64 memlength)
+{
+	nvdev->usemem.memphys = memphys;
+	nvdev->usemem.memlength = memlength;
+	nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength);
+}
+
 static int
 nvgrace_gpu_fetch_memory_property(struct pci_dev *pdev,
 				  u64 *pmemphys, u64 *pmemlength)
@@ -752,9 +762,9 @@  nvgrace_gpu_fetch_memory_property(struct pci_dev *pdev,
 }
 
 static int
-nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
-			      struct nvgrace_gpu_pci_core_device *nvdev,
-			      u64 memphys, u64 memlength)
+nvgrace_gpu_nvdev_struct_workaround(struct pci_dev *pdev,
+				    struct nvgrace_gpu_pci_core_device *nvdev,
+				    u64 memphys, u64 memlength)
 {
 	int ret = 0;
 
@@ -864,10 +874,16 @@  static int nvgrace_gpu_probe(struct pci_dev *pdev,
 		 * Device memory properties are identified in the host ACPI
 		 * table. Set the nvgrace_gpu_pci_core_device structure.
 		 */
-		ret = nvgrace_gpu_init_nvdev_struct(pdev, nvdev,
-						    memphys, memlength);
-		if (ret)
-			goto out_put_vdev;
+		if (nvdev->has_mig_hw_bug_fix) {
+			nvgrace_gpu_init_nvdev_struct(pdev, nvdev,
+						      memphys, memlength);
+		} else {
+			ret = nvgrace_gpu_nvdev_struct_workaround(pdev, nvdev,
+								  memphys,
+								  memlength);
+			if (ret)
+				goto out_put_vdev;
+		}
 	}
 
 	ret = vfio_pci_core_register_device(&nvdev->core_device);