diff mbox series

[v4,3/3] vfio/nvgrace-gpu: Check the HBM training and C2C link status

Message ID 20250117233704.3374-4-ankita@nvidia.com (mailing list archive)
State New
Headers show
Series vfio/nvgrace-gpu: Enable grace blackwell boards | expand

Commit Message

Ankit Agrawal Jan. 17, 2025, 11:37 p.m. UTC
From: Ankit Agrawal <ankita@nvidia.com>

In contrast to Grace Hopper systems, the HBM training has been moved
out of the UEFI on the Grace Blackwell systems. This reduces the system
bootup time significantly.

The onus of checking whether the HBM training has completed thus falls
on the module.

The HBM training status can be determined from a BAR0 register.
Similarly, another BAR0 register exposes the status of the CPU-GPU
chip-to-chip (C2C) cache coherent interconnect.

Based on testing, 30s is determined to be sufficient to ensure
initialization completion on all the Grace based systems. Thus poll
these register and check for 30s. If the HBM training is not complete
or if the C2C link is not ready, fail the probe.

While the time is not required on Grace Hopper systems, it is
beneficial to make the check to ensure the device is in an
expected state. Hence keeping it generalized to both the generations.

Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
---
 drivers/vfio/pci/nvgrace-gpu/main.c | 64 +++++++++++++++++++++++++++++
 drivers/vfio/pci/vfio_pci_core.c    |  2 +
 2 files changed, 66 insertions(+)

Comments

Alex Williamson Jan. 18, 2025, 1:52 a.m. UTC | #1
On Fri, 17 Jan 2025 23:37:04 +0000
<ankita@nvidia.com> wrote:

> From: Ankit Agrawal <ankita@nvidia.com>
> 
> In contrast to Grace Hopper systems, the HBM training has been moved
> out of the UEFI on the Grace Blackwell systems. This reduces the system
> bootup time significantly.
> 
> The onus of checking whether the HBM training has completed thus falls
> on the module.
> 
> The HBM training status can be determined from a BAR0 register.
> Similarly, another BAR0 register exposes the status of the CPU-GPU
> chip-to-chip (C2C) cache coherent interconnect.
> 
> Based on testing, 30s is determined to be sufficient to ensure
> initialization completion on all the Grace based systems. Thus poll
> these register and check for 30s. If the HBM training is not complete
> or if the C2C link is not ready, fail the probe.
> 
> While the time is not required on Grace Hopper systems, it is
> beneficial to make the check to ensure the device is in an
> expected state. Hence keeping it generalized to both the generations.
> 
> Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
> ---
>  drivers/vfio/pci/nvgrace-gpu/main.c | 64 +++++++++++++++++++++++++++++
>  drivers/vfio/pci/vfio_pci_core.c    |  2 +
>  2 files changed, 66 insertions(+)
> 
> diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
> index e6fe5bc8940f..d3529d2cc3b0 100644
> --- a/drivers/vfio/pci/nvgrace-gpu/main.c
> +++ b/drivers/vfio/pci/nvgrace-gpu/main.c
> @@ -5,6 +5,10 @@
>  
>  #include <linux/sizes.h>
>  #include <linux/vfio_pci_core.h>
> +#include <linux/delay.h>
> +#include <linux/jiffies.h>
> +
> +#include "../vfio_pci_priv.h"
>  
>  /*
>   * The device memory usable to the workloads running in the VM is cached
> @@ -25,6 +29,13 @@
>  
>  #define GPU_CAP_DVSEC_REGISTER 3
>  
> +#define C2C_LINK_BAR0_OFFSET 0x1498
> +#define HBM_TRAINING_BAR0_OFFSET 0x200BC
> +#define STATUS_READY 0xFF
> +
> +#define POLL_QUANTUM_MS 1000
> +#define POLL_TIMEOUT_MS (30 * 1000)
> +
>  /*
>   * The state of the two device memory region - resmem and usemem - is
>   * saved as struct mem_region.
> @@ -856,6 +867,55 @@ static bool nvgrace_gpu_has_mig_hw_bug_fix(struct pci_dev *pdev)
>  	return false;
>  }
>  
> +/*
> + * To reduce the system bootup time, the HBM training has
> + * been moved out of the UEFI on the Grace-Blackwell systems.
> + *
> + * The onus of checking whether the HBM training has completed
> + * thus falls on the module. The HBM training status can be
> + * determined from a BAR0 register.
> + *
> + * Similarly, another BAR0 register exposes the status of the
> + * CPU-GPU chip-to-chip (C2C) cache coherent interconnect.
> + *
> + * Poll these register and check for 30s. If the HBM training is
> + * not complete or if the C2C link is not ready, fail the probe.
> + *
> + * While the wait is not required on Grace Hopper systems, it
> + * is beneficial to make the check to ensure the device is in an
> + * expected state.
> + */
> +static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev,
> +					 struct vfio_pci_core_device *vdev)
> +{
> +	unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS);
> +	void __iomem *io;
> +	int ret = -ETIME;
> +	u16 cmd;
> +
> +	cmd = vfio_pci_memory_lock_and_enable(vdev);
> +	io = pci_iomap(pdev, 0, 0);
> +	if (!io) {
> +		ret = -ENOMEM;
> +		goto iomap_exit;
> +	}
> +
> +	do {
> +		if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) &&
> +		    (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) {
> +			ret = 0;
> +			goto reg_check_exit;
> +		}
> +		msleep(POLL_QUANTUM_MS);
> +	} while (!time_after(jiffies, timeout));
> +
> +reg_check_exit:
> +	pci_iounmap(pdev, io);
> +iomap_exit:
> +	vfio_pci_memory_unlock_and_restore(vdev, cmd);
> +	return ret;
> +}
> +
>  static int nvgrace_gpu_probe(struct pci_dev *pdev,
>  			     const struct pci_device_id *id)
>  {
> @@ -875,6 +935,10 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
>  
>  	dev_set_drvdata(&pdev->dev, &nvdev->core_device);
>  
> +	ret = nvgrace_gpu_wait_device_ready(pdev, &nvdev->core_device);
> +	if (ret)
> +		return ret;
> +
>  	if (ops == &nvgrace_gpu_pci_ops) {
>  		nvdev->has_mig_hw_bug_fix = nvgrace_gpu_has_mig_hw_bug_fix(pdev);
>  
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index 90240c8d51aa..68f123d17c4b 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -1634,12 +1634,14 @@ u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev)
>  
>  	return cmd;
>  }
> +EXPORT_SYMBOL_GPL(vfio_pci_memory_lock_and_enable);
>  
>  void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, u16 cmd)
>  {
>  	pci_write_config_word(vdev->pdev, PCI_COMMAND, cmd);
>  	up_write(&vdev->memory_lock);
>  }
> +EXPORT_SYMBOL_GPL(vfio_pci_memory_unlock_and_restore);
>  
>  static unsigned long vma_to_pfn(struct vm_area_struct *vma)
>  {

The access is happening before the device is exposed to the user, the
above are for handling conditions while there may be races with user
access, this is totally unnecessary.

Does this delay even need to happen in the probe function, or could it
happen in the open_device callback?  That would still be before user
access, but if we expect it to generally work, it would allow the
training to happen in the background up until the user tries to open
the device.  Thanks,

Alex
Ankit Agrawal Jan. 20, 2025, 2:24 a.m. UTC | #2
>> +EXPORT_SYMBOL_GPL(vfio_pci_memory_lock_and_enable);
>>
>>  void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, u16 cmd)
>>  {
>>       pci_write_config_word(vdev->pdev, PCI_COMMAND, cmd);
>>       up_write(&vdev->memory_lock);
>>  }
>> +EXPORT_SYMBOL_GPL(vfio_pci_memory_unlock_and_restore);
>>
>>  static unsigned long vma_to_pfn(struct vm_area_struct *vma)
>>  {
>
> The access is happening before the device is exposed to the user, the
> above are for handling conditions while there may be races with user
> access, this is totally unnecessary.

Right. What I could do to reuse the code is to take out the part
related to locking/unlocking as new functions and export that.
The current vfio_pci_memory_lock_and_enable() would take the lock
and call the new function. Same for vfio_pci_memory_unlock_and_restore().
The nvgrace module could also call that new function. Does that sound
reasonable?

> Does this delay even need to happen in the probe function, or could it
> happen in the open_device callback?  That would still be before user
> access, but if we expect it to generally work, it would allow the
> training to happen in the background up until the user tries to open
> the device.  Thanks,
>
> Alex

The thought process is that since it is purely bare metal coming to proper
state while boot, the nvgrace module should probably wait for the startup
to complete during probe() instead of delaying until open() time.

- Ankit Agrawal
Alex Williamson Jan. 20, 2025, 3:12 a.m. UTC | #3
On Mon, 20 Jan 2025 02:24:14 +0000
Ankit Agrawal <ankita@nvidia.com> wrote:

> >> +EXPORT_SYMBOL_GPL(vfio_pci_memory_lock_and_enable);
> >>
> >>  void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, u16 cmd)
> >>  {
> >>       pci_write_config_word(vdev->pdev, PCI_COMMAND, cmd);
> >>       up_write(&vdev->memory_lock);
> >>  }
> >> +EXPORT_SYMBOL_GPL(vfio_pci_memory_unlock_and_restore);
> >>
> >>  static unsigned long vma_to_pfn(struct vm_area_struct *vma)
> >>  {  
> >
> > The access is happening before the device is exposed to the user, the
> > above are for handling conditions while there may be races with user
> > access, this is totally unnecessary.  
> 
> Right. What I could do to reuse the code is to take out the part
> related to locking/unlocking as new functions and export that.
> The current vfio_pci_memory_lock_and_enable() would take the lock
> and call the new function. Same for vfio_pci_memory_unlock_and_restore().
> The nvgrace module could also call that new function. Does that sound
> reasonable?

No, this is standard PCI driver stuff, everything you need is already
there.  Probably pci_enable_device() and some variant of
pci_request_regions().

> > Does this delay even need to happen in the probe function, or could it
> > happen in the open_device callback?  That would still be before user
> > access, but if we expect it to generally work, it would allow the
> > training to happen in the background up until the user tries to open
> > the device.  Thanks,
> >
> > Alex  
> 
> The thought process is that since it is purely bare metal coming to proper
> state while boot, the nvgrace module should probably wait for the startup
> to complete during probe() instead of delaying until open() time.

If the driver is statically loaded, that might mean you're willing to
stall boot for up to 30s.  In practice is this ever actually going to
fail?  Thanks,

Alex
Alex Williamson Jan. 20, 2025, 3:22 a.m. UTC | #4
On Sun, 19 Jan 2025 20:12:32 -0700
Alex Williamson <alex.williamson@redhat.com> wrote:

> On Mon, 20 Jan 2025 02:24:14 +0000
> Ankit Agrawal <ankita@nvidia.com> wrote:
> 
> > >> +EXPORT_SYMBOL_GPL(vfio_pci_memory_lock_and_enable);
> > >>
> > >>  void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, u16 cmd)
> > >>  {
> > >>       pci_write_config_word(vdev->pdev, PCI_COMMAND, cmd);
> > >>       up_write(&vdev->memory_lock);
> > >>  }
> > >> +EXPORT_SYMBOL_GPL(vfio_pci_memory_unlock_and_restore);
> > >>
> > >>  static unsigned long vma_to_pfn(struct vm_area_struct *vma)
> > >>  {    
> > >
> > > The access is happening before the device is exposed to the user, the
> > > above are for handling conditions while there may be races with user
> > > access, this is totally unnecessary.    
> > 
> > Right. What I could do to reuse the code is to take out the part
> > related to locking/unlocking as new functions and export that.
> > The current vfio_pci_memory_lock_and_enable() would take the lock
> > and call the new function. Same for vfio_pci_memory_unlock_and_restore().
> > The nvgrace module could also call that new function. Does that sound
> > reasonable?  
> 
> No, this is standard PCI driver stuff, everything you need is already
> there.  Probably pci_enable_device() and some variant of
> pci_request_regions().
> 
> > > Does this delay even need to happen in the probe function, or could it
> > > happen in the open_device callback?  That would still be before user
> > > access, but if we expect it to generally work, it would allow the
> > > training to happen in the background up until the user tries to open
> > > the device.  Thanks,
> > >
> > > Alex    
> > 
> > The thought process is that since it is purely bare metal coming to proper
> > state while boot, the nvgrace module should probably wait for the startup
> > to complete during probe() instead of delaying until open() time.  
> 
> If the driver is statically loaded, that might mean you're willing to
> stall boot for up to 30s.  In practice is this ever actually going to
> fail?  Thanks,

On second thought, I guess a vfio-pci variant driver can't
automatically bind to a device, whether statically built or not, so
maybe this isn't a concern.  I'm not sure if there are other concerns
with busy waiting for up to 30s at driver probe.  Thanks,

Alex
Ankit Agrawal Jan. 20, 2025, 3:35 a.m. UTC | #5
> No, this is standard PCI driver stuff, everything you need is already
> there.  Probably pci_enable_device() and some variant of
> pci_request_regions().

Ok thanks, I'll take a look at that.

>> > > Does this delay even need to happen in the probe function, or could it
>> > > happen in the open_device callback?  That would still be before user
>> > > access, but if we expect it to generally work, it would allow the
>> > > training to happen in the background up until the user tries to open
>> > > the device.  Thanks,
>> > >
>> > > Alex
>> >
>> > The thought process is that since it is purely bare metal coming to proper
>> > state while boot, the nvgrace module should probably wait for the startup
>> > to complete during probe() instead of delaying until open() time.
>>
>> If the driver is statically loaded, that might mean you're willing to
>> stall boot for up to 30s.  In practice is this ever actually going to
>> fail?  Thanks,

No, I have not seen this getting timeout in my testing. 30s is considered
to be sufficient to be sure that the hardware is not in a bad state.

> On second thought, I guess a vfio-pci variant driver can't
> automatically bind to a device, whether statically built or not, so
> maybe this isn't a concern.  I'm not sure if there are other concerns
> with busy waiting for up to 30s at driver probe.  Thanks,
>
> Alex
Tian, Kevin Jan. 20, 2025, 7:04 a.m. UTC | #6
> From: Alex Williamson <alex.williamson@redhat.com>
> Sent: Monday, January 20, 2025 11:23 AM
> 
> On Sun, 19 Jan 2025 20:12:32 -0700
> Alex Williamson <alex.williamson@redhat.com> wrote:
> 
> > On Mon, 20 Jan 2025 02:24:14 +0000
> > Ankit Agrawal <ankita@nvidia.com> wrote:
> >
> > > >> +EXPORT_SYMBOL_GPL(vfio_pci_memory_lock_and_enable);
> > > >>
> > > >>  void vfio_pci_memory_unlock_and_restore(struct
> vfio_pci_core_device *vdev, u16 cmd)
> > > >>  {
> > > >>       pci_write_config_word(vdev->pdev, PCI_COMMAND, cmd);
> > > >>       up_write(&vdev->memory_lock);
> > > >>  }
> > > >> +EXPORT_SYMBOL_GPL(vfio_pci_memory_unlock_and_restore);
> > > >>
> > > >>  static unsigned long vma_to_pfn(struct vm_area_struct *vma)
> > > >>  {
> > > >
> > > > The access is happening before the device is exposed to the user, the
> > > > above are for handling conditions while there may be races with user
> > > > access, this is totally unnecessary.
> > >
> > > Right. What I could do to reuse the code is to take out the part
> > > related to locking/unlocking as new functions and export that.
> > > The current vfio_pci_memory_lock_and_enable() would take the lock
> > > and call the new function. Same for
> vfio_pci_memory_unlock_and_restore().
> > > The nvgrace module could also call that new function. Does that sound
> > > reasonable?
> >
> > No, this is standard PCI driver stuff, everything you need is already
> > there.  Probably pci_enable_device() and some variant of
> > pci_request_regions().
> >
> > > > Does this delay even need to happen in the probe function, or could it
> > > > happen in the open_device callback?  That would still be before user
> > > > access, but if we expect it to generally work, it would allow the
> > > > training to happen in the background up until the user tries to open
> > > > the device.  Thanks,
> > > >
> > > > Alex
> > >
> > > The thought process is that since it is purely bare metal coming to proper
> > > state while boot, the nvgrace module should probably wait for the
> startup
> > > to complete during probe() instead of delaying until open() time.
> >
> > If the driver is statically loaded, that might mean you're willing to
> > stall boot for up to 30s.  In practice is this ever actually going to
> > fail?  Thanks,
> 
> On second thought, I guess a vfio-pci variant driver can't
> automatically bind to a device, whether statically built or not, so
> maybe this isn't a concern.  I'm not sure if there are other concerns
> with busy waiting for up to 30s at driver probe.  Thanks,
> 

Can this wait be left to userspace i.e. the variant driver just does
one-off check and fail the probe if the device is not ready? Nvidia
can describe the requirement that the administrator may need to
wait for 30s to retry driver probe if the 1st attempt fails...
Jason Gunthorpe Jan. 20, 2025, 1:12 p.m. UTC | #7
On Sun, Jan 19, 2025 at 08:22:52PM -0700, Alex Williamson wrote:
> On second thought, I guess a vfio-pci variant driver can't
> automatically bind to a device, whether statically built or not, so
> maybe this isn't a concern.  I'm not sure if there are other concerns
> with busy waiting for up to 30s at driver probe.  Thanks,

It is not entirely abnormal. mlx5 has a timeout while it pushes
its first command and on the failure path it is long > 30s IIRC..

Drivers that take a couple of seconds to complete probe are not
unusual.

Jason
diff mbox series

Patch

diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index e6fe5bc8940f..d3529d2cc3b0 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -5,6 +5,10 @@ 
 
 #include <linux/sizes.h>
 #include <linux/vfio_pci_core.h>
+#include <linux/delay.h>
+#include <linux/jiffies.h>
+
+#include "../vfio_pci_priv.h"
 
 /*
  * The device memory usable to the workloads running in the VM is cached
@@ -25,6 +29,13 @@ 
 
 #define GPU_CAP_DVSEC_REGISTER 3
 
+#define C2C_LINK_BAR0_OFFSET 0x1498
+#define HBM_TRAINING_BAR0_OFFSET 0x200BC
+#define STATUS_READY 0xFF
+
+#define POLL_QUANTUM_MS 1000
+#define POLL_TIMEOUT_MS (30 * 1000)
+
 /*
  * The state of the two device memory region - resmem and usemem - is
  * saved as struct mem_region.
@@ -856,6 +867,55 @@  static bool nvgrace_gpu_has_mig_hw_bug_fix(struct pci_dev *pdev)
 	return false;
 }
 
+/*
+ * To reduce the system bootup time, the HBM training has
+ * been moved out of the UEFI on the Grace-Blackwell systems.
+ *
+ * The onus of checking whether the HBM training has completed
+ * thus falls on the module. The HBM training status can be
+ * determined from a BAR0 register.
+ *
+ * Similarly, another BAR0 register exposes the status of the
+ * CPU-GPU chip-to-chip (C2C) cache coherent interconnect.
+ *
+ * Poll these register and check for 30s. If the HBM training is
+ * not complete or if the C2C link is not ready, fail the probe.
+ *
+ * While the wait is not required on Grace Hopper systems, it
+ * is beneficial to make the check to ensure the device is in an
+ * expected state.
+ */
+static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev,
+					 struct vfio_pci_core_device *vdev)
+{
+	unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS);
+	void __iomem *io;
+	int ret = -ETIME;
+	u16 cmd;
+
+	cmd = vfio_pci_memory_lock_and_enable(vdev);
+	io = pci_iomap(pdev, 0, 0);
+	if (!io) {
+		ret = -ENOMEM;
+		goto iomap_exit;
+	}
+
+	do {
+		if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) &&
+		    (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) {
+			ret = 0;
+			goto reg_check_exit;
+		}
+		msleep(POLL_QUANTUM_MS);
+	} while (!time_after(jiffies, timeout));
+
+reg_check_exit:
+	pci_iounmap(pdev, io);
+iomap_exit:
+	vfio_pci_memory_unlock_and_restore(vdev, cmd);
+	return ret;
+}
+
 static int nvgrace_gpu_probe(struct pci_dev *pdev,
 			     const struct pci_device_id *id)
 {
@@ -875,6 +935,10 @@  static int nvgrace_gpu_probe(struct pci_dev *pdev,
 
 	dev_set_drvdata(&pdev->dev, &nvdev->core_device);
 
+	ret = nvgrace_gpu_wait_device_ready(pdev, &nvdev->core_device);
+	if (ret)
+		return ret;
+
 	if (ops == &nvgrace_gpu_pci_ops) {
 		nvdev->has_mig_hw_bug_fix = nvgrace_gpu_has_mig_hw_bug_fix(pdev);
 
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 90240c8d51aa..68f123d17c4b 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1634,12 +1634,14 @@  u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev)
 
 	return cmd;
 }
+EXPORT_SYMBOL_GPL(vfio_pci_memory_lock_and_enable);
 
 void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, u16 cmd)
 {
 	pci_write_config_word(vdev->pdev, PCI_COMMAND, cmd);
 	up_write(&vdev->memory_lock);
 }
+EXPORT_SYMBOL_GPL(vfio_pci_memory_unlock_and_restore);
 
 static unsigned long vma_to_pfn(struct vm_area_struct *vma)
 {