diff mbox series

[v3,3/3] vfio/nvgrace-gpu: Check the HBM training and C2C link status

Message ID 20250117152334.2786-4-ankita@nvidia.com (mailing list archive)
State New
Headers show
Series vfio/nvgrace-gpu: Enable grace blackwell boards | expand

Commit Message

Ankit Agrawal Jan. 17, 2025, 3:23 p.m. UTC
From: Ankit Agrawal <ankita@nvidia.com>

In contrast to Grace Hopper systems, the HBM training has been moved
out of the UEFI on the Grace Blackwell systems. This reduces the system
bootup time significantly.

The onus of checking whether the HBM training has completed thus falls
on the module.

The HBM training status can be determined from a BAR0 register.
Similarly, another BAR0 register exposes the status of the CPU-GPU
chip-to-chip (C2C) cache coherent interconnect.

Based on testing, 30s is determined to be sufficient to ensure
initialization completion on all the Grace based systems. Thus poll
these register and check for 30s. If the HBM training is not complete
or if the C2C link is not ready, fail the probe.

While the time is not required on Grace Hopper systems, it is
beneficial to make the check to ensure the device is in an
expected state. Hence keeping it generalized to both the generations.

Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
---
 drivers/vfio/pci/nvgrace-gpu/main.c | 55 +++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

Comments

Alex Williamson Jan. 17, 2025, 6:27 p.m. UTC | #1
On Fri, 17 Jan 2025 15:23:34 +0000
<ankita@nvidia.com> wrote:

> From: Ankit Agrawal <ankita@nvidia.com>
> 
> In contrast to Grace Hopper systems, the HBM training has been moved
> out of the UEFI on the Grace Blackwell systems. This reduces the system
> bootup time significantly.
> 
> The onus of checking whether the HBM training has completed thus falls
> on the module.
> 
> The HBM training status can be determined from a BAR0 register.
> Similarly, another BAR0 register exposes the status of the CPU-GPU
> chip-to-chip (C2C) cache coherent interconnect.
> 
> Based on testing, 30s is determined to be sufficient to ensure
> initialization completion on all the Grace based systems. Thus poll
> these register and check for 30s. If the HBM training is not complete
> or if the C2C link is not ready, fail the probe.
> 
> While the time is not required on Grace Hopper systems, it is
> beneficial to make the check to ensure the device is in an
> expected state. Hence keeping it generalized to both the generations.
> 
> Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
> ---
>  drivers/vfio/pci/nvgrace-gpu/main.c | 55 +++++++++++++++++++++++++++++
>  1 file changed, 55 insertions(+)
> 
> diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
> index 89d38e3c0261..6298e7f0fe1a 100644
> --- a/drivers/vfio/pci/nvgrace-gpu/main.c
> +++ b/drivers/vfio/pci/nvgrace-gpu/main.c
> @@ -5,6 +5,8 @@
>  
>  #include <linux/sizes.h>
>  #include <linux/vfio_pci_core.h>
> +#include <linux/delay.h>
> +#include <linux/jiffies.h>
>  
>  /*
>   * The device memory usable to the workloads running in the VM is cached
> @@ -25,6 +27,13 @@
>  
>  #define GPU_CAP_DVSEC_REGISTER 3
>  
> +#define C2C_LINK_BAR0_OFFSET 0x1498
> +#define HBM_TRAINING_BAR0_OFFSET 0x200BC
> +#define STATUS_READY 0xFF
> +
> +#define POLL_QUANTUM_MS 1000
> +#define POLL_TIMEOUT_MS (30 * 1000)
> +
>  /*
>   * The state of the two device memory region - resmem and usemem - is
>   * saved as struct mem_region.
> @@ -855,6 +864,48 @@ static bool nvgrace_gpu_has_mig_hw_bug_fix(struct pci_dev *pdev)
>  	return false;
>  }
>  
> +/*
> + * To reduce the system bootup time, the HBM training has
> + * been moved out of the UEFI on the Grace-Blackwell systems.
> + *
> + * The onus of checking whether the HBM training has completed
> + * thus falls on the module. The HBM training status can be
> + * determined from a BAR0 register.
> + *
> + * Similarly, another BAR0 register exposes the status of the
> + * CPU-GPU chip-to-chip (C2C) cache coherent interconnect.
> + *
> + * Poll these register and check for 30s. If the HBM training is
> + * not complete or if the C2C link is not ready, fail the probe.
> + *
> + * While the wait is not required on Grace Hopper systems, it
> + * is beneficial to make the check to ensure the device is in an
> + * expected state.
> + */
> +static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev)
> +{
> +	unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS);
> +	void __iomem *io;
> +	int ret = -ETIME;
> +
> +	io = pci_iomap(pdev, 0, 0);
> +	if (!io)
> +		return -ENOMEM;
> +
> +	do {
> +		if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) &&
> +		    (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) {
> +			ret = 0;
> +			goto reg_check_exit;
> +		}
> +		msleep(POLL_QUANTUM_MS);
> +	} while (!time_after(jiffies, timeout));
> +
> +reg_check_exit:
> +	pci_iounmap(pdev, io);
> +	return ret;

We're accessing device memory here but afaict the memory enable bit of
the command register is in an indeterminate state.  What happens if you
use setpci to clear the memory enable bit or 'echo 0 > enable' before
binding the driver?  Thanks,

Alex

> +}
> +
>  static int nvgrace_gpu_probe(struct pci_dev *pdev,
>  			     const struct pci_device_id *id)
>  {
> @@ -863,6 +914,10 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
>  	u64 memphys, memlength;
>  	int ret;
>  
> +	ret = nvgrace_gpu_wait_device_ready(pdev);
> +	if (ret)
> +		return ret;
> +
>  	ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength);
>  	if (!ret)
>  		ops = &nvgrace_gpu_pci_ops;
Ankit Agrawal Jan. 17, 2025, 7:19 p.m. UTC | #2
>> +/*
>> + * To reduce the system bootup time, the HBM training has
>> + * been moved out of the UEFI on the Grace-Blackwell systems.
>> + *
>> + * The onus of checking whether the HBM training has completed
>> + * thus falls on the module. The HBM training status can be
>> + * determined from a BAR0 register.
>> + *
>> + * Similarly, another BAR0 register exposes the status of the
>> + * CPU-GPU chip-to-chip (C2C) cache coherent interconnect.
>> + *
>> + * Poll these register and check for 30s. If the HBM training is
>> + * not complete or if the C2C link is not ready, fail the probe.
>> + *
>> + * While the wait is not required on Grace Hopper systems, it
>> + * is beneficial to make the check to ensure the device is in an
>> + * expected state.
>> + */
>> +static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev)
>> +{
>> +     unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS);
>> +     void __iomem *io;
>> +     int ret = -ETIME;
>> +
>> +     io = pci_iomap(pdev, 0, 0);
>> +     if (!io)
>> +             return -ENOMEM;
>> +
>> +     do {
>> +             if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) &&
>> +                 (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) {
>> +                     ret = 0;
>> +                     goto reg_check_exit;
>> +             }
>> +             msleep(POLL_QUANTUM_MS);
>> +     } while (!time_after(jiffies, timeout));
>> +
>> +reg_check_exit:
>> +     pci_iounmap(pdev, io);
>> +     return ret;
>
> We're accessing device memory here but afaict the memory enable bit of
> the command register is in an indeterminate state.  What happens if you
> use setpci to clear the memory enable bit or 'echo 0 > enable' before
> binding the driver?  Thanks,
>
> Alex

Hi Alex, sorry I didn't understand how we are accessing device memory here if
the C2C_LINK_BAR0_OFFSET and HBM_TRAINING_BAR0_OFFSET are BAR0 regs.
But anyways, I tried 'echo 0 > <sysfs_path>/enable' before device bind. I am not
observing any issue and the bind goes through.

Or am I missing something?
Alex Williamson Jan. 17, 2025, 7:39 p.m. UTC | #3
On Fri, 17 Jan 2025 19:19:42 +0000
Ankit Agrawal <ankita@nvidia.com> wrote:

> >> +/*
> >> + * To reduce the system bootup time, the HBM training has
> >> + * been moved out of the UEFI on the Grace-Blackwell systems.
> >> + *
> >> + * The onus of checking whether the HBM training has completed
> >> + * thus falls on the module. The HBM training status can be
> >> + * determined from a BAR0 register.
> >> + *
> >> + * Similarly, another BAR0 register exposes the status of the
> >> + * CPU-GPU chip-to-chip (C2C) cache coherent interconnect.
> >> + *
> >> + * Poll these register and check for 30s. If the HBM training is
> >> + * not complete or if the C2C link is not ready, fail the probe.
> >> + *
> >> + * While the wait is not required on Grace Hopper systems, it
> >> + * is beneficial to make the check to ensure the device is in an
> >> + * expected state.
> >> + */
> >> +static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev)
> >> +{
> >> +     unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS);
> >> +     void __iomem *io;
> >> +     int ret = -ETIME;
> >> +
> >> +     io = pci_iomap(pdev, 0, 0);
> >> +     if (!io)
> >> +             return -ENOMEM;
> >> +
> >> +     do {
> >> +             if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) &&
> >> +                 (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) {
> >> +                     ret = 0;
> >> +                     goto reg_check_exit;
> >> +             }
> >> +             msleep(POLL_QUANTUM_MS);
> >> +     } while (!time_after(jiffies, timeout));
> >> +
> >> +reg_check_exit:
> >> +     pci_iounmap(pdev, io);
> >> +     return ret;  
> >
> > We're accessing device memory here but afaict the memory enable bit of
> > the command register is in an indeterminate state.  What happens if you
> > use setpci to clear the memory enable bit or 'echo 0 > enable' before
> > binding the driver?  Thanks,
> >
> > Alex  
> 
> Hi Alex, sorry I didn't understand how we are accessing device memory here if
> the C2C_LINK_BAR0_OFFSET and HBM_TRAINING_BAR0_OFFSET are BAR0 regs.
> But anyways, I tried 'echo 0 > <sysfs_path>/enable' before device bind. I am not
> observing any issue and the bind goes through.
> 
> Or am I missing something? 

BAR0 is what I'm referring to as device memory.  We cannot access
registers in BAR0 unless the memory space enable bit of the command
register is set.  The nvgrace-gpu driver makes no effort to enable this
and I don't think the PCI core does before probe either.  Disabling
through sysfs will only disable if it was previously enabled, so
possibly that test was invalid.  Please try with setpci:

# Read command register
$ setpci -s xxxx:xx:xx.x COMMAND
# Clear memory enable
$ setpci -s xxxx:xx:xx.x COMMAND=0:2
# Re-read command register
$ setpci -s xxxx:xx:xx.x COMMAND

Probe driver here now that the memory enable bit should re--back as
unset.  Thanks,

Alex
Ankit Agrawal Jan. 17, 2025, 9:13 p.m. UTC | #4
>> > We're accessing device memory here but afaict the memory enable bit of
>> > the command register is in an indeterminate state.  What happens if you
>> > use setpci to clear the memory enable bit or 'echo 0 > enable' before
>> > binding the driver?  Thanks,
>> >
>> > Alex
>>
>> Hi Alex, sorry I didn't understand how we are accessing device memory here if
>> the C2C_LINK_BAR0_OFFSET and HBM_TRAINING_BAR0_OFFSET are BAR0 regs.
>> But anyways, I tried 'echo 0 > <sysfs_path>/enable' before device bind. I am not
>> observing any issue and the bind goes through.
>>
>> Or am I missing something?
>
> BAR0 is what I'm referring to as device memory.  We cannot access
> registers in BAR0 unless the memory space enable bit of the command
> register is set.  The nvgrace-gpu driver makes no effort to enable this
> and I don't think the PCI core does before probe either.  Disabling
> through sysfs will only disable if it was previously enabled, so
> possibly that test was invalid.  Please try with setpci:
>
> # Read command register
> $ setpci -s xxxx:xx:xx.x COMMAND
> # Clear memory enable
> $ setpci -s xxxx:xx:xx.x COMMAND=0:2
> # Re-read command register
> $ setpci -s xxxx:xx:xx.x COMMAND
>
> Probe driver here now that the memory enable bit should re--back as
> unset.  Thanks,
>
> Alex

Ok, yeah. I tried to disable through setpci, and the probe is failing with ETIME.
Should we check if disabled and return -EIO for such situation to differentiate
from timeout?
Alex Williamson Jan. 17, 2025, 9:31 p.m. UTC | #5
On Fri, 17 Jan 2025 21:13:52 +0000
Ankit Agrawal <ankita@nvidia.com> wrote:

> >> > We're accessing device memory here but afaict the memory enable bit of
> >> > the command register is in an indeterminate state.  What happens if you
> >> > use setpci to clear the memory enable bit or 'echo 0 > enable' before
> >> > binding the driver?  Thanks,
> >> >
> >> > Alex  
> >>
> >> Hi Alex, sorry I didn't understand how we are accessing device memory here if
> >> the C2C_LINK_BAR0_OFFSET and HBM_TRAINING_BAR0_OFFSET are BAR0 regs.
> >> But anyways, I tried 'echo 0 > <sysfs_path>/enable' before device bind. I am not
> >> observing any issue and the bind goes through.
> >>
> >> Or am I missing something?  
> >
> > BAR0 is what I'm referring to as device memory.  We cannot access
> > registers in BAR0 unless the memory space enable bit of the command
> > register is set.  The nvgrace-gpu driver makes no effort to enable this
> > and I don't think the PCI core does before probe either.  Disabling
> > through sysfs will only disable if it was previously enabled, so
> > possibly that test was invalid.  Please try with setpci:
> >
> > # Read command register
> > $ setpci -s xxxx:xx:xx.x COMMAND
> > # Clear memory enable
> > $ setpci -s xxxx:xx:xx.x COMMAND=0:2
> > # Re-read command register
> > $ setpci -s xxxx:xx:xx.x COMMAND
> >
> > Probe driver here now that the memory enable bit should re--back as
> > unset.  Thanks,
> >
> > Alex  
> 
> Ok, yeah. I tried to disable through setpci, and the probe is failing with ETIME.
> Should we check if disabled and return -EIO for such situation to differentiate
> from timeout?

No, the driver needs to enable memory on the device around the iomap
rather than assuming the initial state.  Thanks,

Alex
Ankit Agrawal Jan. 17, 2025, 9:45 p.m. UTC | #6
>>
>> Ok, yeah. I tried to disable through setpci, and the probe is failing with ETIME.
>> Should we check if disabled and return -EIO for such situation to differentiate
>> from timeout?
>
> No, the driver needs to enable memory on the device around the iomap
> rather than assuming the initial state.  Thanks,
>
> Alex

Ack, thanks for the suggestion.

I'll change nvgrace_gpu_wait_device_ready to read the PCI_COMMAND
through pci_read_config_word before pci_iomap. And if PCI_COMMAND_MEMORY
is not set, update through pci_write_config_word.
Ankit Agrawal Jan. 17, 2025, 9:56 p.m. UTC | #7
>> No, the driver needs to enable memory on the device around the iomap
>> rather than assuming the initial state.  Thanks,
>>
>> Alex
>
> Ack, thanks for the suggestion.
>
> I'll change nvgrace_gpu_wait_device_ready to read the PCI_COMMAND
> through pci_read_config_word before pci_iomap. And if PCI_COMMAND_MEMORY
> is not set, update through pci_write_config_word.

Or perhaps better to call vfio_pci_memory_lock_and_enable(). I'll make the change
and send out the next version.
diff mbox series

Patch

diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index 89d38e3c0261..6298e7f0fe1a 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -5,6 +5,8 @@ 
 
 #include <linux/sizes.h>
 #include <linux/vfio_pci_core.h>
+#include <linux/delay.h>
+#include <linux/jiffies.h>
 
 /*
  * The device memory usable to the workloads running in the VM is cached
@@ -25,6 +27,13 @@ 
 
 #define GPU_CAP_DVSEC_REGISTER 3
 
+#define C2C_LINK_BAR0_OFFSET 0x1498
+#define HBM_TRAINING_BAR0_OFFSET 0x200BC
+#define STATUS_READY 0xFF
+
+#define POLL_QUANTUM_MS 1000
+#define POLL_TIMEOUT_MS (30 * 1000)
+
 /*
  * The state of the two device memory region - resmem and usemem - is
  * saved as struct mem_region.
@@ -855,6 +864,48 @@  static bool nvgrace_gpu_has_mig_hw_bug_fix(struct pci_dev *pdev)
 	return false;
 }
 
+/*
+ * To reduce the system bootup time, the HBM training has
+ * been moved out of the UEFI on the Grace-Blackwell systems.
+ *
+ * The onus of checking whether the HBM training has completed
+ * thus falls on the module. The HBM training status can be
+ * determined from a BAR0 register.
+ *
+ * Similarly, another BAR0 register exposes the status of the
+ * CPU-GPU chip-to-chip (C2C) cache coherent interconnect.
+ *
+ * Poll these register and check for 30s. If the HBM training is
+ * not complete or if the C2C link is not ready, fail the probe.
+ *
+ * While the wait is not required on Grace Hopper systems, it
+ * is beneficial to make the check to ensure the device is in an
+ * expected state.
+ */
+static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev)
+{
+	unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS);
+	void __iomem *io;
+	int ret = -ETIME;
+
+	io = pci_iomap(pdev, 0, 0);
+	if (!io)
+		return -ENOMEM;
+
+	do {
+		if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) &&
+		    (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) {
+			ret = 0;
+			goto reg_check_exit;
+		}
+		msleep(POLL_QUANTUM_MS);
+	} while (!time_after(jiffies, timeout));
+
+reg_check_exit:
+	pci_iounmap(pdev, io);
+	return ret;
+}
+
 static int nvgrace_gpu_probe(struct pci_dev *pdev,
 			     const struct pci_device_id *id)
 {
@@ -863,6 +914,10 @@  static int nvgrace_gpu_probe(struct pci_dev *pdev,
 	u64 memphys, memlength;
 	int ret;
 
+	ret = nvgrace_gpu_wait_device_ready(pdev);
+	if (ret)
+		return ret;
+
 	ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength);
 	if (!ret)
 		ops = &nvgrace_gpu_pci_ops;