diff mbox series

[for-next,v2,1/4] RDMA/erdma: Make the device probe process more robust

Message ID 20240828060944.77829-2-chengyou@linux.alibaba.com (mailing list archive)
State Superseded
Headers show
Series RDMA/erdma: erdma updates | expand

Commit Message

Cheng Xu Aug. 28, 2024, 6:09 a.m. UTC
Driver may probe again while hardware is destroying the internal
resources allocated for previous probing, which will fail the
device probe. To make it more robust, we always issue a reset at the
beginning of the device probe process.

Signed-off-by: Cheng Xu <chengyou@linux.alibaba.com>
---
 drivers/infiniband/hw/erdma/erdma.h      |  1 +
 drivers/infiniband/hw/erdma/erdma_main.c | 44 +++++++++++++++++++-----
 2 files changed, 36 insertions(+), 9 deletions(-)

Comments

Leon Romanovsky Aug. 29, 2024, 10:09 a.m. UTC | #1
On Wed, Aug 28, 2024 at 02:09:41PM +0800, Cheng Xu wrote:
> Driver may probe again while hardware is destroying the internal
> resources allocated for previous probing

How is it possible?


> which will fail the device probe. To make it more robust, we always issue a reset at the
> beginning of the device probe process.
> 
> Signed-off-by: Cheng Xu <chengyou@linux.alibaba.com>
> ---
>  drivers/infiniband/hw/erdma/erdma.h      |  1 +
>  drivers/infiniband/hw/erdma/erdma_main.c | 44 +++++++++++++++++++-----
>  2 files changed, 36 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/infiniband/hw/erdma/erdma.h b/drivers/infiniband/hw/erdma/erdma.h
> index c8bd698e21b0..b5c258f77ca0 100644
> --- a/drivers/infiniband/hw/erdma/erdma.h
> +++ b/drivers/infiniband/hw/erdma/erdma.h
> @@ -94,6 +94,7 @@ enum {
>  
>  #define ERDMA_CMDQ_TIMEOUT_MS 15000
>  #define ERDMA_REG_ACCESS_WAIT_MS 20
> +#define ERDMA_WAIT_DEV_REST_CNT 50
>  #define ERDMA_WAIT_DEV_DONE_CNT 500
>  
>  struct erdma_cmdq {
> diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c
> index 7080f8a71ec4..9199058a0b29 100644
> --- a/drivers/infiniband/hw/erdma/erdma_main.c
> +++ b/drivers/infiniband/hw/erdma/erdma_main.c
> @@ -209,11 +209,30 @@ static void erdma_device_uninit(struct erdma_dev *dev)
>  	dma_pool_destroy(dev->resp_pool);
>  }
>  
> -static void erdma_hw_reset(struct erdma_dev *dev)
> +static int erdma_hw_reset(struct erdma_dev *dev, bool wait)
>  {
>  	u32 ctrl = FIELD_PREP(ERDMA_REG_DEV_CTRL_RESET_MASK, 1);
> +	int i;
>  
>  	erdma_reg_write32(dev, ERDMA_REGS_DEV_CTRL_REG, ctrl);
> +
> +	if (!wait)
> +		return 0;
> +
> +	for (i = 0; i < ERDMA_WAIT_DEV_REST_CNT; i++) {
> +		if (erdma_reg_read32_filed(dev, ERDMA_REGS_DEV_ST_REG,
> +					   ERDMA_REG_DEV_ST_RESET_DONE_MASK))
> +			break;
> +
> +		msleep(ERDMA_REG_ACCESS_WAIT_MS);
> +	}
> +
> +	if (i == ERDMA_WAIT_DEV_REST_CNT) {
> +		dev_err(&dev->pdev->dev, "wait reset done timeout.\n");
> +		return -ETIME;
> +	}
> +
> +	return 0;
>  }
>  
>  static int erdma_wait_hw_init_done(struct erdma_dev *dev)
> @@ -239,6 +258,17 @@ static int erdma_wait_hw_init_done(struct erdma_dev *dev)
>  	return 0;
>  }
>  
> +static int erdma_preinit_check(struct erdma_dev *dev)
> +{
> +	u32 version = erdma_reg_read32(dev, ERDMA_REGS_VERSION_REG);
> +
> +	/* we knows that it is a non-functional function. */
> +	if (version == 0)
> +		return -ENODEV;
> +
> +	return erdma_hw_reset(dev, true);
> +}
> +
>  static const struct pci_device_id erdma_pci_tbl[] = {
>  	{ PCI_DEVICE(PCI_VENDOR_ID_ALIBABA, 0x107f) },
>  	{}
> @@ -248,7 +278,6 @@ static int erdma_probe_dev(struct pci_dev *pdev)
>  {
>  	struct erdma_dev *dev;
>  	int bars, err;
> -	u32 version;
>  
>  	err = pci_enable_device(pdev);
>  	if (err) {
> @@ -287,12 +316,9 @@ static int erdma_probe_dev(struct pci_dev *pdev)
>  		goto err_release_bars;
>  	}
>  
> -	version = erdma_reg_read32(dev, ERDMA_REGS_VERSION_REG);
> -	if (version == 0) {
> -		/* we knows that it is a non-functional function. */
> -		err = -ENODEV;
> +	err = erdma_preinit_check(dev);
> +	if (err)
>  		goto err_iounmap_func_bar;
> -	}
>  
>  	err = erdma_device_init(dev, pdev);
>  	if (err)
> @@ -327,7 +353,7 @@ static int erdma_probe_dev(struct pci_dev *pdev)
>  	return 0;
>  
>  err_reset_hw:
> -	erdma_hw_reset(dev);
> +	erdma_hw_reset(dev, false);
>  
>  err_uninit_cmdq:
>  	erdma_cmdq_destroy(dev);
> @@ -364,7 +390,7 @@ static void erdma_remove_dev(struct pci_dev *pdev)
>  	struct erdma_dev *dev = pci_get_drvdata(pdev);
>  
>  	erdma_ceqs_uninit(dev);
> -	erdma_hw_reset(dev);
> +	erdma_hw_reset(dev, false);
>  	erdma_cmdq_destroy(dev);
>  	erdma_aeq_destroy(dev);
>  	erdma_comm_irq_uninit(dev);
> -- 
> 2.31.1
> 
>
Cheng Xu Aug. 30, 2024, 2:34 a.m. UTC | #2
On 8/29/24 6:09 PM, Leon Romanovsky wrote:
> On Wed, Aug 28, 2024 at 02:09:41PM +0800, Cheng Xu wrote:
>> Driver may probe again while hardware is destroying the internal
>> resources allocated for previous probing
> 
> How is it possible?
> 

The resources I mentioned is totally unseen to driver, it's something related
to our device management part in hypervisor, so it won't cause host resources
leak, and the cleanup/reset process may take a long time. For these reason,
we don't wait the completion of the cleanup/reset in the remove routing.
Instead, the driver will wait the device status become ready in probe routing
(In most cases, the hardware will have enough time to finish the cleanup/reset
before the second probe), so that we can boost the remove process.

Thanks,
Cheng Xu
Leon Romanovsky Sept. 2, 2024, 7:21 a.m. UTC | #3
On Fri, Aug 30, 2024 at 10:34:42AM +0800, Cheng Xu wrote:
> 
> 
> On 8/29/24 6:09 PM, Leon Romanovsky wrote:
> > On Wed, Aug 28, 2024 at 02:09:41PM +0800, Cheng Xu wrote:
> >> Driver may probe again while hardware is destroying the internal
> >> resources allocated for previous probing
> > 
> > How is it possible?
> > 
> 
> The resources I mentioned is totally unseen to driver, it's something related
> to our device management part in hypervisor, so it won't cause host resources
> leak, and the cleanup/reset process may take a long time. For these reason,
> we don't wait the completion of the cleanup/reset in the remove routing.
> Instead, the driver will wait the device status become ready in probe routing
> (In most cases, the hardware will have enough time to finish the cleanup/reset
> before the second probe), so that we can boost the remove process.

And why don't hypervisor wait for the device to be ready before giving it to VM?
Why do you need to complicate the probe routine to overcome the hypervisor behavior?

Thanks

> 
> Thanks,
> Cheng Xu
>
Cheng Xu Sept. 2, 2024, 9:09 a.m. UTC | #4
On 9/2/24 3:21 PM, Leon Romanovsky wrote:
> On Fri, Aug 30, 2024 at 10:34:42AM +0800, Cheng Xu wrote:
>>
>>
>> On 8/29/24 6:09 PM, Leon Romanovsky wrote:
>>> On Wed, Aug 28, 2024 at 02:09:41PM +0800, Cheng Xu wrote:
>>>> Driver may probe again while hardware is destroying the internal
>>>> resources allocated for previous probing
>>>
>>> How is it possible?
>>>
>>
>> The resources I mentioned is totally unseen to driver, it's something related
>> to our device management part in hypervisor, so it won't cause host resources
>> leak, and the cleanup/reset process may take a long time. For these reason,
>> we don't wait the completion of the cleanup/reset in the remove routing.
>> Instead, the driver will wait the device status become ready in probe routing
>> (In most cases, the hardware will have enough time to finish the cleanup/reset
>> before the second probe), so that we can boost the remove process.
> 
> And why don't hypervisor wait for the device to be ready before giving it to VM?

Hypervisor actually does what you described during the first bootup. However, one
scenario is that the erdma driver is unloaded and loaded quickly while the device
always exists in the VM. In this case, there is no opportunity for the hypervisor
to perform that action.

> Why do you need to complicate the probe routine to overcome the hypervisor behavior?
> 

The hardware now requires that the former reset (issued in the remove routine) must be
completed before device init (issued in the probe routine). Waiting the reset completed
either in the remove routine or in the probe routine both can meet the requirement.
This patch chose to wait in the probe routine because it can speed up the remove process.

Actually this is a good question, and inspires me that maybe the requirement in the
hardware/backend may be eliminated, so that simplify the driver process.

I'd like to remove this patch in v3 and leave it for internal discussion.

Thanks very much
Cheng Xu


> Thanks
> 
>>
>> Thanks,
>> Cheng Xu
>>
Jason Gunthorpe Sept. 4, 2024, 4:06 p.m. UTC | #5
On Mon, Sep 02, 2024 at 05:09:09PM +0800, Cheng Xu wrote:

> The hardware now requires that the former reset (issued in the
> remove routine) must be completed before device init (issued in the
> probe routine). Waiting the reset completed either in the remove
> routine or in the probe routine both can meet the requirement.  This
> patch chose to wait in the probe routine because it can speed up the
> remove process.

But what happens if you attach VFIO or some other driver while this
background reset is occuring? Are you OK with that?

Jason
Cheng Xu Sept. 5, 2024, 3:39 a.m. UTC | #6
On 9/5/24 12:06 AM, Jason Gunthorpe wrote:
> On Mon, Sep 02, 2024 at 05:09:09PM +0800, Cheng Xu wrote:
> 
>> The hardware now requires that the former reset (issued in the
>> remove routine) must be completed before device init (issued in the
>> probe routine). Waiting the reset completed either in the remove
>> routine or in the probe routine both can meet the requirement.  This
>> patch chose to wait in the probe routine because it can speed up the
>> remove process.
> 
> But what happens if you attach VFIO or some other driver while this
> background reset is occuring? Are you OK with that?

Yes, it's OK.

To simplify the description, We have two relevant components in the
hardware: The device management engine and the RDMA engine. The PCIe
device initialization (erdma, vfio and other drivers will perform it)
is handled by device management engine without any issue. The issue described
in this patch pertains to the RDMA engine, which is only invoked by erdma
driver.

In fact, this low-probability issue is not very serious and can be resolved
by reloading the driver. After internal discussion, we have decided to eliminate
this constraint through appropriate firmware modifications.

Thanks,
Cheng Xu

> 
> Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/hw/erdma/erdma.h b/drivers/infiniband/hw/erdma/erdma.h
index c8bd698e21b0..b5c258f77ca0 100644
--- a/drivers/infiniband/hw/erdma/erdma.h
+++ b/drivers/infiniband/hw/erdma/erdma.h
@@ -94,6 +94,7 @@  enum {
 
 #define ERDMA_CMDQ_TIMEOUT_MS 15000
 #define ERDMA_REG_ACCESS_WAIT_MS 20
+#define ERDMA_WAIT_DEV_REST_CNT 50
 #define ERDMA_WAIT_DEV_DONE_CNT 500
 
 struct erdma_cmdq {
diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c
index 7080f8a71ec4..9199058a0b29 100644
--- a/drivers/infiniband/hw/erdma/erdma_main.c
+++ b/drivers/infiniband/hw/erdma/erdma_main.c
@@ -209,11 +209,30 @@  static void erdma_device_uninit(struct erdma_dev *dev)
 	dma_pool_destroy(dev->resp_pool);
 }
 
-static void erdma_hw_reset(struct erdma_dev *dev)
+static int erdma_hw_reset(struct erdma_dev *dev, bool wait)
 {
 	u32 ctrl = FIELD_PREP(ERDMA_REG_DEV_CTRL_RESET_MASK, 1);
+	int i;
 
 	erdma_reg_write32(dev, ERDMA_REGS_DEV_CTRL_REG, ctrl);
+
+	if (!wait)
+		return 0;
+
+	for (i = 0; i < ERDMA_WAIT_DEV_REST_CNT; i++) {
+		if (erdma_reg_read32_filed(dev, ERDMA_REGS_DEV_ST_REG,
+					   ERDMA_REG_DEV_ST_RESET_DONE_MASK))
+			break;
+
+		msleep(ERDMA_REG_ACCESS_WAIT_MS);
+	}
+
+	if (i == ERDMA_WAIT_DEV_REST_CNT) {
+		dev_err(&dev->pdev->dev, "wait reset done timeout.\n");
+		return -ETIME;
+	}
+
+	return 0;
 }
 
 static int erdma_wait_hw_init_done(struct erdma_dev *dev)
@@ -239,6 +258,17 @@  static int erdma_wait_hw_init_done(struct erdma_dev *dev)
 	return 0;
 }
 
+static int erdma_preinit_check(struct erdma_dev *dev)
+{
+	u32 version = erdma_reg_read32(dev, ERDMA_REGS_VERSION_REG);
+
+	/* we knows that it is a non-functional function. */
+	if (version == 0)
+		return -ENODEV;
+
+	return erdma_hw_reset(dev, true);
+}
+
 static const struct pci_device_id erdma_pci_tbl[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_ALIBABA, 0x107f) },
 	{}
@@ -248,7 +278,6 @@  static int erdma_probe_dev(struct pci_dev *pdev)
 {
 	struct erdma_dev *dev;
 	int bars, err;
-	u32 version;
 
 	err = pci_enable_device(pdev);
 	if (err) {
@@ -287,12 +316,9 @@  static int erdma_probe_dev(struct pci_dev *pdev)
 		goto err_release_bars;
 	}
 
-	version = erdma_reg_read32(dev, ERDMA_REGS_VERSION_REG);
-	if (version == 0) {
-		/* we knows that it is a non-functional function. */
-		err = -ENODEV;
+	err = erdma_preinit_check(dev);
+	if (err)
 		goto err_iounmap_func_bar;
-	}
 
 	err = erdma_device_init(dev, pdev);
 	if (err)
@@ -327,7 +353,7 @@  static int erdma_probe_dev(struct pci_dev *pdev)
 	return 0;
 
 err_reset_hw:
-	erdma_hw_reset(dev);
+	erdma_hw_reset(dev, false);
 
 err_uninit_cmdq:
 	erdma_cmdq_destroy(dev);
@@ -364,7 +390,7 @@  static void erdma_remove_dev(struct pci_dev *pdev)
 	struct erdma_dev *dev = pci_get_drvdata(pdev);
 
 	erdma_ceqs_uninit(dev);
-	erdma_hw_reset(dev);
+	erdma_hw_reset(dev, false);
 	erdma_cmdq_destroy(dev);
 	erdma_aeq_destroy(dev);
 	erdma_comm_irq_uninit(dev);