diff mbox series

[for-next,1/4] RDMA/erdma: Make the device probe process more robust

Message ID 20240823075058.89488-2-chengyou@linux.alibaba.com (mailing list archive)
State Superseded
Headers show
Series RDMA/erdma: erdma updates | expand

Commit Message

Cheng Xu Aug. 23, 2024, 7:50 a.m. UTC
Driver may probe again while hardware is destroying the internal
resources allocated for previous probing, which will fail the
device probe. To make it more robust, we always issue a reset at the
beginning of the device probe process.

Signed-off-by: Cheng Xu <chengyou@linux.alibaba.com>
---
 drivers/infiniband/hw/erdma/erdma.h      |  1 +
 drivers/infiniband/hw/erdma/erdma_main.c | 44 +++++++++++++++++++-----
 2 files changed, 36 insertions(+), 9 deletions(-)
diff mbox series

Patch

diff --git a/drivers/infiniband/hw/erdma/erdma.h b/drivers/infiniband/hw/erdma/erdma.h
index c8bd698e21b0..b5c258f77ca0 100644
--- a/drivers/infiniband/hw/erdma/erdma.h
+++ b/drivers/infiniband/hw/erdma/erdma.h
@@ -94,6 +94,7 @@  enum {
 
 #define ERDMA_CMDQ_TIMEOUT_MS 15000
 #define ERDMA_REG_ACCESS_WAIT_MS 20
+#define ERDMA_WAIT_DEV_REST_CNT 50
 #define ERDMA_WAIT_DEV_DONE_CNT 500
 
 struct erdma_cmdq {
diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c
index 7080f8a71ec4..9199058a0b29 100644
--- a/drivers/infiniband/hw/erdma/erdma_main.c
+++ b/drivers/infiniband/hw/erdma/erdma_main.c
@@ -209,11 +209,30 @@  static void erdma_device_uninit(struct erdma_dev *dev)
 	dma_pool_destroy(dev->resp_pool);
 }
 
-static void erdma_hw_reset(struct erdma_dev *dev)
+static int erdma_hw_reset(struct erdma_dev *dev, bool wait)
 {
 	u32 ctrl = FIELD_PREP(ERDMA_REG_DEV_CTRL_RESET_MASK, 1);
+	int i;
 
 	erdma_reg_write32(dev, ERDMA_REGS_DEV_CTRL_REG, ctrl);
+
+	if (!wait)
+		return 0;
+
+	for (i = 0; i < ERDMA_WAIT_DEV_REST_CNT; i++) {
+		if (erdma_reg_read32_filed(dev, ERDMA_REGS_DEV_ST_REG,
+					   ERDMA_REG_DEV_ST_RESET_DONE_MASK))
+			break;
+
+		msleep(ERDMA_REG_ACCESS_WAIT_MS);
+	}
+
+	if (i == ERDMA_WAIT_DEV_REST_CNT) {
+		dev_err(&dev->pdev->dev, "wait reset done timeout.\n");
+		return -ETIME;
+	}
+
+	return 0;
 }
 
 static int erdma_wait_hw_init_done(struct erdma_dev *dev)
@@ -239,6 +258,17 @@  static int erdma_wait_hw_init_done(struct erdma_dev *dev)
 	return 0;
 }
 
+static int erdma_preinit_check(struct erdma_dev *dev)
+{
+	u32 version = erdma_reg_read32(dev, ERDMA_REGS_VERSION_REG);
+
+	/* we knows that it is a non-functional function. */
+	if (version == 0)
+		return -ENODEV;
+
+	return erdma_hw_reset(dev, true);
+}
+
 static const struct pci_device_id erdma_pci_tbl[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_ALIBABA, 0x107f) },
 	{}
@@ -248,7 +278,6 @@  static int erdma_probe_dev(struct pci_dev *pdev)
 {
 	struct erdma_dev *dev;
 	int bars, err;
-	u32 version;
 
 	err = pci_enable_device(pdev);
 	if (err) {
@@ -287,12 +316,9 @@  static int erdma_probe_dev(struct pci_dev *pdev)
 		goto err_release_bars;
 	}
 
-	version = erdma_reg_read32(dev, ERDMA_REGS_VERSION_REG);
-	if (version == 0) {
-		/* we knows that it is a non-functional function. */
-		err = -ENODEV;
+	err = erdma_preinit_check(dev);
+	if (err)
 		goto err_iounmap_func_bar;
-	}
 
 	err = erdma_device_init(dev, pdev);
 	if (err)
@@ -327,7 +353,7 @@  static int erdma_probe_dev(struct pci_dev *pdev)
 	return 0;
 
 err_reset_hw:
-	erdma_hw_reset(dev);
+	erdma_hw_reset(dev, false);
 
 err_uninit_cmdq:
 	erdma_cmdq_destroy(dev);
@@ -364,7 +390,7 @@  static void erdma_remove_dev(struct pci_dev *pdev)
 	struct erdma_dev *dev = pci_get_drvdata(pdev);
 
 	erdma_ceqs_uninit(dev);
-	erdma_hw_reset(dev);
+	erdma_hw_reset(dev, false);
 	erdma_cmdq_destroy(dev);
 	erdma_aeq_destroy(dev);
 	erdma_comm_irq_uninit(dev);