Message ID | 20240828060944.77829-2-chengyou@linux.alibaba.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | RDMA/erdma: erdma updates | expand |
On Wed, Aug 28, 2024 at 02:09:41PM +0800, Cheng Xu wrote: > Driver may probe again while hardware is destroying the internal > resources allocated for previous probing How is it possible? > which will fail the device probe. To make it more robust, we always issue a reset at the > beginning of the device probe process. > > Signed-off-by: Cheng Xu <chengyou@linux.alibaba.com> > --- > drivers/infiniband/hw/erdma/erdma.h | 1 + > drivers/infiniband/hw/erdma/erdma_main.c | 44 +++++++++++++++++++----- > 2 files changed, 36 insertions(+), 9 deletions(-) > > diff --git a/drivers/infiniband/hw/erdma/erdma.h b/drivers/infiniband/hw/erdma/erdma.h > index c8bd698e21b0..b5c258f77ca0 100644 > --- a/drivers/infiniband/hw/erdma/erdma.h > +++ b/drivers/infiniband/hw/erdma/erdma.h > @@ -94,6 +94,7 @@ enum { > > #define ERDMA_CMDQ_TIMEOUT_MS 15000 > #define ERDMA_REG_ACCESS_WAIT_MS 20 > +#define ERDMA_WAIT_DEV_REST_CNT 50 > #define ERDMA_WAIT_DEV_DONE_CNT 500 > > struct erdma_cmdq { > diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c > index 7080f8a71ec4..9199058a0b29 100644 > --- a/drivers/infiniband/hw/erdma/erdma_main.c > +++ b/drivers/infiniband/hw/erdma/erdma_main.c > @@ -209,11 +209,30 @@ static void erdma_device_uninit(struct erdma_dev *dev) > dma_pool_destroy(dev->resp_pool); > } > > -static void erdma_hw_reset(struct erdma_dev *dev) > +static int erdma_hw_reset(struct erdma_dev *dev, bool wait) > { > u32 ctrl = FIELD_PREP(ERDMA_REG_DEV_CTRL_RESET_MASK, 1); > + int i; > > erdma_reg_write32(dev, ERDMA_REGS_DEV_CTRL_REG, ctrl); > + > + if (!wait) > + return 0; > + > + for (i = 0; i < ERDMA_WAIT_DEV_REST_CNT; i++) { > + if (erdma_reg_read32_filed(dev, ERDMA_REGS_DEV_ST_REG, > + ERDMA_REG_DEV_ST_RESET_DONE_MASK)) > + break; > + > + msleep(ERDMA_REG_ACCESS_WAIT_MS); > + } > + > + if (i == ERDMA_WAIT_DEV_REST_CNT) { > + dev_err(&dev->pdev->dev, "wait reset done timeout.\n"); > + return -ETIME; > + } > + > + return 0; > } > > static int erdma_wait_hw_init_done(struct erdma_dev *dev) > @@ -239,6 +258,17 @@ static int erdma_wait_hw_init_done(struct erdma_dev *dev) > return 0; > } > > +static int erdma_preinit_check(struct erdma_dev *dev) > +{ > + u32 version = erdma_reg_read32(dev, ERDMA_REGS_VERSION_REG); > + > + /* we knows that it is a non-functional function. */ > + if (version == 0) > + return -ENODEV; > + > + return erdma_hw_reset(dev, true); > +} > + > static const struct pci_device_id erdma_pci_tbl[] = { > { PCI_DEVICE(PCI_VENDOR_ID_ALIBABA, 0x107f) }, > {} > @@ -248,7 +278,6 @@ static int erdma_probe_dev(struct pci_dev *pdev) > { > struct erdma_dev *dev; > int bars, err; > - u32 version; > > err = pci_enable_device(pdev); > if (err) { > @@ -287,12 +316,9 @@ static int erdma_probe_dev(struct pci_dev *pdev) > goto err_release_bars; > } > > - version = erdma_reg_read32(dev, ERDMA_REGS_VERSION_REG); > - if (version == 0) { > - /* we knows that it is a non-functional function. */ > - err = -ENODEV; > + err = erdma_preinit_check(dev); > + if (err) > goto err_iounmap_func_bar; > - } > > err = erdma_device_init(dev, pdev); > if (err) > @@ -327,7 +353,7 @@ static int erdma_probe_dev(struct pci_dev *pdev) > return 0; > > err_reset_hw: > - erdma_hw_reset(dev); > + erdma_hw_reset(dev, false); > > err_uninit_cmdq: > erdma_cmdq_destroy(dev); > @@ -364,7 +390,7 @@ static void erdma_remove_dev(struct pci_dev *pdev) > struct erdma_dev *dev = pci_get_drvdata(pdev); > > erdma_ceqs_uninit(dev); > - erdma_hw_reset(dev); > + erdma_hw_reset(dev, false); > erdma_cmdq_destroy(dev); > erdma_aeq_destroy(dev); > erdma_comm_irq_uninit(dev); > -- > 2.31.1 > >
On 8/29/24 6:09 PM, Leon Romanovsky wrote: > On Wed, Aug 28, 2024 at 02:09:41PM +0800, Cheng Xu wrote: >> Driver may probe again while hardware is destroying the internal >> resources allocated for previous probing > > How is it possible? > The resources I mentioned is totally unseen to driver, it's something related to our device management part in hypervisor, so it won't cause host resources leak, and the cleanup/reset process may take a long time. For these reason, we don't wait the completion of the cleanup/reset in the remove routing. Instead, the driver will wait the device status become ready in probe routing (In most cases, the hardware will have enough time to finish the cleanup/reset before the second probe), so that we can boost the remove process. Thanks, Cheng Xu
On Fri, Aug 30, 2024 at 10:34:42AM +0800, Cheng Xu wrote: > > > On 8/29/24 6:09 PM, Leon Romanovsky wrote: > > On Wed, Aug 28, 2024 at 02:09:41PM +0800, Cheng Xu wrote: > >> Driver may probe again while hardware is destroying the internal > >> resources allocated for previous probing > > > > How is it possible? > > > > The resources I mentioned is totally unseen to driver, it's something related > to our device management part in hypervisor, so it won't cause host resources > leak, and the cleanup/reset process may take a long time. For these reason, > we don't wait the completion of the cleanup/reset in the remove routing. > Instead, the driver will wait the device status become ready in probe routing > (In most cases, the hardware will have enough time to finish the cleanup/reset > before the second probe), so that we can boost the remove process. And why don't hypervisor wait for the device to be ready before giving it to VM? Why do you need to complicate the probe routine to overcome the hypervisor behavior? Thanks > > Thanks, > Cheng Xu >
On 9/2/24 3:21 PM, Leon Romanovsky wrote: > On Fri, Aug 30, 2024 at 10:34:42AM +0800, Cheng Xu wrote: >> >> >> On 8/29/24 6:09 PM, Leon Romanovsky wrote: >>> On Wed, Aug 28, 2024 at 02:09:41PM +0800, Cheng Xu wrote: >>>> Driver may probe again while hardware is destroying the internal >>>> resources allocated for previous probing >>> >>> How is it possible? >>> >> >> The resources I mentioned is totally unseen to driver, it's something related >> to our device management part in hypervisor, so it won't cause host resources >> leak, and the cleanup/reset process may take a long time. For these reason, >> we don't wait the completion of the cleanup/reset in the remove routing. >> Instead, the driver will wait the device status become ready in probe routing >> (In most cases, the hardware will have enough time to finish the cleanup/reset >> before the second probe), so that we can boost the remove process. > > And why don't hypervisor wait for the device to be ready before giving it to VM? Hypervisor actually does what you described during the first bootup. However, one scenario is that the erdma driver is unloaded and loaded quickly while the device always exists in the VM. In this case, there is no opportunity for the hypervisor to perform that action. > Why do you need to complicate the probe routine to overcome the hypervisor behavior? > The hardware now requires that the former reset (issued in the remove routine) must be completed before device init (issued in the probe routine). Waiting the reset completed either in the remove routine or in the probe routine both can meet the requirement. This patch chose to wait in the probe routine because it can speed up the remove process. Actually this is a good question, and inspires me that maybe the requirement in the hardware/backend may be eliminated, so that simplify the driver process. I'd like to remove this patch in v3 and leave it for internal discussion. Thanks very much Cheng Xu > Thanks > >> >> Thanks, >> Cheng Xu >>
On Mon, Sep 02, 2024 at 05:09:09PM +0800, Cheng Xu wrote: > The hardware now requires that the former reset (issued in the > remove routine) must be completed before device init (issued in the > probe routine). Waiting the reset completed either in the remove > routine or in the probe routine both can meet the requirement. This > patch chose to wait in the probe routine because it can speed up the > remove process. But what happens if you attach VFIO or some other driver while this background reset is occuring? Are you OK with that? Jason
On 9/5/24 12:06 AM, Jason Gunthorpe wrote: > On Mon, Sep 02, 2024 at 05:09:09PM +0800, Cheng Xu wrote: > >> The hardware now requires that the former reset (issued in the >> remove routine) must be completed before device init (issued in the >> probe routine). Waiting the reset completed either in the remove >> routine or in the probe routine both can meet the requirement. This >> patch chose to wait in the probe routine because it can speed up the >> remove process. > > But what happens if you attach VFIO or some other driver while this > background reset is occuring? Are you OK with that? Yes, it's OK. To simplify the description, We have two relevant components in the hardware: The device management engine and the RDMA engine. The PCIe device initialization (erdma, vfio and other drivers will perform it) is handled by device management engine without any issue. The issue described in this patch pertains to the RDMA engine, which is only invoked by erdma driver. In fact, this low-probability issue is not very serious and can be resolved by reloading the driver. After internal discussion, we have decided to eliminate this constraint through appropriate firmware modifications. Thanks, Cheng Xu > > Jason
diff --git a/drivers/infiniband/hw/erdma/erdma.h b/drivers/infiniband/hw/erdma/erdma.h index c8bd698e21b0..b5c258f77ca0 100644 --- a/drivers/infiniband/hw/erdma/erdma.h +++ b/drivers/infiniband/hw/erdma/erdma.h @@ -94,6 +94,7 @@ enum { #define ERDMA_CMDQ_TIMEOUT_MS 15000 #define ERDMA_REG_ACCESS_WAIT_MS 20 +#define ERDMA_WAIT_DEV_REST_CNT 50 #define ERDMA_WAIT_DEV_DONE_CNT 500 struct erdma_cmdq { diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c index 7080f8a71ec4..9199058a0b29 100644 --- a/drivers/infiniband/hw/erdma/erdma_main.c +++ b/drivers/infiniband/hw/erdma/erdma_main.c @@ -209,11 +209,30 @@ static void erdma_device_uninit(struct erdma_dev *dev) dma_pool_destroy(dev->resp_pool); } -static void erdma_hw_reset(struct erdma_dev *dev) +static int erdma_hw_reset(struct erdma_dev *dev, bool wait) { u32 ctrl = FIELD_PREP(ERDMA_REG_DEV_CTRL_RESET_MASK, 1); + int i; erdma_reg_write32(dev, ERDMA_REGS_DEV_CTRL_REG, ctrl); + + if (!wait) + return 0; + + for (i = 0; i < ERDMA_WAIT_DEV_REST_CNT; i++) { + if (erdma_reg_read32_filed(dev, ERDMA_REGS_DEV_ST_REG, + ERDMA_REG_DEV_ST_RESET_DONE_MASK)) + break; + + msleep(ERDMA_REG_ACCESS_WAIT_MS); + } + + if (i == ERDMA_WAIT_DEV_REST_CNT) { + dev_err(&dev->pdev->dev, "wait reset done timeout.\n"); + return -ETIME; + } + + return 0; } static int erdma_wait_hw_init_done(struct erdma_dev *dev) @@ -239,6 +258,17 @@ static int erdma_wait_hw_init_done(struct erdma_dev *dev) return 0; } +static int erdma_preinit_check(struct erdma_dev *dev) +{ + u32 version = erdma_reg_read32(dev, ERDMA_REGS_VERSION_REG); + + /* we knows that it is a non-functional function. */ + if (version == 0) + return -ENODEV; + + return erdma_hw_reset(dev, true); +} + static const struct pci_device_id erdma_pci_tbl[] = { { PCI_DEVICE(PCI_VENDOR_ID_ALIBABA, 0x107f) }, {} @@ -248,7 +278,6 @@ static int erdma_probe_dev(struct pci_dev *pdev) { struct erdma_dev *dev; int bars, err; - u32 version; err = pci_enable_device(pdev); if (err) { @@ -287,12 +316,9 @@ static int erdma_probe_dev(struct pci_dev *pdev) goto err_release_bars; } - version = erdma_reg_read32(dev, ERDMA_REGS_VERSION_REG); - if (version == 0) { - /* we knows that it is a non-functional function. */ - err = -ENODEV; + err = erdma_preinit_check(dev); + if (err) goto err_iounmap_func_bar; - } err = erdma_device_init(dev, pdev); if (err) @@ -327,7 +353,7 @@ static int erdma_probe_dev(struct pci_dev *pdev) return 0; err_reset_hw: - erdma_hw_reset(dev); + erdma_hw_reset(dev, false); err_uninit_cmdq: erdma_cmdq_destroy(dev); @@ -364,7 +390,7 @@ static void erdma_remove_dev(struct pci_dev *pdev) struct erdma_dev *dev = pci_get_drvdata(pdev); erdma_ceqs_uninit(dev); - erdma_hw_reset(dev); + erdma_hw_reset(dev, false); erdma_cmdq_destroy(dev); erdma_aeq_destroy(dev); erdma_comm_irq_uninit(dev);
Driver may probe again while hardware is destroying the internal resources allocated for previous probing, which will fail the device probe. To make it more robust, we always issue a reset at the beginning of the device probe process. Signed-off-by: Cheng Xu <chengyou@linux.alibaba.com> --- drivers/infiniband/hw/erdma/erdma.h | 1 + drivers/infiniband/hw/erdma/erdma_main.c | 44 +++++++++++++++++++----- 2 files changed, 36 insertions(+), 9 deletions(-)