diff mbox series

[for-rc,1/6] RDMA/bnxt_re: Remove the qp from list only if the qp destroy succeeds

Message ID 1598292876-26529-2-git-send-email-selvin.xavier@broadcom.com (mailing list archive)
State Accepted
Delegated to: Jason Gunthorpe
Headers show
Series RDMA/bnxt_re: Bug fixes | expand

Commit Message

Selvin Xavier Aug. 24, 2020, 6:14 p.m. UTC
Driver crashes when destroy_qp is re-tried because of an
error returned. This is because the qp entry was  removed
from the qp list during the first call.

Remove qp from the list only if destroy_qp returns success.

Fixes: 8dae419f9ec7 ("RDMA/bnxt_re: Refactor queue pair creation code")
Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
---
 drivers/infiniband/hw/bnxt_re/ib_verbs.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

Comments

Leon Romanovsky Aug. 24, 2020, 7:01 p.m. UTC | #1
On Mon, Aug 24, 2020 at 11:14:31AM -0700, Selvin Xavier wrote:
> Driver crashes when destroy_qp is re-tried because of an
> error returned. This is because the qp entry was  removed
> from the qp list during the first call.

How is it possible that destroy_qp fail?

>
> Remove qp from the list only if destroy_qp returns success.
>
> Fixes: 8dae419f9ec7 ("RDMA/bnxt_re: Refactor queue pair creation code")
> Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
> ---
>  drivers/infiniband/hw/bnxt_re/ib_verbs.c | 22 +++++++++++-----------
>  1 file changed, 11 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
> index 3f18efc..2f5aac0 100644
> --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
> +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
> @@ -752,12 +752,6 @@ static int bnxt_re_destroy_gsi_sqp(struct bnxt_re_qp *qp)
>  	gsi_sqp = rdev->gsi_ctx.gsi_sqp;
>  	gsi_sah = rdev->gsi_ctx.gsi_sah;
>
> -	/* remove from active qp list */
> -	mutex_lock(&rdev->qp_lock);
> -	list_del(&gsi_sqp->list);
> -	mutex_unlock(&rdev->qp_lock);
> -	atomic_dec(&rdev->qp_count);
> -
>  	ibdev_dbg(&rdev->ibdev, "Destroy the shadow AH\n");
>  	bnxt_qplib_destroy_ah(&rdev->qplib_res,
>  			      &gsi_sah->qplib_ah,
> @@ -772,6 +766,12 @@ static int bnxt_re_destroy_gsi_sqp(struct bnxt_re_qp *qp)
>  	}
>  	bnxt_qplib_free_qp_res(&rdev->qplib_res, &gsi_sqp->qplib_qp);
>
> +	/* remove from active qp list */
> +	mutex_lock(&rdev->qp_lock);
> +	list_del(&gsi_sqp->list);
> +	mutex_unlock(&rdev->qp_lock);
> +	atomic_dec(&rdev->qp_count);
> +
>  	kfree(rdev->gsi_ctx.sqp_tbl);
>  	kfree(gsi_sah);
>  	kfree(gsi_sqp);
> @@ -792,11 +792,6 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata)
>  	unsigned int flags;
>  	int rc;
>
> -	mutex_lock(&rdev->qp_lock);
> -	list_del(&qp->list);
> -	mutex_unlock(&rdev->qp_lock);
> -	atomic_dec(&rdev->qp_count);
> -
>  	bnxt_qplib_flush_cqn_wq(&qp->qplib_qp);
>
>  	rc = bnxt_qplib_destroy_qp(&rdev->qplib_res, &qp->qplib_qp);
> @@ -819,6 +814,11 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata)
>  			goto sh_fail;
>  	}
>
> +	mutex_lock(&rdev->qp_lock);
> +	list_del(&qp->list);
> +	mutex_unlock(&rdev->qp_lock);
> +	atomic_dec(&rdev->qp_count);
> +
>  	ib_umem_release(qp->rumem);
>  	ib_umem_release(qp->sumem);
>
> --
> 2.5.5
>
Selvin Xavier Aug. 24, 2020, 7:36 p.m. UTC | #2
On Tue, Aug 25, 2020 at 12:31 AM Leon Romanovsky <leon@kernel.org> wrote:
>
> On Mon, Aug 24, 2020 at 11:14:31AM -0700, Selvin Xavier wrote:
> > Driver crashes when destroy_qp is re-tried because of an
> > error returned. This is because the qp entry was  removed
> > from the qp list during the first call.
>
> How is it possible that destroy_qp fail?
>
One possibility is when the FW is in a crash state.   Driver commands
to FW  fails and it reports an error status for destroy_qp verb.
Even Though the chances of this failure is less,  wanted to avoid a
host crash seen in this scenario.
> >
> > Remove qp from the list only if destroy_qp returns success.
> >
> > Fixes: 8dae419f9ec7 ("RDMA/bnxt_re: Refactor queue pair creation code")
> > Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
> > ---
> >  drivers/infiniband/hw/bnxt_re/ib_verbs.c | 22 +++++++++++-----------
> >  1 file changed, 11 insertions(+), 11 deletions(-)
> >
> > diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
> > index 3f18efc..2f5aac0 100644
> > --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
> > +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
> > @@ -752,12 +752,6 @@ static int bnxt_re_destroy_gsi_sqp(struct bnxt_re_qp *qp)
> >       gsi_sqp = rdev->gsi_ctx.gsi_sqp;
> >       gsi_sah = rdev->gsi_ctx.gsi_sah;
> >
> > -     /* remove from active qp list */
> > -     mutex_lock(&rdev->qp_lock);
> > -     list_del(&gsi_sqp->list);
> > -     mutex_unlock(&rdev->qp_lock);
> > -     atomic_dec(&rdev->qp_count);
> > -
> >       ibdev_dbg(&rdev->ibdev, "Destroy the shadow AH\n");
> >       bnxt_qplib_destroy_ah(&rdev->qplib_res,
> >                             &gsi_sah->qplib_ah,
> > @@ -772,6 +766,12 @@ static int bnxt_re_destroy_gsi_sqp(struct bnxt_re_qp *qp)
> >       }
> >       bnxt_qplib_free_qp_res(&rdev->qplib_res, &gsi_sqp->qplib_qp);
> >
> > +     /* remove from active qp list */
> > +     mutex_lock(&rdev->qp_lock);
> > +     list_del(&gsi_sqp->list);
> > +     mutex_unlock(&rdev->qp_lock);
> > +     atomic_dec(&rdev->qp_count);
> > +
> >       kfree(rdev->gsi_ctx.sqp_tbl);
> >       kfree(gsi_sah);
> >       kfree(gsi_sqp);
> > @@ -792,11 +792,6 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata)
> >       unsigned int flags;
> >       int rc;
> >
> > -     mutex_lock(&rdev->qp_lock);
> > -     list_del(&qp->list);
> > -     mutex_unlock(&rdev->qp_lock);
> > -     atomic_dec(&rdev->qp_count);
> > -
> >       bnxt_qplib_flush_cqn_wq(&qp->qplib_qp);
> >
> >       rc = bnxt_qplib_destroy_qp(&rdev->qplib_res, &qp->qplib_qp);
> > @@ -819,6 +814,11 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata)
> >                       goto sh_fail;
> >       }
> >
> > +     mutex_lock(&rdev->qp_lock);
> > +     list_del(&qp->list);
> > +     mutex_unlock(&rdev->qp_lock);
> > +     atomic_dec(&rdev->qp_count);
> > +
> >       ib_umem_release(qp->rumem);
> >       ib_umem_release(qp->sumem);
> >
> > --
> > 2.5.5
> >
Jason Gunthorpe Aug. 24, 2020, 10 p.m. UTC | #3
On Tue, Aug 25, 2020 at 01:06:23AM +0530, Selvin Xavier wrote:
> On Tue, Aug 25, 2020 at 12:31 AM Leon Romanovsky <leon@kernel.org> wrote:
> >
> > On Mon, Aug 24, 2020 at 11:14:31AM -0700, Selvin Xavier wrote:
> > > Driver crashes when destroy_qp is re-tried because of an
> > > error returned. This is because the qp entry was  removed
> > > from the qp list during the first call.
> >
> > How is it possible that destroy_qp fail?
> >
> One possibility is when the FW is in a crash state.   Driver commands
> to FW  fails and it reports an error status for destroy_qp verb.
> Even Though the chances of this failure is less,  wanted to avoid a
> host crash seen in this scenario.

Drivers are not allowed to fail destroy - the only exception is if a
future destroy would succeed for some reason.

This patch should ignore the return code from FW and clean up all the
host memory. If the FW is not responding then the device should be
killed and the DMA allowed bit turned off in the PCI config space.

Jason
Gal Pressman Aug. 25, 2020, 11:44 a.m. UTC | #4
On 25/08/2020 1:00, Jason Gunthorpe wrote:
> On Tue, Aug 25, 2020 at 01:06:23AM +0530, Selvin Xavier wrote:
>> On Tue, Aug 25, 2020 at 12:31 AM Leon Romanovsky <leon@kernel.org> wrote:
>>>
>>> On Mon, Aug 24, 2020 at 11:14:31AM -0700, Selvin Xavier wrote:
>>>> Driver crashes when destroy_qp is re-tried because of an
>>>> error returned. This is because the qp entry was  removed
>>>> from the qp list during the first call.
>>>
>>> How is it possible that destroy_qp fail?
>>>
>> One possibility is when the FW is in a crash state.   Driver commands
>> to FW  fails and it reports an error status for destroy_qp verb.
>> Even Though the chances of this failure is less,  wanted to avoid a
>> host crash seen in this scenario.
> 
> Drivers are not allowed to fail destroy - the only exception is if a
> future destroy would succeed for some reason.

Why?
We already have the iterative cleanup in uverbs_destroy_ufile_hw, and combined
with Leon's changes it makes sense that the actual return value is returned
instead of ignored.

If the subsystem handles it for DEVX, why shouldn't it handle it for other drivers?
diff mbox series

Patch

diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
index 3f18efc..2f5aac0 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -752,12 +752,6 @@  static int bnxt_re_destroy_gsi_sqp(struct bnxt_re_qp *qp)
 	gsi_sqp = rdev->gsi_ctx.gsi_sqp;
 	gsi_sah = rdev->gsi_ctx.gsi_sah;
 
-	/* remove from active qp list */
-	mutex_lock(&rdev->qp_lock);
-	list_del(&gsi_sqp->list);
-	mutex_unlock(&rdev->qp_lock);
-	atomic_dec(&rdev->qp_count);
-
 	ibdev_dbg(&rdev->ibdev, "Destroy the shadow AH\n");
 	bnxt_qplib_destroy_ah(&rdev->qplib_res,
 			      &gsi_sah->qplib_ah,
@@ -772,6 +766,12 @@  static int bnxt_re_destroy_gsi_sqp(struct bnxt_re_qp *qp)
 	}
 	bnxt_qplib_free_qp_res(&rdev->qplib_res, &gsi_sqp->qplib_qp);
 
+	/* remove from active qp list */
+	mutex_lock(&rdev->qp_lock);
+	list_del(&gsi_sqp->list);
+	mutex_unlock(&rdev->qp_lock);
+	atomic_dec(&rdev->qp_count);
+
 	kfree(rdev->gsi_ctx.sqp_tbl);
 	kfree(gsi_sah);
 	kfree(gsi_sqp);
@@ -792,11 +792,6 @@  int bnxt_re_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata)
 	unsigned int flags;
 	int rc;
 
-	mutex_lock(&rdev->qp_lock);
-	list_del(&qp->list);
-	mutex_unlock(&rdev->qp_lock);
-	atomic_dec(&rdev->qp_count);
-
 	bnxt_qplib_flush_cqn_wq(&qp->qplib_qp);
 
 	rc = bnxt_qplib_destroy_qp(&rdev->qplib_res, &qp->qplib_qp);
@@ -819,6 +814,11 @@  int bnxt_re_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata)
 			goto sh_fail;
 	}
 
+	mutex_lock(&rdev->qp_lock);
+	list_del(&qp->list);
+	mutex_unlock(&rdev->qp_lock);
+	atomic_dec(&rdev->qp_count);
+
 	ib_umem_release(qp->rumem);
 	ib_umem_release(qp->sumem);