diff mbox

[for-4.1,10/11] RDMA/ocrdma: Prevent returning failures in the resource destroy path

Message ID 1431762709-20740-11-git-send-email-selvin.xavier@avagotech.com (mailing list archive)
State Superseded
Headers show

Commit Message

Selvin Xavier May 16, 2015, 7:51 a.m. UTC
Avoid returning FW failures in the resource destroy verbs.

Signed-off-by: Padmanabh Ratnakar <padmanabh.ratnakar@avagotech.com>
Signed-off-by: Devesh Sharma <devesh.sharma@avagotech.com>
Signed-off-by: Selvin Xavier <selvin.xavier@avagotech.com>
---
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

Comments

Doug Ledford May 15, 2015, 3:56 p.m. UTC | #1
On Sat, 2015-05-16 at 13:21 +0530, Selvin Xavier wrote:
> Avoid returning FW failures in the resource destroy verbs.

This patch looks extremely unsafe to someone that doesn't have the
benefit of being able to look at your firmware code.  What firmware
errors are you ignoring?  What state is the card in whenever these
errors are returned?  Is there even a remote possibility that the
firmware being in this state and returning these errors will either A)
cause erroneous attempts to write to user memory or B) interfere with
future attempts to create PDs or other contexts on the card?

> Signed-off-by: Padmanabh Ratnakar <padmanabh.ratnakar@avagotech.com>
> Signed-off-by: Devesh Sharma <devesh.sharma@avagotech.com>
> Signed-off-by: Selvin Xavier <selvin.xavier@avagotech.com>
> ---
>  drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 17 +++++++----------
>  1 file changed, 7 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
> index 48ec56f..b1f9d7d 100644
> --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
> +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
> @@ -531,13 +531,12 @@ map_err:
>  
>  int ocrdma_dealloc_ucontext(struct ib_ucontext *ibctx)
>  {
> -	int status = 0;
>  	struct ocrdma_mm *mm, *tmp;
>  	struct ocrdma_ucontext *uctx = get_ocrdma_ucontext(ibctx);
>  	struct ocrdma_dev *dev = get_ocrdma_dev(ibctx->device);
>  	struct pci_dev *pdev = dev->nic_info.pdev;
>  
> -	status = ocrdma_dealloc_ucontext_pd(uctx);
> +	(void)ocrdma_dealloc_ucontext_pd(uctx);
>  
>  	ocrdma_del_mmap(uctx, uctx->ah_tbl.pa, uctx->ah_tbl.len);
>  	dma_free_coherent(&pdev->dev, uctx->ah_tbl.len, uctx->ah_tbl.va,
> @@ -548,7 +547,7 @@ int ocrdma_dealloc_ucontext(struct ib_ucontext *ibctx)
>  		kfree(mm);
>  	}
>  	kfree(uctx);
> -	return status;
> +	return 0;
>  }
>  
>  int ocrdma_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
> @@ -690,7 +689,6 @@ int ocrdma_dealloc_pd(struct ib_pd *ibpd)
>  	struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
>  	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
>  	struct ocrdma_ucontext *uctx = NULL;
> -	int status = 0;
>  	u64 usr_db;
>  
>  	uctx = pd->uctx;
> @@ -704,11 +702,11 @@ int ocrdma_dealloc_pd(struct ib_pd *ibpd)
>  
>  		if (is_ucontext_pd(uctx, pd)) {
>  			ocrdma_release_ucontext_pd(uctx);
> -			return status;
> +			return 0;
>  		}
>  	}
> -	status = _ocrdma_dealloc_pd(dev, pd);
> -	return status;
> +	(void)_ocrdma_dealloc_pd(dev, pd);
> +	return 0;
>  }
>  
>  static int ocrdma_alloc_lkey(struct ocrdma_dev *dev, struct ocrdma_mr *mr,
> @@ -1905,13 +1903,12 @@ int ocrdma_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr)
>  
>  int ocrdma_destroy_srq(struct ib_srq *ibsrq)
>  {
> -	int status;
>  	struct ocrdma_srq *srq;
>  	struct ocrdma_dev *dev = get_ocrdma_dev(ibsrq->device);
>  
>  	srq = get_ocrdma_srq(ibsrq);
>  
> -	status = ocrdma_mbx_destroy_srq(dev, srq);
> +	(void)ocrdma_mbx_destroy_srq(dev, srq);
>  
>  	if (srq->pd->uctx)
>  		ocrdma_del_mmap(srq->pd->uctx, (u64) srq->rq.pa,
> @@ -1920,7 +1917,7 @@ int ocrdma_destroy_srq(struct ib_srq *ibsrq)
>  	kfree(srq->idx_bit_fields);
>  	kfree(srq->rqe_wr_id_tbl);
>  	kfree(srq);
> -	return status;
> +	return 0;
>  }
>  
>  /* unprivileged verbs and their support functions. */
Selvin Xavier May 15, 2015, 6:16 p.m. UTC | #2
> -----Original Message-----
> From: Doug Ledford [mailto:dledford@redhat.com]
> Sent: Friday, May 15, 2015 9:26 PM
> To: Selvin Xavier
> Cc: linux-rdma@vger.kernel.org
> Subject: Re: [PATCH for-4.1 10/11] RDMA/ocrdma: Prevent returning failures
> in the resource destroy path
>
> On Sat, 2015-05-16 at 13:21 +0530, Selvin Xavier wrote:
> > Avoid returning FW failures in the resource destroy verbs.
>
> This patch looks extremely unsafe to someone that doesn't have the benefit
> of being able to look at your firmware code.  What firmware errors are you
> ignoring?  What state is the card in whenever these errors are returned?
> Is
> there even a remote possibility that the firmware being in this state and
> returning these errors will either A) cause erroneous attempts to write to
> user memory or B) interfere with future attempts to create PDs or other
> contexts on the card?
>


One of the main reasons for these destroy commands to fail is when the
adapter is in  an unrecoverable error, which is handled in this patch.  Case
A will not be applicable in case of UE since adapter is down.   But Case A
is applicable if any rogue application passes wrong parameters like PDID for
destroy and the failure is a genuine error from FW .   I will rework on the
patch to handle these conditions also.

Case B is handled in both conditions. If the adapter is in UE, successive
create requests also will fail and failure is returned to the caller. If
adapter is in proper state,  successive create requests return success with
new resource IDs.

Thanks,
Selvin Xavier
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Selvin Xavier May 18, 2015, 11:46 a.m. UTC | #3
> -----Original Message-----
> From: Selvin Xavier [mailto:selvin.xavier@avagotech.com]
> Sent: Friday, May 15, 2015 11:47 PM
> To: 'Doug Ledford'
> Cc: 'linux-rdma@vger.kernel.org'
> Subject: RE: [PATCH for-4.1 10/11] RDMA/ocrdma: Prevent returning failures
> in the resource destroy path
>
>
> > -----Original Message-----
> > From: Doug Ledford [mailto:dledford@redhat.com]
> > Sent: Friday, May 15, 2015 9:26 PM
> > To: Selvin Xavier
> > Cc: linux-rdma@vger.kernel.org
> > Subject: Re: [PATCH for-4.1 10/11] RDMA/ocrdma: Prevent returning
> > failures in the resource destroy path
> >
> > On Sat, 2015-05-16 at 13:21 +0530, Selvin Xavier wrote:
> > > Avoid returning FW failures in the resource destroy verbs.
> >
> > This patch looks extremely unsafe to someone that doesn't have the
> > benefit of being able to look at your firmware code.  What firmware
> > errors are you ignoring?  What state is the card in whenever these
> > errors are returned?  Is there even a remote possibility that the
> > firmware being in this state and returning these errors will either A)
> > cause erroneous attempts to write to user memory or B) interfere with
> > future attempts to create PDs or other contexts on the card?
> >
>
>
> One of the main reasons for these destroy commands to fail is when the
> adapter is in  an unrecoverable error, which is handled in this patch.
> Case A
> will not be applicable in case of UE since adapter is down.   But Case A
> is
> applicable if any rogue application passes wrong parameters like PDID for
> destroy and the failure is a genuine error from FW .   I will rework on
> the
> patch to handle these conditions also.
>

> Case B is handled in both conditions. If the adapter is in UE, successive
> create
> requests also will fail and failure is returned to the caller. If adapter
> is in
> proper state,  successive create requests return success with new resource
> IDs.
>

Doug,

The driver is returning success for other destroy routines also and free the
corresponding driver resources.  As part of the changes suggested by you, we
will change all the destroy routines to return error status for all the FW
command failures and avoid freeing the driver resources  in case of failure.
Since this change need more testing, I will drop this  patch from the series
for 4.1 RC.  I will cut a patch for 4.2 after our testing.

Thanks,
Selvin
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe May 19, 2015, 6:22 p.m. UTC | #4
On Mon, May 18, 2015 at 05:16:14PM +0530, Selvin Xavier wrote:

> The driver is returning success for other destroy routines also and
> free the corresponding driver resources.  As part of the changes
> suggested by you, we will change all the destroy routines to return
> error status for all the FW command failures and avoid freeing the
> driver resources in case of failure.  Since this change need more
> testing, I will drop this patch from the series for 4.1 RC.  I will
> cut a patch for 4.2 after our testing.

Didn't we discuss this already? Destroy routines cannot fail.

If the card's firmware fails during destroy then that is an
unrecoverable error and your driver and firmware are no longer
synchronized. Panic the kernel or do some kind of (synchronous)
card-reset recovery.

Don't expect the caller to handle this, and don't leak IB core memory
when it happens.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Selvin Xavier May 28, 2015, 3:58 a.m. UTC | #5
Hi Jason
 My reply is delayed since  I was on vacation.

> -----Original Message-----
> From: Jason Gunthorpe [mailto:jgunthorpe@obsidianresearch.com]
> Sent: Tuesday, May 19, 2015 11:53 PM
> To: Selvin Xavier
> Cc: Doug Ledford; linux-rdma@vger.kernel.org
> Subject: Re: [PATCH for-4.1 10/11] RDMA/ocrdma: Prevent returning
failures
> in the resource destroy path
>
> On Mon, May 18, 2015 at 05:16:14PM +0530, Selvin Xavier wrote:
>
> > The driver is returning success for other destroy routines also and
> > free the corresponding driver resources.  As part of the changes
> > suggested by you, we will change all the destroy routines to return
> > error status for all the FW command failures and avoid freeing the
> > driver resources in case of failure.  Since this change need more
> > testing, I will drop this patch from the series for 4.1 RC.  I will
> > cut a patch for 4.2 after our testing.
>
> Didn't we discuss this already? Destroy routines cannot fail.
>
> If the card's firmware fails during destroy then that is an
unrecoverable error
> and your driver and firmware are no longer synchronized. Panic the
kernel or
> do some kind of (synchronous) card-reset recovery.
>
> Don't expect the caller to handle this, and don't leak IB core memory
when it
> happens.
>
Got it .   Will do the same in all destroy APIs.

> Jason
Thanks,
Selvin
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
index 48ec56f..b1f9d7d 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
@@ -531,13 +531,12 @@  map_err:
 
 int ocrdma_dealloc_ucontext(struct ib_ucontext *ibctx)
 {
-	int status = 0;
 	struct ocrdma_mm *mm, *tmp;
 	struct ocrdma_ucontext *uctx = get_ocrdma_ucontext(ibctx);
 	struct ocrdma_dev *dev = get_ocrdma_dev(ibctx->device);
 	struct pci_dev *pdev = dev->nic_info.pdev;
 
-	status = ocrdma_dealloc_ucontext_pd(uctx);
+	(void)ocrdma_dealloc_ucontext_pd(uctx);
 
 	ocrdma_del_mmap(uctx, uctx->ah_tbl.pa, uctx->ah_tbl.len);
 	dma_free_coherent(&pdev->dev, uctx->ah_tbl.len, uctx->ah_tbl.va,
@@ -548,7 +547,7 @@  int ocrdma_dealloc_ucontext(struct ib_ucontext *ibctx)
 		kfree(mm);
 	}
 	kfree(uctx);
-	return status;
+	return 0;
 }
 
 int ocrdma_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
@@ -690,7 +689,6 @@  int ocrdma_dealloc_pd(struct ib_pd *ibpd)
 	struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
 	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
 	struct ocrdma_ucontext *uctx = NULL;
-	int status = 0;
 	u64 usr_db;
 
 	uctx = pd->uctx;
@@ -704,11 +702,11 @@  int ocrdma_dealloc_pd(struct ib_pd *ibpd)
 
 		if (is_ucontext_pd(uctx, pd)) {
 			ocrdma_release_ucontext_pd(uctx);
-			return status;
+			return 0;
 		}
 	}
-	status = _ocrdma_dealloc_pd(dev, pd);
-	return status;
+	(void)_ocrdma_dealloc_pd(dev, pd);
+	return 0;
 }
 
 static int ocrdma_alloc_lkey(struct ocrdma_dev *dev, struct ocrdma_mr *mr,
@@ -1905,13 +1903,12 @@  int ocrdma_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr)
 
 int ocrdma_destroy_srq(struct ib_srq *ibsrq)
 {
-	int status;
 	struct ocrdma_srq *srq;
 	struct ocrdma_dev *dev = get_ocrdma_dev(ibsrq->device);
 
 	srq = get_ocrdma_srq(ibsrq);
 
-	status = ocrdma_mbx_destroy_srq(dev, srq);
+	(void)ocrdma_mbx_destroy_srq(dev, srq);
 
 	if (srq->pd->uctx)
 		ocrdma_del_mmap(srq->pd->uctx, (u64) srq->rq.pa,
@@ -1920,7 +1917,7 @@  int ocrdma_destroy_srq(struct ib_srq *ibsrq)
 	kfree(srq->idx_bit_fields);
 	kfree(srq->rqe_wr_id_tbl);
 	kfree(srq);
-	return status;
+	return 0;
 }
 
 /* unprivileged verbs and their support functions. */