diff mbox series

[for-next,3/9] RDMA/rxe: Fix freeing busy objects

Message ID 20230721205021.5394-4-rpearsonhpe@gmail.com (mailing list archive)
State Changes Requested
Delegated to: Jason Gunthorpe
Headers show
Series RDMA/rxe: Misc fixes and cleanups | expand

Commit Message

Bob Pearson July 21, 2023, 8:50 p.m. UTC
Currently the rxe driver calls wait_for_completion_timeout() in
rxe_complete() to wait for the references to the object to be
freed before final cleanup and returning to the rdma-core
destroy verb. If this does not happen within the timeout interval
it prints a WARN_ON and returns to the 'destroy' verb caller
without any indication that there was an error. This is incorrect.

A heavily loaded system can take an arbitrarily long time to
complete the work needed before freeing all the references with no
guarantees of performance within a specific time.

Another frequent cause of these timeouts is due to ref counting bugs
introduced by changes in the driver so it is helpful to report the
timeouts if they occur.

This patch changes the type of the rxe_cleanup() subroutine to void
and fixes calls to reflect this API change in rxe_verbs.c. This is
better aligned with the code in rdma-core which sometimes fails to
check the return value of destroy verb calls assuming they will always
succeed. Specifically this is the case for kernel qp's.

Not able to return an error, this patch puts the completion timeout or
busy wait code into a subroutine called wait_until_done(). And places
the call in a loop with a 10 second timeout that issues a WARN_ON
each pass through the loop. This is slow enough to not overload the
logs.

Fixes: 215d0a755e1b ("RDMA/rxe: Stop lookup of partially built objects")
Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
---
 drivers/infiniband/sw/rxe/rxe_pool.c  | 39 ++++++-------
 drivers/infiniband/sw/rxe/rxe_pool.h  |  2 +-
 drivers/infiniband/sw/rxe/rxe_verbs.c | 79 +++++++--------------------
 3 files changed, 38 insertions(+), 82 deletions(-)

Comments

Jason Gunthorpe July 31, 2023, 6:11 p.m. UTC | #1
On Fri, Jul 21, 2023 at 03:50:16PM -0500, Bob Pearson wrote:
> @@ -175,16 +175,17 @@ static void rxe_elem_release(struct kref *kref)
>  {
>  	struct rxe_pool_elem *elem = container_of(kref, typeof(*elem), ref_cnt);
>  
> -	complete(&elem->complete);
> +	complete_all(&elem->complete);
>  }
>  
> -int __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable)
> +void __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable)
>  {

You should definately put this one change in its own patch doing just
that.

Jason
Bob Pearson July 31, 2023, 6:16 p.m. UTC | #2
On 7/31/23 13:11, Jason Gunthorpe wrote:
> On Fri, Jul 21, 2023 at 03:50:16PM -0500, Bob Pearson wrote:
>> @@ -175,16 +175,17 @@ static void rxe_elem_release(struct kref *kref)
>>  {
>>  	struct rxe_pool_elem *elem = container_of(kref, typeof(*elem), ref_cnt);
>>  
>> -	complete(&elem->complete);
>> +	complete_all(&elem->complete);
>>  }
>>  
>> -int __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable)
>> +void __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable)
>>  {
> 
> You should definately put this one change in its own patch doing just
> that.
> 
> Jason

Please explain a little more. I only found this change because the
change in rxe_complete to repeat the wait_for_completion call didn't work.
This seemed to fix it but the man page and comments didn't explain
complete_all very well.

Bob
Jason Gunthorpe July 31, 2023, 6:22 p.m. UTC | #3
On Mon, Jul 31, 2023 at 01:16:48PM -0500, Bob Pearson wrote:
> On 7/31/23 13:11, Jason Gunthorpe wrote:
> > On Fri, Jul 21, 2023 at 03:50:16PM -0500, Bob Pearson wrote:
> >> @@ -175,16 +175,17 @@ static void rxe_elem_release(struct kref *kref)
> >>  {
> >>  	struct rxe_pool_elem *elem = container_of(kref, typeof(*elem), ref_cnt);
> >>  
> >> -	complete(&elem->complete);
> >> +	complete_all(&elem->complete);
> >>  }
> >>  
> >> -int __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable)
> >> +void __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable)
> >>  {
> > 
> > You should definately put this one change in its own patch doing just
> > that.
> > 
> > Jason
> 
> Please explain a little more. 

I mean just remove the function return in one patch, these functions
are not supposed to fail anyhow.

> I only found this change because the change in rxe_complete to
> repeat the wait_for_completion call didn't work.  This seemed to fix
> it but the man page and comments didn't explain complete_all very
> well.

I didn't mean the complete_all hunk, I ment the __rxe_cleanup hunk :)

Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
index b88492f5f300..9877a798258a 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.c
+++ b/drivers/infiniband/sw/rxe/rxe_pool.c
@@ -6,7 +6,7 @@ 
 
 #include "rxe.h"
 
-#define RXE_POOL_TIMEOUT	(200)
+#define RXE_POOL_TIMEOUT	(10000)	/* 10 seconds */
 #define RXE_POOL_ALIGN		(16)
 
 static const struct rxe_type_info {
@@ -175,16 +175,17 @@  static void rxe_elem_release(struct kref *kref)
 {
 	struct rxe_pool_elem *elem = container_of(kref, typeof(*elem), ref_cnt);
 
-	complete(&elem->complete);
+	complete_all(&elem->complete);
 }
 
-int __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable)
+void __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable)
 {
 	struct rxe_pool *pool = elem->pool;
 	struct xarray *xa = &pool->xa;
-	static int timeout = RXE_POOL_TIMEOUT;
+	int timeout = RXE_POOL_TIMEOUT;
+	unsigned long until;
 	unsigned long flags;
-	int ret, err = 0;
+	int ret;
 	void *xa_ret;
 
 	if (sleepable)
@@ -209,39 +210,31 @@  int __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable)
 	 * return to rdma-core
 	 */
 	if (sleepable) {
-		if (!completion_done(&elem->complete) && timeout) {
+		while (!completion_done(&elem->complete) && timeout) {
 			ret = wait_for_completion_timeout(&elem->complete,
 					timeout);
-
-			/* Shouldn't happen. There are still references to
-			 * the object but, rather than deadlock, free the
-			 * object or pass back to rdma-core.
-			 */
-			if (WARN_ON(!ret))
-				err = -EINVAL;
+			WARN_ON(!ret);
 		}
 	} else {
-		unsigned long until = jiffies + timeout;
-
 		/* AH objects are unique in that the destroy_ah verb
 		 * can be called in atomic context. This delay
 		 * replaces the wait_for_completion call above
 		 * when the destroy_ah call is not sleepable
 		 */
-		while (!completion_done(&elem->complete) &&
-				time_before(jiffies, until))
-			mdelay(1);
-
-		if (WARN_ON(!completion_done(&elem->complete)))
-			err = -EINVAL;
+		while (!completion_done(&elem->complete) && timeout) {
+			until = jiffies + timeout;
+			while (!completion_done(&elem->complete) &&
+			       time_before(jiffies, until)) {
+				mdelay(10);
+			}
+			WARN_ON(!completion_done(&elem->complete));
+		}
 	}
 
 	if (pool->cleanup)
 		pool->cleanup(elem);
 
 	atomic_dec(&pool->num_elem);
-
-	return err;
 }
 
 int __rxe_get(struct rxe_pool_elem *elem)
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h
index d764c51ed6f7..efef4b05d1ed 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.h
+++ b/drivers/infiniband/sw/rxe/rxe_pool.h
@@ -71,7 +71,7 @@  int __rxe_get(struct rxe_pool_elem *elem);
 int __rxe_put(struct rxe_pool_elem *elem);
 #define rxe_put(obj) __rxe_put(&(obj)->elem)
 
-int __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable);
+void __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable);
 #define rxe_cleanup(obj) __rxe_cleanup(&(obj)->elem, true)
 #define rxe_cleanup_ah(obj, sleepable) __rxe_cleanup(&(obj)->elem, sleepable)
 
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index 5e93dbac17b3..67995c259916 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -218,11 +218,8 @@  static int rxe_alloc_ucontext(struct ib_ucontext *ibuc, struct ib_udata *udata)
 static void rxe_dealloc_ucontext(struct ib_ucontext *ibuc)
 {
 	struct rxe_ucontext *uc = to_ruc(ibuc);
-	int err;
 
-	err = rxe_cleanup(uc);
-	if (err)
-		rxe_err_uc(uc, "cleanup failed, err = %d", err);
+	rxe_cleanup(uc);
 }
 
 /* pd */
@@ -248,11 +245,8 @@  static int rxe_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 static int rxe_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
 	struct rxe_pd *pd = to_rpd(ibpd);
-	int err;
 
-	err = rxe_cleanup(pd);
-	if (err)
-		rxe_err_pd(pd, "cleanup failed, err = %d", err);
+	rxe_cleanup(pd);
 
 	return 0;
 }
@@ -266,7 +260,7 @@  static int rxe_create_ah(struct ib_ah *ibah,
 	struct rxe_ah *ah = to_rah(ibah);
 	struct rxe_create_ah_resp __user *uresp = NULL;
 	bool sleepable = init_attr->flags & RDMA_CREATE_AH_SLEEPABLE;
-	int err, cleanup_err;
+	int err;
 
 	if (udata) {
 		/* test if new user provider */
@@ -312,9 +306,7 @@  static int rxe_create_ah(struct ib_ah *ibah,
 	return 0;
 
 err_cleanup:
-	cleanup_err = rxe_cleanup_ah(ah, sleepable);
-	if (cleanup_err)
-		rxe_err_ah(ah, "cleanup failed, err = %d", cleanup_err);
+	rxe_cleanup_ah(ah, sleepable);
 err_out:
 	rxe_err_ah(ah, "returned err = %d", err);
 	return err;
@@ -355,11 +347,8 @@  static int rxe_destroy_ah(struct ib_ah *ibah, u32 flags)
 {
 	struct rxe_ah *ah = to_rah(ibah);
 	bool sleepable = flags & RDMA_DESTROY_AH_SLEEPABLE;
-	int err;
 
-	err = rxe_cleanup_ah(ah, sleepable);
-	if (err)
-		rxe_err_ah(ah, "cleanup failed, err = %d", err);
+	rxe_cleanup_ah(ah, sleepable);
 
 	return 0;
 }
@@ -372,7 +361,7 @@  static int rxe_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *init,
 	struct rxe_pd *pd = to_rpd(ibsrq->pd);
 	struct rxe_srq *srq = to_rsrq(ibsrq);
 	struct rxe_create_srq_resp __user *uresp = NULL;
-	int err, cleanup_err;
+	int err;
 
 	if (udata) {
 		if (udata->outlen < sizeof(*uresp)) {
@@ -414,9 +403,7 @@  static int rxe_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *init,
 	return 0;
 
 err_cleanup:
-	cleanup_err = rxe_cleanup(srq);
-	if (cleanup_err)
-		rxe_err_srq(srq, "cleanup failed, err = %d", cleanup_err);
+	rxe_cleanup(srq);
 err_out:
 	rxe_err_dev(rxe, "returned err = %d", err);
 	return err;
@@ -515,11 +502,8 @@  static int rxe_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
 static int rxe_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata)
 {
 	struct rxe_srq *srq = to_rsrq(ibsrq);
-	int err;
 
-	err = rxe_cleanup(srq);
-	if (err)
-		rxe_err_srq(srq, "cleanup failed, err = %d", err);
+	rxe_cleanup(srq);
 
 	return 0;
 }
@@ -532,7 +516,7 @@  static int rxe_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init,
 	struct rxe_pd *pd = to_rpd(ibqp->pd);
 	struct rxe_qp *qp = to_rqp(ibqp);
 	struct rxe_create_qp_resp __user *uresp = NULL;
-	int err, cleanup_err;
+	int err;
 
 	if (udata) {
 		if (udata->inlen) {
@@ -581,9 +565,7 @@  static int rxe_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init,
 	return 0;
 
 err_cleanup:
-	cleanup_err = rxe_cleanup(qp);
-	if (cleanup_err)
-		rxe_err_qp(qp, "cleanup failed, err = %d", cleanup_err);
+	rxe_cleanup(qp);
 err_out:
 	rxe_err_dev(rxe, "returned err = %d", err);
 	return err;
@@ -649,9 +631,7 @@  static int rxe_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 		goto err_out;
 	}
 
-	err = rxe_cleanup(qp);
-	if (err)
-		rxe_err_qp(qp, "cleanup failed, err = %d", err);
+	rxe_cleanup(qp);
 
 	return 0;
 
@@ -1062,7 +1042,7 @@  static int rxe_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 	struct rxe_dev *rxe = to_rdev(dev);
 	struct rxe_cq *cq = to_rcq(ibcq);
 	struct rxe_create_cq_resp __user *uresp = NULL;
-	int err, cleanup_err;
+	int err;
 
 	if (udata) {
 		if (udata->outlen < sizeof(*uresp)) {
@@ -1101,9 +1081,7 @@  static int rxe_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 	return 0;
 
 err_cleanup:
-	cleanup_err = rxe_cleanup(cq);
-	if (cleanup_err)
-		rxe_err_cq(cq, "cleanup failed, err = %d", cleanup_err);
+	rxe_cleanup(cq);
 err_out:
 	rxe_err_dev(rxe, "returned err = %d", err);
 	return err;
@@ -1208,9 +1186,7 @@  static int rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 		goto err_out;
 	}
 
-	err = rxe_cleanup(cq);
-	if (err)
-		rxe_err_cq(cq, "cleanup failed, err = %d", err);
+	rxe_cleanup(cq);
 
 	return 0;
 
@@ -1258,7 +1234,7 @@  static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, u64 start,
 	struct rxe_dev *rxe = to_rdev(ibpd->device);
 	struct rxe_pd *pd = to_rpd(ibpd);
 	struct rxe_mr *mr;
-	int err, cleanup_err;
+	int err;
 
 	if (access & ~RXE_ACCESS_SUPPORTED_MR) {
 		rxe_err_pd(pd, "access = %#x not supported (%#x)", access,
@@ -1290,9 +1266,7 @@  static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, u64 start,
 	return &mr->ibmr;
 
 err_cleanup:
-	cleanup_err = rxe_cleanup(mr);
-	if (cleanup_err)
-		rxe_err_mr(mr, "cleanup failed, err = %d", cleanup_err);
+	rxe_cleanup(mr);
 err_free:
 	kfree(mr);
 	rxe_err_pd(pd, "returned err = %d", err);
@@ -1339,7 +1313,7 @@  static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type,
 	struct rxe_dev *rxe = to_rdev(ibpd->device);
 	struct rxe_pd *pd = to_rpd(ibpd);
 	struct rxe_mr *mr;
-	int err, cleanup_err;
+	int err;
 
 	if (mr_type != IB_MR_TYPE_MEM_REG) {
 		err = -EINVAL;
@@ -1370,9 +1344,7 @@  static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type,
 	return &mr->ibmr;
 
 err_cleanup:
-	cleanup_err = rxe_cleanup(mr);
-	if (cleanup_err)
-		rxe_err_mr(mr, "cleanup failed, err = %d", err);
+	rxe_cleanup(mr);
 err_free:
 	kfree(mr);
 err_out:
@@ -1383,25 +1355,16 @@  static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type,
 static int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 {
 	struct rxe_mr *mr = to_rmr(ibmr);
-	int err, cleanup_err;
 
 	/* See IBA 10.6.7.2.6 */
 	if (atomic_read(&mr->num_mw) > 0) {
-		err = -EINVAL;
-		rxe_dbg_mr(mr, "mr has mw's bound");
-		goto err_out;
+		rxe_err_mr(mr, "mr has mw's bound");
+		return -EINVAL;
 	}
 
-	cleanup_err = rxe_cleanup(mr);
-	if (cleanup_err)
-		rxe_err_mr(mr, "cleanup failed, err = %d", cleanup_err);
-
+	rxe_cleanup(mr);
 	kfree_rcu(mr, elem.rcu);
 	return 0;
-
-err_out:
-	rxe_err_mr(mr, "returned err = %d", err);
-	return err;
 }
 
 static ssize_t parent_show(struct device *device,