diff mbox series

[[PATCH,v3,for-next] ] RDMA/siw: Fix SQ/RQ drain logic

Message ID 20191004125356.20673-1-bmt@zurich.ibm.com (mailing list archive)
State Mainlined
Commit cf049bb31f7101d9672eaf97ade4fdd5171ddf26
Delegated to: Jason Gunthorpe
Headers show
Series [[PATCH,v3,for-next] ] RDMA/siw: Fix SQ/RQ drain logic | expand

Commit Message

Bernard Metzler Oct. 4, 2019, 12:53 p.m. UTC
Storage ULPs (e.g. iSER & NVMeOF) use ib_drain_qp() to
drain QP/CQ. Current SIW's own drain routines do not properly
wait until all SQ/RQ elements are completed and reaped
from the CQ. This may cause touch after free issues.
New logic relies on generic __ib_drain_sq()/__ib_drain_rq()
posting a final work request, which SIW immediately flushes
to CQ.

Fixes: 303ae1cdfdf7 ("rdma/siw: application interface")
Signed-off-by: Krishnamraju Eraparaju <krishna2@chelsio.com>
Signed-off-by: Bernard Metzler <bmt@zurich.ibm.com>
---
v2 -> v3:
- Handle ib_drain_sq()/ib_drain_rq() calls when QP's
  state is currently locked.

v1 -> v2:
- Accept SQ and RQ work requests, if QP is in ERROR
  state. In that case, immediately flush WR's to CQ.
  This already provides needed functionality to
  support ib_drain_sq()/ib_drain_rq() without extra
  state checking in the fast path.

 drivers/infiniband/sw/siw/siw_main.c  |  20 ----
 drivers/infiniband/sw/siw/siw_verbs.c | 144 ++++++++++++++++++++++----
 2 files changed, 122 insertions(+), 42 deletions(-)

Comments

Bernard Metzler Oct. 18, 2019, 1:22 p.m. UTC | #1
-----"Bernard Metzler" <bmt@zurich.ibm.com> wrote: -----

>To: linux-rdma@vger.kernel.org
>From: "Bernard Metzler" <bmt@zurich.ibm.com>
>Date: 10/04/2019 02:54PM
>Cc: bharat@chelsio.com, jgg@ziepe.ca, nirranjan@chelsio.com,
>krishna2@chelsio.com, bvanassche@acm.org, leon@kernel.org, "Bernard
>Metzler" <bmt@zurich.ibm.com>
>Subject: [[PATCH v3 for-next]] RDMA/siw: Fix SQ/RQ drain logic
>
>Storage ULPs (e.g. iSER & NVMeOF) use ib_drain_qp() to
>drain QP/CQ. Current SIW's own drain routines do not properly
>wait until all SQ/RQ elements are completed and reaped
>from the CQ. This may cause touch after free issues.
>New logic relies on generic __ib_drain_sq()/__ib_drain_rq()
>posting a final work request, which SIW immediately flushes
>to CQ.
>
>Fixes: 303ae1cdfdf7 ("rdma/siw: application interface")
>Signed-off-by: Krishnamraju Eraparaju <krishna2@chelsio.com>
>Signed-off-by: Bernard Metzler <bmt@zurich.ibm.com>
>---
>v2 -> v3:
>- Handle ib_drain_sq()/ib_drain_rq() calls when QP's
>  state is currently locked.
>
>v1 -> v2:
>- Accept SQ and RQ work requests, if QP is in ERROR
>  state. In that case, immediately flush WR's to CQ.
>  This already provides needed functionality to
>  support ib_drain_sq()/ib_drain_rq() without extra
>  state checking in the fast path.
>
> drivers/infiniband/sw/siw/siw_main.c  |  20 ----
> drivers/infiniband/sw/siw/siw_verbs.c | 144
>++++++++++++++++++++++----
> 2 files changed, 122 insertions(+), 42 deletions(-)
>

Is there any more comment on that one? I think it has been
sufficiently discussed and it is well understood, and it fixes
the issue at hand.

Thanks very much,
Bernard.

>diff --git a/drivers/infiniband/sw/siw/siw_main.c
>b/drivers/infiniband/sw/siw/siw_main.c
>index 05a92f997f60..fb01407a310f 100644
>--- a/drivers/infiniband/sw/siw/siw_main.c
>+++ b/drivers/infiniband/sw/siw/siw_main.c
>@@ -248,24 +248,6 @@ static struct ib_qp *siw_get_base_qp(struct
>ib_device *base_dev, int id)
> 	return NULL;
> }
> 
>-static void siw_verbs_sq_flush(struct ib_qp *base_qp)
>-{
>-	struct siw_qp *qp = to_siw_qp(base_qp);
>-
>-	down_write(&qp->state_lock);
>-	siw_sq_flush(qp);
>-	up_write(&qp->state_lock);
>-}
>-
>-static void siw_verbs_rq_flush(struct ib_qp *base_qp)
>-{
>-	struct siw_qp *qp = to_siw_qp(base_qp);
>-
>-	down_write(&qp->state_lock);
>-	siw_rq_flush(qp);
>-	up_write(&qp->state_lock);
>-}
>-
> static const struct ib_device_ops siw_device_ops = {
> 	.owner = THIS_MODULE,
> 	.uverbs_abi_ver = SIW_ABI_VERSION,
>@@ -284,8 +266,6 @@ static const struct ib_device_ops siw_device_ops
>= {
> 	.destroy_cq = siw_destroy_cq,
> 	.destroy_qp = siw_destroy_qp,
> 	.destroy_srq = siw_destroy_srq,
>-	.drain_rq = siw_verbs_rq_flush,
>-	.drain_sq = siw_verbs_sq_flush,
> 	.get_dma_mr = siw_get_dma_mr,
> 	.get_port_immutable = siw_get_port_immutable,
> 	.iw_accept = siw_accept,
>diff --git a/drivers/infiniband/sw/siw/siw_verbs.c
>b/drivers/infiniband/sw/siw/siw_verbs.c
>index 869e02b69a01..c0574ddc98fa 100644
>--- a/drivers/infiniband/sw/siw/siw_verbs.c
>+++ b/drivers/infiniband/sw/siw/siw_verbs.c
>@@ -687,6 +687,47 @@ static int siw_copy_inline_sgl(const struct
>ib_send_wr *core_wr,
> 	return bytes;
> }
> 
>+/* Complete SQ WR's without processing */
>+static int siw_sq_flush_wr(struct siw_qp *qp, const struct
>ib_send_wr *wr,
>+			   const struct ib_send_wr **bad_wr)
>+{
>+	struct siw_sqe sqe = {};
>+	int rv = 0;
>+
>+	while (wr) {
>+		sqe.id = wr->wr_id;
>+		sqe.opcode = wr->opcode;
>+		rv = siw_sqe_complete(qp, &sqe, 0, SIW_WC_WR_FLUSH_ERR);
>+		if (rv) {
>+			if (bad_wr)
>+				*bad_wr = wr;
>+			break;
>+		}
>+		wr = wr->next;
>+	}
>+	return rv;
>+}
>+
>+/* Complete RQ WR's without processing */
>+static int siw_rq_flush_wr(struct siw_qp *qp, const struct
>ib_recv_wr *wr,
>+			   const struct ib_recv_wr **bad_wr)
>+{
>+	struct siw_rqe rqe = {};
>+	int rv = 0;
>+
>+	while (wr) {
>+		rqe.id = wr->wr_id;
>+		rv = siw_rqe_complete(qp, &rqe, 0, 0, SIW_WC_WR_FLUSH_ERR);
>+		if (rv) {
>+			if (bad_wr)
>+				*bad_wr = wr;
>+			break;
>+		}
>+		wr = wr->next;
>+	}
>+	return rv;
>+}
>+
> /*
>  * siw_post_send()
>  *
>@@ -705,26 +746,54 @@ int siw_post_send(struct ib_qp *base_qp, const
>struct ib_send_wr *wr,
> 	unsigned long flags;
> 	int rv = 0;
> 
>+	if (wr && !qp->kernel_verbs) {
>+		siw_dbg_qp(qp, "wr must be empty for user mapped sq\n");
>+		*bad_wr = wr;
>+		return -EINVAL;
>+	}
>+
> 	/*
> 	 * Try to acquire QP state lock. Must be non-blocking
> 	 * to accommodate kernel clients needs.
> 	 */
> 	if (!down_read_trylock(&qp->state_lock)) {
>-		*bad_wr = wr;
>-		siw_dbg_qp(qp, "QP locked, state %d\n", qp->attrs.state);
>-		return -ENOTCONN;
>+		if (qp->attrs.state == SIW_QP_STATE_ERROR) {
>+			/*
>+			 * ERROR state is final, so we can be sure
>+			 * this state will not change as long as the QP
>+			 * exists.
>+			 *
>+			 * This handles an ib_drain_sq() call with
>+			 * a concurrent request to set the QP state
>+			 * to ERROR.
>+			 */
>+			rv = siw_sq_flush_wr(qp, wr, bad_wr);
>+		} else {
>+			siw_dbg_qp(qp, "QP locked, state %d\n",
>+				   qp->attrs.state);
>+			*bad_wr = wr;
>+			rv = -ENOTCONN;
>+		}
>+		return rv;
> 	}
> 	if (unlikely(qp->attrs.state != SIW_QP_STATE_RTS)) {
>+		if (qp->attrs.state == SIW_QP_STATE_ERROR) {
>+			/*
>+			 * Immediately flush this WR to CQ, if QP
>+			 * is in ERROR state. SQ is guaranteed to
>+			 * be empty, so WR complets in-order.
>+			 *
>+			 * Typically triggered by ib_drain_sq().
>+			 */
>+			rv = siw_sq_flush_wr(qp, wr, bad_wr);
>+		} else {
>+			siw_dbg_qp(qp, "QP out of state %d\n",
>+				   qp->attrs.state);
>+			*bad_wr = wr;
>+			rv = -ENOTCONN;
>+		}
> 		up_read(&qp->state_lock);
>-		*bad_wr = wr;
>-		siw_dbg_qp(qp, "QP out of state %d\n", qp->attrs.state);
>-		return -ENOTCONN;
>-	}
>-	if (wr && !qp->kernel_verbs) {
>-		siw_dbg_qp(qp, "wr must be empty for user mapped sq\n");
>-		up_read(&qp->state_lock);
>-		*bad_wr = wr;
>-		return -EINVAL;
>+		return rv;
> 	}
> 	spin_lock_irqsave(&qp->sq_lock, flags);
> 
>@@ -919,24 +988,55 @@ int siw_post_receive(struct ib_qp *base_qp,
>const struct ib_recv_wr *wr,
> 		*bad_wr = wr;
> 		return -EOPNOTSUPP; /* what else from errno.h? */
> 	}
>-	/*
>-	 * Try to acquire QP state lock. Must be non-blocking
>-	 * to accommodate kernel clients needs.
>-	 */
>-	if (!down_read_trylock(&qp->state_lock)) {
>-		*bad_wr = wr;
>-		return -ENOTCONN;
>-	}
> 	if (!qp->kernel_verbs) {
> 		siw_dbg_qp(qp, "no kernel post_recv for user mapped sq\n");
> 		up_read(&qp->state_lock);
> 		*bad_wr = wr;
> 		return -EINVAL;
> 	}
>+
>+	/*
>+	 * Try to acquire QP state lock. Must be non-blocking
>+	 * to accommodate kernel clients needs.
>+	 */
>+	if (!down_read_trylock(&qp->state_lock)) {
>+		if (qp->attrs.state == SIW_QP_STATE_ERROR) {
>+			/*
>+			 * ERROR state is final, so we can be sure
>+			 * this state will not change as long as the QP
>+			 * exists.
>+			 *
>+			 * This handles an ib_drain_rq() call with
>+			 * a concurrent request to set the QP state
>+			 * to ERROR.
>+			 */
>+			rv = siw_rq_flush_wr(qp, wr, bad_wr);
>+		} else {
>+			siw_dbg_qp(qp, "QP locked, state %d\n",
>+				   qp->attrs.state);
>+			*bad_wr = wr;
>+			rv = -ENOTCONN;
>+		}
>+		return rv;
>+	}
> 	if (qp->attrs.state > SIW_QP_STATE_RTS) {
>+		if (qp->attrs.state == SIW_QP_STATE_ERROR) {
>+			/*
>+			 * Immediately flush this WR to CQ, if QP
>+			 * is in ERROR state. RQ is guaranteed to
>+			 * be empty, so WR complets in-order.
>+			 *
>+			 * Typically triggered by ib_drain_rq().
>+			 */
>+			rv = siw_rq_flush_wr(qp, wr, bad_wr);
>+		} else {
>+			siw_dbg_qp(qp, "QP out of state %d\n",
>+				   qp->attrs.state);
>+			*bad_wr = wr;
>+			rv = -ENOTCONN;
>+		}
> 		up_read(&qp->state_lock);
>-		*bad_wr = wr;
>-		return -EINVAL;
>+		return rv;
> 	}
> 	/*
> 	 * Serialize potentially multiple producers.
>-- 
>2.17.2
>
>
Jason Gunthorpe Oct. 22, 2019, 5:10 p.m. UTC | #2
On Fri, Oct 04, 2019 at 02:53:56PM +0200, Bernard Metzler wrote:
> Storage ULPs (e.g. iSER & NVMeOF) use ib_drain_qp() to
> drain QP/CQ. Current SIW's own drain routines do not properly
> wait until all SQ/RQ elements are completed and reaped
> from the CQ. This may cause touch after free issues.
> New logic relies on generic __ib_drain_sq()/__ib_drain_rq()
> posting a final work request, which SIW immediately flushes
> to CQ.
> 
> Fixes: 303ae1cdfdf7 ("rdma/siw: application interface")
> Signed-off-by: Krishnamraju Eraparaju <krishna2@chelsio.com>
> Signed-off-by: Bernard Metzler <bmt@zurich.ibm.com>
> ---
> v2 -> v3:
> - Handle ib_drain_sq()/ib_drain_rq() calls when QP's
>   state is currently locked.
> 
> v1 -> v2:
> - Accept SQ and RQ work requests, if QP is in ERROR
>   state. In that case, immediately flush WR's to CQ.
>   This already provides needed functionality to
>   support ib_drain_sq()/ib_drain_rq() without extra
>   state checking in the fast path.
> 
>  drivers/infiniband/sw/siw/siw_main.c  |  20 ----
>  drivers/infiniband/sw/siw/siw_verbs.c | 144 ++++++++++++++++++++++----
>  2 files changed, 122 insertions(+), 42 deletions(-)

Applied to for-next, thanks

Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c
index 05a92f997f60..fb01407a310f 100644
--- a/drivers/infiniband/sw/siw/siw_main.c
+++ b/drivers/infiniband/sw/siw/siw_main.c
@@ -248,24 +248,6 @@  static struct ib_qp *siw_get_base_qp(struct ib_device *base_dev, int id)
 	return NULL;
 }
 
-static void siw_verbs_sq_flush(struct ib_qp *base_qp)
-{
-	struct siw_qp *qp = to_siw_qp(base_qp);
-
-	down_write(&qp->state_lock);
-	siw_sq_flush(qp);
-	up_write(&qp->state_lock);
-}
-
-static void siw_verbs_rq_flush(struct ib_qp *base_qp)
-{
-	struct siw_qp *qp = to_siw_qp(base_qp);
-
-	down_write(&qp->state_lock);
-	siw_rq_flush(qp);
-	up_write(&qp->state_lock);
-}
-
 static const struct ib_device_ops siw_device_ops = {
 	.owner = THIS_MODULE,
 	.uverbs_abi_ver = SIW_ABI_VERSION,
@@ -284,8 +266,6 @@  static const struct ib_device_ops siw_device_ops = {
 	.destroy_cq = siw_destroy_cq,
 	.destroy_qp = siw_destroy_qp,
 	.destroy_srq = siw_destroy_srq,
-	.drain_rq = siw_verbs_rq_flush,
-	.drain_sq = siw_verbs_sq_flush,
 	.get_dma_mr = siw_get_dma_mr,
 	.get_port_immutable = siw_get_port_immutable,
 	.iw_accept = siw_accept,
diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c
index 869e02b69a01..c0574ddc98fa 100644
--- a/drivers/infiniband/sw/siw/siw_verbs.c
+++ b/drivers/infiniband/sw/siw/siw_verbs.c
@@ -687,6 +687,47 @@  static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr,
 	return bytes;
 }
 
+/* Complete SQ WR's without processing */
+static int siw_sq_flush_wr(struct siw_qp *qp, const struct ib_send_wr *wr,
+			   const struct ib_send_wr **bad_wr)
+{
+	struct siw_sqe sqe = {};
+	int rv = 0;
+
+	while (wr) {
+		sqe.id = wr->wr_id;
+		sqe.opcode = wr->opcode;
+		rv = siw_sqe_complete(qp, &sqe, 0, SIW_WC_WR_FLUSH_ERR);
+		if (rv) {
+			if (bad_wr)
+				*bad_wr = wr;
+			break;
+		}
+		wr = wr->next;
+	}
+	return rv;
+}
+
+/* Complete RQ WR's without processing */
+static int siw_rq_flush_wr(struct siw_qp *qp, const struct ib_recv_wr *wr,
+			   const struct ib_recv_wr **bad_wr)
+{
+	struct siw_rqe rqe = {};
+	int rv = 0;
+
+	while (wr) {
+		rqe.id = wr->wr_id;
+		rv = siw_rqe_complete(qp, &rqe, 0, 0, SIW_WC_WR_FLUSH_ERR);
+		if (rv) {
+			if (bad_wr)
+				*bad_wr = wr;
+			break;
+		}
+		wr = wr->next;
+	}
+	return rv;
+}
+
 /*
  * siw_post_send()
  *
@@ -705,26 +746,54 @@  int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr,
 	unsigned long flags;
 	int rv = 0;
 
+	if (wr && !qp->kernel_verbs) {
+		siw_dbg_qp(qp, "wr must be empty for user mapped sq\n");
+		*bad_wr = wr;
+		return -EINVAL;
+	}
+
 	/*
 	 * Try to acquire QP state lock. Must be non-blocking
 	 * to accommodate kernel clients needs.
 	 */
 	if (!down_read_trylock(&qp->state_lock)) {
-		*bad_wr = wr;
-		siw_dbg_qp(qp, "QP locked, state %d\n", qp->attrs.state);
-		return -ENOTCONN;
+		if (qp->attrs.state == SIW_QP_STATE_ERROR) {
+			/*
+			 * ERROR state is final, so we can be sure
+			 * this state will not change as long as the QP
+			 * exists.
+			 *
+			 * This handles an ib_drain_sq() call with
+			 * a concurrent request to set the QP state
+			 * to ERROR.
+			 */
+			rv = siw_sq_flush_wr(qp, wr, bad_wr);
+		} else {
+			siw_dbg_qp(qp, "QP locked, state %d\n",
+				   qp->attrs.state);
+			*bad_wr = wr;
+			rv = -ENOTCONN;
+		}
+		return rv;
 	}
 	if (unlikely(qp->attrs.state != SIW_QP_STATE_RTS)) {
+		if (qp->attrs.state == SIW_QP_STATE_ERROR) {
+			/*
+			 * Immediately flush this WR to CQ, if QP
+			 * is in ERROR state. SQ is guaranteed to
+			 * be empty, so WR complets in-order.
+			 *
+			 * Typically triggered by ib_drain_sq().
+			 */
+			rv = siw_sq_flush_wr(qp, wr, bad_wr);
+		} else {
+			siw_dbg_qp(qp, "QP out of state %d\n",
+				   qp->attrs.state);
+			*bad_wr = wr;
+			rv = -ENOTCONN;
+		}
 		up_read(&qp->state_lock);
-		*bad_wr = wr;
-		siw_dbg_qp(qp, "QP out of state %d\n", qp->attrs.state);
-		return -ENOTCONN;
-	}
-	if (wr && !qp->kernel_verbs) {
-		siw_dbg_qp(qp, "wr must be empty for user mapped sq\n");
-		up_read(&qp->state_lock);
-		*bad_wr = wr;
-		return -EINVAL;
+		return rv;
 	}
 	spin_lock_irqsave(&qp->sq_lock, flags);
 
@@ -919,24 +988,55 @@  int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr,
 		*bad_wr = wr;
 		return -EOPNOTSUPP; /* what else from errno.h? */
 	}
-	/*
-	 * Try to acquire QP state lock. Must be non-blocking
-	 * to accommodate kernel clients needs.
-	 */
-	if (!down_read_trylock(&qp->state_lock)) {
-		*bad_wr = wr;
-		return -ENOTCONN;
-	}
 	if (!qp->kernel_verbs) {
 		siw_dbg_qp(qp, "no kernel post_recv for user mapped sq\n");
 		up_read(&qp->state_lock);
 		*bad_wr = wr;
 		return -EINVAL;
 	}
+
+	/*
+	 * Try to acquire QP state lock. Must be non-blocking
+	 * to accommodate kernel clients needs.
+	 */
+	if (!down_read_trylock(&qp->state_lock)) {
+		if (qp->attrs.state == SIW_QP_STATE_ERROR) {
+			/*
+			 * ERROR state is final, so we can be sure
+			 * this state will not change as long as the QP
+			 * exists.
+			 *
+			 * This handles an ib_drain_rq() call with
+			 * a concurrent request to set the QP state
+			 * to ERROR.
+			 */
+			rv = siw_rq_flush_wr(qp, wr, bad_wr);
+		} else {
+			siw_dbg_qp(qp, "QP locked, state %d\n",
+				   qp->attrs.state);
+			*bad_wr = wr;
+			rv = -ENOTCONN;
+		}
+		return rv;
+	}
 	if (qp->attrs.state > SIW_QP_STATE_RTS) {
+		if (qp->attrs.state == SIW_QP_STATE_ERROR) {
+			/*
+			 * Immediately flush this WR to CQ, if QP
+			 * is in ERROR state. RQ is guaranteed to
+			 * be empty, so WR complets in-order.
+			 *
+			 * Typically triggered by ib_drain_rq().
+			 */
+			rv = siw_rq_flush_wr(qp, wr, bad_wr);
+		} else {
+			siw_dbg_qp(qp, "QP out of state %d\n",
+				   qp->attrs.state);
+			*bad_wr = wr;
+			rv = -ENOTCONN;
+		}
 		up_read(&qp->state_lock);
-		*bad_wr = wr;
-		return -EINVAL;
+		return rv;
 	}
 	/*
 	 * Serialize potentially multiple producers.