Message ID | 20191004125356.20673-1-bmt@zurich.ibm.com (mailing list archive) |
---|---|
State | Mainlined |
Commit | cf049bb31f7101d9672eaf97ade4fdd5171ddf26 |
Delegated to: | Jason Gunthorpe |
Headers | show |
Series | [[PATCH,v3,for-next] ] RDMA/siw: Fix SQ/RQ drain logic | expand |
-----"Bernard Metzler" <bmt@zurich.ibm.com> wrote: ----- >To: linux-rdma@vger.kernel.org >From: "Bernard Metzler" <bmt@zurich.ibm.com> >Date: 10/04/2019 02:54PM >Cc: bharat@chelsio.com, jgg@ziepe.ca, nirranjan@chelsio.com, >krishna2@chelsio.com, bvanassche@acm.org, leon@kernel.org, "Bernard >Metzler" <bmt@zurich.ibm.com> >Subject: [[PATCH v3 for-next]] RDMA/siw: Fix SQ/RQ drain logic > >Storage ULPs (e.g. iSER & NVMeOF) use ib_drain_qp() to >drain QP/CQ. Current SIW's own drain routines do not properly >wait until all SQ/RQ elements are completed and reaped >from the CQ. This may cause touch after free issues. >New logic relies on generic __ib_drain_sq()/__ib_drain_rq() >posting a final work request, which SIW immediately flushes >to CQ. > >Fixes: 303ae1cdfdf7 ("rdma/siw: application interface") >Signed-off-by: Krishnamraju Eraparaju <krishna2@chelsio.com> >Signed-off-by: Bernard Metzler <bmt@zurich.ibm.com> >--- >v2 -> v3: >- Handle ib_drain_sq()/ib_drain_rq() calls when QP's > state is currently locked. > >v1 -> v2: >- Accept SQ and RQ work requests, if QP is in ERROR > state. In that case, immediately flush WR's to CQ. > This already provides needed functionality to > support ib_drain_sq()/ib_drain_rq() without extra > state checking in the fast path. > > drivers/infiniband/sw/siw/siw_main.c | 20 ---- > drivers/infiniband/sw/siw/siw_verbs.c | 144 >++++++++++++++++++++++---- > 2 files changed, 122 insertions(+), 42 deletions(-) > Is there any more comment on that one? I think it has been sufficiently discussed and it is well understood, and it fixes the issue at hand. Thanks very much, Bernard. >diff --git a/drivers/infiniband/sw/siw/siw_main.c >b/drivers/infiniband/sw/siw/siw_main.c >index 05a92f997f60..fb01407a310f 100644 >--- a/drivers/infiniband/sw/siw/siw_main.c >+++ b/drivers/infiniband/sw/siw/siw_main.c >@@ -248,24 +248,6 @@ static struct ib_qp *siw_get_base_qp(struct >ib_device *base_dev, int id) > return NULL; > } > >-static void siw_verbs_sq_flush(struct ib_qp *base_qp) >-{ >- struct siw_qp *qp = to_siw_qp(base_qp); >- >- down_write(&qp->state_lock); >- siw_sq_flush(qp); >- up_write(&qp->state_lock); >-} >- >-static void siw_verbs_rq_flush(struct ib_qp *base_qp) >-{ >- struct siw_qp *qp = to_siw_qp(base_qp); >- >- down_write(&qp->state_lock); >- siw_rq_flush(qp); >- up_write(&qp->state_lock); >-} >- > static const struct ib_device_ops siw_device_ops = { > .owner = THIS_MODULE, > .uverbs_abi_ver = SIW_ABI_VERSION, >@@ -284,8 +266,6 @@ static const struct ib_device_ops siw_device_ops >= { > .destroy_cq = siw_destroy_cq, > .destroy_qp = siw_destroy_qp, > .destroy_srq = siw_destroy_srq, >- .drain_rq = siw_verbs_rq_flush, >- .drain_sq = siw_verbs_sq_flush, > .get_dma_mr = siw_get_dma_mr, > .get_port_immutable = siw_get_port_immutable, > .iw_accept = siw_accept, >diff --git a/drivers/infiniband/sw/siw/siw_verbs.c >b/drivers/infiniband/sw/siw/siw_verbs.c >index 869e02b69a01..c0574ddc98fa 100644 >--- a/drivers/infiniband/sw/siw/siw_verbs.c >+++ b/drivers/infiniband/sw/siw/siw_verbs.c >@@ -687,6 +687,47 @@ static int siw_copy_inline_sgl(const struct >ib_send_wr *core_wr, > return bytes; > } > >+/* Complete SQ WR's without processing */ >+static int siw_sq_flush_wr(struct siw_qp *qp, const struct >ib_send_wr *wr, >+ const struct ib_send_wr **bad_wr) >+{ >+ struct siw_sqe sqe = {}; >+ int rv = 0; >+ >+ while (wr) { >+ sqe.id = wr->wr_id; >+ sqe.opcode = wr->opcode; >+ rv = siw_sqe_complete(qp, &sqe, 0, SIW_WC_WR_FLUSH_ERR); >+ if (rv) { >+ if (bad_wr) >+ *bad_wr = wr; >+ break; >+ } >+ wr = wr->next; >+ } >+ return rv; >+} >+ >+/* Complete RQ WR's without processing */ >+static int siw_rq_flush_wr(struct siw_qp *qp, const struct >ib_recv_wr *wr, >+ const struct ib_recv_wr **bad_wr) >+{ >+ struct siw_rqe rqe = {}; >+ int rv = 0; >+ >+ while (wr) { >+ rqe.id = wr->wr_id; >+ rv = siw_rqe_complete(qp, &rqe, 0, 0, SIW_WC_WR_FLUSH_ERR); >+ if (rv) { >+ if (bad_wr) >+ *bad_wr = wr; >+ break; >+ } >+ wr = wr->next; >+ } >+ return rv; >+} >+ > /* > * siw_post_send() > * >@@ -705,26 +746,54 @@ int siw_post_send(struct ib_qp *base_qp, const >struct ib_send_wr *wr, > unsigned long flags; > int rv = 0; > >+ if (wr && !qp->kernel_verbs) { >+ siw_dbg_qp(qp, "wr must be empty for user mapped sq\n"); >+ *bad_wr = wr; >+ return -EINVAL; >+ } >+ > /* > * Try to acquire QP state lock. Must be non-blocking > * to accommodate kernel clients needs. > */ > if (!down_read_trylock(&qp->state_lock)) { >- *bad_wr = wr; >- siw_dbg_qp(qp, "QP locked, state %d\n", qp->attrs.state); >- return -ENOTCONN; >+ if (qp->attrs.state == SIW_QP_STATE_ERROR) { >+ /* >+ * ERROR state is final, so we can be sure >+ * this state will not change as long as the QP >+ * exists. >+ * >+ * This handles an ib_drain_sq() call with >+ * a concurrent request to set the QP state >+ * to ERROR. >+ */ >+ rv = siw_sq_flush_wr(qp, wr, bad_wr); >+ } else { >+ siw_dbg_qp(qp, "QP locked, state %d\n", >+ qp->attrs.state); >+ *bad_wr = wr; >+ rv = -ENOTCONN; >+ } >+ return rv; > } > if (unlikely(qp->attrs.state != SIW_QP_STATE_RTS)) { >+ if (qp->attrs.state == SIW_QP_STATE_ERROR) { >+ /* >+ * Immediately flush this WR to CQ, if QP >+ * is in ERROR state. SQ is guaranteed to >+ * be empty, so WR complets in-order. >+ * >+ * Typically triggered by ib_drain_sq(). >+ */ >+ rv = siw_sq_flush_wr(qp, wr, bad_wr); >+ } else { >+ siw_dbg_qp(qp, "QP out of state %d\n", >+ qp->attrs.state); >+ *bad_wr = wr; >+ rv = -ENOTCONN; >+ } > up_read(&qp->state_lock); >- *bad_wr = wr; >- siw_dbg_qp(qp, "QP out of state %d\n", qp->attrs.state); >- return -ENOTCONN; >- } >- if (wr && !qp->kernel_verbs) { >- siw_dbg_qp(qp, "wr must be empty for user mapped sq\n"); >- up_read(&qp->state_lock); >- *bad_wr = wr; >- return -EINVAL; >+ return rv; > } > spin_lock_irqsave(&qp->sq_lock, flags); > >@@ -919,24 +988,55 @@ int siw_post_receive(struct ib_qp *base_qp, >const struct ib_recv_wr *wr, > *bad_wr = wr; > return -EOPNOTSUPP; /* what else from errno.h? */ > } >- /* >- * Try to acquire QP state lock. Must be non-blocking >- * to accommodate kernel clients needs. >- */ >- if (!down_read_trylock(&qp->state_lock)) { >- *bad_wr = wr; >- return -ENOTCONN; >- } > if (!qp->kernel_verbs) { > siw_dbg_qp(qp, "no kernel post_recv for user mapped sq\n"); > up_read(&qp->state_lock); > *bad_wr = wr; > return -EINVAL; > } >+ >+ /* >+ * Try to acquire QP state lock. Must be non-blocking >+ * to accommodate kernel clients needs. >+ */ >+ if (!down_read_trylock(&qp->state_lock)) { >+ if (qp->attrs.state == SIW_QP_STATE_ERROR) { >+ /* >+ * ERROR state is final, so we can be sure >+ * this state will not change as long as the QP >+ * exists. >+ * >+ * This handles an ib_drain_rq() call with >+ * a concurrent request to set the QP state >+ * to ERROR. >+ */ >+ rv = siw_rq_flush_wr(qp, wr, bad_wr); >+ } else { >+ siw_dbg_qp(qp, "QP locked, state %d\n", >+ qp->attrs.state); >+ *bad_wr = wr; >+ rv = -ENOTCONN; >+ } >+ return rv; >+ } > if (qp->attrs.state > SIW_QP_STATE_RTS) { >+ if (qp->attrs.state == SIW_QP_STATE_ERROR) { >+ /* >+ * Immediately flush this WR to CQ, if QP >+ * is in ERROR state. RQ is guaranteed to >+ * be empty, so WR complets in-order. >+ * >+ * Typically triggered by ib_drain_rq(). >+ */ >+ rv = siw_rq_flush_wr(qp, wr, bad_wr); >+ } else { >+ siw_dbg_qp(qp, "QP out of state %d\n", >+ qp->attrs.state); >+ *bad_wr = wr; >+ rv = -ENOTCONN; >+ } > up_read(&qp->state_lock); >- *bad_wr = wr; >- return -EINVAL; >+ return rv; > } > /* > * Serialize potentially multiple producers. >-- >2.17.2 > >
On Fri, Oct 04, 2019 at 02:53:56PM +0200, Bernard Metzler wrote: > Storage ULPs (e.g. iSER & NVMeOF) use ib_drain_qp() to > drain QP/CQ. Current SIW's own drain routines do not properly > wait until all SQ/RQ elements are completed and reaped > from the CQ. This may cause touch after free issues. > New logic relies on generic __ib_drain_sq()/__ib_drain_rq() > posting a final work request, which SIW immediately flushes > to CQ. > > Fixes: 303ae1cdfdf7 ("rdma/siw: application interface") > Signed-off-by: Krishnamraju Eraparaju <krishna2@chelsio.com> > Signed-off-by: Bernard Metzler <bmt@zurich.ibm.com> > --- > v2 -> v3: > - Handle ib_drain_sq()/ib_drain_rq() calls when QP's > state is currently locked. > > v1 -> v2: > - Accept SQ and RQ work requests, if QP is in ERROR > state. In that case, immediately flush WR's to CQ. > This already provides needed functionality to > support ib_drain_sq()/ib_drain_rq() without extra > state checking in the fast path. > > drivers/infiniband/sw/siw/siw_main.c | 20 ---- > drivers/infiniband/sw/siw/siw_verbs.c | 144 ++++++++++++++++++++++---- > 2 files changed, 122 insertions(+), 42 deletions(-) Applied to for-next, thanks Jason
diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c index 05a92f997f60..fb01407a310f 100644 --- a/drivers/infiniband/sw/siw/siw_main.c +++ b/drivers/infiniband/sw/siw/siw_main.c @@ -248,24 +248,6 @@ static struct ib_qp *siw_get_base_qp(struct ib_device *base_dev, int id) return NULL; } -static void siw_verbs_sq_flush(struct ib_qp *base_qp) -{ - struct siw_qp *qp = to_siw_qp(base_qp); - - down_write(&qp->state_lock); - siw_sq_flush(qp); - up_write(&qp->state_lock); -} - -static void siw_verbs_rq_flush(struct ib_qp *base_qp) -{ - struct siw_qp *qp = to_siw_qp(base_qp); - - down_write(&qp->state_lock); - siw_rq_flush(qp); - up_write(&qp->state_lock); -} - static const struct ib_device_ops siw_device_ops = { .owner = THIS_MODULE, .uverbs_abi_ver = SIW_ABI_VERSION, @@ -284,8 +266,6 @@ static const struct ib_device_ops siw_device_ops = { .destroy_cq = siw_destroy_cq, .destroy_qp = siw_destroy_qp, .destroy_srq = siw_destroy_srq, - .drain_rq = siw_verbs_rq_flush, - .drain_sq = siw_verbs_sq_flush, .get_dma_mr = siw_get_dma_mr, .get_port_immutable = siw_get_port_immutable, .iw_accept = siw_accept, diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c index 869e02b69a01..c0574ddc98fa 100644 --- a/drivers/infiniband/sw/siw/siw_verbs.c +++ b/drivers/infiniband/sw/siw/siw_verbs.c @@ -687,6 +687,47 @@ static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr, return bytes; } +/* Complete SQ WR's without processing */ +static int siw_sq_flush_wr(struct siw_qp *qp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) +{ + struct siw_sqe sqe = {}; + int rv = 0; + + while (wr) { + sqe.id = wr->wr_id; + sqe.opcode = wr->opcode; + rv = siw_sqe_complete(qp, &sqe, 0, SIW_WC_WR_FLUSH_ERR); + if (rv) { + if (bad_wr) + *bad_wr = wr; + break; + } + wr = wr->next; + } + return rv; +} + +/* Complete RQ WR's without processing */ +static int siw_rq_flush_wr(struct siw_qp *qp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) +{ + struct siw_rqe rqe = {}; + int rv = 0; + + while (wr) { + rqe.id = wr->wr_id; + rv = siw_rqe_complete(qp, &rqe, 0, 0, SIW_WC_WR_FLUSH_ERR); + if (rv) { + if (bad_wr) + *bad_wr = wr; + break; + } + wr = wr->next; + } + return rv; +} + /* * siw_post_send() * @@ -705,26 +746,54 @@ int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr, unsigned long flags; int rv = 0; + if (wr && !qp->kernel_verbs) { + siw_dbg_qp(qp, "wr must be empty for user mapped sq\n"); + *bad_wr = wr; + return -EINVAL; + } + /* * Try to acquire QP state lock. Must be non-blocking * to accommodate kernel clients needs. */ if (!down_read_trylock(&qp->state_lock)) { - *bad_wr = wr; - siw_dbg_qp(qp, "QP locked, state %d\n", qp->attrs.state); - return -ENOTCONN; + if (qp->attrs.state == SIW_QP_STATE_ERROR) { + /* + * ERROR state is final, so we can be sure + * this state will not change as long as the QP + * exists. + * + * This handles an ib_drain_sq() call with + * a concurrent request to set the QP state + * to ERROR. + */ + rv = siw_sq_flush_wr(qp, wr, bad_wr); + } else { + siw_dbg_qp(qp, "QP locked, state %d\n", + qp->attrs.state); + *bad_wr = wr; + rv = -ENOTCONN; + } + return rv; } if (unlikely(qp->attrs.state != SIW_QP_STATE_RTS)) { + if (qp->attrs.state == SIW_QP_STATE_ERROR) { + /* + * Immediately flush this WR to CQ, if QP + * is in ERROR state. SQ is guaranteed to + * be empty, so WR complets in-order. + * + * Typically triggered by ib_drain_sq(). + */ + rv = siw_sq_flush_wr(qp, wr, bad_wr); + } else { + siw_dbg_qp(qp, "QP out of state %d\n", + qp->attrs.state); + *bad_wr = wr; + rv = -ENOTCONN; + } up_read(&qp->state_lock); - *bad_wr = wr; - siw_dbg_qp(qp, "QP out of state %d\n", qp->attrs.state); - return -ENOTCONN; - } - if (wr && !qp->kernel_verbs) { - siw_dbg_qp(qp, "wr must be empty for user mapped sq\n"); - up_read(&qp->state_lock); - *bad_wr = wr; - return -EINVAL; + return rv; } spin_lock_irqsave(&qp->sq_lock, flags); @@ -919,24 +988,55 @@ int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr, *bad_wr = wr; return -EOPNOTSUPP; /* what else from errno.h? */ } - /* - * Try to acquire QP state lock. Must be non-blocking - * to accommodate kernel clients needs. - */ - if (!down_read_trylock(&qp->state_lock)) { - *bad_wr = wr; - return -ENOTCONN; - } if (!qp->kernel_verbs) { siw_dbg_qp(qp, "no kernel post_recv for user mapped sq\n"); up_read(&qp->state_lock); *bad_wr = wr; return -EINVAL; } + + /* + * Try to acquire QP state lock. Must be non-blocking + * to accommodate kernel clients needs. + */ + if (!down_read_trylock(&qp->state_lock)) { + if (qp->attrs.state == SIW_QP_STATE_ERROR) { + /* + * ERROR state is final, so we can be sure + * this state will not change as long as the QP + * exists. + * + * This handles an ib_drain_rq() call with + * a concurrent request to set the QP state + * to ERROR. + */ + rv = siw_rq_flush_wr(qp, wr, bad_wr); + } else { + siw_dbg_qp(qp, "QP locked, state %d\n", + qp->attrs.state); + *bad_wr = wr; + rv = -ENOTCONN; + } + return rv; + } if (qp->attrs.state > SIW_QP_STATE_RTS) { + if (qp->attrs.state == SIW_QP_STATE_ERROR) { + /* + * Immediately flush this WR to CQ, if QP + * is in ERROR state. RQ is guaranteed to + * be empty, so WR complets in-order. + * + * Typically triggered by ib_drain_rq(). + */ + rv = siw_rq_flush_wr(qp, wr, bad_wr); + } else { + siw_dbg_qp(qp, "QP out of state %d\n", + qp->attrs.state); + *bad_wr = wr; + rv = -ENOTCONN; + } up_read(&qp->state_lock); - *bad_wr = wr; - return -EINVAL; + return rv; } /* * Serialize potentially multiple producers.