diff mbox series

RDMA/siw: Fix immediate work request flush to completion queue.

Message ID 20221101153705.626906-1-bmt@zurich.ibm.com (mailing list archive)
State Superseded
Headers show
Series RDMA/siw: Fix immediate work request flush to completion queue. | expand

Commit Message

Bernard Metzler Nov. 1, 2022, 3:37 p.m. UTC
Correctly set send queue element opcode during immediate work request
flushing in post sendqueue operation, if the QP is in ERROR state.
An undefined ocode value results in out-of-bounds access to an array
for mapping the opcode between siw internal and RDMA core representation
in work completion generation. It resulted in a KASAN BUG report
of type 'global-out-of-bounds' during NFSoRDMA testing.
This patch further fixes a potential case of a malicious user which may
write undefined values for completion queue elements status or opcode,
if the CQ is memory mapped to user land. It avoids the same out-of-bounds
access to arrays for status and opcode mapping as described above.

Fixes: 303ae1cdfdf7 ("rdma/siw: application interface")
Fixes: b0fff7317bb4 ("rdma/siw: completion queue methods")

Reported-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Bernard Metzler <bmt@zurich.ibm.com>
---
 drivers/infiniband/sw/siw/siw_cq.c    | 24 ++++++++++++++--
 drivers/infiniband/sw/siw/siw_verbs.c | 40 ++++++++++++++++++++++++---
 2 files changed, 58 insertions(+), 6 deletions(-)

Comments

Tom Talpey Nov. 1, 2022, 9:05 p.m. UTC | #1
On 11/1/2022 11:37 AM, Bernard Metzler wrote:
> Correctly set send queue element opcode during immediate work request
> flushing in post sendqueue operation, if the QP is in ERROR state.
> An undefined ocode value results in out-of-bounds access to an array
> for mapping the opcode between siw internal and RDMA core representation
> in work completion generation. It resulted in a KASAN BUG report
> of type 'global-out-of-bounds' during NFSoRDMA testing.
> This patch further fixes a potential case of a malicious user which may
> write undefined values for completion queue elements status or opcode,
> if the CQ is memory mapped to user land. It avoids the same out-of-bounds
> access to arrays for status and opcode mapping as described above.
> 
> Fixes: 303ae1cdfdf7 ("rdma/siw: application interface")
> Fixes: b0fff7317bb4 ("rdma/siw: completion queue methods")
> 
> Reported-by: Olga Kornievskaia <kolga@netapp.com>
> Signed-off-by: Bernard Metzler <bmt@zurich.ibm.com>
> ---
>   drivers/infiniband/sw/siw/siw_cq.c    | 24 ++++++++++++++--
>   drivers/infiniband/sw/siw/siw_verbs.c | 40 ++++++++++++++++++++++++---
>   2 files changed, 58 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/infiniband/sw/siw/siw_cq.c b/drivers/infiniband/sw/siw/siw_cq.c
> index d68e37859e73..acc7bcd538b5 100644
> --- a/drivers/infiniband/sw/siw/siw_cq.c
> +++ b/drivers/infiniband/sw/siw/siw_cq.c
> @@ -56,8 +56,6 @@ int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *wc)
>   	if (READ_ONCE(cqe->flags) & SIW_WQE_VALID) {
>   		memset(wc, 0, sizeof(*wc));
>   		wc->wr_id = cqe->id;
> -		wc->status = map_cqe_status[cqe->status].ib;
> -		wc->opcode = map_wc_opcode[cqe->opcode];
>   		wc->byte_len = cqe->bytes;
>   
>   		/*
> @@ -71,10 +69,32 @@ int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *wc)
>   				wc->wc_flags = IB_WC_WITH_INVALIDATE;
>   			}
>   			wc->qp = cqe->base_qp;
> +			wc->opcode = map_wc_opcode[cqe->opcode];
> +			wc->status = map_cqe_status[cqe->status].ib;
>   			siw_dbg_cq(cq,
>   				   "idx %u, type %d, flags %2x, id 0x%pK\n",
>   				   cq->cq_get % cq->num_cqe, cqe->opcode,
>   				   cqe->flags, (void *)(uintptr_t)cqe->id);
> +		} else {
> +			/*
> +			 * A malicious user may set invalid opcode or
> +			 * status in the user mmapped CQE array.
> +			 * Sanity check and correct values in that case
> +			 * to avoid out-of-bounds access to global arrays
> +			 * for opcode and status mapping.
> +			 */
> +			u8 opcode = cqe->opcode;
> +			u16 status = cqe->status;
> +
> +			if (opcode >= SIW_NUM_OPCODES) {
> +				opcode = 0;
> +				status = IB_WC_GENERAL_ERR;
> +			} else if (status >= SIW_NUM_WC_STATUS) {
> +				status = IB_WC_GENERAL_ERR;
> +			}
> +			wc->opcode = map_wc_opcode[opcode];
> +			wc->status = map_cqe_status[status].ib;
> +
>   		}
>   		WRITE_ONCE(cqe->flags, 0);
>   		cq->cq_get++;
> diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c
> index 3e814cfb298c..8021fbd004b0 100644
> --- a/drivers/infiniband/sw/siw/siw_verbs.c
> +++ b/drivers/infiniband/sw/siw/siw_verbs.c
> @@ -676,13 +676,45 @@ static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr,
>   static int siw_sq_flush_wr(struct siw_qp *qp, const struct ib_send_wr *wr,
>   			   const struct ib_send_wr **bad_wr)
>   {
> -	struct siw_sqe sqe = {};
>   	int rv = 0;
>   
>   	while (wr) {
> -		sqe.id = wr->wr_id;
> -		sqe.opcode = wr->opcode;
> -		rv = siw_sqe_complete(qp, &sqe, 0, SIW_WC_WR_FLUSH_ERR);
> +		struct siw_sqe sqe = {};
> +
> +		switch (wr->opcode) {
> +		case IB_WR_RDMA_WRITE:
> +			sqe.opcode = SIW_OP_WRITE;
> +			break;
> +		case IB_WR_RDMA_READ:
> +			sqe.opcode = SIW_OP_READ;
> +			break;
> +		case IB_WR_RDMA_READ_WITH_INV:
> +			sqe.opcode = SIW_OP_READ_LOCAL_INV;
> +			break;
> +		case IB_WR_SEND:
> +			sqe.opcode = SIW_OP_SEND;
> +			break;
> +		case IB_WR_SEND_WITH_IMM:
> +			sqe.opcode = SIW_OP_SEND_WITH_IMM;
> +			break;
> +		case IB_WR_SEND_WITH_INV:
> +			sqe.opcode = SIW_OP_SEND_REMOTE_INV;
> +			break;
> +		case IB_WR_LOCAL_INV:
> +			sqe.opcode = SIW_OP_INVAL_STAG;
> +			break;
> +		case IB_WR_REG_MR:
> +			sqe.opcode = SIW_OP_REG_MR;
> +			break;
> +		default:
> +			rv = -EOPNOTSUPP;

I think -EINVAL would be more appropriate here. That error is
returned from siw_post_send() in a similar situation, and
this routine is actually called from there also, and its rc
is returned, so it's best to be consistent.

Other than that, looks good to me:
Reviewed-by: Tom Talpey <tom@talpey.com>

Tom.


> +			break;
> +		}
> +		if (!rv) {
> +			sqe.id = wr->wr_id;
> +			rv = siw_sqe_complete(qp, &sqe, 0,
> +					      SIW_WC_WR_FLUSH_ERR);
> +		}
>   		if (rv) {
>   			if (bad_wr)
>   				*bad_wr = wr;
Bernard Metzler Nov. 2, 2022, 9:17 a.m. UTC | #2
> -----Original Message-----
> From: Tom Talpey <tom@talpey.com>
> Sent: Tuesday, 1 November 2022 22:05
> To: Bernard Metzler <BMT@zurich.ibm.com>; linux-rdma@vger.kernel.org
> Cc: jgg@nvidia.com; leonro@nvidia.com; Olga Kornievskaia
> <kolga@netapp.com>
> Subject: [EXTERNAL] Re: [PATCH] RDMA/siw: Fix immediate work request
> flush to completion queue.
> 
> On 11/1/2022 11:37 AM, Bernard Metzler wrote:
> > Correctly set send queue element opcode during immediate work request
> > flushing in post sendqueue operation, if the QP is in ERROR state.
> > An undefined ocode value results in out-of-bounds access to an array
> > for mapping the opcode between siw internal and RDMA core
> representation
> > in work completion generation. It resulted in a KASAN BUG report
> > of type 'global-out-of-bounds' during NFSoRDMA testing.
> > This patch further fixes a potential case of a malicious user which
> may
> > write undefined values for completion queue elements status or opcode,
> > if the CQ is memory mapped to user land. It avoids the same out-of-
> bounds
> > access to arrays for status and opcode mapping as described above.
> >
> > Fixes: 303ae1cdfdf7 ("rdma/siw: application interface")
> > Fixes: b0fff7317bb4 ("rdma/siw: completion queue methods")
> >
> > Reported-by: Olga Kornievskaia <kolga@netapp.com>
> > Signed-off-by: Bernard Metzler <bmt@zurich.ibm.com>
> > ---
> >   drivers/infiniband/sw/siw/siw_cq.c    | 24 ++++++++++++++--
> >   drivers/infiniband/sw/siw/siw_verbs.c | 40 ++++++++++++++++++++++++-
> --
> >   2 files changed, 58 insertions(+), 6 deletions(-)
> >
> > diff --git a/drivers/infiniband/sw/siw/siw_cq.c
> b/drivers/infiniband/sw/siw/siw_cq.c
> > index d68e37859e73..acc7bcd538b5 100644
> > --- a/drivers/infiniband/sw/siw/siw_cq.c
> > +++ b/drivers/infiniband/sw/siw/siw_cq.c
> > @@ -56,8 +56,6 @@ int siw_reap_cqe(struct siw_cq *cq, struct ib_wc
> *wc)
> >   	if (READ_ONCE(cqe->flags) & SIW_WQE_VALID) {
> >   		memset(wc, 0, sizeof(*wc));
> >   		wc->wr_id = cqe->id;
> > -		wc->status = map_cqe_status[cqe->status].ib;
> > -		wc->opcode = map_wc_opcode[cqe->opcode];
> >   		wc->byte_len = cqe->bytes;
> >
> >   		/*
> > @@ -71,10 +69,32 @@ int siw_reap_cqe(struct siw_cq *cq, struct ib_wc
> *wc)
> >   				wc->wc_flags = IB_WC_WITH_INVALIDATE;
> >   			}
> >   			wc->qp = cqe->base_qp;
> > +			wc->opcode = map_wc_opcode[cqe->opcode];
> > +			wc->status = map_cqe_status[cqe->status].ib;
> >   			siw_dbg_cq(cq,
> >   				   "idx %u, type %d, flags %2x, id 0x%pK\n",
> >   				   cq->cq_get % cq->num_cqe, cqe->opcode,
> >   				   cqe->flags, (void *)(uintptr_t)cqe->id);
> > +		} else {
> > +			/*
> > +			 * A malicious user may set invalid opcode or
> > +			 * status in the user mmapped CQE array.
> > +			 * Sanity check and correct values in that case
> > +			 * to avoid out-of-bounds access to global arrays
> > +			 * for opcode and status mapping.
> > +			 */
> > +			u8 opcode = cqe->opcode;
> > +			u16 status = cqe->status;
> > +
> > +			if (opcode >= SIW_NUM_OPCODES) {
> > +				opcode = 0;
> > +				status = IB_WC_GENERAL_ERR;
> > +			} else if (status >= SIW_NUM_WC_STATUS) {
> > +				status = IB_WC_GENERAL_ERR;
> > +			}
> > +			wc->opcode = map_wc_opcode[opcode];
> > +			wc->status = map_cqe_status[status].ib;
> > +
> >   		}
> >   		WRITE_ONCE(cqe->flags, 0);
> >   		cq->cq_get++;
> > diff --git a/drivers/infiniband/sw/siw/siw_verbs.c
> b/drivers/infiniband/sw/siw/siw_verbs.c
> > index 3e814cfb298c..8021fbd004b0 100644
> > --- a/drivers/infiniband/sw/siw/siw_verbs.c
> > +++ b/drivers/infiniband/sw/siw/siw_verbs.c
> > @@ -676,13 +676,45 @@ static int siw_copy_inline_sgl(const struct
> ib_send_wr *core_wr,
> >   static int siw_sq_flush_wr(struct siw_qp *qp, const struct
> ib_send_wr *wr,
> >   			   const struct ib_send_wr **bad_wr)
> >   {
> > -	struct siw_sqe sqe = {};
> >   	int rv = 0;
> >
> >   	while (wr) {
> > -		sqe.id = wr->wr_id;
> > -		sqe.opcode = wr->opcode;
> > -		rv = siw_sqe_complete(qp, &sqe, 0, SIW_WC_WR_FLUSH_ERR);
> > +		struct siw_sqe sqe = {};
> > +
> > +		switch (wr->opcode) {
> > +		case IB_WR_RDMA_WRITE:
> > +			sqe.opcode = SIW_OP_WRITE;
> > +			break;
> > +		case IB_WR_RDMA_READ:
> > +			sqe.opcode = SIW_OP_READ;
> > +			break;
> > +		case IB_WR_RDMA_READ_WITH_INV:
> > +			sqe.opcode = SIW_OP_READ_LOCAL_INV;
> > +			break;
> > +		case IB_WR_SEND:
> > +			sqe.opcode = SIW_OP_SEND;
> > +			break;
> > +		case IB_WR_SEND_WITH_IMM:
> > +			sqe.opcode = SIW_OP_SEND_WITH_IMM;
> > +			break;
> > +		case IB_WR_SEND_WITH_INV:
> > +			sqe.opcode = SIW_OP_SEND_REMOTE_INV;
> > +			break;
> > +		case IB_WR_LOCAL_INV:
> > +			sqe.opcode = SIW_OP_INVAL_STAG;
> > +			break;
> > +		case IB_WR_REG_MR:
> > +			sqe.opcode = SIW_OP_REG_MR;
> > +			break;
> > +		default:
> > +			rv = -EOPNOTSUPP;
> 
> I think -EINVAL would be more appropriate here. That error is
> returned from siw_post_send() in a similar situation, and
> this routine is actually called from there also, and its rc
> is returned, so it's best to be consistent.
> 
I agree, it would make it more consistent. I'll send an
update.

Thanks Tom!


> Other than that, looks good to me:
> Reviewed-by: Tom Talpey <tom@talpey.com>
> 
> Tom.
> 
> 
> > +			break;
> > +		}
> > +		if (!rv) {
> > +			sqe.id = wr->wr_id;
> > +			rv = siw_sqe_complete(qp, &sqe, 0,
> > +					      SIW_WC_WR_FLUSH_ERR);
> > +		}
> >   		if (rv) {
> >   			if (bad_wr)
> >   				*bad_wr = wr;
diff mbox series

Patch

diff --git a/drivers/infiniband/sw/siw/siw_cq.c b/drivers/infiniband/sw/siw/siw_cq.c
index d68e37859e73..acc7bcd538b5 100644
--- a/drivers/infiniband/sw/siw/siw_cq.c
+++ b/drivers/infiniband/sw/siw/siw_cq.c
@@ -56,8 +56,6 @@  int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *wc)
 	if (READ_ONCE(cqe->flags) & SIW_WQE_VALID) {
 		memset(wc, 0, sizeof(*wc));
 		wc->wr_id = cqe->id;
-		wc->status = map_cqe_status[cqe->status].ib;
-		wc->opcode = map_wc_opcode[cqe->opcode];
 		wc->byte_len = cqe->bytes;
 
 		/*
@@ -71,10 +69,32 @@  int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *wc)
 				wc->wc_flags = IB_WC_WITH_INVALIDATE;
 			}
 			wc->qp = cqe->base_qp;
+			wc->opcode = map_wc_opcode[cqe->opcode];
+			wc->status = map_cqe_status[cqe->status].ib;
 			siw_dbg_cq(cq,
 				   "idx %u, type %d, flags %2x, id 0x%pK\n",
 				   cq->cq_get % cq->num_cqe, cqe->opcode,
 				   cqe->flags, (void *)(uintptr_t)cqe->id);
+		} else {
+			/*
+			 * A malicious user may set invalid opcode or
+			 * status in the user mmapped CQE array.
+			 * Sanity check and correct values in that case
+			 * to avoid out-of-bounds access to global arrays
+			 * for opcode and status mapping.
+			 */
+			u8 opcode = cqe->opcode;
+			u16 status = cqe->status;
+
+			if (opcode >= SIW_NUM_OPCODES) {
+				opcode = 0;
+				status = IB_WC_GENERAL_ERR;
+			} else if (status >= SIW_NUM_WC_STATUS) {
+				status = IB_WC_GENERAL_ERR;
+			}
+			wc->opcode = map_wc_opcode[opcode];
+			wc->status = map_cqe_status[status].ib;
+
 		}
 		WRITE_ONCE(cqe->flags, 0);
 		cq->cq_get++;
diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c
index 3e814cfb298c..8021fbd004b0 100644
--- a/drivers/infiniband/sw/siw/siw_verbs.c
+++ b/drivers/infiniband/sw/siw/siw_verbs.c
@@ -676,13 +676,45 @@  static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr,
 static int siw_sq_flush_wr(struct siw_qp *qp, const struct ib_send_wr *wr,
 			   const struct ib_send_wr **bad_wr)
 {
-	struct siw_sqe sqe = {};
 	int rv = 0;
 
 	while (wr) {
-		sqe.id = wr->wr_id;
-		sqe.opcode = wr->opcode;
-		rv = siw_sqe_complete(qp, &sqe, 0, SIW_WC_WR_FLUSH_ERR);
+		struct siw_sqe sqe = {};
+
+		switch (wr->opcode) {
+		case IB_WR_RDMA_WRITE:
+			sqe.opcode = SIW_OP_WRITE;
+			break;
+		case IB_WR_RDMA_READ:
+			sqe.opcode = SIW_OP_READ;
+			break;
+		case IB_WR_RDMA_READ_WITH_INV:
+			sqe.opcode = SIW_OP_READ_LOCAL_INV;
+			break;
+		case IB_WR_SEND:
+			sqe.opcode = SIW_OP_SEND;
+			break;
+		case IB_WR_SEND_WITH_IMM:
+			sqe.opcode = SIW_OP_SEND_WITH_IMM;
+			break;
+		case IB_WR_SEND_WITH_INV:
+			sqe.opcode = SIW_OP_SEND_REMOTE_INV;
+			break;
+		case IB_WR_LOCAL_INV:
+			sqe.opcode = SIW_OP_INVAL_STAG;
+			break;
+		case IB_WR_REG_MR:
+			sqe.opcode = SIW_OP_REG_MR;
+			break;
+		default:
+			rv = -EOPNOTSUPP;
+			break;
+		}
+		if (!rv) {
+			sqe.id = wr->wr_id;
+			rv = siw_sqe_complete(qp, &sqe, 0,
+					      SIW_WC_WR_FLUSH_ERR);
+		}
 		if (rv) {
 			if (bad_wr)
 				*bad_wr = wr;