Message ID | 20230127210938.30051-1-rpearsonhpe@gmail.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | [for-next,v3] Subject: RDMA/rxe: Handle zero length rdma | expand |
On Sat, Jan 28, 2023 6:10 AM Bob Pearson wrote: > > Currently the rxe driver does not handle all cases of zero length > rdma operations correctly. The client does not have to provide an > rkey for zero length RDMA operations so the rkey provided may be > invalid and should not be used to lookup an mr. > > This patch corrects the driver to ignore the provided rkey if the > reth length is zero and make sure to set the mr to NULL. In read_reply() > if length is zero rxe_recheck_mr() is not called. Warnings are added in > the routines in rxe_mr.c to catch NULL MRs when the length is non-zero. > > Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com> > --- When I applied this change, a testcase in rdma-core failed as shown below: ====================================================================== ERROR: test_qp_ex_rc_flush (tests.test_qpex.QpExTestCase) ---------------------------------------------------------------------- Traceback (most recent call last): File "/root/rdma-core/tests/test_qpex.py", line 258, in test_qp_ex_rc_flush raise PyverbsError(f'Unexpected {wc_status_to_str(wcs[0].status)}') pyverbs.pyverbs_error.PyverbsError: Unexpected Remote access error ---------------------------------------------------------------------- In my opinion, your change makes sense within the range of traditional RDMA operations, but conflicts with the new RDMA FLUSH operation. Responder cannot access the target MR because of invalid rkey. The root cause is written in IBA Annex A19, especially 'oA19-2'. We thus cannot set qp->resp.rkey to 0 in qp_resp_from_reth(). Do you have anything to say about this? > Li Zhijian Thanks, Daisuke Matsuda > v3: > Fixed my fat finger typing on v2. Mangled the patch. > > v2: > Rebased to current for-next. > Cleaned up description to be a little more accurate. > --- > drivers/infiniband/sw/rxe/rxe_mr.c | 6 +++ > drivers/infiniband/sw/rxe/rxe_resp.c | 55 +++++++++++++++++++++------- > 2 files changed, 47 insertions(+), 14 deletions(-) > > diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c > index c80458634962..5b7ede1d2b08 100644 > --- a/drivers/infiniband/sw/rxe/rxe_mr.c > +++ b/drivers/infiniband/sw/rxe/rxe_mr.c > @@ -314,6 +314,9 @@ int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, > if (length == 0) > return 0; > > + if (WARN_ON(!mr)) > + return -EINVAL; > + > if (mr->ibmr.type == IB_MR_TYPE_DMA) { > rxe_mr_copy_dma(mr, iova, addr, length, dir); > return 0; > @@ -435,6 +438,9 @@ int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) > if (length == 0) > return 0; > > + if (WARN_ON(!mr)) > + return -EINVAL; > + > if (mr->ibmr.type == IB_MR_TYPE_DMA) > return -EFAULT; > > diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c > index cd2d88de287c..b13ae98400c1 100644 > --- a/drivers/infiniband/sw/rxe/rxe_resp.c > +++ b/drivers/infiniband/sw/rxe/rxe_resp.c > @@ -420,13 +420,23 @@ static enum resp_states rxe_resp_check_length(struct rxe_qp *qp, > return RESPST_CHK_RKEY; > } > > +/* if the reth length field is zero we can assume nothing > + * about the rkey value and should not validate or use it. > + * Instead set qp->resp.rkey to 0 which is an invalid rkey > + * value since the minimum index part is 1. > + */ > static void qp_resp_from_reth(struct rxe_qp *qp, struct rxe_pkt_info *pkt) > { > + unsigned int length = reth_len(pkt); > + > qp->resp.va = reth_va(pkt); > qp->resp.offset = 0; > - qp->resp.rkey = reth_rkey(pkt); > - qp->resp.resid = reth_len(pkt); > - qp->resp.length = reth_len(pkt); > + qp->resp.resid = length; > + qp->resp.length = length; > + if (length) > + qp->resp.rkey = reth_rkey(pkt); > + else > + qp->resp.rkey = 0; > } > > static void qp_resp_from_atmeth(struct rxe_qp *qp, struct rxe_pkt_info *pkt) > @@ -437,6 +447,10 @@ static void qp_resp_from_atmeth(struct rxe_qp *qp, struct rxe_pkt_info *pkt) > qp->resp.resid = sizeof(u64); > } > > +/* resolve the packet rkey to qp->resp.mr or set qp->resp.mr to NULL > + * if an invalid rkey is received or the rdma length is zero. For middle > + * or last packets use the stored value of mr. > + */ > static enum resp_states check_rkey(struct rxe_qp *qp, > struct rxe_pkt_info *pkt) > { > @@ -475,8 +489,8 @@ static enum resp_states check_rkey(struct rxe_qp *qp, > > /* A zero-byte op is not required to set an addr or rkey. See C9-88 */ > if ((pkt->mask & RXE_READ_OR_WRITE_MASK) && > - (pkt->mask & RXE_RETH_MASK) && > - reth_len(pkt) == 0) { > + (pkt->mask & RXE_RETH_MASK) && reth_len(pkt) == 0) { > + qp->resp.mr = NULL; > return RESPST_EXECUTE; > } > > @@ -555,6 +569,7 @@ static enum resp_states check_rkey(struct rxe_qp *qp, > return RESPST_EXECUTE; > > err: > + qp->resp.mr = NULL; > if (mr) > rxe_put(mr); > if (mw) > @@ -885,7 +900,11 @@ static enum resp_states read_reply(struct rxe_qp *qp, > } > > if (res->state == rdatm_res_state_new) { > - if (!res->replay) { > + if (!res->replay || qp->resp.length == 0) { > + /* if length == 0 mr will be NULL (is ok) > + * otherwise qp->resp.mr holds a ref on mr > + * which we transfer to mr and drop below. > + */ > mr = qp->resp.mr; > qp->resp.mr = NULL; > } else { > @@ -899,6 +918,10 @@ static enum resp_states read_reply(struct rxe_qp *qp, > else > opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST; > } else { > + /* re-lookup mr from rkey on all later packets. > + * length will be non-zero. This can fail if someone > + * modifies or destroys the mr since the first packet. > + */ > mr = rxe_recheck_mr(qp, res->read.rkey); > if (!mr) > return RESPST_ERR_RKEY_VIOLATION; > @@ -916,18 +939,16 @@ static enum resp_states read_reply(struct rxe_qp *qp, > skb = prepare_ack_packet(qp, &ack_pkt, opcode, payload, > res->cur_psn, AETH_ACK_UNLIMITED); > if (!skb) { > - if (mr) > - rxe_put(mr); > - return RESPST_ERR_RNR; > + state = RESPST_ERR_RNR; > + goto err_out; > } > > err = rxe_mr_copy(mr, res->read.va, payload_addr(&ack_pkt), > payload, RXE_FROM_MR_OBJ); > - if (mr) > - rxe_put(mr); > if (err) { > kfree_skb(skb); > - return RESPST_ERR_RKEY_VIOLATION; > + state = RESPST_ERR_RKEY_VIOLATION; > + goto err_out; > } > > if (bth_pad(&ack_pkt)) { > @@ -936,9 +957,12 @@ static enum resp_states read_reply(struct rxe_qp *qp, > memset(pad, 0, bth_pad(&ack_pkt)); > } > > + /* rxe_xmit_packet always consumes the skb */ > err = rxe_xmit_packet(qp, &ack_pkt, skb); > - if (err) > - return RESPST_ERR_RNR; > + if (err) { > + state = RESPST_ERR_RNR; > + goto err_out; > + } > > res->read.va += payload; > res->read.resid -= payload; > @@ -955,6 +979,9 @@ static enum resp_states read_reply(struct rxe_qp *qp, > state = RESPST_CLEANUP; > } > > +err_out: > + if (mr) > + rxe_put(mr); > return state; > } > > -- > 2.37.2
On 2/1/2023 6:06 AM, Daisuke Matsuda (Fujitsu) wrote: > On Sat, Jan 28, 2023 6:10 AM Bob Pearson wrote: >> >> Currently the rxe driver does not handle all cases of zero length >> rdma operations correctly. The client does not have to provide an >> rkey for zero length RDMA operations so the rkey provided may be >> invalid and should not be used to lookup an mr. >> >> This patch corrects the driver to ignore the provided rkey if the >> reth length is zero and make sure to set the mr to NULL. In read_reply() >> if length is zero rxe_recheck_mr() is not called. Warnings are added in >> the routines in rxe_mr.c to catch NULL MRs when the length is non-zero. >> >> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com> >> --- > > When I applied this change, a testcase in rdma-core failed as shown below: > ====================================================================== > ERROR: test_qp_ex_rc_flush (tests.test_qpex.QpExTestCase) > ---------------------------------------------------------------------- > Traceback (most recent call last): > File "/root/rdma-core/tests/test_qpex.py", line 258, in test_qp_ex_rc_flush > raise PyverbsError(f'Unexpected {wc_status_to_str(wcs[0].status)}') > pyverbs.pyverbs_error.PyverbsError: Unexpected Remote access error > > ---------------------------------------------------------------------- > > In my opinion, your change makes sense within the range of traditional > RDMA operations, but conflicts with the new RDMA FLUSH operation. > Responder cannot access the target MR because of invalid rkey. The > root cause is written in IBA Annex A19, especially 'oA19-2'. > We thus cannot set qp->resp.rkey to 0 in qp_resp_from_reth(). > > Do you have anything to say about this? > Li Zhijian > > Thanks, > Daisuke Matsuda I'm confused too, Bob can you point to the section of the spec that allows the rkey to be zero? It's my understanding that a zero-length RDMA Read must always check for access, even though no data is actually fetched. That would not be possible without an rkey. Tom. >> v3: >> Fixed my fat finger typing on v2. Mangled the patch. >> >> v2: >> Rebased to current for-next. >> Cleaned up description to be a little more accurate. >> --- >> drivers/infiniband/sw/rxe/rxe_mr.c | 6 +++ >> drivers/infiniband/sw/rxe/rxe_resp.c | 55 +++++++++++++++++++++------- >> 2 files changed, 47 insertions(+), 14 deletions(-) >> >> diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c >> index c80458634962..5b7ede1d2b08 100644 >> --- a/drivers/infiniband/sw/rxe/rxe_mr.c >> +++ b/drivers/infiniband/sw/rxe/rxe_mr.c >> @@ -314,6 +314,9 @@ int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, >> if (length == 0) >> return 0; >> >> + if (WARN_ON(!mr)) >> + return -EINVAL; >> + >> if (mr->ibmr.type == IB_MR_TYPE_DMA) { >> rxe_mr_copy_dma(mr, iova, addr, length, dir); >> return 0; >> @@ -435,6 +438,9 @@ int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) >> if (length == 0) >> return 0; >> >> + if (WARN_ON(!mr)) >> + return -EINVAL; >> + >> if (mr->ibmr.type == IB_MR_TYPE_DMA) >> return -EFAULT; >> >> diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c >> index cd2d88de287c..b13ae98400c1 100644 >> --- a/drivers/infiniband/sw/rxe/rxe_resp.c >> +++ b/drivers/infiniband/sw/rxe/rxe_resp.c >> @@ -420,13 +420,23 @@ static enum resp_states rxe_resp_check_length(struct rxe_qp *qp, >> return RESPST_CHK_RKEY; >> } >> >> +/* if the reth length field is zero we can assume nothing >> + * about the rkey value and should not validate or use it. >> + * Instead set qp->resp.rkey to 0 which is an invalid rkey >> + * value since the minimum index part is 1. >> + */ >> static void qp_resp_from_reth(struct rxe_qp *qp, struct rxe_pkt_info *pkt) >> { >> + unsigned int length = reth_len(pkt); >> + >> qp->resp.va = reth_va(pkt); >> qp->resp.offset = 0; >> - qp->resp.rkey = reth_rkey(pkt); >> - qp->resp.resid = reth_len(pkt); >> - qp->resp.length = reth_len(pkt); >> + qp->resp.resid = length; >> + qp->resp.length = length; >> + if (length) >> + qp->resp.rkey = reth_rkey(pkt); >> + else >> + qp->resp.rkey = 0; >> } >> >> static void qp_resp_from_atmeth(struct rxe_qp *qp, struct rxe_pkt_info *pkt) >> @@ -437,6 +447,10 @@ static void qp_resp_from_atmeth(struct rxe_qp *qp, struct rxe_pkt_info *pkt) >> qp->resp.resid = sizeof(u64); >> } >> >> +/* resolve the packet rkey to qp->resp.mr or set qp->resp.mr to NULL >> + * if an invalid rkey is received or the rdma length is zero. For middle >> + * or last packets use the stored value of mr. >> + */ >> static enum resp_states check_rkey(struct rxe_qp *qp, >> struct rxe_pkt_info *pkt) >> { >> @@ -475,8 +489,8 @@ static enum resp_states check_rkey(struct rxe_qp *qp, >> >> /* A zero-byte op is not required to set an addr or rkey. See C9-88 */ >> if ((pkt->mask & RXE_READ_OR_WRITE_MASK) && >> - (pkt->mask & RXE_RETH_MASK) && >> - reth_len(pkt) == 0) { >> + (pkt->mask & RXE_RETH_MASK) && reth_len(pkt) == 0) { >> + qp->resp.mr = NULL; >> return RESPST_EXECUTE; >> } >> >> @@ -555,6 +569,7 @@ static enum resp_states check_rkey(struct rxe_qp *qp, >> return RESPST_EXECUTE; >> >> err: >> + qp->resp.mr = NULL; >> if (mr) >> rxe_put(mr); >> if (mw) >> @@ -885,7 +900,11 @@ static enum resp_states read_reply(struct rxe_qp *qp, >> } >> >> if (res->state == rdatm_res_state_new) { >> - if (!res->replay) { >> + if (!res->replay || qp->resp.length == 0) { >> + /* if length == 0 mr will be NULL (is ok) >> + * otherwise qp->resp.mr holds a ref on mr >> + * which we transfer to mr and drop below. >> + */ >> mr = qp->resp.mr; >> qp->resp.mr = NULL; >> } else { >> @@ -899,6 +918,10 @@ static enum resp_states read_reply(struct rxe_qp *qp, >> else >> opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST; >> } else { >> + /* re-lookup mr from rkey on all later packets. >> + * length will be non-zero. This can fail if someone >> + * modifies or destroys the mr since the first packet. >> + */ >> mr = rxe_recheck_mr(qp, res->read.rkey); >> if (!mr) >> return RESPST_ERR_RKEY_VIOLATION; >> @@ -916,18 +939,16 @@ static enum resp_states read_reply(struct rxe_qp *qp, >> skb = prepare_ack_packet(qp, &ack_pkt, opcode, payload, >> res->cur_psn, AETH_ACK_UNLIMITED); >> if (!skb) { >> - if (mr) >> - rxe_put(mr); >> - return RESPST_ERR_RNR; >> + state = RESPST_ERR_RNR; >> + goto err_out; >> } >> >> err = rxe_mr_copy(mr, res->read.va, payload_addr(&ack_pkt), >> payload, RXE_FROM_MR_OBJ); >> - if (mr) >> - rxe_put(mr); >> if (err) { >> kfree_skb(skb); >> - return RESPST_ERR_RKEY_VIOLATION; >> + state = RESPST_ERR_RKEY_VIOLATION; >> + goto err_out; >> } >> >> if (bth_pad(&ack_pkt)) { >> @@ -936,9 +957,12 @@ static enum resp_states read_reply(struct rxe_qp *qp, >> memset(pad, 0, bth_pad(&ack_pkt)); >> } >> >> + /* rxe_xmit_packet always consumes the skb */ >> err = rxe_xmit_packet(qp, &ack_pkt, skb); >> - if (err) >> - return RESPST_ERR_RNR; >> + if (err) { >> + state = RESPST_ERR_RNR; >> + goto err_out; >> + } >> >> res->read.va += payload; >> res->read.resid -= payload; >> @@ -955,6 +979,9 @@ static enum resp_states read_reply(struct rxe_qp *qp, >> state = RESPST_CLEANUP; >> } >> >> +err_out: >> + if (mr) >> + rxe_put(mr); >> return state; >> } >> >> -- >> 2.37.2 >
On 2/1/23 09:38, Tom Talpey wrote: > On 2/1/2023 6:06 AM, Daisuke Matsuda (Fujitsu) wrote: >> On Sat, Jan 28, 2023 6:10 AM Bob Pearson wrote: >>> >>> Currently the rxe driver does not handle all cases of zero length >>> rdma operations correctly. The client does not have to provide an >>> rkey for zero length RDMA operations so the rkey provided may be >>> invalid and should not be used to lookup an mr. >>> >>> This patch corrects the driver to ignore the provided rkey if the >>> reth length is zero and make sure to set the mr to NULL. In read_reply() >>> if length is zero rxe_recheck_mr() is not called. Warnings are added in >>> the routines in rxe_mr.c to catch NULL MRs when the length is non-zero. >>> >>> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com> >>> --- >> >> When I applied this change, a testcase in rdma-core failed as shown below: >> ====================================================================== >> ERROR: test_qp_ex_rc_flush (tests.test_qpex.QpExTestCase) >> ---------------------------------------------------------------------- >> Traceback (most recent call last): >> File "/root/rdma-core/tests/test_qpex.py", line 258, in test_qp_ex_rc_flush >> raise PyverbsError(f'Unexpected {wc_status_to_str(wcs[0].status)}') >> pyverbs.pyverbs_error.PyverbsError: Unexpected Remote access error >> >> ---------------------------------------------------------------------- >> >> In my opinion, your change makes sense within the range of traditional >> RDMA operations, but conflicts with the new RDMA FLUSH operation. >> Responder cannot access the target MR because of invalid rkey. The >> root cause is written in IBA Annex A19, especially 'oA19-2'. >> We thus cannot set qp->resp.rkey to 0 in qp_resp_from_reth(). >> >> Do you have anything to say about this? > Li Zhijian >> >> Thanks, >> Daisuke Matsuda > > I'm confused too, Bob can you point to the section of the spec > that allows the rkey to be zero? It's my understanding that > a zero-length RDMA Read must always check for access, even > though no data is actually fetched. That would not be possible > without an rkey. > > Tom. > Tom, Daisuke, C9-88: For an HCA responder using Reliable Connection service, for each zero-length RDMA READ or WRITE request, the R_Key shall not be validated, even if the request includes Immediate data. Further I have seen the pyverbs test suite sending a totally bogus rkey on a zero length rdma read. That was the impetus for me looking at this. Daisuke has a different issue since flush is a different operation than read or write. I need to look into what a zero length flush means. Bob
On 02/02/2023 11:45, Bob Pearson wrote: > On 2/1/23 09:38, Tom Talpey wrote: >> On 2/1/2023 6:06 AM, Daisuke Matsuda (Fujitsu) wrote: >>> On Sat, Jan 28, 2023 6:10 AM Bob Pearson wrote: >>>> >>>> Currently the rxe driver does not handle all cases of zero length >>>> rdma operations correctly. The client does not have to provide an >>>> rkey for zero length RDMA operations so the rkey provided may be >>>> invalid and should not be used to lookup an mr. >>>> >>>> This patch corrects the driver to ignore the provided rkey if the >>>> reth length is zero and make sure to set the mr to NULL. In read_reply() >>>> if length is zero rxe_recheck_mr() is not called. Warnings are added in >>>> the routines in rxe_mr.c to catch NULL MRs when the length is non-zero. >>>> >>>> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com> >>>> --- >>> >>> When I applied this change, a testcase in rdma-core failed as shown below: >>> ====================================================================== >>> ERROR: test_qp_ex_rc_flush (tests.test_qpex.QpExTestCase) >>> ---------------------------------------------------------------------- >>> Traceback (most recent call last): >>> File "/root/rdma-core/tests/test_qpex.py", line 258, in test_qp_ex_rc_flush >>> raise PyverbsError(f'Unexpected {wc_status_to_str(wcs[0].status)}') >>> pyverbs.pyverbs_error.PyverbsError: Unexpected Remote access error >>> >>> ---------------------------------------------------------------------- >>> >>> In my opinion, your change makes sense within the range of traditional >>> RDMA operations, but conflicts with the new RDMA FLUSH operation. >>> Responder cannot access the target MR because of invalid rkey. The >>> root cause is written in IBA Annex A19, especially 'oA19-2'. >>> We thus cannot set qp->resp.rkey to 0 in qp_resp_from_reth(). >>> >>> Do you have anything to say about this? > Li Zhijian >>> >>> Thanks, >>> Daisuke Matsuda >> >> I'm confused too, Bob can you point to the section of the spec >> that allows the rkey to be zero? It's my understanding that >> a zero-length RDMA Read must always check for access, even >> though no data is actually fetched. That would not be possible >> without an rkey. >> >> Tom. >> > Tom, Daisuke, > > C9-88: For an HCA responder using Reliable Connection service, for > each zero-length RDMA READ or WRITE request, the R_Key shall not be > validated, even if the request includes Immediate data. > > Further I have seen the pyverbs test suite sending a totally bogus rkey on a zero length rdma read. That was the impetus for me looking at this. > > Daisuke has a different issue since flush is a different operation than read or write. > I need to look into what a zero length flush means. > Just took a look at the above FLUSH problem. It also exposed another bug in my flush code in rdma-core, PR: https://github.com/linux-rdma/rdma-core/pull/1307 when 'Selectivity Level (SEL)' is 'Memory Region', 0 length will be set in FETH, in this case, rkey should be valid and length should be ignored. Thanks Zhijian > Bob > >
On 2/1/2023 10:45 PM, Bob Pearson wrote: > On 2/1/23 09:38, Tom Talpey wrote: >> On 2/1/2023 6:06 AM, Daisuke Matsuda (Fujitsu) wrote: >>> On Sat, Jan 28, 2023 6:10 AM Bob Pearson wrote: >>>> >>>> Currently the rxe driver does not handle all cases of zero length >>>> rdma operations correctly. The client does not have to provide an >>>> rkey for zero length RDMA operations so the rkey provided may be >>>> invalid and should not be used to lookup an mr. >>>> >>>> This patch corrects the driver to ignore the provided rkey if the >>>> reth length is zero and make sure to set the mr to NULL. In read_reply() >>>> if length is zero rxe_recheck_mr() is not called. Warnings are added in >>>> the routines in rxe_mr.c to catch NULL MRs when the length is non-zero. >>>> >>>> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com> >>>> --- >>> >>> When I applied this change, a testcase in rdma-core failed as shown below: >>> ====================================================================== >>> ERROR: test_qp_ex_rc_flush (tests.test_qpex.QpExTestCase) >>> ---------------------------------------------------------------------- >>> Traceback (most recent call last): >>> File "/root/rdma-core/tests/test_qpex.py", line 258, in test_qp_ex_rc_flush >>> raise PyverbsError(f'Unexpected {wc_status_to_str(wcs[0].status)}') >>> pyverbs.pyverbs_error.PyverbsError: Unexpected Remote access error >>> >>> ---------------------------------------------------------------------- >>> >>> In my opinion, your change makes sense within the range of traditional >>> RDMA operations, but conflicts with the new RDMA FLUSH operation. >>> Responder cannot access the target MR because of invalid rkey. The >>> root cause is written in IBA Annex A19, especially 'oA19-2'. >>> We thus cannot set qp->resp.rkey to 0 in qp_resp_from_reth(). >>> >>> Do you have anything to say about this? > Li Zhijian >>> >>> Thanks, >>> Daisuke Matsuda >> >> I'm confused too, Bob can you point to the section of the spec >> that allows the rkey to be zero? It's my understanding that >> a zero-length RDMA Read must always check for access, even >> though no data is actually fetched. That would not be possible >> without an rkey. >> >> Tom. >> > Tom, Daisuke, > > C9-88: For an HCA responder using Reliable Connection service, for > each zero-length RDMA READ or WRITE request, the R_Key shall not be > validated, even if the request includes Immediate data. Interesting, thanks. I'm pretty certain I recall HCAs not allowing it in the past. Also note that the iWARP protocol does not have this exception, and always requires validation. > Further I have seen the pyverbs test suite sending a totally bogus rkey on a zero length rdma read. That was the impetus for me looking at this. > > Daisuke has a different issue since flush is a different operation than read or write. > I need to look into what a zero length flush means. Yes, RDMA Flush has a flag which allows flushing the entire region, and the length is ignored in that case. oA19-13 always requires validation of the region. The IBTA Link Working Group may need to clarify this language. I suspect the existing text in 9.7.4.1.5 may be correct strictly speaking, but the RDMA Flush and RDMA Verify operations aren't mentioned there. And the Annex A19 processing text is different. I'll bring it up for discussion in an upcoming LWG meeting. Thanks! Tom.
diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index c80458634962..5b7ede1d2b08 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -314,6 +314,9 @@ int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, if (length == 0) return 0; + if (WARN_ON(!mr)) + return -EINVAL; + if (mr->ibmr.type == IB_MR_TYPE_DMA) { rxe_mr_copy_dma(mr, iova, addr, length, dir); return 0; @@ -435,6 +438,9 @@ int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) if (length == 0) return 0; + if (WARN_ON(!mr)) + return -EINVAL; + if (mr->ibmr.type == IB_MR_TYPE_DMA) return -EFAULT; diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index cd2d88de287c..b13ae98400c1 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -420,13 +420,23 @@ static enum resp_states rxe_resp_check_length(struct rxe_qp *qp, return RESPST_CHK_RKEY; } +/* if the reth length field is zero we can assume nothing + * about the rkey value and should not validate or use it. + * Instead set qp->resp.rkey to 0 which is an invalid rkey + * value since the minimum index part is 1. + */ static void qp_resp_from_reth(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { + unsigned int length = reth_len(pkt); + qp->resp.va = reth_va(pkt); qp->resp.offset = 0; - qp->resp.rkey = reth_rkey(pkt); - qp->resp.resid = reth_len(pkt); - qp->resp.length = reth_len(pkt); + qp->resp.resid = length; + qp->resp.length = length; + if (length) + qp->resp.rkey = reth_rkey(pkt); + else + qp->resp.rkey = 0; } static void qp_resp_from_atmeth(struct rxe_qp *qp, struct rxe_pkt_info *pkt) @@ -437,6 +447,10 @@ static void qp_resp_from_atmeth(struct rxe_qp *qp, struct rxe_pkt_info *pkt) qp->resp.resid = sizeof(u64); } +/* resolve the packet rkey to qp->resp.mr or set qp->resp.mr to NULL + * if an invalid rkey is received or the rdma length is zero. For middle + * or last packets use the stored value of mr. + */ static enum resp_states check_rkey(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { @@ -475,8 +489,8 @@ static enum resp_states check_rkey(struct rxe_qp *qp, /* A zero-byte op is not required to set an addr or rkey. See C9-88 */ if ((pkt->mask & RXE_READ_OR_WRITE_MASK) && - (pkt->mask & RXE_RETH_MASK) && - reth_len(pkt) == 0) { + (pkt->mask & RXE_RETH_MASK) && reth_len(pkt) == 0) { + qp->resp.mr = NULL; return RESPST_EXECUTE; } @@ -555,6 +569,7 @@ static enum resp_states check_rkey(struct rxe_qp *qp, return RESPST_EXECUTE; err: + qp->resp.mr = NULL; if (mr) rxe_put(mr); if (mw) @@ -885,7 +900,11 @@ static enum resp_states read_reply(struct rxe_qp *qp, } if (res->state == rdatm_res_state_new) { - if (!res->replay) { + if (!res->replay || qp->resp.length == 0) { + /* if length == 0 mr will be NULL (is ok) + * otherwise qp->resp.mr holds a ref on mr + * which we transfer to mr and drop below. + */ mr = qp->resp.mr; qp->resp.mr = NULL; } else { @@ -899,6 +918,10 @@ static enum resp_states read_reply(struct rxe_qp *qp, else opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST; } else { + /* re-lookup mr from rkey on all later packets. + * length will be non-zero. This can fail if someone + * modifies or destroys the mr since the first packet. + */ mr = rxe_recheck_mr(qp, res->read.rkey); if (!mr) return RESPST_ERR_RKEY_VIOLATION; @@ -916,18 +939,16 @@ static enum resp_states read_reply(struct rxe_qp *qp, skb = prepare_ack_packet(qp, &ack_pkt, opcode, payload, res->cur_psn, AETH_ACK_UNLIMITED); if (!skb) { - if (mr) - rxe_put(mr); - return RESPST_ERR_RNR; + state = RESPST_ERR_RNR; + goto err_out; } err = rxe_mr_copy(mr, res->read.va, payload_addr(&ack_pkt), payload, RXE_FROM_MR_OBJ); - if (mr) - rxe_put(mr); if (err) { kfree_skb(skb); - return RESPST_ERR_RKEY_VIOLATION; + state = RESPST_ERR_RKEY_VIOLATION; + goto err_out; } if (bth_pad(&ack_pkt)) { @@ -936,9 +957,12 @@ static enum resp_states read_reply(struct rxe_qp *qp, memset(pad, 0, bth_pad(&ack_pkt)); } + /* rxe_xmit_packet always consumes the skb */ err = rxe_xmit_packet(qp, &ack_pkt, skb); - if (err) - return RESPST_ERR_RNR; + if (err) { + state = RESPST_ERR_RNR; + goto err_out; + } res->read.va += payload; res->read.resid -= payload; @@ -955,6 +979,9 @@ static enum resp_states read_reply(struct rxe_qp *qp, state = RESPST_CLEANUP; } +err_out: + if (mr) + rxe_put(mr); return state; }
Currently the rxe driver does not handle all cases of zero length rdma operations correctly. The client does not have to provide an rkey for zero length RDMA operations so the rkey provided may be invalid and should not be used to lookup an mr. This patch corrects the driver to ignore the provided rkey if the reth length is zero and make sure to set the mr to NULL. In read_reply() if length is zero rxe_recheck_mr() is not called. Warnings are added in the routines in rxe_mr.c to catch NULL MRs when the length is non-zero. Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com> --- v3: Fixed my fat finger typing on v2. Mangled the patch. v2: Rebased to current for-next. Cleaned up description to be a little more accurate. --- drivers/infiniband/sw/rxe/rxe_mr.c | 6 +++ drivers/infiniband/sw/rxe/rxe_resp.c | 55 +++++++++++++++++++++------- 2 files changed, 47 insertions(+), 14 deletions(-)