diff mbox series

xprtrdma: Remap Receive buffers after a reconnect

Message ID 168934757954.2781502.4228890662497418497.stgit@morisot.1015granger.net (mailing list archive)
State New, archived
Headers show
Series xprtrdma: Remap Receive buffers after a reconnect | expand

Commit Message

Chuck Lever July 14, 2023, 3:13 p.m. UTC
From: Chuck Lever <chuck.lever@oracle.com>

On server-initiated disconnect, rpcrdma_xprt_disconnect() DMA-unmaps
the transport's Receive buffers, but rpcrdma_post_recvs() neglected
to remap them after a new connection had been established. The
result is immediate failure of the new connection with the Receives
flushing with LOCAL_PROT_ERR.

Fixes: 671c450b6fe0 ("xprtrdma: Fix oops in Receive handler after device removal")
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/verbs.c |    9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

Hi Anna-

Can you apply this for 6.5-rc ?

Comments

Tom Talpey July 15, 2023, 5:01 p.m. UTC | #1
On 7/14/2023 11:13 AM, Chuck Lever wrote:
> From: Chuck Lever <chuck.lever@oracle.com>
> 
> On server-initiated disconnect, rpcrdma_xprt_disconnect() DMA-unmaps
> the transport's Receive buffers, but rpcrdma_post_recvs() neglected
> to remap them after a new connection had been established. The
> result is immediate failure of the new connection with the Receives
> flushing with LOCAL_PROT_ERR.

The fix is correct, my only comment is that the failure occurs when the
first message is received, and only the first CQE is a LOCAL_PROT_ERR.
The remainder of the posted receives are simply flushed.

Same result of course! The summary is ok as-is. Important fix.

Reviewed-by: Tom Talpey <tom@talpey.com>

> Fixes: 671c450b6fe0 ("xprtrdma: Fix oops in Receive handler after device removal")
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
>   net/sunrpc/xprtrdma/verbs.c |    9 ++++-----
>   1 file changed, 4 insertions(+), 5 deletions(-)
> 
> Hi Anna-
> 
> Can you apply this for 6.5-rc ?
> 
> 
> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
> index b098fde373ab..28c0771c4e8c 100644
> --- a/net/sunrpc/xprtrdma/verbs.c
> +++ b/net/sunrpc/xprtrdma/verbs.c
> @@ -935,9 +935,6 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
>   	if (!rep->rr_rdmabuf)
>   		goto out_free;
>   
> -	if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf))
> -		goto out_free_regbuf;
> -
>   	rep->rr_cid.ci_completion_id =
>   		atomic_inc_return(&r_xprt->rx_ep->re_completion_ids);
>   
> @@ -956,8 +953,6 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
>   	spin_unlock(&buf->rb_lock);
>   	return rep;
>   
> -out_free_regbuf:
> -	rpcrdma_regbuf_free(rep->rr_rdmabuf);
>   out_free:
>   	kfree(rep);
>   out:
> @@ -1363,6 +1358,10 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp)
>   			rep = rpcrdma_rep_create(r_xprt, temp);
>   		if (!rep)
>   			break;
> +		if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf)) {
> +			rpcrdma_rep_put(buf, rep);
> +			break;
> +		}
>   
>   		rep->rr_cid.ci_queue_id = ep->re_attr.recv_cq->res.id;
>   		trace_xprtrdma_post_recv(rep);
> 
> 
>
Chuck Lever July 15, 2023, 6:56 p.m. UTC | #2
> On Jul 15, 2023, at 1:01 PM, Tom Talpey <tom@talpey.com> wrote:
> 
> On 7/14/2023 11:13 AM, Chuck Lever wrote:
>> From: Chuck Lever <chuck.lever@oracle.com>
>> On server-initiated disconnect, rpcrdma_xprt_disconnect() DMA-unmaps
>> the transport's Receive buffers, but rpcrdma_post_recvs() neglected
>> to remap them after a new connection had been established. The
>> result is immediate failure of the new connection with the Receives
>> flushing with LOCAL_PROT_ERR.
> 
> The fix is correct, my only comment is that the failure occurs when the
> first message is received, and only the first CQE is a LOCAL_PROT_ERR.
> The remainder of the posted receives are simply flushed.
> 
> Same result of course! The summary is ok as-is. Important fix.
> 
> Reviewed-by: Tom Talpey <tom@talpey.com>

Thanks for the look!

Anna, when applying this patch, can you replace:

"The result is immediate failure of the new connection with the Receives
flushing with LOCAL_PROT_ERR."

with

"The result is immediate failure of the new connection with the first
unmapped Receive completing with LOCAL_PROT_ERR."


>> Fixes: 671c450b6fe0 ("xprtrdma: Fix oops in Receive handler after device removal")
>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>> ---
>>  net/sunrpc/xprtrdma/verbs.c |    9 ++++-----
>>  1 file changed, 4 insertions(+), 5 deletions(-)
>> Hi Anna-
>> Can you apply this for 6.5-rc ?
>> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
>> index b098fde373ab..28c0771c4e8c 100644
>> --- a/net/sunrpc/xprtrdma/verbs.c
>> +++ b/net/sunrpc/xprtrdma/verbs.c
>> @@ -935,9 +935,6 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
>>   if (!rep->rr_rdmabuf)
>>   goto out_free;
>>  - if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf))
>> - goto out_free_regbuf;
>> -
>>   rep->rr_cid.ci_completion_id =
>>   atomic_inc_return(&r_xprt->rx_ep->re_completion_ids);
>>  @@ -956,8 +953,6 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
>>   spin_unlock(&buf->rb_lock);
>>   return rep;
>>  -out_free_regbuf:
>> - rpcrdma_regbuf_free(rep->rr_rdmabuf);
>>  out_free:
>>   kfree(rep);
>>  out:
>> @@ -1363,6 +1358,10 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp)
>>   rep = rpcrdma_rep_create(r_xprt, temp);
>>   if (!rep)
>>   break;
>> + if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf)) {
>> + rpcrdma_rep_put(buf, rep);
>> + break;
>> + }
>>     rep->rr_cid.ci_queue_id = ep->re_attr.recv_cq->res.id;
>>   trace_xprtrdma_post_recv(rep);

--
Chuck Lever
Chuck Lever July 16, 2023, 5:11 p.m. UTC | #3
> On Jul 15, 2023, at 2:56 PM, Chuck Lever III <chuck.lever@oracle.com> wrote:
> 
> 
> 
>> On Jul 15, 2023, at 1:01 PM, Tom Talpey <tom@talpey.com> wrote:
>> 
>> On 7/14/2023 11:13 AM, Chuck Lever wrote:
>>> From: Chuck Lever <chuck.lever@oracle.com>
>>> On server-initiated disconnect, rpcrdma_xprt_disconnect() DMA-unmaps
>>> the transport's Receive buffers, but rpcrdma_post_recvs() neglected
>>> to remap them after a new connection had been established. The
>>> result is immediate failure of the new connection with the Receives
>>> flushing with LOCAL_PROT_ERR.
>> 
>> The fix is correct, my only comment is that the failure occurs when the
>> first message is received, and only the first CQE is a LOCAL_PROT_ERR.
>> The remainder of the posted receives are simply flushed.
>> 
>> Same result of course! The summary is ok as-is. Important fix.
>> 
>> Reviewed-by: Tom Talpey <tom@talpey.com>
> 
> Thanks for the look!
> 
> Anna, when applying this patch, can you replace:
> 
> "The result is immediate failure of the new connection with the Receives
> flushing with LOCAL_PROT_ERR."
> 
> with
> 
> "The result is immediate failure of the new connection with the first
> unmapped Receive completing with LOCAL_PROT_ERR."

Also, please add:

Reported-by: Olga Kornievskaia <kolga@netapp.com>


>>> Fixes: 671c450b6fe0 ("xprtrdma: Fix oops in Receive handler after device removal")
>>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>>> ---
>>> net/sunrpc/xprtrdma/verbs.c |    9 ++++-----
>>> 1 file changed, 4 insertions(+), 5 deletions(-)
>>> Hi Anna-
>>> Can you apply this for 6.5-rc ?
>>> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
>>> index b098fde373ab..28c0771c4e8c 100644
>>> --- a/net/sunrpc/xprtrdma/verbs.c
>>> +++ b/net/sunrpc/xprtrdma/verbs.c
>>> @@ -935,9 +935,6 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
>>>  if (!rep->rr_rdmabuf)
>>>  goto out_free;
>>> - if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf))
>>> - goto out_free_regbuf;
>>> -
>>>  rep->rr_cid.ci_completion_id =
>>>  atomic_inc_return(&r_xprt->rx_ep->re_completion_ids);
>>> @@ -956,8 +953,6 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
>>>  spin_unlock(&buf->rb_lock);
>>>  return rep;
>>> -out_free_regbuf:
>>> - rpcrdma_regbuf_free(rep->rr_rdmabuf);
>>> out_free:
>>>  kfree(rep);
>>> out:
>>> @@ -1363,6 +1358,10 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp)
>>>  rep = rpcrdma_rep_create(r_xprt, temp);
>>>  if (!rep)
>>>  break;
>>> + if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf)) {
>>> + rpcrdma_rep_put(buf, rep);
>>> + break;
>>> + }
>>>    rep->rr_cid.ci_queue_id = ep->re_attr.recv_cq->res.id;
>>>  trace_xprtrdma_post_recv(rep);
> 
> --
> Chuck Lever


--
Chuck Lever
diff mbox series

Patch

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index b098fde373ab..28c0771c4e8c 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -935,9 +935,6 @@  struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
 	if (!rep->rr_rdmabuf)
 		goto out_free;
 
-	if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf))
-		goto out_free_regbuf;
-
 	rep->rr_cid.ci_completion_id =
 		atomic_inc_return(&r_xprt->rx_ep->re_completion_ids);
 
@@ -956,8 +953,6 @@  struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
 	spin_unlock(&buf->rb_lock);
 	return rep;
 
-out_free_regbuf:
-	rpcrdma_regbuf_free(rep->rr_rdmabuf);
 out_free:
 	kfree(rep);
 out:
@@ -1363,6 +1358,10 @@  void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp)
 			rep = rpcrdma_rep_create(r_xprt, temp);
 		if (!rep)
 			break;
+		if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf)) {
+			rpcrdma_rep_put(buf, rep);
+			break;
+		}
 
 		rep->rr_cid.ci_queue_id = ep->re_attr.recv_cq->res.id;
 		trace_xprtrdma_post_recv(rep);