diff mbox series

[RFC,02/13] NFSD: Reschedule CB operations when backchannel rpc_clnt is shut down

Message ID 170620013252.2833.10156142379669175540.stgit@manet.1015granger.net (mailing list archive)
State New
Headers show
Series NFSD backchannel fixes | expand

Commit Message

Chuck Lever Jan. 25, 2024, 4:28 p.m. UTC
From: Chuck Lever <chuck.lever@oracle.com>

As part of managing a client disconnect, NFSD closes down and
replaces the backchannel rpc_clnt.

If a callback operation is pending when the backchannel rpc_clnt is
shut down, currently nfsd4_run_cb_work() just discards that
callback. But there are multiple cases to deal with here:

 o The client's lease is getting destroyed. Throw the CB away.

 o The client disconnected. It might be forcing a retransmit of
   CB operations, or it could have disconnected for other reasons.
   Reschedule the CB so it is retransmitted when the client
   reconnects.

Since callback operations can now be rescheduled, ensure that
cb_ops->prepare can be called only once by moving the
cb_ops->prepare paragraph down to just before the rpc_call_async()
call.

Fixes: 2bbfed98a4d8 ("nfsd: Fix races between nfsd4_cb_release() and nfsd4_shutdown_callback()")
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4callback.c |   26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

Comments

Jeffrey Layton Jan. 25, 2024, 8:19 p.m. UTC | #1
On Thu, 2024-01-25 at 11:28 -0500, Chuck Lever wrote:
> From: Chuck Lever <chuck.lever@oracle.com>
> 
> As part of managing a client disconnect, NFSD closes down and
> replaces the backchannel rpc_clnt.
> 
> If a callback operation is pending when the backchannel rpc_clnt is
> shut down, currently nfsd4_run_cb_work() just discards that
> callback. But there are multiple cases to deal with here:
> 
>  o The client's lease is getting destroyed. Throw the CB away.
> 
>  o The client disconnected. It might be forcing a retransmit of
>    CB operations, or it could have disconnected for other reasons.
>    Reschedule the CB so it is retransmitted when the client
>    reconnects.
> 
> Since callback operations can now be rescheduled, ensure that
> cb_ops->prepare can be called only once by moving the
> cb_ops->prepare paragraph down to just before the rpc_call_async()
> call.
> 
> Fixes: 2bbfed98a4d8 ("nfsd: Fix races between nfsd4_cb_release() and nfsd4_shutdown_callback()")
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
>  fs/nfsd/nfs4callback.c |   26 +++++++++++++++++---------
>  1 file changed, 17 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> index 43b0a34a5d5b..b2844abcb51f 100644
> --- a/fs/nfsd/nfs4callback.c
> +++ b/fs/nfsd/nfs4callback.c
> @@ -1375,20 +1375,22 @@ nfsd4_run_cb_work(struct work_struct *work)
>  	struct rpc_clnt *clnt;
>  	int flags;
>  
> -	if (cb->cb_need_restart) {
> -		cb->cb_need_restart = false;
> -	} else {
> -		if (cb->cb_ops && cb->cb_ops->prepare)
> -			cb->cb_ops->prepare(cb);
> -	}
> -
>  	if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK)
>  		nfsd4_process_cb_update(cb);
>  
>  	clnt = clp->cl_cb_client;
>  	if (!clnt) {
> -		/* Callback channel broken, or client killed; give up: */
> -		nfsd41_destroy_cb(cb);
> +		if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags))
> +			nfsd41_destroy_cb(cb);
> +		else {
> +			/*
> +			 * XXX: Ideally, we would wait for the client to
> +			 *	reconnect, but I haven't figured out how
> +			 *	to do that yet.
> +			 */
> +			msleep(30);
> +			nfsd4_queue_cb(cb);

It would probably be better to just queue the cb as delayed_work here,
so you don't squat on the workqueue thread. That'll mean changing
cb_work to struct delayed_work, but that should be NBD.

> +		}
>  		return;
>  	}
>  
> @@ -1401,6 +1403,12 @@ nfsd4_run_cb_work(struct work_struct *work)
>  		return;
>  	}
>  
> +	if (cb->cb_need_restart) {
> +		cb->cb_need_restart = false;
> +	} else {
> +		if (cb->cb_ops && cb->cb_ops->prepare)
> +			cb->cb_ops->prepare(cb);
> +	}
>  	cb->cb_msg.rpc_cred = clp->cl_cb_cred;
>  	flags = clp->cl_minorversion ? RPC_TASK_NOCONNECT : RPC_TASK_SOFTCONN;
>  	rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | flags,
> 
> 
>
Chuck Lever Jan. 25, 2024, 8:29 p.m. UTC | #2
On Thu, Jan 25, 2024 at 03:19:41PM -0500, Jeff Layton wrote:
> On Thu, 2024-01-25 at 11:28 -0500, Chuck Lever wrote:
> > From: Chuck Lever <chuck.lever@oracle.com>
> > 
> > As part of managing a client disconnect, NFSD closes down and
> > replaces the backchannel rpc_clnt.
> > 
> > If a callback operation is pending when the backchannel rpc_clnt is
> > shut down, currently nfsd4_run_cb_work() just discards that
> > callback. But there are multiple cases to deal with here:
> > 
> >  o The client's lease is getting destroyed. Throw the CB away.
> > 
> >  o The client disconnected. It might be forcing a retransmit of
> >    CB operations, or it could have disconnected for other reasons.
> >    Reschedule the CB so it is retransmitted when the client
> >    reconnects.
> > 
> > Since callback operations can now be rescheduled, ensure that
> > cb_ops->prepare can be called only once by moving the
> > cb_ops->prepare paragraph down to just before the rpc_call_async()
> > call.
> > 
> > Fixes: 2bbfed98a4d8 ("nfsd: Fix races between nfsd4_cb_release() and nfsd4_shutdown_callback()")
> > Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> > ---
> >  fs/nfsd/nfs4callback.c |   26 +++++++++++++++++---------
> >  1 file changed, 17 insertions(+), 9 deletions(-)
> > 
> > diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> > index 43b0a34a5d5b..b2844abcb51f 100644
> > --- a/fs/nfsd/nfs4callback.c
> > +++ b/fs/nfsd/nfs4callback.c
> > @@ -1375,20 +1375,22 @@ nfsd4_run_cb_work(struct work_struct *work)
> >  	struct rpc_clnt *clnt;
> >  	int flags;
> >  
> > -	if (cb->cb_need_restart) {
> > -		cb->cb_need_restart = false;
> > -	} else {
> > -		if (cb->cb_ops && cb->cb_ops->prepare)
> > -			cb->cb_ops->prepare(cb);
> > -	}
> > -
> >  	if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK)
> >  		nfsd4_process_cb_update(cb);
> >  
> >  	clnt = clp->cl_cb_client;
> >  	if (!clnt) {
> > -		/* Callback channel broken, or client killed; give up: */
> > -		nfsd41_destroy_cb(cb);
> > +		if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags))
> > +			nfsd41_destroy_cb(cb);
> > +		else {
> > +			/*
> > +			 * XXX: Ideally, we would wait for the client to
> > +			 *	reconnect, but I haven't figured out how
> > +			 *	to do that yet.
> > +			 */
> > +			msleep(30);
> > +			nfsd4_queue_cb(cb);
> 
> It would probably be better to just queue the cb as delayed_work here,
> so you don't squat on the workqueue thread.

You found my placeholder :-)


> That'll mean changing
> cb_work to struct delayed_work, but that should be NBD.

I've looked at that. I wanted to be sure, before going that route,
that there is no obvious way to implement a "wait for client to
reconnect". msleep (or delayed_work) is basically a slow busy wait.


> > +		}
> >  		return;
> >  	}
> >  
> > @@ -1401,6 +1403,12 @@ nfsd4_run_cb_work(struct work_struct *work)
> >  		return;
> >  	}
> >  
> > +	if (cb->cb_need_restart) {
> > +		cb->cb_need_restart = false;
> > +	} else {
> > +		if (cb->cb_ops && cb->cb_ops->prepare)
> > +			cb->cb_ops->prepare(cb);
> > +	}
> >  	cb->cb_msg.rpc_cred = clp->cl_cb_cred;
> >  	flags = clp->cl_minorversion ? RPC_TASK_NOCONNECT : RPC_TASK_SOFTCONN;
> >  	rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | flags,
> > 
> > 
> > 
> 
> -- 
> Jeff Layton <jlayton@kernel.org>
diff mbox series

Patch

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 43b0a34a5d5b..b2844abcb51f 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1375,20 +1375,22 @@  nfsd4_run_cb_work(struct work_struct *work)
 	struct rpc_clnt *clnt;
 	int flags;
 
-	if (cb->cb_need_restart) {
-		cb->cb_need_restart = false;
-	} else {
-		if (cb->cb_ops && cb->cb_ops->prepare)
-			cb->cb_ops->prepare(cb);
-	}
-
 	if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK)
 		nfsd4_process_cb_update(cb);
 
 	clnt = clp->cl_cb_client;
 	if (!clnt) {
-		/* Callback channel broken, or client killed; give up: */
-		nfsd41_destroy_cb(cb);
+		if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags))
+			nfsd41_destroy_cb(cb);
+		else {
+			/*
+			 * XXX: Ideally, we would wait for the client to
+			 *	reconnect, but I haven't figured out how
+			 *	to do that yet.
+			 */
+			msleep(30);
+			nfsd4_queue_cb(cb);
+		}
 		return;
 	}
 
@@ -1401,6 +1403,12 @@  nfsd4_run_cb_work(struct work_struct *work)
 		return;
 	}
 
+	if (cb->cb_need_restart) {
+		cb->cb_need_restart = false;
+	} else {
+		if (cb->cb_ops && cb->cb_ops->prepare)
+			cb->cb_ops->prepare(cb);
+	}
 	cb->cb_msg.rpc_cred = clp->cl_cb_cred;
 	flags = clp->cl_minorversion ? RPC_TASK_NOCONNECT : RPC_TASK_SOFTCONN;
 	rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | flags,