diff mbox series

[1/1] NFSD: cancel CB_RECALL_ANY call when nfs4_client is about to be destroyed

Message ID 1711476809-26248-1-git-send-email-dai.ngo@oracle.com (mailing list archive)
State New
Headers show
Series [1/1] NFSD: cancel CB_RECALL_ANY call when nfs4_client is about to be destroyed | expand

Commit Message

Dai Ngo March 26, 2024, 6:13 p.m. UTC
Currently when a nfs4_client is destroyed we wait for the cb_recall_any
callback to complete before proceed. This adds unnecessary delay to the
__destroy_client call if there is problem communicating with the client.

This patch addresses this issue by cancelling the CB_RECALL_ANY call from
the workqueue when the nfs4_client is about to be destroyed.

Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
---
 fs/nfsd/nfs4callback.c | 10 ++++++++++
 fs/nfsd/nfs4state.c    | 10 +++++++++-
 fs/nfsd/state.h        |  1 +
 3 files changed, 20 insertions(+), 1 deletion(-)

Comments

Chuck Lever March 26, 2024, 6:27 p.m. UTC | #1
On Tue, Mar 26, 2024 at 11:13:29AM -0700, Dai Ngo wrote:
> Currently when a nfs4_client is destroyed we wait for the cb_recall_any
> callback to complete before proceed. This adds unnecessary delay to the
> __destroy_client call if there is problem communicating with the client.

By "unnecessary delay" do you mean only the seven-second RPC
retransmit timeout, or is there something else?

I can see that a server shutdown might want to cancel these, but why
is this a problem when destroying an nfs4_client?


> This patch addresses this issue by cancelling the CB_RECALL_ANY call from
> the workqueue when the nfs4_client is about to be destroyed.

Does CB_OFFLOAD need similar treatment?


> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
> ---
>  fs/nfsd/nfs4callback.c | 10 ++++++++++
>  fs/nfsd/nfs4state.c    | 10 +++++++++-
>  fs/nfsd/state.h        |  1 +
>  3 files changed, 20 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> index 87c9547989f6..e5b50c96be6a 100644
> --- a/fs/nfsd/nfs4callback.c
> +++ b/fs/nfsd/nfs4callback.c
> @@ -1568,3 +1568,13 @@ bool nfsd4_run_cb(struct nfsd4_callback *cb)
>  		nfsd41_cb_inflight_end(clp);
>  	return queued;
>  }
> +
> +void nfsd41_cb_recall_any_cancel(struct nfs4_client *clp)
> +{
> +	if (test_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags) &&
> +			cancel_delayed_work(&clp->cl_ra->ra_cb.cb_work)) {
> +		clear_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags);
> +		atomic_add_unless(&clp->cl_rpc_users, -1, 0);
> +		nfsd41_cb_inflight_end(clp);
> +	}
> +}
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 1a93c7fcf76c..0e1db57c9a19 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -2402,6 +2402,7 @@ __destroy_client(struct nfs4_client *clp)
>  	}
>  	nfsd4_return_all_client_layouts(clp);
>  	nfsd4_shutdown_copy(clp);
> +	nfsd41_cb_recall_any_cancel(clp);
>  	nfsd4_shutdown_callback(clp);
>  	if (clp->cl_cb_conn.cb_xprt)
>  		svc_xprt_put(clp->cl_cb_conn.cb_xprt);
> @@ -2980,6 +2981,12 @@ static void force_expire_client(struct nfs4_client *clp)
>  	clp->cl_time = 0;
>  	spin_unlock(&nn->client_lock);
>  
> +	/*
> +	 * no need to send and wait for CB_RECALL_ANY
> +	 * when client is about to be destroyed
> +	 */
> +	nfsd41_cb_recall_any_cancel(clp);
> +
>  	wait_event(expiry_wq, atomic_read(&clp->cl_rpc_users) == 0);
>  	spin_lock(&nn->client_lock);
>  	already_expired = list_empty(&clp->cl_lru);
> @@ -6617,7 +6624,8 @@ deleg_reaper(struct nfsd_net *nn)
>  		clp->cl_ra->ra_bmval[0] = BIT(RCA4_TYPE_MASK_RDATA_DLG) |
>  						BIT(RCA4_TYPE_MASK_WDATA_DLG);
>  		trace_nfsd_cb_recall_any(clp->cl_ra);
> -		nfsd4_run_cb(&clp->cl_ra->ra_cb);
> +		if (!nfsd4_run_cb(&clp->cl_ra->ra_cb))
> +			clear_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags);
>  	}
>  }
>  
> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> index 01c6f3445646..259b4af7d226 100644
> --- a/fs/nfsd/state.h
> +++ b/fs/nfsd/state.h
> @@ -735,6 +735,7 @@ extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *
>  extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
>  		const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op);
>  extern bool nfsd4_run_cb(struct nfsd4_callback *cb);
> +extern void nfsd41_cb_recall_any_cancel(struct nfs4_client *clp);
>  extern int nfsd4_create_callback_queue(void);
>  extern void nfsd4_destroy_callback_queue(void);
>  extern void nfsd4_shutdown_callback(struct nfs4_client *);
> -- 
> 2.39.3
>
Dai Ngo March 28, 2024, 1:09 a.m. UTC | #2
On 3/26/24 11:27 AM, Chuck Lever wrote:
> On Tue, Mar 26, 2024 at 11:13:29AM -0700, Dai Ngo wrote:
>> Currently when a nfs4_client is destroyed we wait for the cb_recall_any
>> callback to complete before proceed. This adds unnecessary delay to the
>> __destroy_client call if there is problem communicating with the client.
> By "unnecessary delay" do you mean only the seven-second RPC
> retransmit timeout, or is there something else?

when the client network interface is down, the RPC task takes ~9s to
send the callback, waits for the reply and gets ETIMEDOUT. This process
repeats in a loop with the same RPC task before being stopped by
rpc_shutdown_client after client lease expires.

It takes a total of about 1m20s before the CB_RECALL is terminated.
For CB_RECALL_ANY and CB_OFFLOAD, this process gets in to a infinite
loop since there is no delegation conflict and the client is allowed
to stay in courtesy state.

The loop happens because in nfsd4_cb_sequence_done if cb_seq_status
is 1 (an RPC Reply was never received) it calls nfsd4_mark_cb_fault
to set the NFSD4_CB_FAULT bit. It then sets cb_need_restart to true.
When nfsd4_cb_release is called, it checks cb_need_restart bit and
re-queues the work again.

> I can see that a server shutdown might want to cancel these, but why
> is this a problem when destroying an nfs4_client?

Destroying an nfs4_client is called when the export is unmounted.
Cancelling these calls just make the process a bit quicker when there
is problem with the client connection, or preventing the unmount to
hang if there is problem at the workqueue and a callback work is
pending there.

For CB_RECALL, even if we wait for the call to complete the client
won't be able to return any delegations since the nfs4_client is
already been destroyed. It just serves as a notice to the client that
there is a delegation conflict so it can take appropriate actions.

>> This patch addresses this issue by cancelling the CB_RECALL_ANY call from
>> the workqueue when the nfs4_client is about to be destroyed.
> Does CB_OFFLOAD need similar treatment?

Probably. The copy is already done anyway, this is just a notification.

-Dai

>
>
>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
>> ---
>>   fs/nfsd/nfs4callback.c | 10 ++++++++++
>>   fs/nfsd/nfs4state.c    | 10 +++++++++-
>>   fs/nfsd/state.h        |  1 +
>>   3 files changed, 20 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
>> index 87c9547989f6..e5b50c96be6a 100644
>> --- a/fs/nfsd/nfs4callback.c
>> +++ b/fs/nfsd/nfs4callback.c
>> @@ -1568,3 +1568,13 @@ bool nfsd4_run_cb(struct nfsd4_callback *cb)
>>   		nfsd41_cb_inflight_end(clp);
>>   	return queued;
>>   }
>> +
>> +void nfsd41_cb_recall_any_cancel(struct nfs4_client *clp)
>> +{
>> +	if (test_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags) &&
>> +			cancel_delayed_work(&clp->cl_ra->ra_cb.cb_work)) {
>> +		clear_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags);
>> +		atomic_add_unless(&clp->cl_rpc_users, -1, 0);
>> +		nfsd41_cb_inflight_end(clp);
>> +	}
>> +}
>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>> index 1a93c7fcf76c..0e1db57c9a19 100644
>> --- a/fs/nfsd/nfs4state.c
>> +++ b/fs/nfsd/nfs4state.c
>> @@ -2402,6 +2402,7 @@ __destroy_client(struct nfs4_client *clp)
>>   	}
>>   	nfsd4_return_all_client_layouts(clp);
>>   	nfsd4_shutdown_copy(clp);
>> +	nfsd41_cb_recall_any_cancel(clp);
>>   	nfsd4_shutdown_callback(clp);
>>   	if (clp->cl_cb_conn.cb_xprt)
>>   		svc_xprt_put(clp->cl_cb_conn.cb_xprt);
>> @@ -2980,6 +2981,12 @@ static void force_expire_client(struct nfs4_client *clp)
>>   	clp->cl_time = 0;
>>   	spin_unlock(&nn->client_lock);
>>   
>> +	/*
>> +	 * no need to send and wait for CB_RECALL_ANY
>> +	 * when client is about to be destroyed
>> +	 */
>> +	nfsd41_cb_recall_any_cancel(clp);
>> +
>>   	wait_event(expiry_wq, atomic_read(&clp->cl_rpc_users) == 0);
>>   	spin_lock(&nn->client_lock);
>>   	already_expired = list_empty(&clp->cl_lru);
>> @@ -6617,7 +6624,8 @@ deleg_reaper(struct nfsd_net *nn)
>>   		clp->cl_ra->ra_bmval[0] = BIT(RCA4_TYPE_MASK_RDATA_DLG) |
>>   						BIT(RCA4_TYPE_MASK_WDATA_DLG);
>>   		trace_nfsd_cb_recall_any(clp->cl_ra);
>> -		nfsd4_run_cb(&clp->cl_ra->ra_cb);
>> +		if (!nfsd4_run_cb(&clp->cl_ra->ra_cb))
>> +			clear_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags);
>>   	}
>>   }
>>   
>> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
>> index 01c6f3445646..259b4af7d226 100644
>> --- a/fs/nfsd/state.h
>> +++ b/fs/nfsd/state.h
>> @@ -735,6 +735,7 @@ extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *
>>   extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
>>   		const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op);
>>   extern bool nfsd4_run_cb(struct nfsd4_callback *cb);
>> +extern void nfsd41_cb_recall_any_cancel(struct nfs4_client *clp);
>>   extern int nfsd4_create_callback_queue(void);
>>   extern void nfsd4_destroy_callback_queue(void);
>>   extern void nfsd4_shutdown_callback(struct nfs4_client *);
>> -- 
>> 2.39.3
>>
Chuck Lever March 28, 2024, 2:08 p.m. UTC | #3
On Wed, Mar 27, 2024 at 06:09:28PM -0700, Dai Ngo wrote:
> 
> On 3/26/24 11:27 AM, Chuck Lever wrote:
> > On Tue, Mar 26, 2024 at 11:13:29AM -0700, Dai Ngo wrote:
> > > Currently when a nfs4_client is destroyed we wait for the cb_recall_any
> > > callback to complete before proceed. This adds unnecessary delay to the
> > > __destroy_client call if there is problem communicating with the client.
> > By "unnecessary delay" do you mean only the seven-second RPC
> > retransmit timeout, or is there something else?
> 
> when the client network interface is down, the RPC task takes ~9s to
> send the callback, waits for the reply and gets ETIMEDOUT. This process
> repeats in a loop with the same RPC task before being stopped by
> rpc_shutdown_client after client lease expires.

I'll have to review this code again, but rpc_shutdown_client
should cause these RPCs to terminate immediately and safely. Can't
we use that?


> It takes a total of about 1m20s before the CB_RECALL is terminated.
> For CB_RECALL_ANY and CB_OFFLOAD, this process gets in to a infinite
> loop since there is no delegation conflict and the client is allowed
> to stay in courtesy state.
> 
> The loop happens because in nfsd4_cb_sequence_done if cb_seq_status
> is 1 (an RPC Reply was never received) it calls nfsd4_mark_cb_fault
> to set the NFSD4_CB_FAULT bit. It then sets cb_need_restart to true.
> When nfsd4_cb_release is called, it checks cb_need_restart bit and
> re-queues the work again.

Something in the sequence_done path should check if the server is
tearing down this callback connection. If it doesn't, that is a bug
IMO.

Btw, have you checked NFSv4.0 behavior?


> > I can see that a server shutdown might want to cancel these, but why
> > is this a problem when destroying an nfs4_client?
> 
> Destroying an nfs4_client is called when the export is unmounted.

Ah, agreed. Thanks for reminding me.


> Cancelling these calls just make the process a bit quicker when there
> is problem with the client connection, or preventing the unmount to
> hang if there is problem at the workqueue and a callback work is
> pending there.
> 
> For CB_RECALL, even if we wait for the call to complete the client
> won't be able to return any delegations since the nfs4_client is
> already been destroyed. It just serves as a notice to the client that
> there is a delegation conflict so it can take appropriate actions.
> 
> > > This patch addresses this issue by cancelling the CB_RECALL_ANY call from
> > > the workqueue when the nfs4_client is about to be destroyed.
> > Does CB_OFFLOAD need similar treatment?
> 
> Probably. The copy is already done anyway, this is just a notification.

It would be a nicer design if all outstanding callback RPCs could
be handled with one mechanism instead of building a separate
shutdown method for each operation type.


> -Dai
> 
> > 
> > 
> > > Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
> > > ---
> > >   fs/nfsd/nfs4callback.c | 10 ++++++++++
> > >   fs/nfsd/nfs4state.c    | 10 +++++++++-
> > >   fs/nfsd/state.h        |  1 +
> > >   3 files changed, 20 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> > > index 87c9547989f6..e5b50c96be6a 100644
> > > --- a/fs/nfsd/nfs4callback.c
> > > +++ b/fs/nfsd/nfs4callback.c
> > > @@ -1568,3 +1568,13 @@ bool nfsd4_run_cb(struct nfsd4_callback *cb)
> > >   		nfsd41_cb_inflight_end(clp);
> > >   	return queued;
> > >   }
> > > +
> > > +void nfsd41_cb_recall_any_cancel(struct nfs4_client *clp)
> > > +{
> > > +	if (test_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags) &&
> > > +			cancel_delayed_work(&clp->cl_ra->ra_cb.cb_work)) {
> > > +		clear_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags);
> > > +		atomic_add_unless(&clp->cl_rpc_users, -1, 0);
> > > +		nfsd41_cb_inflight_end(clp);
> > > +	}
> > > +}
> > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > index 1a93c7fcf76c..0e1db57c9a19 100644
> > > --- a/fs/nfsd/nfs4state.c
> > > +++ b/fs/nfsd/nfs4state.c
> > > @@ -2402,6 +2402,7 @@ __destroy_client(struct nfs4_client *clp)
> > >   	}
> > >   	nfsd4_return_all_client_layouts(clp);
> > >   	nfsd4_shutdown_copy(clp);
> > > +	nfsd41_cb_recall_any_cancel(clp);
> > >   	nfsd4_shutdown_callback(clp);
> > >   	if (clp->cl_cb_conn.cb_xprt)
> > >   		svc_xprt_put(clp->cl_cb_conn.cb_xprt);
> > > @@ -2980,6 +2981,12 @@ static void force_expire_client(struct nfs4_client *clp)
> > >   	clp->cl_time = 0;
> > >   	spin_unlock(&nn->client_lock);
> > > +	/*
> > > +	 * no need to send and wait for CB_RECALL_ANY
> > > +	 * when client is about to be destroyed
> > > +	 */
> > > +	nfsd41_cb_recall_any_cancel(clp);
> > > +
> > >   	wait_event(expiry_wq, atomic_read(&clp->cl_rpc_users) == 0);
> > >   	spin_lock(&nn->client_lock);
> > >   	already_expired = list_empty(&clp->cl_lru);
> > > @@ -6617,7 +6624,8 @@ deleg_reaper(struct nfsd_net *nn)
> > >   		clp->cl_ra->ra_bmval[0] = BIT(RCA4_TYPE_MASK_RDATA_DLG) |
> > >   						BIT(RCA4_TYPE_MASK_WDATA_DLG);
> > >   		trace_nfsd_cb_recall_any(clp->cl_ra);
> > > -		nfsd4_run_cb(&clp->cl_ra->ra_cb);
> > > +		if (!nfsd4_run_cb(&clp->cl_ra->ra_cb))
> > > +			clear_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags);
> > >   	}
> > >   }
> > > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> > > index 01c6f3445646..259b4af7d226 100644
> > > --- a/fs/nfsd/state.h
> > > +++ b/fs/nfsd/state.h
> > > @@ -735,6 +735,7 @@ extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *
> > >   extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
> > >   		const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op);
> > >   extern bool nfsd4_run_cb(struct nfsd4_callback *cb);
> > > +extern void nfsd41_cb_recall_any_cancel(struct nfs4_client *clp);
> > >   extern int nfsd4_create_callback_queue(void);
> > >   extern void nfsd4_destroy_callback_queue(void);
> > >   extern void nfsd4_shutdown_callback(struct nfs4_client *);
> > > -- 
> > > 2.39.3
> > >
Dai Ngo March 28, 2024, 6:14 p.m. UTC | #4
On 3/28/24 7:08 AM, Chuck Lever wrote:
> On Wed, Mar 27, 2024 at 06:09:28PM -0700, Dai Ngo wrote:
>> On 3/26/24 11:27 AM, Chuck Lever wrote:
>>> On Tue, Mar 26, 2024 at 11:13:29AM -0700, Dai Ngo wrote:
>>>> Currently when a nfs4_client is destroyed we wait for the cb_recall_any
>>>> callback to complete before proceed. This adds unnecessary delay to the
>>>> __destroy_client call if there is problem communicating with the client.
>>> By "unnecessary delay" do you mean only the seven-second RPC
>>> retransmit timeout, or is there something else?
>> when the client network interface is down, the RPC task takes ~9s to
>> send the callback, waits for the reply and gets ETIMEDOUT. This process
>> repeats in a loop with the same RPC task before being stopped by
>> rpc_shutdown_client after client lease expires.
> I'll have to review this code again, but rpc_shutdown_client
> should cause these RPCs to terminate immediately and safely. Can't
> we use that?

rpc_shutdown_client works, it terminated the RPC call to stop the loop.

>
>
>> It takes a total of about 1m20s before the CB_RECALL is terminated.
>> For CB_RECALL_ANY and CB_OFFLOAD, this process gets in to a infinite
>> loop since there is no delegation conflict and the client is allowed
>> to stay in courtesy state.
>>
>> The loop happens because in nfsd4_cb_sequence_done if cb_seq_status
>> is 1 (an RPC Reply was never received) it calls nfsd4_mark_cb_fault
>> to set the NFSD4_CB_FAULT bit. It then sets cb_need_restart to true.
>> When nfsd4_cb_release is called, it checks cb_need_restart bit and
>> re-queues the work again.
> Something in the sequence_done path should check if the server is
> tearing down this callback connection. If it doesn't, that is a bug
> IMO.

I will check to see if TCP eventually closes the connection and
notifies the RPC layer. From network traces, I see TCP stopped
retrying after about 7 minutes. But even 7 minutes it's a long
time we should not be hanging around waiting for it.

>
> Btw, have you checked NFSv4.0 behavior?

Not yet.

>
>
>>> I can see that a server shutdown might want to cancel these, but why
>>> is this a problem when destroying an nfs4_client?
>> Destroying an nfs4_client is called when the export is unmounted.
> Ah, agreed. Thanks for reminding me.
>
>
>> Cancelling these calls just make the process a bit quicker when there
>> is problem with the client connection, or preventing the unmount to
>> hang if there is problem at the workqueue and a callback work is
>> pending there.
>>
>> For CB_RECALL, even if we wait for the call to complete the client
>> won't be able to return any delegations since the nfs4_client is
>> already been destroyed. It just serves as a notice to the client that
>> there is a delegation conflict so it can take appropriate actions.
>>
>>>> This patch addresses this issue by cancelling the CB_RECALL_ANY call from
>>>> the workqueue when the nfs4_client is about to be destroyed.
>>> Does CB_OFFLOAD need similar treatment?
>> Probably. The copy is already done anyway, this is just a notification.
> It would be a nicer design if all outstanding callback RPCs could
> be handled with one mechanism instead of building a separate
> shutdown method for each operation type.

cb_recall ties to the individual delegation and cb_recall_any ties
to the nfs4_client. We can check the delegation and the client to
see if there are pending callbacks. Currently cb_offload is stand-alone
and not tied to anything, kzalloc the callback on the fly and send
it out so there is no way to find out if there is pending callback.

-Dai

>
>
>> -Dai
>>
>>>
>>>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
>>>> ---
>>>>    fs/nfsd/nfs4callback.c | 10 ++++++++++
>>>>    fs/nfsd/nfs4state.c    | 10 +++++++++-
>>>>    fs/nfsd/state.h        |  1 +
>>>>    3 files changed, 20 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
>>>> index 87c9547989f6..e5b50c96be6a 100644
>>>> --- a/fs/nfsd/nfs4callback.c
>>>> +++ b/fs/nfsd/nfs4callback.c
>>>> @@ -1568,3 +1568,13 @@ bool nfsd4_run_cb(struct nfsd4_callback *cb)
>>>>    		nfsd41_cb_inflight_end(clp);
>>>>    	return queued;
>>>>    }
>>>> +
>>>> +void nfsd41_cb_recall_any_cancel(struct nfs4_client *clp)
>>>> +{
>>>> +	if (test_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags) &&
>>>> +			cancel_delayed_work(&clp->cl_ra->ra_cb.cb_work)) {
>>>> +		clear_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags);
>>>> +		atomic_add_unless(&clp->cl_rpc_users, -1, 0);
>>>> +		nfsd41_cb_inflight_end(clp);
>>>> +	}
>>>> +}
>>>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>>>> index 1a93c7fcf76c..0e1db57c9a19 100644
>>>> --- a/fs/nfsd/nfs4state.c
>>>> +++ b/fs/nfsd/nfs4state.c
>>>> @@ -2402,6 +2402,7 @@ __destroy_client(struct nfs4_client *clp)
>>>>    	}
>>>>    	nfsd4_return_all_client_layouts(clp);
>>>>    	nfsd4_shutdown_copy(clp);
>>>> +	nfsd41_cb_recall_any_cancel(clp);
>>>>    	nfsd4_shutdown_callback(clp);
>>>>    	if (clp->cl_cb_conn.cb_xprt)
>>>>    		svc_xprt_put(clp->cl_cb_conn.cb_xprt);
>>>> @@ -2980,6 +2981,12 @@ static void force_expire_client(struct nfs4_client *clp)
>>>>    	clp->cl_time = 0;
>>>>    	spin_unlock(&nn->client_lock);
>>>> +	/*
>>>> +	 * no need to send and wait for CB_RECALL_ANY
>>>> +	 * when client is about to be destroyed
>>>> +	 */
>>>> +	nfsd41_cb_recall_any_cancel(clp);
>>>> +
>>>>    	wait_event(expiry_wq, atomic_read(&clp->cl_rpc_users) == 0);
>>>>    	spin_lock(&nn->client_lock);
>>>>    	already_expired = list_empty(&clp->cl_lru);
>>>> @@ -6617,7 +6624,8 @@ deleg_reaper(struct nfsd_net *nn)
>>>>    		clp->cl_ra->ra_bmval[0] = BIT(RCA4_TYPE_MASK_RDATA_DLG) |
>>>>    						BIT(RCA4_TYPE_MASK_WDATA_DLG);
>>>>    		trace_nfsd_cb_recall_any(clp->cl_ra);
>>>> -		nfsd4_run_cb(&clp->cl_ra->ra_cb);
>>>> +		if (!nfsd4_run_cb(&clp->cl_ra->ra_cb))
>>>> +			clear_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags);
>>>>    	}
>>>>    }
>>>> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
>>>> index 01c6f3445646..259b4af7d226 100644
>>>> --- a/fs/nfsd/state.h
>>>> +++ b/fs/nfsd/state.h
>>>> @@ -735,6 +735,7 @@ extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *
>>>>    extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
>>>>    		const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op);
>>>>    extern bool nfsd4_run_cb(struct nfsd4_callback *cb);
>>>> +extern void nfsd41_cb_recall_any_cancel(struct nfs4_client *clp);
>>>>    extern int nfsd4_create_callback_queue(void);
>>>>    extern void nfsd4_destroy_callback_queue(void);
>>>>    extern void nfsd4_shutdown_callback(struct nfs4_client *);
>>>> -- 
>>>> 2.39.3
>>>>
Dai Ngo March 29, 2024, 12:31 a.m. UTC | #5
On 3/28/24 11:14 AM, Dai Ngo wrote:
>
> On 3/28/24 7:08 AM, Chuck Lever wrote:
>> On Wed, Mar 27, 2024 at 06:09:28PM -0700, Dai Ngo wrote:
>>> On 3/26/24 11:27 AM, Chuck Lever wrote:
>>>> On Tue, Mar 26, 2024 at 11:13:29AM -0700, Dai Ngo wrote:
>>>>> Currently when a nfs4_client is destroyed we wait for the 
>>>>> cb_recall_any
>>>>> callback to complete before proceed. This adds unnecessary delay 
>>>>> to the
>>>>> __destroy_client call if there is problem communicating with the 
>>>>> client.
>>>> By "unnecessary delay" do you mean only the seven-second RPC
>>>> retransmit timeout, or is there something else?
>>> when the client network interface is down, the RPC task takes ~9s to
>>> send the callback, waits for the reply and gets ETIMEDOUT. This process
>>> repeats in a loop with the same RPC task before being stopped by
>>> rpc_shutdown_client after client lease expires.
>> I'll have to review this code again, but rpc_shutdown_client
>> should cause these RPCs to terminate immediately and safely. Can't
>> we use that?
>
> rpc_shutdown_client works, it terminated the RPC call to stop the loop.
>
>>
>>
>>> It takes a total of about 1m20s before the CB_RECALL is terminated.
>>> For CB_RECALL_ANY and CB_OFFLOAD, this process gets in to a infinite
>>> loop since there is no delegation conflict and the client is allowed
>>> to stay in courtesy state.
>>>
>>> The loop happens because in nfsd4_cb_sequence_done if cb_seq_status
>>> is 1 (an RPC Reply was never received) it calls nfsd4_mark_cb_fault
>>> to set the NFSD4_CB_FAULT bit. It then sets cb_need_restart to true.
>>> When nfsd4_cb_release is called, it checks cb_need_restart bit and
>>> re-queues the work again.
>> Something in the sequence_done path should check if the server is
>> tearing down this callback connection. If it doesn't, that is a bug
>> IMO.

TCP terminated the connection after retrying for 16 minutes and
notified the RPC layer which deleted the nfsd4_conn.

But when nfsd4_run_cb_work ran again, it got into the infinite
loop caused by:
      /*
       * XXX: Ideally, we could wait for the client to
       *      reconnect, but I haven't figured out how
       *      to do that yet.
       */
       nfsd4_queue_cb_delayed(cb, 25);

which was introduced by c1ccfcf1a9bf. Note that I'm using 6.9-rc1.


-Dai

>
> I will check to see if TCP eventually closes the connection and
> notifies the RPC layer. From network traces, I see TCP stopped
> retrying after about 7 minutes. But even 7 minutes it's a long
> time we should not be hanging around waiting for it.
>
>>
>> Btw, have you checked NFSv4.0 behavior?
>
> Not yet.
>
>>
>>
>>>> I can see that a server shutdown might want to cancel these, but why
>>>> is this a problem when destroying an nfs4_client?
>>> Destroying an nfs4_client is called when the export is unmounted.
>> Ah, agreed. Thanks for reminding me.
>>
>>
>>> Cancelling these calls just make the process a bit quicker when there
>>> is problem with the client connection, or preventing the unmount to
>>> hang if there is problem at the workqueue and a callback work is
>>> pending there.
>>>
>>> For CB_RECALL, even if we wait for the call to complete the client
>>> won't be able to return any delegations since the nfs4_client is
>>> already been destroyed. It just serves as a notice to the client that
>>> there is a delegation conflict so it can take appropriate actions.
>>>
>>>>> This patch addresses this issue by cancelling the CB_RECALL_ANY 
>>>>> call from
>>>>> the workqueue when the nfs4_client is about to be destroyed.
>>>> Does CB_OFFLOAD need similar treatment?
>>> Probably. The copy is already done anyway, this is just a notification.
>> It would be a nicer design if all outstanding callback RPCs could
>> be handled with one mechanism instead of building a separate
>> shutdown method for each operation type.
>
> cb_recall ties to the individual delegation and cb_recall_any ties
> to the nfs4_client. We can check the delegation and the client to
> see if there are pending callbacks. Currently cb_offload is stand-alone
> and not tied to anything, kzalloc the callback on the fly and send
> it out so there is no way to find out if there is pending callback.
>
> -Dai
>
>>
>>
>>> -Dai
>>>
>>>>
>>>>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
>>>>> ---
>>>>>    fs/nfsd/nfs4callback.c | 10 ++++++++++
>>>>>    fs/nfsd/nfs4state.c    | 10 +++++++++-
>>>>>    fs/nfsd/state.h        |  1 +
>>>>>    3 files changed, 20 insertions(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
>>>>> index 87c9547989f6..e5b50c96be6a 100644
>>>>> --- a/fs/nfsd/nfs4callback.c
>>>>> +++ b/fs/nfsd/nfs4callback.c
>>>>> @@ -1568,3 +1568,13 @@ bool nfsd4_run_cb(struct nfsd4_callback *cb)
>>>>>            nfsd41_cb_inflight_end(clp);
>>>>>        return queued;
>>>>>    }
>>>>> +
>>>>> +void nfsd41_cb_recall_any_cancel(struct nfs4_client *clp)
>>>>> +{
>>>>> +    if (test_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags) &&
>>>>> + cancel_delayed_work(&clp->cl_ra->ra_cb.cb_work)) {
>>>>> +        clear_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags);
>>>>> +        atomic_add_unless(&clp->cl_rpc_users, -1, 0);
>>>>> +        nfsd41_cb_inflight_end(clp);
>>>>> +    }
>>>>> +}
>>>>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>>>>> index 1a93c7fcf76c..0e1db57c9a19 100644
>>>>> --- a/fs/nfsd/nfs4state.c
>>>>> +++ b/fs/nfsd/nfs4state.c
>>>>> @@ -2402,6 +2402,7 @@ __destroy_client(struct nfs4_client *clp)
>>>>>        }
>>>>>        nfsd4_return_all_client_layouts(clp);
>>>>>        nfsd4_shutdown_copy(clp);
>>>>> +    nfsd41_cb_recall_any_cancel(clp);
>>>>>        nfsd4_shutdown_callback(clp);
>>>>>        if (clp->cl_cb_conn.cb_xprt)
>>>>>            svc_xprt_put(clp->cl_cb_conn.cb_xprt);
>>>>> @@ -2980,6 +2981,12 @@ static void force_expire_client(struct 
>>>>> nfs4_client *clp)
>>>>>        clp->cl_time = 0;
>>>>>        spin_unlock(&nn->client_lock);
>>>>> +    /*
>>>>> +     * no need to send and wait for CB_RECALL_ANY
>>>>> +     * when client is about to be destroyed
>>>>> +     */
>>>>> +    nfsd41_cb_recall_any_cancel(clp);
>>>>> +
>>>>>        wait_event(expiry_wq, atomic_read(&clp->cl_rpc_users) == 0);
>>>>>        spin_lock(&nn->client_lock);
>>>>>        already_expired = list_empty(&clp->cl_lru);
>>>>> @@ -6617,7 +6624,8 @@ deleg_reaper(struct nfsd_net *nn)
>>>>>            clp->cl_ra->ra_bmval[0] = BIT(RCA4_TYPE_MASK_RDATA_DLG) |
>>>>>                            BIT(RCA4_TYPE_MASK_WDATA_DLG);
>>>>>            trace_nfsd_cb_recall_any(clp->cl_ra);
>>>>> -        nfsd4_run_cb(&clp->cl_ra->ra_cb);
>>>>> +        if (!nfsd4_run_cb(&clp->cl_ra->ra_cb))
>>>>> +            clear_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags);
>>>>>        }
>>>>>    }
>>>>> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
>>>>> index 01c6f3445646..259b4af7d226 100644
>>>>> --- a/fs/nfsd/state.h
>>>>> +++ b/fs/nfsd/state.h
>>>>> @@ -735,6 +735,7 @@ extern void nfsd4_change_callback(struct 
>>>>> nfs4_client *clp, struct nfs4_cb_conn *
>>>>>    extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct 
>>>>> nfs4_client *clp,
>>>>>            const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op 
>>>>> op);
>>>>>    extern bool nfsd4_run_cb(struct nfsd4_callback *cb);
>>>>> +extern void nfsd41_cb_recall_any_cancel(struct nfs4_client *clp);
>>>>>    extern int nfsd4_create_callback_queue(void);
>>>>>    extern void nfsd4_destroy_callback_queue(void);
>>>>>    extern void nfsd4_shutdown_callback(struct nfs4_client *);
>>>>> -- 
>>>>> 2.39.3
>>>>>
>
Chuck Lever March 29, 2024, 2:55 p.m. UTC | #6
On Thu, Mar 28, 2024 at 05:31:02PM -0700, Dai Ngo wrote:
> 
> On 3/28/24 11:14 AM, Dai Ngo wrote:
> > 
> > On 3/28/24 7:08 AM, Chuck Lever wrote:
> > > On Wed, Mar 27, 2024 at 06:09:28PM -0700, Dai Ngo wrote:
> > > > On 3/26/24 11:27 AM, Chuck Lever wrote:
> > > > > On Tue, Mar 26, 2024 at 11:13:29AM -0700, Dai Ngo wrote:
> > > > > > Currently when a nfs4_client is destroyed we wait for
> > > > > > the cb_recall_any
> > > > > > callback to complete before proceed. This adds
> > > > > > unnecessary delay to the
> > > > > > __destroy_client call if there is problem communicating
> > > > > > with the client.
> > > > > By "unnecessary delay" do you mean only the seven-second RPC
> > > > > retransmit timeout, or is there something else?
> > > > when the client network interface is down, the RPC task takes ~9s to
> > > > send the callback, waits for the reply and gets ETIMEDOUT. This process
> > > > repeats in a loop with the same RPC task before being stopped by
> > > > rpc_shutdown_client after client lease expires.
> > > I'll have to review this code again, but rpc_shutdown_client
> > > should cause these RPCs to terminate immediately and safely. Can't
> > > we use that?
> > 
> > rpc_shutdown_client works, it terminated the RPC call to stop the loop.
> > 
> > > 
> > > 
> > > > It takes a total of about 1m20s before the CB_RECALL is terminated.
> > > > For CB_RECALL_ANY and CB_OFFLOAD, this process gets in to a infinite
> > > > loop since there is no delegation conflict and the client is allowed
> > > > to stay in courtesy state.
> > > > 
> > > > The loop happens because in nfsd4_cb_sequence_done if cb_seq_status
> > > > is 1 (an RPC Reply was never received) it calls nfsd4_mark_cb_fault
> > > > to set the NFSD4_CB_FAULT bit. It then sets cb_need_restart to true.
> > > > When nfsd4_cb_release is called, it checks cb_need_restart bit and
> > > > re-queues the work again.
> > > Something in the sequence_done path should check if the server is
> > > tearing down this callback connection. If it doesn't, that is a bug
> > > IMO.
> 
> TCP terminated the connection after retrying for 16 minutes and
> notified the RPC layer which deleted the nfsd4_conn.

The server should have closed this connection already. Is it stuck
waiting for the client to respond to a FIN or something?


> But when nfsd4_run_cb_work ran again, it got into the infinite
> loop caused by:
>      /*
>       * XXX: Ideally, we could wait for the client to
>       *      reconnect, but I haven't figured out how
>       *      to do that yet.
>       */
>       nfsd4_queue_cb_delayed(cb, 25);
> 
> which was introduced by c1ccfcf1a9bf. Note that I'm using 6.9-rc1.

The whole paragraph is:

1503         clnt = clp->cl_cb_client;
1504         if (!clnt) {
1505                 if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags))
1506                         nfsd41_destroy_cb(cb);
1507                 else {
1508                         /*
1509                          * XXX: Ideally, we could wait for the client to
1510                          *      reconnect, but I haven't figured out how
1511                          *      to do that yet.
1512                          */
1513                         nfsd4_queue_cb_delayed(cb, 25);
1514                 }
1515                 return;
1516         }

When there's no rpc_clnt and CB_KILL is set, the callback
operation should be destroyed immediately. CB_KILL is set by
nfsd4_shutdown_callback. It's only caller is
__destroy_client().

Why isn't NFSD4_CLIENT_CB_KILL set?
Dai Ngo March 29, 2024, 5:57 p.m. UTC | #7
On 3/29/24 7:55 AM, Chuck Lever wrote:
> On Thu, Mar 28, 2024 at 05:31:02PM -0700, Dai Ngo wrote:
>> On 3/28/24 11:14 AM, Dai Ngo wrote:
>>> On 3/28/24 7:08 AM, Chuck Lever wrote:
>>>> On Wed, Mar 27, 2024 at 06:09:28PM -0700, Dai Ngo wrote:
>>>>> On 3/26/24 11:27 AM, Chuck Lever wrote:
>>>>>> On Tue, Mar 26, 2024 at 11:13:29AM -0700, Dai Ngo wrote:
>>>>>>> Currently when a nfs4_client is destroyed we wait for
>>>>>>> the cb_recall_any
>>>>>>> callback to complete before proceed. This adds
>>>>>>> unnecessary delay to the
>>>>>>> __destroy_client call if there is problem communicating
>>>>>>> with the client.
>>>>>> By "unnecessary delay" do you mean only the seven-second RPC
>>>>>> retransmit timeout, or is there something else?
>>>>> when the client network interface is down, the RPC task takes ~9s to
>>>>> send the callback, waits for the reply and gets ETIMEDOUT. This process
>>>>> repeats in a loop with the same RPC task before being stopped by
>>>>> rpc_shutdown_client after client lease expires.
>>>> I'll have to review this code again, but rpc_shutdown_client
>>>> should cause these RPCs to terminate immediately and safely. Can't
>>>> we use that?
>>> rpc_shutdown_client works, it terminated the RPC call to stop the loop.
>>>
>>>>
>>>>> It takes a total of about 1m20s before the CB_RECALL is terminated.
>>>>> For CB_RECALL_ANY and CB_OFFLOAD, this process gets in to a infinite
>>>>> loop since there is no delegation conflict and the client is allowed
>>>>> to stay in courtesy state.
>>>>>
>>>>> The loop happens because in nfsd4_cb_sequence_done if cb_seq_status
>>>>> is 1 (an RPC Reply was never received) it calls nfsd4_mark_cb_fault
>>>>> to set the NFSD4_CB_FAULT bit. It then sets cb_need_restart to true.
>>>>> When nfsd4_cb_release is called, it checks cb_need_restart bit and
>>>>> re-queues the work again.
>>>> Something in the sequence_done path should check if the server is
>>>> tearing down this callback connection. If it doesn't, that is a bug
>>>> IMO.
>> TCP terminated the connection after retrying for 16 minutes and
>> notified the RPC layer which deleted the nfsd4_conn.
> The server should have closed this connection already.

Since there is no delegation conflict, the client remains in courtesy
state.

>   Is it stuck
> waiting for the client to respond to a FIN or something?

No, in this case since the client network interface was down server
TCP did not even receive ACK for the transmit so the server TCP
kept retrying.

>
>
>> But when nfsd4_run_cb_work ran again, it got into the infinite
>> loop caused by:
>>       /*
>>        * XXX: Ideally, we could wait for the client to
>>        *      reconnect, but I haven't figured out how
>>        *      to do that yet.
>>        */
>>        nfsd4_queue_cb_delayed(cb, 25);
>>
>> which was introduced by c1ccfcf1a9bf. Note that I'm using 6.9-rc1.
> The whole paragraph is:
>
> 1503         clnt = clp->cl_cb_client;
> 1504         if (!clnt) {
> 1505                 if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags))
> 1506                         nfsd41_destroy_cb(cb);
> 1507                 else {
> 1508                         /*
> 1509                          * XXX: Ideally, we could wait for the client to
> 1510                          *      reconnect, but I haven't figured out how
> 1511                          *      to do that yet.
> 1512                          */
> 1513                         nfsd4_queue_cb_delayed(cb, 25);
> 1514                 }
> 1515                 return;
> 1516         }
>
> When there's no rpc_clnt and CB_KILL is set, the callback
> operation should be destroyed immediately. CB_KILL is set by
> nfsd4_shutdown_callback. It's only caller is
> __destroy_client().
>
> Why isn't NFSD4_CLIENT_CB_KILL set?

Since __destroy_client was not called, for the reason mentioned above,
nfsd4_shutdown_callback was not called so NFSD4_CLIENT_CB_KILL was not
set.

Since the nfsd callback_wq was created with alloc_ordered_workqueue,
once this loop happens the nfsd callback_wq is stuck since this workqueue
executes at most one work item at any given time.

This means that if a nfs client is shutdown and the server is in this
state then all other clients are effected; all callbacks to other
clients are stuck in the workqueue. And if a callback for a client is
stuck in the workqueue then that client can not unmount its shares.

This is the symptom that was reported recently on this list. I think
the nfsd callback_wq should be created as a normal workqueue that allows
multiple work items to be executed at the same time so a problem with
one client does not effect other clients.

-Dai

>
>
Chuck Lever March 29, 2024, 11:42 p.m. UTC | #8
On Fri, Mar 29, 2024 at 10:57:22AM -0700, Dai Ngo wrote:
> 
> On 3/29/24 7:55 AM, Chuck Lever wrote:
> > On Thu, Mar 28, 2024 at 05:31:02PM -0700, Dai Ngo wrote:
> > > On 3/28/24 11:14 AM, Dai Ngo wrote:
> > > > On 3/28/24 7:08 AM, Chuck Lever wrote:
> > > > > On Wed, Mar 27, 2024 at 06:09:28PM -0700, Dai Ngo wrote:
> > > > > > On 3/26/24 11:27 AM, Chuck Lever wrote:
> > > > > > > On Tue, Mar 26, 2024 at 11:13:29AM -0700, Dai Ngo wrote:
> > > > > > > > Currently when a nfs4_client is destroyed we wait for
> > > > > > > > the cb_recall_any
> > > > > > > > callback to complete before proceed. This adds
> > > > > > > > unnecessary delay to the
> > > > > > > > __destroy_client call if there is problem communicating
> > > > > > > > with the client.
> > > > > > > By "unnecessary delay" do you mean only the seven-second RPC
> > > > > > > retransmit timeout, or is there something else?
> > > > > > when the client network interface is down, the RPC task takes ~9s to
> > > > > > send the callback, waits for the reply and gets ETIMEDOUT. This process
> > > > > > repeats in a loop with the same RPC task before being stopped by
> > > > > > rpc_shutdown_client after client lease expires.
> > > > > I'll have to review this code again, but rpc_shutdown_client
> > > > > should cause these RPCs to terminate immediately and safely. Can't
> > > > > we use that?
> > > > rpc_shutdown_client works, it terminated the RPC call to stop the loop.
> > > > 
> > > > > 
> > > > > > It takes a total of about 1m20s before the CB_RECALL is terminated.
> > > > > > For CB_RECALL_ANY and CB_OFFLOAD, this process gets in to a infinite
> > > > > > loop since there is no delegation conflict and the client is allowed
> > > > > > to stay in courtesy state.
> > > > > > 
> > > > > > The loop happens because in nfsd4_cb_sequence_done if cb_seq_status
> > > > > > is 1 (an RPC Reply was never received) it calls nfsd4_mark_cb_fault
> > > > > > to set the NFSD4_CB_FAULT bit. It then sets cb_need_restart to true.
> > > > > > When nfsd4_cb_release is called, it checks cb_need_restart bit and
> > > > > > re-queues the work again.
> > > > > Something in the sequence_done path should check if the server is
> > > > > tearing down this callback connection. If it doesn't, that is a bug
> > > > > IMO.
> > > TCP terminated the connection after retrying for 16 minutes and
> > > notified the RPC layer which deleted the nfsd4_conn.
> > The server should have closed this connection already.
> 
> Since there is no delegation conflict, the client remains in courtesy
> state.
> 
> >   Is it stuck
> > waiting for the client to respond to a FIN or something?
> 
> No, in this case since the client network interface was down server
> TCP did not even receive ACK for the transmit so the server TCP
> kept retrying.

It sounds like the server attempts to maintain the client's
transport while the client is in courtesy state? I thought the
purpose of courteous server was to more gracefully handle network
partitions, in which case, the transport is not reliable.

If a courtesy client hasn't re-established a connection with a
backchannel by the time a conflicting open/lock request arrives,
the server has no choice but to expire that client's courtesy
state immediately. Right?

But maybe that's a side-bar.


> > > But when nfsd4_run_cb_work ran again, it got into the infinite
> > > loop caused by:
> > >       /*
> > >        * XXX: Ideally, we could wait for the client to
> > >        *      reconnect, but I haven't figured out how
> > >        *      to do that yet.
> > >        */
> > >        nfsd4_queue_cb_delayed(cb, 25);
> > > 
> > > which was introduced by c1ccfcf1a9bf. Note that I'm using 6.9-rc1.
> > The whole paragraph is:
> > 
> > 1503         clnt = clp->cl_cb_client;
> > 1504         if (!clnt) {
> > 1505                 if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags))
> > 1506                         nfsd41_destroy_cb(cb);
> > 1507                 else {
> > 1508                         /*
> > 1509                          * XXX: Ideally, we could wait for the client to
> > 1510                          *      reconnect, but I haven't figured out how
> > 1511                          *      to do that yet.
> > 1512                          */
> > 1513                         nfsd4_queue_cb_delayed(cb, 25);
> > 1514                 }
> > 1515                 return;
> > 1516         }
> > 
> > When there's no rpc_clnt and CB_KILL is set, the callback
> > operation should be destroyed immediately. CB_KILL is set by
> > nfsd4_shutdown_callback. It's only caller is
> > __destroy_client().
> > 
> > Why isn't NFSD4_CLIENT_CB_KILL set?
> 
> Since __destroy_client was not called, for the reason mentioned above,
> nfsd4_shutdown_callback was not called so NFSD4_CLIENT_CB_KILL was not
> set.

Well, then I'm missing something. You said, above:

> Currently when a nfs4_client is destroyed we wait for

And __destroy_client() invokes nfsd4_shutdown_callback(). Can you
explain the usage scenario(s) to me again?


> Since the nfsd callback_wq was created with alloc_ordered_workqueue,
> once this loop happens the nfsd callback_wq is stuck since this workqueue
> executes at most one work item at any given time.
> 
> This means that if a nfs client is shutdown and the server is in this
> state then all other clients are effected; all callbacks to other
> clients are stuck in the workqueue. And if a callback for a client is
> stuck in the workqueue then that client can not unmount its shares.
> 
> This is the symptom that was reported recently on this list. I think
> the nfsd callback_wq should be created as a normal workqueue that allows
> multiple work items to be executed at the same time so a problem with
> one client does not effect other clients.

My impression is that the callback_wq is single-threaded to avoid
locking complexity. I'm not yet convinced it would be safe to simply
change that workqueue to permit multiple concurrent work items.

It could be straightforward, however, to move the callback_wq into
struct nfs4_client so that each client can have its own workqueue.
Then we can take some time and design something less brittle and
more scalable (and maybe come up with some test infrastructure so
this stuff doesn't break as often).
Dai Ngo March 30, 2024, 5:46 p.m. UTC | #9
On 3/29/24 4:42 PM, Chuck Lever wrote:
> On Fri, Mar 29, 2024 at 10:57:22AM -0700, Dai Ngo wrote:
>> On 3/29/24 7:55 AM, Chuck Lever wrote:
>>> On Thu, Mar 28, 2024 at 05:31:02PM -0700, Dai Ngo wrote:
>>>> On 3/28/24 11:14 AM, Dai Ngo wrote:
>>>>> On 3/28/24 7:08 AM, Chuck Lever wrote:
>>>>>> On Wed, Mar 27, 2024 at 06:09:28PM -0700, Dai Ngo wrote:
>>>>>>> On 3/26/24 11:27 AM, Chuck Lever wrote:
>>>>>>>> On Tue, Mar 26, 2024 at 11:13:29AM -0700, Dai Ngo wrote:
>>>>>>>>> Currently when a nfs4_client is destroyed we wait for
>>>>>>>>> the cb_recall_any
>>>>>>>>> callback to complete before proceed. This adds
>>>>>>>>> unnecessary delay to the
>>>>>>>>> __destroy_client call if there is problem communicating
>>>>>>>>> with the client.
>>>>>>>> By "unnecessary delay" do you mean only the seven-second RPC
>>>>>>>> retransmit timeout, or is there something else?
>>>>>>> when the client network interface is down, the RPC task takes ~9s to
>>>>>>> send the callback, waits for the reply and gets ETIMEDOUT. This process
>>>>>>> repeats in a loop with the same RPC task before being stopped by
>>>>>>> rpc_shutdown_client after client lease expires.
>>>>>> I'll have to review this code again, but rpc_shutdown_client
>>>>>> should cause these RPCs to terminate immediately and safely. Can't
>>>>>> we use that?
>>>>> rpc_shutdown_client works, it terminated the RPC call to stop the loop.
>>>>>
>>>>>>> It takes a total of about 1m20s before the CB_RECALL is terminated.
>>>>>>> For CB_RECALL_ANY and CB_OFFLOAD, this process gets in to a infinite
>>>>>>> loop since there is no delegation conflict and the client is allowed
>>>>>>> to stay in courtesy state.
>>>>>>>
>>>>>>> The loop happens because in nfsd4_cb_sequence_done if cb_seq_status
>>>>>>> is 1 (an RPC Reply was never received) it calls nfsd4_mark_cb_fault
>>>>>>> to set the NFSD4_CB_FAULT bit. It then sets cb_need_restart to true.
>>>>>>> When nfsd4_cb_release is called, it checks cb_need_restart bit and
>>>>>>> re-queues the work again.
>>>>>> Something in the sequence_done path should check if the server is
>>>>>> tearing down this callback connection. If it doesn't, that is a bug
>>>>>> IMO.
>>>> TCP terminated the connection after retrying for 16 minutes and
>>>> notified the RPC layer which deleted the nfsd4_conn.
>>> The server should have closed this connection already.
>> Since there is no delegation conflict, the client remains in courtesy
>> state.
>>
>>>    Is it stuck
>>> waiting for the client to respond to a FIN or something?
>> No, in this case since the client network interface was down server
>> TCP did not even receive ACK for the transmit so the server TCP
>> kept retrying.
> It sounds like the server attempts to maintain the client's
> transport while the client is in courtesy state?

Yes, TCP keeps retryingwhile the client is in courtesy state.
  

>   I thought the
> purpose of courteous server was to more gracefully handle network
> partitions, in which case, the transport is not reliable.
>
> If a courtesy client hasn't re-established a connection with a
> backchannel by the time a conflicting open/lock request arrives,
> the server has no choice but to expire that client's courtesy
> state immediately. Right?

Yes, that is the case for CB_RECALL but not for CB_RECALL_ANY.

>
> But maybe that's a side-bar.
>
>
>>>> But when nfsd4_run_cb_work ran again, it got into the infinite
>>>> loop caused by:
>>>>        /*
>>>>         * XXX: Ideally, we could wait for the client to
>>>>         *      reconnect, but I haven't figured out how
>>>>         *      to do that yet.
>>>>         */
>>>>         nfsd4_queue_cb_delayed(cb, 25);
>>>>
>>>> which was introduced by c1ccfcf1a9bf. Note that I'm using 6.9-rc1.
>>> The whole paragraph is:
>>>
>>> 1503         clnt = clp->cl_cb_client;
>>> 1504         if (!clnt) {
>>> 1505                 if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags))
>>> 1506                         nfsd41_destroy_cb(cb);
>>> 1507                 else {
>>> 1508                         /*
>>> 1509                          * XXX: Ideally, we could wait for the client to
>>> 1510                          *      reconnect, but I haven't figured out how
>>> 1511                          *      to do that yet.
>>> 1512                          */
>>> 1513                         nfsd4_queue_cb_delayed(cb, 25);
>>> 1514                 }
>>> 1515                 return;
>>> 1516         }
>>>
>>> When there's no rpc_clnt and CB_KILL is set, the callback
>>> operation should be destroyed immediately. CB_KILL is set by
>>> nfsd4_shutdown_callback. It's only caller is
>>> __destroy_client().
>>>
>>> Why isn't NFSD4_CLIENT_CB_KILL set?
>> Since __destroy_client was not called, for the reason mentioned above,
>> nfsd4_shutdown_callback was not called so NFSD4_CLIENT_CB_KILL was not
>> set.
> Well, then I'm missing something. You said, above:
>
>> Currently when a nfs4_client is destroyed we wait for
> And __destroy_client() invokes nfsd4_shutdown_callback(). Can you
> explain the usage scenario(s) to me again?

__destroy_client is not called in the case of CB_RECALL_ANY since
there is no open/lock conflict associated the the client.

>
>
>> Since the nfsd callback_wq was created with alloc_ordered_workqueue,
>> once this loop happens the nfsd callback_wq is stuck since this workqueue
>> executes at most one work item at any given time.
>>
>> This means that if a nfs client is shutdown and the server is in this
>> state then all other clients are effected; all callbacks to other
>> clients are stuck in the workqueue. And if a callback for a client is
>> stuck in the workqueue then that client can not unmount its shares.
>>
>> This is the symptom that was reported recently on this list. I think
>> the nfsd callback_wq should be created as a normal workqueue that allows
>> multiple work items to be executed at the same time so a problem with
>> one client does not effect other clients.
> My impression is that the callback_wq is single-threaded to avoid
> locking complexity. I'm not yet convinced it would be safe to simply
> change that workqueue to permit multiple concurrent work items.

Do you have any specific concern about lock complexity related to
callback operation?

>
> It could be straightforward, however, to move the callback_wq into
> struct nfs4_client so that each client can have its own workqueue.
> Then we can take some time and design something less brittle and
> more scalable (and maybe come up with some test infrastructure so
> this stuff doesn't break as often).

IMHO I don't see why the callback workqueue has to be different
from the laundry_wq or nfsd_filecache_wq used by nfsd.

-Dai

>
>
Chuck Lever March 30, 2024, 6:28 p.m. UTC | #10
On Sat, Mar 30, 2024 at 10:46:08AM -0700, Dai Ngo wrote:
> 
> On 3/29/24 4:42 PM, Chuck Lever wrote:
> > On Fri, Mar 29, 2024 at 10:57:22AM -0700, Dai Ngo wrote:
> > > On 3/29/24 7:55 AM, Chuck Lever wrote:
> > > > On Thu, Mar 28, 2024 at 05:31:02PM -0700, Dai Ngo wrote:
> > > > > On 3/28/24 11:14 AM, Dai Ngo wrote:
> > > > > > On 3/28/24 7:08 AM, Chuck Lever wrote:
> > > > > > > On Wed, Mar 27, 2024 at 06:09:28PM -0700, Dai Ngo wrote:
> > > > > > > > On 3/26/24 11:27 AM, Chuck Lever wrote:
> > > > > > > > > On Tue, Mar 26, 2024 at 11:13:29AM -0700, Dai Ngo wrote:
> > > > > > > > > > Currently when a nfs4_client is destroyed we wait for
> > > > > > > > > > the cb_recall_any
> > > > > > > > > > callback to complete before proceed. This adds
> > > > > > > > > > unnecessary delay to the
> > > > > > > > > > __destroy_client call if there is problem communicating
> > > > > > > > > > with the client.
> > > > > > > > > By "unnecessary delay" do you mean only the seven-second RPC
> > > > > > > > > retransmit timeout, or is there something else?
> > > > > > > > when the client network interface is down, the RPC task takes ~9s to
> > > > > > > > send the callback, waits for the reply and gets ETIMEDOUT. This process
> > > > > > > > repeats in a loop with the same RPC task before being stopped by
> > > > > > > > rpc_shutdown_client after client lease expires.
> > > > > > > I'll have to review this code again, but rpc_shutdown_client
> > > > > > > should cause these RPCs to terminate immediately and safely. Can't
> > > > > > > we use that?
> > > > > > rpc_shutdown_client works, it terminated the RPC call to stop the loop.
> > > > > > 
> > > > > > > > It takes a total of about 1m20s before the CB_RECALL is terminated.
> > > > > > > > For CB_RECALL_ANY and CB_OFFLOAD, this process gets in to a infinite
> > > > > > > > loop since there is no delegation conflict and the client is allowed
> > > > > > > > to stay in courtesy state.
> > > > > > > > 
> > > > > > > > The loop happens because in nfsd4_cb_sequence_done if cb_seq_status
> > > > > > > > is 1 (an RPC Reply was never received) it calls nfsd4_mark_cb_fault
> > > > > > > > to set the NFSD4_CB_FAULT bit. It then sets cb_need_restart to true.
> > > > > > > > When nfsd4_cb_release is called, it checks cb_need_restart bit and
> > > > > > > > re-queues the work again.
> > > > > > > Something in the sequence_done path should check if the server is
> > > > > > > tearing down this callback connection. If it doesn't, that is a bug
> > > > > > > IMO.
> > > > > TCP terminated the connection after retrying for 16 minutes and
> > > > > notified the RPC layer which deleted the nfsd4_conn.
> > > > The server should have closed this connection already.
> > > Since there is no delegation conflict, the client remains in courtesy
> > > state.
> > > 
> > > >    Is it stuck
> > > > waiting for the client to respond to a FIN or something?
> > > No, in this case since the client network interface was down server
> > > TCP did not even receive ACK for the transmit so the server TCP
> > > kept retrying.
> > It sounds like the server attempts to maintain the client's
> > transport while the client is in courtesy state?
> 
> Yes, TCP keeps retryingwhile the client is in courtesy state.

So the client hasn't sent a lease-renewing operation recently, but
the connection is still there. It then makes sense for the server to
forcibly close the connection when a client transitions to the
courtesy state, since that connection instance is suspect.

The server would then recognize immediately that it shouldn't post
any new backchannel operations to that client until it gets a fresh
connection attempt from it.


> > I thought the
> > purpose of courteous server was to more gracefully handle network
> > partitions, in which case, the transport is not reliable.
> > 
> > If a courtesy client hasn't re-established a connection with a
> > backchannel by the time a conflicting open/lock request arrives,
> > the server has no choice but to expire that client's courtesy
> > state immediately. Right?
> 
> Yes, that is the case for CB_RECALL but not for CB_RECALL_ANY.

CB_RECALL_ANY is done by a shrinker, yes?

Instead of attempting to send a CB_RECALL_ANY to a courtesy client
which is likely to be unresponsive, why not just expire the oldest
courtesy client the server has? Or... if NFSD /already/ expires the
oldest courtesy client as a result of memory pressure, then just
don't ever send CB_RECALL_ANY to courtesy clients.

As soon as a courtesy client reconnects, it will send a lease-
renewing operation, transition back to an active state, and at that
point it re-qualifies for getting CB_RECALL_ANY.


> > But maybe that's a side-bar.
> > 
> > 
> > > > > But when nfsd4_run_cb_work ran again, it got into the infinite
> > > > > loop caused by:
> > > > >        /*
> > > > >         * XXX: Ideally, we could wait for the client to
> > > > >         *      reconnect, but I haven't figured out how
> > > > >         *      to do that yet.
> > > > >         */
> > > > >         nfsd4_queue_cb_delayed(cb, 25);
> > > > > 
> > > > > which was introduced by c1ccfcf1a9bf. Note that I'm using 6.9-rc1.
> > > > The whole paragraph is:
> > > > 
> > > > 1503         clnt = clp->cl_cb_client;
> > > > 1504         if (!clnt) {
> > > > 1505                 if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags))
> > > > 1506                         nfsd41_destroy_cb(cb);
> > > > 1507                 else {
> > > > 1508                         /*
> > > > 1509                          * XXX: Ideally, we could wait for the client to
> > > > 1510                          *      reconnect, but I haven't figured out how
> > > > 1511                          *      to do that yet.
> > > > 1512                          */
> > > > 1513                         nfsd4_queue_cb_delayed(cb, 25);
> > > > 1514                 }
> > > > 1515                 return;
> > > > 1516         }
> > > > 
> > > > When there's no rpc_clnt and CB_KILL is set, the callback
> > > > operation should be destroyed immediately. CB_KILL is set by
> > > > nfsd4_shutdown_callback. It's only caller is
> > > > __destroy_client().
> > > > 
> > > > Why isn't NFSD4_CLIENT_CB_KILL set?
> > > Since __destroy_client was not called, for the reason mentioned above,
> > > nfsd4_shutdown_callback was not called so NFSD4_CLIENT_CB_KILL was not
> > > set.
> > Well, then I'm missing something. You said, above:
> > 
> > > Currently when a nfs4_client is destroyed we wait for
> > And __destroy_client() invokes nfsd4_shutdown_callback(). Can you
> > explain the usage scenario(s) to me again?
> 
> __destroy_client is not called in the case of CB_RECALL_ANY since
> there is no open/lock conflict associated the the client.

> > > Since the nfsd callback_wq was created with alloc_ordered_workqueue,
> > > once this loop happens the nfsd callback_wq is stuck since this workqueue
> > > executes at most one work item at any given time.
> > > 
> > > This means that if a nfs client is shutdown and the server is in this
> > > state then all other clients are effected; all callbacks to other
> > > clients are stuck in the workqueue. And if a callback for a client is
> > > stuck in the workqueue then that client can not unmount its shares.
> > > 
> > > This is the symptom that was reported recently on this list. I think
> > > the nfsd callback_wq should be created as a normal workqueue that allows
> > > multiple work items to be executed at the same time so a problem with
> > > one client does not effect other clients.
> > My impression is that the callback_wq is single-threaded to avoid
> > locking complexity. I'm not yet convinced it would be safe to simply
> > change that workqueue to permit multiple concurrent work items.
> 
> Do you have any specific concern about lock complexity related to
> callback operation?

If there needs to be a fix, I'd like something for v6.9-rc, so it
needs to be a narrow change. Larger infrastructural changes have to
go via a full merge window.


> > It could be straightforward, however, to move the callback_wq into
> > struct nfs4_client so that each client can have its own workqueue.
> > Then we can take some time and design something less brittle and
> > more scalable (and maybe come up with some test infrastructure so
> > this stuff doesn't break as often).
> 
> IMHO I don't see why the callback workqueue has to be different
> from the laundry_wq or nfsd_filecache_wq used by nfsd.

You mean, you don't see why callback_wq has to be ordered, while
the others are not so constrained? Quite possibly it does not have
to be ordered.

But we might have lost the bit of history that explains why, so
let's be cautious about making broad changes here until we have a
good operational understanding of the code and some robust test
cases to check any changes we make.
Dai Ngo March 30, 2024, 11:30 p.m. UTC | #11
On 3/30/24 11:28 AM, Chuck Lever wrote:
> On Sat, Mar 30, 2024 at 10:46:08AM -0700, Dai Ngo wrote:
>> On 3/29/24 4:42 PM, Chuck Lever wrote:
>>> On Fri, Mar 29, 2024 at 10:57:22AM -0700, Dai Ngo wrote:
>>>> On 3/29/24 7:55 AM, Chuck Lever wrote:
>>>>> On Thu, Mar 28, 2024 at 05:31:02PM -0700, Dai Ngo wrote:
>>>>>> On 3/28/24 11:14 AM, Dai Ngo wrote:
>>>>>>> On 3/28/24 7:08 AM, Chuck Lever wrote:
>>>>>>>> On Wed, Mar 27, 2024 at 06:09:28PM -0700, Dai Ngo wrote:
>>>>>>>>> On 3/26/24 11:27 AM, Chuck Lever wrote:
>>>>>>>>>> On Tue, Mar 26, 2024 at 11:13:29AM -0700, Dai Ngo wrote:
>>>>>>>>>>> Currently when a nfs4_client is destroyed we wait for
>>>>>>>>>>> the cb_recall_any
>>>>>>>>>>> callback to complete before proceed. This adds
>>>>>>>>>>> unnecessary delay to the
>>>>>>>>>>> __destroy_client call if there is problem communicating
>>>>>>>>>>> with the client.
>>>>>>>>>> By "unnecessary delay" do you mean only the seven-second RPC
>>>>>>>>>> retransmit timeout, or is there something else?
>>>>>>>>> when the client network interface is down, the RPC task takes ~9s to
>>>>>>>>> send the callback, waits for the reply and gets ETIMEDOUT. This process
>>>>>>>>> repeats in a loop with the same RPC task before being stopped by
>>>>>>>>> rpc_shutdown_client after client lease expires.
>>>>>>>> I'll have to review this code again, but rpc_shutdown_client
>>>>>>>> should cause these RPCs to terminate immediately and safely. Can't
>>>>>>>> we use that?
>>>>>>> rpc_shutdown_client works, it terminated the RPC call to stop the loop.
>>>>>>>
>>>>>>>>> It takes a total of about 1m20s before the CB_RECALL is terminated.
>>>>>>>>> For CB_RECALL_ANY and CB_OFFLOAD, this process gets in to a infinite
>>>>>>>>> loop since there is no delegation conflict and the client is allowed
>>>>>>>>> to stay in courtesy state.
>>>>>>>>>
>>>>>>>>> The loop happens because in nfsd4_cb_sequence_done if cb_seq_status
>>>>>>>>> is 1 (an RPC Reply was never received) it calls nfsd4_mark_cb_fault
>>>>>>>>> to set the NFSD4_CB_FAULT bit. It then sets cb_need_restart to true.
>>>>>>>>> When nfsd4_cb_release is called, it checks cb_need_restart bit and
>>>>>>>>> re-queues the work again.
>>>>>>>> Something in the sequence_done path should check if the server is
>>>>>>>> tearing down this callback connection. If it doesn't, that is a bug
>>>>>>>> IMO.
>>>>>> TCP terminated the connection after retrying for 16 minutes and
>>>>>> notified the RPC layer which deleted the nfsd4_conn.
>>>>> The server should have closed this connection already.
>>>> Since there is no delegation conflict, the client remains in courtesy
>>>> state.
>>>>
>>>>>     Is it stuck
>>>>> waiting for the client to respond to a FIN or something?
>>>> No, in this case since the client network interface was down server
>>>> TCP did not even receive ACK for the transmit so the server TCP
>>>> kept retrying.
>>> It sounds like the server attempts to maintain the client's
>>> transport while the client is in courtesy state?
>> Yes, TCP keeps retryingwhile the client is in courtesy state.
> So the client hasn't sent a lease-renewing operation recently, but
> the connection is still there. It then makes sense for the server to
> forcibly close the connection when a client transitions to the
> courtesy state, since that connection instance is suspect.

yes, this makes sense. This would make the RPC to stop within a
lease period.

I'll submit a patch to kill the back channel connection if there
is RPC pending and the client is about to enter courtesy state.

>
> The server would then recognize immediately that it shouldn't post
> any new backchannel operations to that client until it gets a fresh
> connection attempt from it.

Currently deleg_reaper does not issue CB_RECALL_ANY for courtesy clients.

>
>
>>> I thought the
>>> purpose of courteous server was to more gracefully handle network
>>> partitions, in which case, the transport is not reliable.
>>>
>>> If a courtesy client hasn't re-established a connection with a
>>> backchannel by the time a conflicting open/lock request arrives,
>>> the server has no choice but to expire that client's courtesy
>>> state immediately. Right?
>> Yes, that is the case for CB_RECALL but not for CB_RECALL_ANY.
> CB_RECALL_ANY is done by a shrinker, yes?

CB_RECALL_ANY is done by the memory shrinker or when
num_delegations >= max_delegations.

>
> Instead of attempting to send a CB_RECALL_ANY to a courtesy client
> which is likely to be unresponsive, why not just expire the oldest
> courtesy client the server has? Or... if NFSD /already/ expires the
> oldest courtesy client as a result of memory pressure, then just
> don't ever send CB_RECALL_ANY to courtesy clients.

Currently deleg_reaper does not issue CB_RECALL_ANY for courtesy clients.

>
> As soon as a courtesy client reconnects, it will send a lease-
> renewing operation, transition back to an active state, and at that
> point it re-qualifies for getting CB_RECALL_ANY.
>
>
>>> But maybe that's a side-bar.
>>>
>>>
>>>>>> But when nfsd4_run_cb_work ran again, it got into the infinite
>>>>>> loop caused by:
>>>>>>         /*
>>>>>>          * XXX: Ideally, we could wait for the client to
>>>>>>          *      reconnect, but I haven't figured out how
>>>>>>          *      to do that yet.
>>>>>>          */
>>>>>>          nfsd4_queue_cb_delayed(cb, 25);
>>>>>>
>>>>>> which was introduced by c1ccfcf1a9bf. Note that I'm using 6.9-rc1.
>>>>> The whole paragraph is:
>>>>>
>>>>> 1503         clnt = clp->cl_cb_client;
>>>>> 1504         if (!clnt) {
>>>>> 1505                 if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags))
>>>>> 1506                         nfsd41_destroy_cb(cb);
>>>>> 1507                 else {
>>>>> 1508                         /*
>>>>> 1509                          * XXX: Ideally, we could wait for the client to
>>>>> 1510                          *      reconnect, but I haven't figured out how
>>>>> 1511                          *      to do that yet.
>>>>> 1512                          */
>>>>> 1513                         nfsd4_queue_cb_delayed(cb, 25);
>>>>> 1514                 }
>>>>> 1515                 return;
>>>>> 1516         }
>>>>>
>>>>> When there's no rpc_clnt and CB_KILL is set, the callback
>>>>> operation should be destroyed immediately. CB_KILL is set by
>>>>> nfsd4_shutdown_callback. It's only caller is
>>>>> __destroy_client().
>>>>>
>>>>> Why isn't NFSD4_CLIENT_CB_KILL set?
>>>> Since __destroy_client was not called, for the reason mentioned above,
>>>> nfsd4_shutdown_callback was not called so NFSD4_CLIENT_CB_KILL was not
>>>> set.
>>> Well, then I'm missing something. You said, above:
>>>
>>>> Currently when a nfs4_client is destroyed we wait for
>>> And __destroy_client() invokes nfsd4_shutdown_callback(). Can you
>>> explain the usage scenario(s) to me again?
>> __destroy_client is not called in the case of CB_RECALL_ANY since
>> there is no open/lock conflict associated the the client.
>>>> Since the nfsd callback_wq was created with alloc_ordered_workqueue,
>>>> once this loop happens the nfsd callback_wq is stuck since this workqueue
>>>> executes at most one work item at any given time.
>>>>
>>>> This means that if a nfs client is shutdown and the server is in this
>>>> state then all other clients are effected; all callbacks to other
>>>> clients are stuck in the workqueue. And if a callback for a client is
>>>> stuck in the workqueue then that client can not unmount its shares.
>>>>
>>>> This is the symptom that was reported recently on this list. I think
>>>> the nfsd callback_wq should be created as a normal workqueue that allows
>>>> multiple work items to be executed at the same time so a problem with
>>>> one client does not effect other clients.
>>> My impression is that the callback_wq is single-threaded to avoid
>>> locking complexity. I'm not yet convinced it would be safe to simply
>>> change that workqueue to permit multiple concurrent work items.
>> Do you have any specific concern about lock complexity related to
>> callback operation?
> If there needs to be a fix, I'd like something for v6.9-rc, so it
> needs to be a narrow change. Larger infrastructural changes have to
> go via a full merge window.
>
>
>>> It could be straightforward, however, to move the callback_wq into
>>> struct nfs4_client so that each client can have its own workqueue.
>>> Then we can take some time and design something less brittle and
>>> more scalable (and maybe come up with some test infrastructure so
>>> this stuff doesn't break as often).
>> IMHO I don't see why the callback workqueue has to be different
>> from the laundry_wq or nfsd_filecache_wq used by nfsd.
> You mean, you don't see why callback_wq has to be ordered, while
> the others are not so constrained? Quite possibly it does not have
> to be ordered.

Yes, I looked at the all the nfsd4_callback_ops on nfsd and they
seems to take into account of concurrency and use locks appropriately.
For each type of work I don't see why one work has to wait for
the previous work to complete before proceed.

>
> But we might have lost the bit of history that explains why, so
> let's be cautious about making broad changes here until we have a
> good operational understanding of the code and some robust test
> cases to check any changes we make.

Understand, you make the call.

-Dai

>
>
Jeffrey Layton April 1, 2024, 12:49 p.m. UTC | #12
On Sat, 2024-03-30 at 16:30 -0700, Dai Ngo wrote:
> On 3/30/24 11:28 AM, Chuck Lever wrote:
> > On Sat, Mar 30, 2024 at 10:46:08AM -0700, Dai Ngo wrote:
> > > On 3/29/24 4:42 PM, Chuck Lever wrote:
> > > > On Fri, Mar 29, 2024 at 10:57:22AM -0700, Dai Ngo wrote:
> > > > > On 3/29/24 7:55 AM, Chuck Lever wrote:
> > > > > > On Thu, Mar 28, 2024 at 05:31:02PM -0700, Dai Ngo wrote:
> > > > > > > On 3/28/24 11:14 AM, Dai Ngo wrote:
> > > > > > > > On 3/28/24 7:08 AM, Chuck Lever wrote:
> > > > > > > > > On Wed, Mar 27, 2024 at 06:09:28PM -0700, Dai Ngo wrote:
> > > > > > > > > > On 3/26/24 11:27 AM, Chuck Lever wrote:
> > > > > > > > > > > On Tue, Mar 26, 2024 at 11:13:29AM -0700, Dai Ngo wrote:
> > > > > > > > > > > > Currently when a nfs4_client is destroyed we wait for
> > > > > > > > > > > > the cb_recall_any
> > > > > > > > > > > > callback to complete before proceed. This adds
> > > > > > > > > > > > unnecessary delay to the
> > > > > > > > > > > > __destroy_client call if there is problem communicating
> > > > > > > > > > > > with the client.
> > > > > > > > > > > By "unnecessary delay" do you mean only the seven-second RPC
> > > > > > > > > > > retransmit timeout, or is there something else?
> > > > > > > > > > when the client network interface is down, the RPC task takes ~9s to
> > > > > > > > > > send the callback, waits for the reply and gets ETIMEDOUT. This process
> > > > > > > > > > repeats in a loop with the same RPC task before being stopped by
> > > > > > > > > > rpc_shutdown_client after client lease expires.
> > > > > > > > > I'll have to review this code again, but rpc_shutdown_client
> > > > > > > > > should cause these RPCs to terminate immediately and safely. Can't
> > > > > > > > > we use that?
> > > > > > > > rpc_shutdown_client works, it terminated the RPC call to stop the loop.
> > > > > > > > 
> > > > > > > > > > It takes a total of about 1m20s before the CB_RECALL is terminated.
> > > > > > > > > > For CB_RECALL_ANY and CB_OFFLOAD, this process gets in to a infinite
> > > > > > > > > > loop since there is no delegation conflict and the client is allowed
> > > > > > > > > > to stay in courtesy state.
> > > > > > > > > > 
> > > > > > > > > > The loop happens because in nfsd4_cb_sequence_done if cb_seq_status
> > > > > > > > > > is 1 (an RPC Reply was never received) it calls nfsd4_mark_cb_fault
> > > > > > > > > > to set the NFSD4_CB_FAULT bit. It then sets cb_need_restart to true.
> > > > > > > > > > When nfsd4_cb_release is called, it checks cb_need_restart bit and
> > > > > > > > > > re-queues the work again.
> > > > > > > > > Something in the sequence_done path should check if the server is
> > > > > > > > > tearing down this callback connection. If it doesn't, that is a bug
> > > > > > > > > IMO.
> > > > > > > TCP terminated the connection after retrying for 16 minutes and
> > > > > > > notified the RPC layer which deleted the nfsd4_conn.
> > > > > > The server should have closed this connection already.
> > > > > Since there is no delegation conflict, the client remains in courtesy
> > > > > state.
> > > > > 
> > > > > >     Is it stuck
> > > > > > waiting for the client to respond to a FIN or something?
> > > > > No, in this case since the client network interface was down server
> > > > > TCP did not even receive ACK for the transmit so the server TCP
> > > > > kept retrying.
> > > > It sounds like the server attempts to maintain the client's
> > > > transport while the client is in courtesy state?
> > > Yes, TCP keeps retryingwhile the client is in courtesy state.
> > So the client hasn't sent a lease-renewing operation recently, but
> > the connection is still there. It then makes sense for the server to
> > forcibly close the connection when a client transitions to the
> > courtesy state, since that connection instance is suspect.
> 
> yes, this makes sense. This would make the RPC to stop within a
> lease period.
> 
> I'll submit a patch to kill the back channel connection if there
> is RPC pending and the client is about to enter courtesy state.
> 
> > 
> > The server would then recognize immediately that it shouldn't post
> > any new backchannel operations to that client until it gets a fresh
> > connection attempt from it.
> 
> Currently deleg_reaper does not issue CB_RECALL_ANY for courtesy clients.
> 
> > 
> > 
> > > > I thought the
> > > > purpose of courteous server was to more gracefully handle network
> > > > partitions, in which case, the transport is not reliable.
> > > > 
> > > > If a courtesy client hasn't re-established a connection with a
> > > > backchannel by the time a conflicting open/lock request arrives,
> > > > the server has no choice but to expire that client's courtesy
> > > > state immediately. Right?
> > > Yes, that is the case for CB_RECALL but not for CB_RECALL_ANY.
> > CB_RECALL_ANY is done by a shrinker, yes?
> 
> CB_RECALL_ANY is done by the memory shrinker or when
> num_delegations >= max_delegations.
> 
> > 
> > Instead of attempting to send a CB_RECALL_ANY to a courtesy client
> > which is likely to be unresponsive, why not just expire the oldest
> > courtesy client the server has? Or... if NFSD /already/ expires the
> > oldest courtesy client as a result of memory pressure, then just
> > don't ever send CB_RECALL_ANY to courtesy clients.
> 
> Currently deleg_reaper does not issue CB_RECALL_ANY for courtesy clients.
> 
> > 
> > As soon as a courtesy client reconnects, it will send a lease-
> > renewing operation, transition back to an active state, and at that
> > point it re-qualifies for getting CB_RECALL_ANY.
> > 
> > 
> > > > But maybe that's a side-bar.
> > > > 
> > > > 
> > > > > > > But when nfsd4_run_cb_work ran again, it got into the infinite
> > > > > > > loop caused by:
> > > > > > >         /*
> > > > > > >          * XXX: Ideally, we could wait for the client to
> > > > > > >          *      reconnect, but I haven't figured out how
> > > > > > >          *      to do that yet.
> > > > > > >          */
> > > > > > >          nfsd4_queue_cb_delayed(cb, 25);
> > > > > > > 
> > > > > > > which was introduced by c1ccfcf1a9bf. Note that I'm using 6.9-rc1.
> > > > > > The whole paragraph is:
> > > > > > 
> > > > > > 1503         clnt = clp->cl_cb_client;
> > > > > > 1504         if (!clnt) {
> > > > > > 1505                 if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags))
> > > > > > 1506                         nfsd41_destroy_cb(cb);
> > > > > > 1507                 else {
> > > > > > 1508                         /*
> > > > > > 1509                          * XXX: Ideally, we could wait for the client to
> > > > > > 1510                          *      reconnect, but I haven't figured out how
> > > > > > 1511                          *      to do that yet.
> > > > > > 1512                          */
> > > > > > 1513                         nfsd4_queue_cb_delayed(cb, 25);
> > > > > > 1514                 }
> > > > > > 1515                 return;
> > > > > > 1516         }
> > > > > > 
> > > > > > When there's no rpc_clnt and CB_KILL is set, the callback
> > > > > > operation should be destroyed immediately. CB_KILL is set by
> > > > > > nfsd4_shutdown_callback. It's only caller is
> > > > > > __destroy_client().
> > > > > > 
> > > > > > Why isn't NFSD4_CLIENT_CB_KILL set?
> > > > > Since __destroy_client was not called, for the reason mentioned above,
> > > > > nfsd4_shutdown_callback was not called so NFSD4_CLIENT_CB_KILL was not
> > > > > set.
> > > > Well, then I'm missing something. You said, above:
> > > > 
> > > > > Currently when a nfs4_client is destroyed we wait for
> > > > And __destroy_client() invokes nfsd4_shutdown_callback(). Can you
> > > > explain the usage scenario(s) to me again?
> > > __destroy_client is not called in the case of CB_RECALL_ANY since
> > > there is no open/lock conflict associated the the client.
> > > > > Since the nfsd callback_wq was created with alloc_ordered_workqueue,
> > > > > once this loop happens the nfsd callback_wq is stuck since this workqueue
> > > > > executes at most one work item at any given time.
> > > > > 
> > > > > This means that if a nfs client is shutdown and the server is in this
> > > > > state then all other clients are effected; all callbacks to other
> > > > > clients are stuck in the workqueue. And if a callback for a client is
> > > > > stuck in the workqueue then that client can not unmount its shares.
> > > > > 
> > > > > This is the symptom that was reported recently on this list. I think
> > > > > the nfsd callback_wq should be created as a normal workqueue that allows
> > > > > multiple work items to be executed at the same time so a problem with
> > > > > one client does not effect other clients.
> > > > My impression is that the callback_wq is single-threaded to avoid
> > > > locking complexity. I'm not yet convinced it would be safe to simply
> > > > change that workqueue to permit multiple concurrent work items.
> > > Do you have any specific concern about lock complexity related to
> > > callback operation?
> > If there needs to be a fix, I'd like something for v6.9-rc, so it
> > needs to be a narrow change. Larger infrastructural changes have to
> > go via a full merge window.
> > 
> > 
> > > > It could be straightforward, however, to move the callback_wq into
> > > > struct nfs4_client so that each client can have its own workqueue.
> > > > Then we can take some time and design something less brittle and
> > > > more scalable (and maybe come up with some test infrastructure so
> > > > this stuff doesn't break as often).
> > > IMHO I don't see why the callback workqueue has to be different
> > > from the laundry_wq or nfsd_filecache_wq used by nfsd.
> > You mean, you don't see why callback_wq has to be ordered, while
> > the others are not so constrained? Quite possibly it does not have
> > to be ordered.
> 
> Yes, I looked at the all the nfsd4_callback_ops on nfsd and they
> seems to take into account of concurrency and use locks appropriately.
> For each type of work I don't see why one work has to wait for
> the previous work to complete before proceed.
> 
> > 
> > But we might have lost the bit of history that explains why, so
> > let's be cautious about making broad changes here until we have a
> > good operational understanding of the code and some robust test
> > cases to check any changes we make.
> 
> Understand, you make the call.
> 

commit 88382036674770173128417e4c09e9e549f82d54
Author: J. Bruce Fields <bfields@fieldses.org>
Date:   Mon Nov 14 11:13:43 2016 -0500

    nfsd: update workqueue creation
    
    No real change in functionality, but the old interface seems to be
    deprecated.
    
    We don't actually care about ordering necessarily, but we do depend on
    running at most one work item at a time: nfsd4_process_cb_update()
    assumes that no other thread is running it, and that no new callbacks
    are starting while it's running.
    
    Reviewed-by: Jeff Layton <jlayton@redhat.com>
    Signed-off-by: J. Bruce Fields <bfields@redhat.com>


...so it may be as simple as just fixing up nfsd4_process_cb_update().
Allowing parallel recalls would certainly be a good thing.

That said, a workqueue per client would be a great place to start. I
don't see any reason to serialize callbacks to different clients.
--
Jeff Layton <jlayton@kernel.org>
Chuck Lever April 1, 2024, 1:34 p.m. UTC | #13
On Mon, Apr 01, 2024 at 08:49:49AM -0400, Jeff Layton wrote:
> On Sat, 2024-03-30 at 16:30 -0700, Dai Ngo wrote:
> > On 3/30/24 11:28 AM, Chuck Lever wrote:
> > > On Sat, Mar 30, 2024 at 10:46:08AM -0700, Dai Ngo wrote:
> > > > On 3/29/24 4:42 PM, Chuck Lever wrote:
> > > > > On Fri, Mar 29, 2024 at 10:57:22AM -0700, Dai Ngo wrote:
> > > > > > On 3/29/24 7:55 AM, Chuck Lever wrote:
> > > > > It could be straightforward, however, to move the callback_wq into
> > > > > struct nfs4_client so that each client can have its own workqueue.
> > > > > Then we can take some time and design something less brittle and
> > > > > more scalable (and maybe come up with some test infrastructure so
> > > > > this stuff doesn't break as often).
> > > > IMHO I don't see why the callback workqueue has to be different
> > > > from the laundry_wq or nfsd_filecache_wq used by nfsd.
> > > You mean, you don't see why callback_wq has to be ordered, while
> > > the others are not so constrained? Quite possibly it does not have
> > > to be ordered.
> > 
> > Yes, I looked at the all the nfsd4_callback_ops on nfsd and they
> > seems to take into account of concurrency and use locks appropriately.
> > For each type of work I don't see why one work has to wait for
> > the previous work to complete before proceed.
> > 
> > > But we might have lost the bit of history that explains why, so
> > > let's be cautious about making broad changes here until we have a
> > > good operational understanding of the code and some robust test
> > > cases to check any changes we make.
> > 
> > Understand, you make the call.
> 
> commit 88382036674770173128417e4c09e9e549f82d54
> Author: J. Bruce Fields <bfields@fieldses.org>
> Date:   Mon Nov 14 11:13:43 2016 -0500
> 
>     nfsd: update workqueue creation
>     
>     No real change in functionality, but the old interface seems to be
>     deprecated.
>     
>     We don't actually care about ordering necessarily, but we do depend on
>     running at most one work item at a time: nfsd4_process_cb_update()
>     assumes that no other thread is running it, and that no new callbacks
>     are starting while it's running.
>     
>     Reviewed-by: Jeff Layton <jlayton@redhat.com>
>     Signed-off-by: J. Bruce Fields <bfields@redhat.com>
> 
> 
> ...so it may be as simple as just fixing up nfsd4_process_cb_update().
> Allowing parallel recalls would certainly be a good thing.

Thanks for the research. I was about to do the same.

I think we do allow parallel recalls -- IIUC, callback_wq
single-threads only the dispatch of RPC calls, not their
processing. Note the use of rpc_call_async().

So nfsd4_process_cb_update() is protecting modifications of
cl_cb_client and the backchannel transport. We might wrap that in
a mutex, for example. But I don't see strong evidence (yet) that
this design is a bottleneck when it is working properly.

However, if for some reason, a work item sleeps, that would
block forward progress of the work queue, and would be Bad (tm).


> That said, a workqueue per client would be a great place to start. I
> don't see any reason to serialize callbacks to different clients.

I volunteer to take care of that for v6.10.
Dai Ngo April 1, 2024, 4 p.m. UTC | #14
On 4/1/24 6:34 AM, Chuck Lever wrote:
> On Mon, Apr 01, 2024 at 08:49:49AM -0400, Jeff Layton wrote:
>> On Sat, 2024-03-30 at 16:30 -0700, Dai Ngo wrote:
>>> On 3/30/24 11:28 AM, Chuck Lever wrote:
>>>> On Sat, Mar 30, 2024 at 10:46:08AM -0700, Dai Ngo wrote:
>>>>> On 3/29/24 4:42 PM, Chuck Lever wrote:
>>>>>> On Fri, Mar 29, 2024 at 10:57:22AM -0700, Dai Ngo wrote:
>>>>>>> On 3/29/24 7:55 AM, Chuck Lever wrote:
>>>>>> It could be straightforward, however, to move the callback_wq into
>>>>>> struct nfs4_client so that each client can have its own workqueue.
>>>>>> Then we can take some time and design something less brittle and
>>>>>> more scalable (and maybe come up with some test infrastructure so
>>>>>> this stuff doesn't break as often).
>>>>> IMHO I don't see why the callback workqueue has to be different
>>>>> from the laundry_wq or nfsd_filecache_wq used by nfsd.
>>>> You mean, you don't see why callback_wq has to be ordered, while
>>>> the others are not so constrained? Quite possibly it does not have
>>>> to be ordered.
>>> Yes, I looked at the all the nfsd4_callback_ops on nfsd and they
>>> seems to take into account of concurrency and use locks appropriately.
>>> For each type of work I don't see why one work has to wait for
>>> the previous work to complete before proceed.
>>>
>>>> But we might have lost the bit of history that explains why, so
>>>> let's be cautious about making broad changes here until we have a
>>>> good operational understanding of the code and some robust test
>>>> cases to check any changes we make.
>>> Understand, you make the call.
>> commit 88382036674770173128417e4c09e9e549f82d54
>> Author: J. Bruce Fields <bfields@fieldses.org>
>> Date:   Mon Nov 14 11:13:43 2016 -0500
>>
>>      nfsd: update workqueue creation
>>      
>>      No real change in functionality, but the old interface seems to be
>>      deprecated.
>>      
>>      We don't actually care about ordering necessarily, but we do depend on
>>      running at most one work item at a time: nfsd4_process_cb_update()
>>      assumes that no other thread is running it, and that no new callbacks
>>      are starting while it's running.
>>      
>>      Reviewed-by: Jeff Layton <jlayton@redhat.com>
>>      Signed-off-by: J. Bruce Fields <bfields@redhat.com>
>>
>>
>> ...so it may be as simple as just fixing up nfsd4_process_cb_update().
>> Allowing parallel recalls would certainly be a good thing.

Thank you Jeff for pointing this out.

> Thanks for the research. I was about to do the same.
>
> I think we do allow parallel recalls -- IIUC, callback_wq
> single-threads only the dispatch of RPC calls, not their
> processing. Note the use of rpc_call_async().
>
> So nfsd4_process_cb_update() is protecting modifications of
> cl_cb_client and the backchannel transport. We might wrap that in
> a mutex, for example. But I don't see strong evidence (yet) that
> this design is a bottleneck when it is working properly.
>
> However, if for some reason, a work item sleeps, that would
> block forward progress of the work queue, and would be Bad (tm).
>
>
>> That said, a workqueue per client would be a great place to start. I
>> don't see any reason to serialize callbacks to different clients.
> I volunteer to take care of that for v6.10.

Thank you Chuck!

-Dai

>
>
Jeffrey Layton April 1, 2024, 4:11 p.m. UTC | #15
On Mon, 2024-04-01 at 09:34 -0400, Chuck Lever wrote:
> On Mon, Apr 01, 2024 at 08:49:49AM -0400, Jeff Layton wrote:
> > On Sat, 2024-03-30 at 16:30 -0700, Dai Ngo wrote:
> > > On 3/30/24 11:28 AM, Chuck Lever wrote:
> > > > On Sat, Mar 30, 2024 at 10:46:08AM -0700, Dai Ngo wrote:
> > > > > On 3/29/24 4:42 PM, Chuck Lever wrote:
> > > > > > On Fri, Mar 29, 2024 at 10:57:22AM -0700, Dai Ngo wrote:
> > > > > > > On 3/29/24 7:55 AM, Chuck Lever wrote:
> > > > > > It could be straightforward, however, to move the callback_wq into
> > > > > > struct nfs4_client so that each client can have its own workqueue.
> > > > > > Then we can take some time and design something less brittle and
> > > > > > more scalable (and maybe come up with some test infrastructure so
> > > > > > this stuff doesn't break as often).
> > > > > IMHO I don't see why the callback workqueue has to be different
> > > > > from the laundry_wq or nfsd_filecache_wq used by nfsd.
> > > > You mean, you don't see why callback_wq has to be ordered, while
> > > > the others are not so constrained? Quite possibly it does not have
> > > > to be ordered.
> > > 
> > > Yes, I looked at the all the nfsd4_callback_ops on nfsd and they
> > > seems to take into account of concurrency and use locks appropriately.
> > > For each type of work I don't see why one work has to wait for
> > > the previous work to complete before proceed.
> > > 
> > > > But we might have lost the bit of history that explains why, so
> > > > let's be cautious about making broad changes here until we have a
> > > > good operational understanding of the code and some robust test
> > > > cases to check any changes we make.
> > > 
> > > Understand, you make the call.
> > 
> > commit 88382036674770173128417e4c09e9e549f82d54
> > Author: J. Bruce Fields <bfields@fieldses.org>
> > Date:   Mon Nov 14 11:13:43 2016 -0500
> > 
> >     nfsd: update workqueue creation
> >     
> >     No real change in functionality, but the old interface seems to be
> >     deprecated.
> >     
> >     We don't actually care about ordering necessarily, but we do depend on
> >     running at most one work item at a time: nfsd4_process_cb_update()
> >     assumes that no other thread is running it, and that no new callbacks
> >     are starting while it's running.
> >     
> >     Reviewed-by: Jeff Layton <jlayton@redhat.com>
> >     Signed-off-by: J. Bruce Fields <bfields@redhat.com>
> > 
> > 
> > ...so it may be as simple as just fixing up nfsd4_process_cb_update().
> > Allowing parallel recalls would certainly be a good thing.
> 
> Thanks for the research. I was about to do the same.
> 
> I think we do allow parallel recalls -- IIUC, callback_wq
> single-threads only the dispatch of RPC calls, not their
> processing. Note the use of rpc_call_async().
> 
> So nfsd4_process_cb_update() is protecting modifications of
> cl_cb_client and the backchannel transport. We might wrap that in
> a mutex, for example. But I don't see strong evidence (yet) that
> this design is a bottleneck when it is working properly.
> 
> However, if for some reason, a work item sleeps, that would
> block forward progress of the work queue, and would be Bad (tm).
> 

Right. That's my main concern with the current single-threaded
workqueue. A stuck or sleeping job will block all callbacks, even to
clients that aren't related to the stuck one. The RPC layer is generally
good at not blocking when we ask it not to, but there are places where
the task can sleep (memory allocation comes to mind).

Also, the single threaded workqueue is just an artificial bottleneck. If
the server has a lot of callbacks to send, there is no need to serialize
that activity.

> 
> > That said, a workqueue per client would be a great place to start. I
> > don't see any reason to serialize callbacks to different clients.
> 
> I volunteer to take care of that for v6.10.
> 
> 

Excellent!
Dai Ngo April 1, 2024, 4:46 p.m. UTC | #16
On 4/1/24 9:00 AM, Dai Ngo wrote:
>
> On 4/1/24 6:34 AM, Chuck Lever wrote:
>> On Mon, Apr 01, 2024 at 08:49:49AM -0400, Jeff Layton wrote:
>>> On Sat, 2024-03-30 at 16:30 -0700, Dai Ngo wrote:
>>>> On 3/30/24 11:28 AM, Chuck Lever wrote:
>>>>> On Sat, Mar 30, 2024 at 10:46:08AM -0700, Dai Ngo wrote:
>>>>>> On 3/29/24 4:42 PM, Chuck Lever wrote:
>>>>>>> On Fri, Mar 29, 2024 at 10:57:22AM -0700, Dai Ngo wrote:
>>>>>>>> On 3/29/24 7:55 AM, Chuck Lever wrote:
>>>>>>> It could be straightforward, however, to move the callback_wq into
>>>>>>> struct nfs4_client so that each client can have its own workqueue.
>>>>>>> Then we can take some time and design something less brittle and
>>>>>>> more scalable (and maybe come up with some test infrastructure so
>>>>>>> this stuff doesn't break as often).
>>>>>> IMHO I don't see why the callback workqueue has to be different
>>>>>> from the laundry_wq or nfsd_filecache_wq used by nfsd.
>>>>> You mean, you don't see why callback_wq has to be ordered, while
>>>>> the others are not so constrained? Quite possibly it does not have
>>>>> to be ordered.
>>>> Yes, I looked at the all the nfsd4_callback_ops on nfsd and they
>>>> seems to take into account of concurrency and use locks appropriately.
>>>> For each type of work I don't see why one work has to wait for
>>>> the previous work to complete before proceed.
>>>>
>>>>> But we might have lost the bit of history that explains why, so
>>>>> let's be cautious about making broad changes here until we have a
>>>>> good operational understanding of the code and some robust test
>>>>> cases to check any changes we make.
>>>> Understand, you make the call.
>>> commit 88382036674770173128417e4c09e9e549f82d54
>>> Author: J. Bruce Fields <bfields@fieldses.org>
>>> Date:   Mon Nov 14 11:13:43 2016 -0500
>>>
>>>      nfsd: update workqueue creation
>>>           No real change in functionality, but the old interface 
>>> seems to be
>>>      deprecated.
>>>           We don't actually care about ordering necessarily, but we 
>>> do depend on
>>>      running at most one work item at a time: nfsd4_process_cb_update()
>>>      assumes that no other thread is running it, and that no new 
>>> callbacks
>>>      are starting while it's running.
>>>           Reviewed-by: Jeff Layton <jlayton@redhat.com>
>>>      Signed-off-by: J. Bruce Fields <bfields@redhat.com>
>>>
>>>
>>> ...so it may be as simple as just fixing up nfsd4_process_cb_update().
>>> Allowing parallel recalls would certainly be a good thing.
>
> Thank you Jeff for pointing this out.
>
>> Thanks for the research. I was about to do the same.
>>
>> I think we do allow parallel recalls -- IIUC, callback_wq
>> single-threads only the dispatch of RPC calls, not their
>> processing. Note the use of rpc_call_async().
>>
>> So nfsd4_process_cb_update() is protecting modifications of
>> cl_cb_client and the backchannel transport. We might wrap that in
>> a mutex, for example. But I don't see strong evidence (yet) that
>> this design is a bottleneck when it is working properly.
>>
>> However, if for some reason, a work item sleeps, that would
>> block forward progress of the work queue, and would be Bad (tm).
>>
>>
>>> That said, a workqueue per client would be a great place to start. I
>>> don't see any reason to serialize callbacks to different clients.
>> I volunteer to take care of that for v6.10.

Since you're going to make callback workqueue per client, do we still need
a fix in nfsd to shut down the callback when a client is about to enter
courtesy state and there is pending RPC calls.

With callback workqueue per client, it fixes the problem of all callbacks
hang when a job get stuck in the workqueue. The fix in nfsd prevents a
stuck job to loop until the client reconnects which might be a very long
time or never if that client is no longer used.

-Dai

>
> Thank you Chuck!
>
> -Dai
>
>>
>>
>
Chuck Lever April 1, 2024, 5:49 p.m. UTC | #17
On Mon, Apr 01, 2024 at 09:46:25AM -0700, Dai Ngo wrote:
> 
> On 4/1/24 9:00 AM, Dai Ngo wrote:
> > 
> > On 4/1/24 6:34 AM, Chuck Lever wrote:
> > > On Mon, Apr 01, 2024 at 08:49:49AM -0400, Jeff Layton wrote:
> > > > On Sat, 2024-03-30 at 16:30 -0700, Dai Ngo wrote:
> > > > > On 3/30/24 11:28 AM, Chuck Lever wrote:
> > > > > > On Sat, Mar 30, 2024 at 10:46:08AM -0700, Dai Ngo wrote:
> > > > > > > On 3/29/24 4:42 PM, Chuck Lever wrote:
> > > > > > > > On Fri, Mar 29, 2024 at 10:57:22AM -0700, Dai Ngo wrote:
> > > > > > > > > On 3/29/24 7:55 AM, Chuck Lever wrote:
> > > > > > > > It could be straightforward, however, to move the callback_wq into
> > > > > > > > struct nfs4_client so that each client can have its own workqueue.
> > > > > > > > Then we can take some time and design something less brittle and
> > > > > > > > more scalable (and maybe come up with some test infrastructure so
> > > > > > > > this stuff doesn't break as often).
> > > > > > > IMHO I don't see why the callback workqueue has to be different
> > > > > > > from the laundry_wq or nfsd_filecache_wq used by nfsd.
> > > > > > You mean, you don't see why callback_wq has to be ordered, while
> > > > > > the others are not so constrained? Quite possibly it does not have
> > > > > > to be ordered.
> > > > > Yes, I looked at the all the nfsd4_callback_ops on nfsd and they
> > > > > seems to take into account of concurrency and use locks appropriately.
> > > > > For each type of work I don't see why one work has to wait for
> > > > > the previous work to complete before proceed.
> > > > > 
> > > > > > But we might have lost the bit of history that explains why, so
> > > > > > let's be cautious about making broad changes here until we have a
> > > > > > good operational understanding of the code and some robust test
> > > > > > cases to check any changes we make.
> > > > > Understand, you make the call.
> > > > commit 88382036674770173128417e4c09e9e549f82d54
> > > > Author: J. Bruce Fields <bfields@fieldses.org>
> > > > Date:   Mon Nov 14 11:13:43 2016 -0500
> > > > 
> > > >      nfsd: update workqueue creation
> > > >           No real change in functionality, but the old interface
> > > > seems to be
> > > >      deprecated.
> > > >           We don't actually care about ordering necessarily, but
> > > > we do depend on
> > > >      running at most one work item at a time: nfsd4_process_cb_update()
> > > >      assumes that no other thread is running it, and that no new
> > > > callbacks
> > > >      are starting while it's running.
> > > >           Reviewed-by: Jeff Layton <jlayton@redhat.com>
> > > >      Signed-off-by: J. Bruce Fields <bfields@redhat.com>
> > > > 
> > > > 
> > > > ...so it may be as simple as just fixing up nfsd4_process_cb_update().
> > > > Allowing parallel recalls would certainly be a good thing.
> > 
> > Thank you Jeff for pointing this out.
> > 
> > > Thanks for the research. I was about to do the same.
> > > 
> > > I think we do allow parallel recalls -- IIUC, callback_wq
> > > single-threads only the dispatch of RPC calls, not their
> > > processing. Note the use of rpc_call_async().
> > > 
> > > So nfsd4_process_cb_update() is protecting modifications of
> > > cl_cb_client and the backchannel transport. We might wrap that in
> > > a mutex, for example. But I don't see strong evidence (yet) that
> > > this design is a bottleneck when it is working properly.
> > > 
> > > However, if for some reason, a work item sleeps, that would
> > > block forward progress of the work queue, and would be Bad (tm).
> > > 
> > > 
> > > > That said, a workqueue per client would be a great place to start. I
> > > > don't see any reason to serialize callbacks to different clients.
> > > I volunteer to take care of that for v6.10.
> 
> Since you're going to make callback workqueue per client, do we still need
> a fix in nfsd to shut down the callback when a client is about to enter
> courtesy state and there is pending RPC calls.

I would rather just close down the transports for courtesy clients.
But that doesn't seem to be the root cause, so let's put this aside
for a bit.


> With callback workqueue per client, it fixes the problem of all callbacks
> hang when a job get stuck in the workqueue. The fix in nfsd prevents a
> stuck job to loop until the client reconnects which might be a very long
> time or never if that client is no longer used.

The question I have is will this unresponsive client cause other
issues, such as:

 - a hang when the server tries to unexport or shutdown
 - CPU or memory consumption for each retried callback

That is an ongoing concern.
Dai Ngo April 1, 2024, 7:55 p.m. UTC | #18
On 4/1/24 10:49 AM, Chuck Lever wrote:
> On Mon, Apr 01, 2024 at 09:46:25AM -0700, Dai Ngo wrote:
>> On 4/1/24 9:00 AM, Dai Ngo wrote:
>>> On 4/1/24 6:34 AM, Chuck Lever wrote:
>>>> On Mon, Apr 01, 2024 at 08:49:49AM -0400, Jeff Layton wrote:
>>>>> On Sat, 2024-03-30 at 16:30 -0700, Dai Ngo wrote:
>>>>>> On 3/30/24 11:28 AM, Chuck Lever wrote:
>>>>>>> On Sat, Mar 30, 2024 at 10:46:08AM -0700, Dai Ngo wrote:
>>>>>>>> On 3/29/24 4:42 PM, Chuck Lever wrote:
>>>>>>>>> On Fri, Mar 29, 2024 at 10:57:22AM -0700, Dai Ngo wrote:
>>>>>>>>>> On 3/29/24 7:55 AM, Chuck Lever wrote:
>>>>>>>>> It could be straightforward, however, to move the callback_wq into
>>>>>>>>> struct nfs4_client so that each client can have its own workqueue.
>>>>>>>>> Then we can take some time and design something less brittle and
>>>>>>>>> more scalable (and maybe come up with some test infrastructure so
>>>>>>>>> this stuff doesn't break as often).
>>>>>>>> IMHO I don't see why the callback workqueue has to be different
>>>>>>>> from the laundry_wq or nfsd_filecache_wq used by nfsd.
>>>>>>> You mean, you don't see why callback_wq has to be ordered, while
>>>>>>> the others are not so constrained? Quite possibly it does not have
>>>>>>> to be ordered.
>>>>>> Yes, I looked at the all the nfsd4_callback_ops on nfsd and they
>>>>>> seems to take into account of concurrency and use locks appropriately.
>>>>>> For each type of work I don't see why one work has to wait for
>>>>>> the previous work to complete before proceed.
>>>>>>
>>>>>>> But we might have lost the bit of history that explains why, so
>>>>>>> let's be cautious about making broad changes here until we have a
>>>>>>> good operational understanding of the code and some robust test
>>>>>>> cases to check any changes we make.
>>>>>> Understand, you make the call.
>>>>> commit 88382036674770173128417e4c09e9e549f82d54
>>>>> Author: J. Bruce Fields <bfields@fieldses.org>
>>>>> Date:   Mon Nov 14 11:13:43 2016 -0500
>>>>>
>>>>>       nfsd: update workqueue creation
>>>>>            No real change in functionality, but the old interface
>>>>> seems to be
>>>>>       deprecated.
>>>>>            We don't actually care about ordering necessarily, but
>>>>> we do depend on
>>>>>       running at most one work item at a time: nfsd4_process_cb_update()
>>>>>       assumes that no other thread is running it, and that no new
>>>>> callbacks
>>>>>       are starting while it's running.
>>>>>            Reviewed-by: Jeff Layton <jlayton@redhat.com>
>>>>>       Signed-off-by: J. Bruce Fields <bfields@redhat.com>
>>>>>
>>>>>
>>>>> ...so it may be as simple as just fixing up nfsd4_process_cb_update().
>>>>> Allowing parallel recalls would certainly be a good thing.
>>> Thank you Jeff for pointing this out.
>>>
>>>> Thanks for the research. I was about to do the same.
>>>>
>>>> I think we do allow parallel recalls -- IIUC, callback_wq
>>>> single-threads only the dispatch of RPC calls, not their
>>>> processing. Note the use of rpc_call_async().
>>>>
>>>> So nfsd4_process_cb_update() is protecting modifications of
>>>> cl_cb_client and the backchannel transport. We might wrap that in
>>>> a mutex, for example. But I don't see strong evidence (yet) that
>>>> this design is a bottleneck when it is working properly.
>>>>
>>>> However, if for some reason, a work item sleeps, that would
>>>> block forward progress of the work queue, and would be Bad (tm).
>>>>
>>>>
>>>>> That said, a workqueue per client would be a great place to start. I
>>>>> don't see any reason to serialize callbacks to different clients.
>>>> I volunteer to take care of that for v6.10.
>> Since you're going to make callback workqueue per client, do we still need
>> a fix in nfsd to shut down the callback when a client is about to enter
>> courtesy state and there is pending RPC calls.
> I would rather just close down the transports for courtesy clients.
> But that doesn't seem to be the root cause, so let's put this aside
> for a bit.
>
>
>> With callback workqueue per client, it fixes the problem of all callbacks
>> hang when a job get stuck in the workqueue. The fix in nfsd prevents a
>> stuck job to loop until the client reconnects which might be a very long
>> time or never if that client is no longer used.
> The question I have is will this unresponsive client cause other
> issues, such as:
>
>   - a hang when the server tries to unexport

exportfs -u does not hang, but the share can not be un-exported.

>   or shutdown

shutdown does not hang since __destroy_client is called which calls
nfsd4_shutdown_callback to set NFSD4_CLIENT_CB_KILL.

echo "expire' > /proc/fs/nfsd/X/ctl does hang since it waits for
cl_rpc_users to drop to 0. But we can fix that by dropping the
wait_event(expiry_wq, atomic_read(&clp->cl_rpc_users) == 0) and
just go ahead and expire_client likes shutdown.

>   - CPU or memory consumption for each retried callback

Yes, the loop does consume CPU cycles since it's rescheduled to run
after 25ms.

-Dai

>
> That is an ongoing concern.
>
Dai Ngo April 1, 2024, 8:17 p.m. UTC | #19
On 4/1/24 12:55 PM, Dai Ngo wrote:
>
> On 4/1/24 10:49 AM, Chuck Lever wrote:
>> On Mon, Apr 01, 2024 at 09:46:25AM -0700, Dai Ngo wrote:
>>> On 4/1/24 9:00 AM, Dai Ngo wrote:
>>>> On 4/1/24 6:34 AM, Chuck Lever wrote:
>>>>> On Mon, Apr 01, 2024 at 08:49:49AM -0400, Jeff Layton wrote:
>>>>>> On Sat, 2024-03-30 at 16:30 -0700, Dai Ngo wrote:
>>>>>>> On 3/30/24 11:28 AM, Chuck Lever wrote:
>>>>>>>> On Sat, Mar 30, 2024 at 10:46:08AM -0700, Dai Ngo wrote:
>>>>>>>>> On 3/29/24 4:42 PM, Chuck Lever wrote:
>>>>>>>>>> On Fri, Mar 29, 2024 at 10:57:22AM -0700, Dai Ngo wrote:
>>>>>>>>>>> On 3/29/24 7:55 AM, Chuck Lever wrote:
>>>>>>>>>> It could be straightforward, however, to move the callback_wq 
>>>>>>>>>> into
>>>>>>>>>> struct nfs4_client so that each client can have its own 
>>>>>>>>>> workqueue.
>>>>>>>>>> Then we can take some time and design something less brittle and
>>>>>>>>>> more scalable (and maybe come up with some test 
>>>>>>>>>> infrastructure so
>>>>>>>>>> this stuff doesn't break as often).
>>>>>>>>> IMHO I don't see why the callback workqueue has to be different
>>>>>>>>> from the laundry_wq or nfsd_filecache_wq used by nfsd.
>>>>>>>> You mean, you don't see why callback_wq has to be ordered, while
>>>>>>>> the others are not so constrained? Quite possibly it does not have
>>>>>>>> to be ordered.
>>>>>>> Yes, I looked at the all the nfsd4_callback_ops on nfsd and they
>>>>>>> seems to take into account of concurrency and use locks 
>>>>>>> appropriately.
>>>>>>> For each type of work I don't see why one work has to wait for
>>>>>>> the previous work to complete before proceed.
>>>>>>>
>>>>>>>> But we might have lost the bit of history that explains why, so
>>>>>>>> let's be cautious about making broad changes here until we have a
>>>>>>>> good operational understanding of the code and some robust test
>>>>>>>> cases to check any changes we make.
>>>>>>> Understand, you make the call.
>>>>>> commit 88382036674770173128417e4c09e9e549f82d54
>>>>>> Author: J. Bruce Fields <bfields@fieldses.org>
>>>>>> Date:   Mon Nov 14 11:13:43 2016 -0500
>>>>>>
>>>>>>       nfsd: update workqueue creation
>>>>>>            No real change in functionality, but the old interface
>>>>>> seems to be
>>>>>>       deprecated.
>>>>>>            We don't actually care about ordering necessarily, but
>>>>>> we do depend on
>>>>>>       running at most one work item at a time: 
>>>>>> nfsd4_process_cb_update()
>>>>>>       assumes that no other thread is running it, and that no new
>>>>>> callbacks
>>>>>>       are starting while it's running.
>>>>>>            Reviewed-by: Jeff Layton <jlayton@redhat.com>
>>>>>>       Signed-off-by: J. Bruce Fields <bfields@redhat.com>
>>>>>>
>>>>>>
>>>>>> ...so it may be as simple as just fixing up 
>>>>>> nfsd4_process_cb_update().
>>>>>> Allowing parallel recalls would certainly be a good thing.
>>>> Thank you Jeff for pointing this out.
>>>>
>>>>> Thanks for the research. I was about to do the same.
>>>>>
>>>>> I think we do allow parallel recalls -- IIUC, callback_wq
>>>>> single-threads only the dispatch of RPC calls, not their
>>>>> processing. Note the use of rpc_call_async().
>>>>>
>>>>> So nfsd4_process_cb_update() is protecting modifications of
>>>>> cl_cb_client and the backchannel transport. We might wrap that in
>>>>> a mutex, for example. But I don't see strong evidence (yet) that
>>>>> this design is a bottleneck when it is working properly.
>>>>>
>>>>> However, if for some reason, a work item sleeps, that would
>>>>> block forward progress of the work queue, and would be Bad (tm).
>>>>>
>>>>>
>>>>>> That said, a workqueue per client would be a great place to start. I
>>>>>> don't see any reason to serialize callbacks to different clients.
>>>>> I volunteer to take care of that for v6.10.
>>> Since you're going to make callback workqueue per client, do we 
>>> still need
>>> a fix in nfsd to shut down the callback when a client is about to enter
>>> courtesy state and there is pending RPC calls.
>> I would rather just close down the transports for courtesy clients.
>> But that doesn't seem to be the root cause, so let's put this aside
>> for a bit.
>>
>>
>>> With callback workqueue per client, it fixes the problem of all 
>>> callbacks
>>> hang when a job get stuck in the workqueue. The fix in nfsd prevents a
>>> stuck job to loop until the client reconnects which might be a very 
>>> long
>>> time or never if that client is no longer used.
>> The question I have is will this unresponsive client cause other
>> issues, such as:
>>
>>   - a hang when the server tries to unexport
>
> exportfs -u does not hang, but the share can not be un-exported.

unexport does not hang.

-Dai

>
>>   or shutdown
>
> shutdown does not hang since __destroy_client is called which calls
> nfsd4_shutdown_callback to set NFSD4_CLIENT_CB_KILL.
>
> echo "expire' > /proc/fs/nfsd/X/ctl does hang since it waits for
> cl_rpc_users to drop to 0. But we can fix that by dropping the
> wait_event(expiry_wq, atomic_read(&clp->cl_rpc_users) == 0) and
> just go ahead and expire_client likes shutdown.
>
>>   - CPU or memory consumption for each retried callback
>
> Yes, the loop does consume CPU cycles since it's rescheduled to run
> after 25ms.
>
> -Dai
>
>>
>> That is an ongoing concern.
>>
>
Chuck Lever April 2, 2024, 1:58 p.m. UTC | #20
On Mon, Apr 01, 2024 at 12:55:16PM -0700, Dai Ngo wrote:
> 
> On 4/1/24 10:49 AM, Chuck Lever wrote:
> > The question I have is will this unresponsive client cause other
> > issues, such as:
> > 
> >   - a hang when the server tries to unexport
> 
> exportfs -u does not hang, but the share can not be un-exported.

What does "can not be un-exported" mean? Does "exportfs -u" fail?
Seems like this is a UX bug.
Dai Ngo April 2, 2024, 2:29 p.m. UTC | #21
Unexport works fine. It was my mistake.

-Dai

> On Apr 2, 2024, at 6:58 AM, Chuck Lever III <chuck.lever@oracle.com> wrote:
> 
> On Mon, Apr 01, 2024 at 12:55:16PM -0700, Dai Ngo wrote:
>> 
>>> On 4/1/24 10:49 AM, Chuck Lever wrote:
>>> The question I have is will this unresponsive client cause other
>>> issues, such as:
>>> 
>>>  - a hang when the server tries to unexport
>> 
>> exportfs -u does not hang, but the share can not be un-exported.
> 
> What does "can not be un-exported" mean? Does "exportfs -u" fail?
> Seems like this is a UX bug.
> 
> 
> --
> Chuck Lever
diff mbox series

Patch

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 87c9547989f6..e5b50c96be6a 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1568,3 +1568,13 @@  bool nfsd4_run_cb(struct nfsd4_callback *cb)
 		nfsd41_cb_inflight_end(clp);
 	return queued;
 }
+
+void nfsd41_cb_recall_any_cancel(struct nfs4_client *clp)
+{
+	if (test_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags) &&
+			cancel_delayed_work(&clp->cl_ra->ra_cb.cb_work)) {
+		clear_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags);
+		atomic_add_unless(&clp->cl_rpc_users, -1, 0);
+		nfsd41_cb_inflight_end(clp);
+	}
+}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 1a93c7fcf76c..0e1db57c9a19 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2402,6 +2402,7 @@  __destroy_client(struct nfs4_client *clp)
 	}
 	nfsd4_return_all_client_layouts(clp);
 	nfsd4_shutdown_copy(clp);
+	nfsd41_cb_recall_any_cancel(clp);
 	nfsd4_shutdown_callback(clp);
 	if (clp->cl_cb_conn.cb_xprt)
 		svc_xprt_put(clp->cl_cb_conn.cb_xprt);
@@ -2980,6 +2981,12 @@  static void force_expire_client(struct nfs4_client *clp)
 	clp->cl_time = 0;
 	spin_unlock(&nn->client_lock);
 
+	/*
+	 * no need to send and wait for CB_RECALL_ANY
+	 * when client is about to be destroyed
+	 */
+	nfsd41_cb_recall_any_cancel(clp);
+
 	wait_event(expiry_wq, atomic_read(&clp->cl_rpc_users) == 0);
 	spin_lock(&nn->client_lock);
 	already_expired = list_empty(&clp->cl_lru);
@@ -6617,7 +6624,8 @@  deleg_reaper(struct nfsd_net *nn)
 		clp->cl_ra->ra_bmval[0] = BIT(RCA4_TYPE_MASK_RDATA_DLG) |
 						BIT(RCA4_TYPE_MASK_WDATA_DLG);
 		trace_nfsd_cb_recall_any(clp->cl_ra);
-		nfsd4_run_cb(&clp->cl_ra->ra_cb);
+		if (!nfsd4_run_cb(&clp->cl_ra->ra_cb))
+			clear_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags);
 	}
 }
 
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 01c6f3445646..259b4af7d226 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -735,6 +735,7 @@  extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *
 extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
 		const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op);
 extern bool nfsd4_run_cb(struct nfsd4_callback *cb);
+extern void nfsd41_cb_recall_any_cancel(struct nfs4_client *clp);
 extern int nfsd4_create_callback_queue(void);
 extern void nfsd4_destroy_callback_queue(void);
 extern void nfsd4_shutdown_callback(struct nfs4_client *);