diff mbox series

[RFC,v18,02/11] NFSD: Add courtesy client state, macro and spinlock to support courteous server

Message ID 1648182891-32599-3-git-send-email-dai.ngo@oracle.com (mailing list archive)
State New, archived
Headers show
Series NFSD: Initial implementation of NFSv4 Courteous Server | expand

Commit Message

Dai Ngo March 25, 2022, 4:34 a.m. UTC
Update nfs4_client to add:
 . cl_cs_client_state: courtesy client state
 . cl_cs_lock: spinlock to synchronize access to cl_cs_client_state
 . cl_cs_list: list used by laundromat to process courtesy clients

Modify alloc_client to initialize these fields.

Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
---
 fs/nfsd/nfs4state.c |  2 ++
 fs/nfsd/nfsd.h      |  1 +
 fs/nfsd/state.h     | 33 +++++++++++++++++++++++++++++++++
 3 files changed, 36 insertions(+)

Comments

J. Bruce Fields March 29, 2022, 3:47 p.m. UTC | #1
On Thu, Mar 24, 2022 at 09:34:42PM -0700, Dai Ngo wrote:
> Update nfs4_client to add:
>  . cl_cs_client_state: courtesy client state
>  . cl_cs_lock: spinlock to synchronize access to cl_cs_client_state
>  . cl_cs_list: list used by laundromat to process courtesy clients
> 
> Modify alloc_client to initialize these fields.
> 
> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
> ---
>  fs/nfsd/nfs4state.c |  2 ++
>  fs/nfsd/nfsd.h      |  1 +
>  fs/nfsd/state.h     | 33 +++++++++++++++++++++++++++++++++
>  3 files changed, 36 insertions(+)
> 
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 234e852fcdfa..a65d59510681 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -2009,12 +2009,14 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
>  	INIT_LIST_HEAD(&clp->cl_delegations);
>  	INIT_LIST_HEAD(&clp->cl_lru);
>  	INIT_LIST_HEAD(&clp->cl_revoked);
> +	INIT_LIST_HEAD(&clp->cl_cs_list);
>  #ifdef CONFIG_NFSD_PNFS
>  	INIT_LIST_HEAD(&clp->cl_lo_states);
>  #endif
>  	INIT_LIST_HEAD(&clp->async_copies);
>  	spin_lock_init(&clp->async_lock);
>  	spin_lock_init(&clp->cl_lock);
> +	spin_lock_init(&clp->cl_cs_lock);
>  	rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
>  	return clp;
>  err_no_hashtbl:
> diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
> index 4fc1fd639527..23996c6ca75e 100644
> --- a/fs/nfsd/nfsd.h
> +++ b/fs/nfsd/nfsd.h
> @@ -336,6 +336,7 @@ void		nfsd_lockd_shutdown(void);
>  #define COMPOUND_ERR_SLACK_SPACE	16     /* OP_SETATTR */
>  
>  #define NFSD_LAUNDROMAT_MINTIMEOUT      1   /* seconds */
> +#define	NFSD_COURTESY_CLIENT_TIMEOUT	(24 * 60 * 60)	/* seconds */
>  
>  /*
>   * The following attributes are currently not supported by the NFSv4 server:
> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> index 95457cfd37fc..40e390abc842 100644
> --- a/fs/nfsd/state.h
> +++ b/fs/nfsd/state.h
> @@ -283,6 +283,35 @@ struct nfsd4_sessionid {
>  #define HEXDIR_LEN     33 /* hex version of 16 byte md5 of cl_name plus '\0' */
>  
>  /*
> + * CLIENT_  CLIENT_ CLIENT_
> + * COURTESY EXPIRED RECONNECTED      Meaning                  Where set
> + * -----------------------------------------------------------------------------
> + * | false | false | false | Confirmed, active    | Default                    |
> + * |---------------------------------------------------------------------------|
> + * | true  | false | false | Courtesy state.      | nfs4_get_client_reaplist   |
> + * |       |       |       | Lease/lock/share     |                            |
> + * |       |       |       | reservation conflict |                            |
> + * |       |       |       | can cause Courtesy   |                            |
> + * |       |       |       | client to be expired |                            |
> + * |---------------------------------------------------------------------------|
> + * | false | true  | false | Courtesy client to be| nfs4_laundromat            |
> + * |       |       |       | expired by Laundromat| nfsd4_lm_lock_expired      |
> + * |       |       |       | due to conflict     | nfsd4_discard_courtesy_clnt |
> + * |       |       |       |                      | nfsd4_expire_courtesy_clnt |
> + * |---------------------------------------------------------------------------|
> + * | false | false | true  | Courtesy client      | nfsd4_courtesy_clnt_expired|
> + * |       |       |       | reconnected,         |                            |
> + * |       |       |       | becoming active      |                            |
> + * -----------------------------------------------------------------------------

These are mutually exclusive values, not bits that may set to 0 or 1, so
the three boolean columns are confusing.  I'd just structure the table
like:

	client state	meaning			where set
	0		Confirmed, active	Default
	CLIENT_COURTESY	Courtesy state....	nfs4_get_client_reaplist
	CLIENT_EXPIRED	Courtesy client to be..	nfs4_laundromat

etc.

--b.

> + */
> +
> +enum courtesy_client_state {
> +	NFSD4_CLIENT_COURTESY = 1,
> +	NFSD4_CLIENT_EXPIRED,
> +	NFSD4_CLIENT_RECONNECTED,
> +};
> +
> +/*
>   * struct nfs4_client - one per client.  Clientids live here.
>   *
>   * The initial object created by an NFS client using SETCLIENTID (for NFSv4.0)
> @@ -385,6 +414,10 @@ struct nfs4_client {
>  	struct list_head	async_copies;	/* list of async copies */
>  	spinlock_t		async_lock;	/* lock for async copies */
>  	atomic_t		cl_cb_inflight;	/* Outstanding callbacks */
> +
> +	enum courtesy_client_state	cl_cs_client_state;
> +	spinlock_t		cl_cs_lock;
> +	struct list_head	cl_cs_list;
>  };
>  
>  /* struct nfs4_client_reset
> -- 
> 2.9.5
Dai Ngo March 29, 2022, 4:20 p.m. UTC | #2
On 3/29/22 8:47 AM, J. Bruce Fields wrote:
> On Thu, Mar 24, 2022 at 09:34:42PM -0700, Dai Ngo wrote:
>> Update nfs4_client to add:
>>   . cl_cs_client_state: courtesy client state
>>   . cl_cs_lock: spinlock to synchronize access to cl_cs_client_state
>>   . cl_cs_list: list used by laundromat to process courtesy clients
>>
>> Modify alloc_client to initialize these fields.
>>
>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
>> ---
>>   fs/nfsd/nfs4state.c |  2 ++
>>   fs/nfsd/nfsd.h      |  1 +
>>   fs/nfsd/state.h     | 33 +++++++++++++++++++++++++++++++++
>>   3 files changed, 36 insertions(+)
>>
>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>> index 234e852fcdfa..a65d59510681 100644
>> --- a/fs/nfsd/nfs4state.c
>> +++ b/fs/nfsd/nfs4state.c
>> @@ -2009,12 +2009,14 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
>>   	INIT_LIST_HEAD(&clp->cl_delegations);
>>   	INIT_LIST_HEAD(&clp->cl_lru);
>>   	INIT_LIST_HEAD(&clp->cl_revoked);
>> +	INIT_LIST_HEAD(&clp->cl_cs_list);
>>   #ifdef CONFIG_NFSD_PNFS
>>   	INIT_LIST_HEAD(&clp->cl_lo_states);
>>   #endif
>>   	INIT_LIST_HEAD(&clp->async_copies);
>>   	spin_lock_init(&clp->async_lock);
>>   	spin_lock_init(&clp->cl_lock);
>> +	spin_lock_init(&clp->cl_cs_lock);
>>   	rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
>>   	return clp;
>>   err_no_hashtbl:
>> diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
>> index 4fc1fd639527..23996c6ca75e 100644
>> --- a/fs/nfsd/nfsd.h
>> +++ b/fs/nfsd/nfsd.h
>> @@ -336,6 +336,7 @@ void		nfsd_lockd_shutdown(void);
>>   #define COMPOUND_ERR_SLACK_SPACE	16     /* OP_SETATTR */
>>   
>>   #define NFSD_LAUNDROMAT_MINTIMEOUT      1   /* seconds */
>> +#define	NFSD_COURTESY_CLIENT_TIMEOUT	(24 * 60 * 60)	/* seconds */
>>   
>>   /*
>>    * The following attributes are currently not supported by the NFSv4 server:
>> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
>> index 95457cfd37fc..40e390abc842 100644
>> --- a/fs/nfsd/state.h
>> +++ b/fs/nfsd/state.h
>> @@ -283,6 +283,35 @@ struct nfsd4_sessionid {
>>   #define HEXDIR_LEN     33 /* hex version of 16 byte md5 of cl_name plus '\0' */
>>   
>>   /*
>> + * CLIENT_  CLIENT_ CLIENT_
>> + * COURTESY EXPIRED RECONNECTED      Meaning                  Where set
>> + * -----------------------------------------------------------------------------
>> + * | false | false | false | Confirmed, active    | Default                    |
>> + * |---------------------------------------------------------------------------|
>> + * | true  | false | false | Courtesy state.      | nfs4_get_client_reaplist   |
>> + * |       |       |       | Lease/lock/share     |                            |
>> + * |       |       |       | reservation conflict |                            |
>> + * |       |       |       | can cause Courtesy   |                            |
>> + * |       |       |       | client to be expired |                            |
>> + * |---------------------------------------------------------------------------|
>> + * | false | true  | false | Courtesy client to be| nfs4_laundromat            |
>> + * |       |       |       | expired by Laundromat| nfsd4_lm_lock_expired      |
>> + * |       |       |       | due to conflict     | nfsd4_discard_courtesy_clnt |
>> + * |       |       |       |                      | nfsd4_expire_courtesy_clnt |
>> + * |---------------------------------------------------------------------------|
>> + * | false | false | true  | Courtesy client      | nfsd4_courtesy_clnt_expired|
>> + * |       |       |       | reconnected,         |                            |
>> + * |       |       |       | becoming active      |                            |
>> + * -----------------------------------------------------------------------------
> These are mutually exclusive values, not bits that may set to 0 or 1, so
> the three boolean columns are confusing.  I'd just structure the table
> like:
>
> 	client state	meaning			where set
> 	0		Confirmed, active	Default
> 	CLIENT_COURTESY	Courtesy state....	nfs4_get_client_reaplist
> 	CLIENT_EXPIRED	Courtesy client to be..	nfs4_laundromat
>
> etc.

will fix in v19.

Thanks,
-Dai

>
> --b.
>
>> + */
>> +
>> +enum courtesy_client_state {
>> +	NFSD4_CLIENT_COURTESY = 1,
>> +	NFSD4_CLIENT_EXPIRED,
>> +	NFSD4_CLIENT_RECONNECTED,
>> +};
>> +
>> +/*
>>    * struct nfs4_client - one per client.  Clientids live here.
>>    *
>>    * The initial object created by an NFS client using SETCLIENTID (for NFSv4.0)
>> @@ -385,6 +414,10 @@ struct nfs4_client {
>>   	struct list_head	async_copies;	/* list of async copies */
>>   	spinlock_t		async_lock;	/* lock for async copies */
>>   	atomic_t		cl_cb_inflight;	/* Outstanding callbacks */
>> +
>> +	enum courtesy_client_state	cl_cs_client_state;
>> +	spinlock_t		cl_cs_lock;
>> +	struct list_head	cl_cs_list;
>>   };
>>   
>>   /* struct nfs4_client_reset
>> -- 
>> 2.9.5
J. Bruce Fields March 29, 2022, 4:30 p.m. UTC | #3
On Tue, Mar 29, 2022 at 09:20:02AM -0700, dai.ngo@oracle.com wrote:
> 
> On 3/29/22 8:47 AM, J. Bruce Fields wrote:
> >On Thu, Mar 24, 2022 at 09:34:42PM -0700, Dai Ngo wrote:
> >>Update nfs4_client to add:
> >>  . cl_cs_client_state: courtesy client state
> >>  . cl_cs_lock: spinlock to synchronize access to cl_cs_client_state
> >>  . cl_cs_list: list used by laundromat to process courtesy clients
> >>
> >>Modify alloc_client to initialize these fields.
> >>
> >>Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
> >>---
> >>  fs/nfsd/nfs4state.c |  2 ++
> >>  fs/nfsd/nfsd.h      |  1 +
> >>  fs/nfsd/state.h     | 33 +++++++++++++++++++++++++++++++++
> >>  3 files changed, 36 insertions(+)
> >>
> >>diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> >>index 234e852fcdfa..a65d59510681 100644
> >>--- a/fs/nfsd/nfs4state.c
> >>+++ b/fs/nfsd/nfs4state.c
> >>@@ -2009,12 +2009,14 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
> >>  	INIT_LIST_HEAD(&clp->cl_delegations);
> >>  	INIT_LIST_HEAD(&clp->cl_lru);
> >>  	INIT_LIST_HEAD(&clp->cl_revoked);
> >>+	INIT_LIST_HEAD(&clp->cl_cs_list);
> >>  #ifdef CONFIG_NFSD_PNFS
> >>  	INIT_LIST_HEAD(&clp->cl_lo_states);
> >>  #endif
> >>  	INIT_LIST_HEAD(&clp->async_copies);
> >>  	spin_lock_init(&clp->async_lock);
> >>  	spin_lock_init(&clp->cl_lock);
> >>+	spin_lock_init(&clp->cl_cs_lock);
> >>  	rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
> >>  	return clp;
> >>  err_no_hashtbl:
> >>diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
> >>index 4fc1fd639527..23996c6ca75e 100644
> >>--- a/fs/nfsd/nfsd.h
> >>+++ b/fs/nfsd/nfsd.h
> >>@@ -336,6 +336,7 @@ void		nfsd_lockd_shutdown(void);
> >>  #define COMPOUND_ERR_SLACK_SPACE	16     /* OP_SETATTR */
> >>  #define NFSD_LAUNDROMAT_MINTIMEOUT      1   /* seconds */
> >>+#define	NFSD_COURTESY_CLIENT_TIMEOUT	(24 * 60 * 60)	/* seconds */
> >>  /*
> >>   * The following attributes are currently not supported by the NFSv4 server:
> >>diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> >>index 95457cfd37fc..40e390abc842 100644
> >>--- a/fs/nfsd/state.h
> >>+++ b/fs/nfsd/state.h
> >>@@ -283,6 +283,35 @@ struct nfsd4_sessionid {
> >>  #define HEXDIR_LEN     33 /* hex version of 16 byte md5 of cl_name plus '\0' */
> >>  /*
> >>+ * CLIENT_  CLIENT_ CLIENT_
> >>+ * COURTESY EXPIRED RECONNECTED      Meaning                  Where set
> >>+ * -----------------------------------------------------------------------------
> >>+ * | false | false | false | Confirmed, active    | Default                    |
> >>+ * |---------------------------------------------------------------------------|
> >>+ * | true  | false | false | Courtesy state.      | nfs4_get_client_reaplist   |
> >>+ * |       |       |       | Lease/lock/share     |                            |
> >>+ * |       |       |       | reservation conflict |                            |
> >>+ * |       |       |       | can cause Courtesy   |                            |
> >>+ * |       |       |       | client to be expired |                            |
> >>+ * |---------------------------------------------------------------------------|
> >>+ * | false | true  | false | Courtesy client to be| nfs4_laundromat            |
> >>+ * |       |       |       | expired by Laundromat| nfsd4_lm_lock_expired      |
> >>+ * |       |       |       | due to conflict     | nfsd4_discard_courtesy_clnt |
> >>+ * |       |       |       |                      | nfsd4_expire_courtesy_clnt |
> >>+ * |---------------------------------------------------------------------------|
> >>+ * | false | false | true  | Courtesy client      | nfsd4_courtesy_clnt_expired|
> >>+ * |       |       |       | reconnected,         |                            |
> >>+ * |       |       |       | becoming active      |                            |
> >>+ * -----------------------------------------------------------------------------

By the way, where is a client returned to the normal (0) state?  That
has to happen at some point.

How is CLIENT_EXPIRED treated differently from cl_time == 0, and why?

Why are RECONNECTED clients discarded in so many cases?  (E.g. whenever
a bind_conn_to_session fails).

--b.


> >These are mutually exclusive values, not bits that may set to 0 or 1, so
> >the three boolean columns are confusing.  I'd just structure the table
> >like:
> >
> >	client state	meaning			where set
> >	0		Confirmed, active	Default
> >	CLIENT_COURTESY	Courtesy state....	nfs4_get_client_reaplist
> >	CLIENT_EXPIRED	Courtesy client to be..	nfs4_laundromat
> >
> >etc.
> 
> will fix in v19.
> 
> Thanks,
> -Dai
> 
> >
> >--b.
> >
> >>+ */
> >>+
> >>+enum courtesy_client_state {
> >>+	NFSD4_CLIENT_COURTESY = 1,
> >>+	NFSD4_CLIENT_EXPIRED,
> >>+	NFSD4_CLIENT_RECONNECTED,
> >>+};
> >>+
> >>+/*
> >>   * struct nfs4_client - one per client.  Clientids live here.
> >>   *
> >>   * The initial object created by an NFS client using SETCLIENTID (for NFSv4.0)
> >>@@ -385,6 +414,10 @@ struct nfs4_client {
> >>  	struct list_head	async_copies;	/* list of async copies */
> >>  	spinlock_t		async_lock;	/* lock for async copies */
> >>  	atomic_t		cl_cb_inflight;	/* Outstanding callbacks */
> >>+
> >>+	enum courtesy_client_state	cl_cs_client_state;
> >>+	spinlock_t		cl_cs_lock;
> >>+	struct list_head	cl_cs_list;
> >>  };
> >>  /* struct nfs4_client_reset
> >>-- 
> >>2.9.5
J. Bruce Fields March 29, 2022, 4:42 p.m. UTC | #4
On Tue, Mar 29, 2022 at 12:30:11PM -0400, J. Bruce Fields wrote:
> On Tue, Mar 29, 2022 at 09:20:02AM -0700, dai.ngo@oracle.com wrote:
> > 
> > On 3/29/22 8:47 AM, J. Bruce Fields wrote:
> > >On Thu, Mar 24, 2022 at 09:34:42PM -0700, Dai Ngo wrote:
> > >>Update nfs4_client to add:
> > >>  . cl_cs_client_state: courtesy client state
> > >>  . cl_cs_lock: spinlock to synchronize access to cl_cs_client_state
> > >>  . cl_cs_list: list used by laundromat to process courtesy clients
> > >>
> > >>Modify alloc_client to initialize these fields.
> > >>
> > >>Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
> > >>---
> > >>  fs/nfsd/nfs4state.c |  2 ++
> > >>  fs/nfsd/nfsd.h      |  1 +
> > >>  fs/nfsd/state.h     | 33 +++++++++++++++++++++++++++++++++
> > >>  3 files changed, 36 insertions(+)
> > >>
> > >>diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > >>index 234e852fcdfa..a65d59510681 100644
> > >>--- a/fs/nfsd/nfs4state.c
> > >>+++ b/fs/nfsd/nfs4state.c
> > >>@@ -2009,12 +2009,14 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
> > >>  	INIT_LIST_HEAD(&clp->cl_delegations);
> > >>  	INIT_LIST_HEAD(&clp->cl_lru);
> > >>  	INIT_LIST_HEAD(&clp->cl_revoked);
> > >>+	INIT_LIST_HEAD(&clp->cl_cs_list);
> > >>  #ifdef CONFIG_NFSD_PNFS
> > >>  	INIT_LIST_HEAD(&clp->cl_lo_states);
> > >>  #endif
> > >>  	INIT_LIST_HEAD(&clp->async_copies);
> > >>  	spin_lock_init(&clp->async_lock);
> > >>  	spin_lock_init(&clp->cl_lock);
> > >>+	spin_lock_init(&clp->cl_cs_lock);
> > >>  	rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
> > >>  	return clp;
> > >>  err_no_hashtbl:
> > >>diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
> > >>index 4fc1fd639527..23996c6ca75e 100644
> > >>--- a/fs/nfsd/nfsd.h
> > >>+++ b/fs/nfsd/nfsd.h
> > >>@@ -336,6 +336,7 @@ void		nfsd_lockd_shutdown(void);
> > >>  #define COMPOUND_ERR_SLACK_SPACE	16     /* OP_SETATTR */
> > >>  #define NFSD_LAUNDROMAT_MINTIMEOUT      1   /* seconds */
> > >>+#define	NFSD_COURTESY_CLIENT_TIMEOUT	(24 * 60 * 60)	/* seconds */
> > >>  /*
> > >>   * The following attributes are currently not supported by the NFSv4 server:
> > >>diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> > >>index 95457cfd37fc..40e390abc842 100644
> > >>--- a/fs/nfsd/state.h
> > >>+++ b/fs/nfsd/state.h
> > >>@@ -283,6 +283,35 @@ struct nfsd4_sessionid {
> > >>  #define HEXDIR_LEN     33 /* hex version of 16 byte md5 of cl_name plus '\0' */
> > >>  /*
> > >>+ * CLIENT_  CLIENT_ CLIENT_
> > >>+ * COURTESY EXPIRED RECONNECTED      Meaning                  Where set
> > >>+ * -----------------------------------------------------------------------------
> > >>+ * | false | false | false | Confirmed, active    | Default                    |
> > >>+ * |---------------------------------------------------------------------------|
> > >>+ * | true  | false | false | Courtesy state.      | nfs4_get_client_reaplist   |
> > >>+ * |       |       |       | Lease/lock/share     |                            |
> > >>+ * |       |       |       | reservation conflict |                            |
> > >>+ * |       |       |       | can cause Courtesy   |                            |
> > >>+ * |       |       |       | client to be expired |                            |
> > >>+ * |---------------------------------------------------------------------------|
> > >>+ * | false | true  | false | Courtesy client to be| nfs4_laundromat            |
> > >>+ * |       |       |       | expired by Laundromat| nfsd4_lm_lock_expired      |
> > >>+ * |       |       |       | due to conflict     | nfsd4_discard_courtesy_clnt |
> > >>+ * |       |       |       |                      | nfsd4_expire_courtesy_clnt |
> > >>+ * |---------------------------------------------------------------------------|
> > >>+ * | false | false | true  | Courtesy client      | nfsd4_courtesy_clnt_expired|
> > >>+ * |       |       |       | reconnected,         |                            |
> > >>+ * |       |       |       | becoming active      |                            |
> > >>+ * -----------------------------------------------------------------------------
> 
> By the way, where is a client returned to the normal (0) state?  That
> has to happen at some point.
> 
> How is CLIENT_EXPIRED treated differently from cl_time == 0, and why?
> 
> Why are RECONNECTED clients discarded in so many cases?  (E.g. whenever
> a bind_conn_to_session fails).

A priori I just don't see how it can be right to treat a reconnected
client in any way differently from an normal confirmed client.

Once we've told the client that its lease is still good, we have to
treat it like any other client, don't we?

--b.
Dai Ngo March 29, 2022, 6:19 p.m. UTC | #5
On 3/29/22 9:30 AM, J. Bruce Fields wrote:
> On Tue, Mar 29, 2022 at 09:20:02AM -0700, dai.ngo@oracle.com wrote:
>> On 3/29/22 8:47 AM, J. Bruce Fields wrote:
>>> On Thu, Mar 24, 2022 at 09:34:42PM -0700, Dai Ngo wrote:
>>>> Update nfs4_client to add:
>>>>   . cl_cs_client_state: courtesy client state
>>>>   . cl_cs_lock: spinlock to synchronize access to cl_cs_client_state
>>>>   . cl_cs_list: list used by laundromat to process courtesy clients
>>>>
>>>> Modify alloc_client to initialize these fields.
>>>>
>>>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
>>>> ---
>>>>   fs/nfsd/nfs4state.c |  2 ++
>>>>   fs/nfsd/nfsd.h      |  1 +
>>>>   fs/nfsd/state.h     | 33 +++++++++++++++++++++++++++++++++
>>>>   3 files changed, 36 insertions(+)
>>>>
>>>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>>>> index 234e852fcdfa..a65d59510681 100644
>>>> --- a/fs/nfsd/nfs4state.c
>>>> +++ b/fs/nfsd/nfs4state.c
>>>> @@ -2009,12 +2009,14 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
>>>>   	INIT_LIST_HEAD(&clp->cl_delegations);
>>>>   	INIT_LIST_HEAD(&clp->cl_lru);
>>>>   	INIT_LIST_HEAD(&clp->cl_revoked);
>>>> +	INIT_LIST_HEAD(&clp->cl_cs_list);
>>>>   #ifdef CONFIG_NFSD_PNFS
>>>>   	INIT_LIST_HEAD(&clp->cl_lo_states);
>>>>   #endif
>>>>   	INIT_LIST_HEAD(&clp->async_copies);
>>>>   	spin_lock_init(&clp->async_lock);
>>>>   	spin_lock_init(&clp->cl_lock);
>>>> +	spin_lock_init(&clp->cl_cs_lock);
>>>>   	rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
>>>>   	return clp;
>>>>   err_no_hashtbl:
>>>> diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
>>>> index 4fc1fd639527..23996c6ca75e 100644
>>>> --- a/fs/nfsd/nfsd.h
>>>> +++ b/fs/nfsd/nfsd.h
>>>> @@ -336,6 +336,7 @@ void		nfsd_lockd_shutdown(void);
>>>>   #define COMPOUND_ERR_SLACK_SPACE	16     /* OP_SETATTR */
>>>>   #define NFSD_LAUNDROMAT_MINTIMEOUT      1   /* seconds */
>>>> +#define	NFSD_COURTESY_CLIENT_TIMEOUT	(24 * 60 * 60)	/* seconds */
>>>>   /*
>>>>    * The following attributes are currently not supported by the NFSv4 server:
>>>> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
>>>> index 95457cfd37fc..40e390abc842 100644
>>>> --- a/fs/nfsd/state.h
>>>> +++ b/fs/nfsd/state.h
>>>> @@ -283,6 +283,35 @@ struct nfsd4_sessionid {
>>>>   #define HEXDIR_LEN     33 /* hex version of 16 byte md5 of cl_name plus '\0' */
>>>>   /*
>>>> + * CLIENT_  CLIENT_ CLIENT_
>>>> + * COURTESY EXPIRED RECONNECTED      Meaning                  Where set
>>>> + * -----------------------------------------------------------------------------
>>>> + * | false | false | false | Confirmed, active    | Default                    |
>>>> + * |---------------------------------------------------------------------------|
>>>> + * | true  | false | false | Courtesy state.      | nfs4_get_client_reaplist   |
>>>> + * |       |       |       | Lease/lock/share     |                            |
>>>> + * |       |       |       | reservation conflict |                            |
>>>> + * |       |       |       | can cause Courtesy   |                            |
>>>> + * |       |       |       | client to be expired |                            |
>>>> + * |---------------------------------------------------------------------------|
>>>> + * | false | true  | false | Courtesy client to be| nfs4_laundromat            |
>>>> + * |       |       |       | expired by Laundromat| nfsd4_lm_lock_expired      |
>>>> + * |       |       |       | due to conflict     | nfsd4_discard_courtesy_clnt |
>>>> + * |       |       |       |                      | nfsd4_expire_courtesy_clnt |
>>>> + * |---------------------------------------------------------------------------|
>>>> + * | false | false | true  | Courtesy client      | nfsd4_courtesy_clnt_expired|
>>>> + * |       |       |       | reconnected,         |                            |
>>>> + * |       |       |       | becoming active      |                            |
>>>> + * -----------------------------------------------------------------------------
> By the way, where is a client returned to the normal (0) state?  That
> has to happen at some point.

For 4.1 courtesy client reconnects is detected in nfsd4_sequence,
nfsd4_bind_conn_to_session. For 4.0 courtesy client reconnects is
detected in set_client.

>
> How is CLIENT_EXPIRED treated differently from cl_time == 0, and why?

cl_time == 0 means the client is being destroyed (almost) immediately
either by the laundromat or force_expire_client.

CLIENT_EXPIRED means the client will be destroyed by the laundromat
and this depends on when the laundromat runs. When we set CLIENT_COURTESY
we don't clear cl_time since the client is not really expired yet.

We could replace CLIENT_EXPIRED with (cl_time == 0). However,
to set cl_time = 0 we need to acquire the nn->client_lock which
causes deadlock when we try to resolve lock conflicts
from nfs4_resolve_deny_conflicts_locked (fp->fi_lock -> nn_clientlock).
We use the cl_cs_lock to set CLIENT_EXPIRED.

>
> Why are RECONNECTED clients discarded in so many cases?  (E.g. whenever
> a bind_conn_to_session fails).

find_in_sessionid_hashtbl: we discard the courtesy client when it
reconnects and there is error from nfsd4_get_session_locked. This
should be a rare condition so rather than reverting the client
state back to courtesy, it is simpler just to discard it.

nfsd4_create_session/find_confirmed_client: I think the only time
the courtesy client sends CREATE_SESSION, before sending the SEQUENCE
to reconnect after missing its leases, is when it wants to do clientid
trunking. This should be a rare condition so instead of dealing
with it we just do not allow it and discard the client for now.

nfsd4_destroy_clientid/find_confirmed_client: instead of destroy
the courtesy client here we just let the laundromat destroy it
as if the client already expired.

nfsd4_setclientid_confirm/find_confirmed_client: there should not
be any courtesy client found from nfsd4_setclientid_confirm, it
should be detected and discarded in nfsd4_setclientid.

-Dai

>>> These are mutually exclusive values, not bits that may set to 0 or 1, so
>>> the three boolean columns are confusing.  I'd just structure the table
>>> like:
>>>
>>> 	client state	meaning			where set
>>> 	0		Confirmed, active	Default
>>> 	CLIENT_COURTESY	Courtesy state....	nfs4_get_client_reaplist
>>> 	CLIENT_EXPIRED	Courtesy client to be..	nfs4_laundromat
>>>
>>> etc.
>> will fix in v19.
>>
>> Thanks,
>> -Dai
>>
>>> --b.
>>>
>>>> + */
>>>> +
>>>> +enum courtesy_client_state {
>>>> +	NFSD4_CLIENT_COURTESY = 1,
>>>> +	NFSD4_CLIENT_EXPIRED,
>>>> +	NFSD4_CLIENT_RECONNECTED,
>>>> +};
>>>> +
>>>> +/*
>>>>    * struct nfs4_client - one per client.  Clientids live here.
>>>>    *
>>>>    * The initial object created by an NFS client using SETCLIENTID (for NFSv4.0)
>>>> @@ -385,6 +414,10 @@ struct nfs4_client {
>>>>   	struct list_head	async_copies;	/* list of async copies */
>>>>   	spinlock_t		async_lock;	/* lock for async copies */
>>>>   	atomic_t		cl_cb_inflight;	/* Outstanding callbacks */
>>>> +
>>>> +	enum courtesy_client_state	cl_cs_client_state;
>>>> +	spinlock_t		cl_cs_lock;
>>>> +	struct list_head	cl_cs_list;
>>>>   };
>>>>   /* struct nfs4_client_reset
>>>> -- 
>>>> 2.9.5
J. Bruce Fields March 29, 2022, 6:39 p.m. UTC | #6
On Tue, Mar 29, 2022 at 11:19:51AM -0700, dai.ngo@oracle.com wrote:
> 
> On 3/29/22 9:30 AM, J. Bruce Fields wrote:
> >On Tue, Mar 29, 2022 at 09:20:02AM -0700, dai.ngo@oracle.com wrote:
> >>On 3/29/22 8:47 AM, J. Bruce Fields wrote:
> >>>On Thu, Mar 24, 2022 at 09:34:42PM -0700, Dai Ngo wrote:
> >>>>Update nfs4_client to add:
> >>>>  . cl_cs_client_state: courtesy client state
> >>>>  . cl_cs_lock: spinlock to synchronize access to cl_cs_client_state
> >>>>  . cl_cs_list: list used by laundromat to process courtesy clients
> >>>>
> >>>>Modify alloc_client to initialize these fields.
> >>>>
> >>>>Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
> >>>>---
> >>>>  fs/nfsd/nfs4state.c |  2 ++
> >>>>  fs/nfsd/nfsd.h      |  1 +
> >>>>  fs/nfsd/state.h     | 33 +++++++++++++++++++++++++++++++++
> >>>>  3 files changed, 36 insertions(+)
> >>>>
> >>>>diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> >>>>index 234e852fcdfa..a65d59510681 100644
> >>>>--- a/fs/nfsd/nfs4state.c
> >>>>+++ b/fs/nfsd/nfs4state.c
> >>>>@@ -2009,12 +2009,14 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
> >>>>  	INIT_LIST_HEAD(&clp->cl_delegations);
> >>>>  	INIT_LIST_HEAD(&clp->cl_lru);
> >>>>  	INIT_LIST_HEAD(&clp->cl_revoked);
> >>>>+	INIT_LIST_HEAD(&clp->cl_cs_list);
> >>>>  #ifdef CONFIG_NFSD_PNFS
> >>>>  	INIT_LIST_HEAD(&clp->cl_lo_states);
> >>>>  #endif
> >>>>  	INIT_LIST_HEAD(&clp->async_copies);
> >>>>  	spin_lock_init(&clp->async_lock);
> >>>>  	spin_lock_init(&clp->cl_lock);
> >>>>+	spin_lock_init(&clp->cl_cs_lock);
> >>>>  	rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
> >>>>  	return clp;
> >>>>  err_no_hashtbl:
> >>>>diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
> >>>>index 4fc1fd639527..23996c6ca75e 100644
> >>>>--- a/fs/nfsd/nfsd.h
> >>>>+++ b/fs/nfsd/nfsd.h
> >>>>@@ -336,6 +336,7 @@ void		nfsd_lockd_shutdown(void);
> >>>>  #define COMPOUND_ERR_SLACK_SPACE	16     /* OP_SETATTR */
> >>>>  #define NFSD_LAUNDROMAT_MINTIMEOUT      1   /* seconds */
> >>>>+#define	NFSD_COURTESY_CLIENT_TIMEOUT	(24 * 60 * 60)	/* seconds */
> >>>>  /*
> >>>>   * The following attributes are currently not supported by the NFSv4 server:
> >>>>diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> >>>>index 95457cfd37fc..40e390abc842 100644
> >>>>--- a/fs/nfsd/state.h
> >>>>+++ b/fs/nfsd/state.h
> >>>>@@ -283,6 +283,35 @@ struct nfsd4_sessionid {
> >>>>  #define HEXDIR_LEN     33 /* hex version of 16 byte md5 of cl_name plus '\0' */
> >>>>  /*
> >>>>+ * CLIENT_  CLIENT_ CLIENT_
> >>>>+ * COURTESY EXPIRED RECONNECTED      Meaning                  Where set
> >>>>+ * -----------------------------------------------------------------------------
> >>>>+ * | false | false | false | Confirmed, active    | Default                    |
> >>>>+ * |---------------------------------------------------------------------------|
> >>>>+ * | true  | false | false | Courtesy state.      | nfs4_get_client_reaplist   |
> >>>>+ * |       |       |       | Lease/lock/share     |                            |
> >>>>+ * |       |       |       | reservation conflict |                            |
> >>>>+ * |       |       |       | can cause Courtesy   |                            |
> >>>>+ * |       |       |       | client to be expired |                            |
> >>>>+ * |---------------------------------------------------------------------------|
> >>>>+ * | false | true  | false | Courtesy client to be| nfs4_laundromat            |
> >>>>+ * |       |       |       | expired by Laundromat| nfsd4_lm_lock_expired      |
> >>>>+ * |       |       |       | due to conflict     | nfsd4_discard_courtesy_clnt |
> >>>>+ * |       |       |       |                      | nfsd4_expire_courtesy_clnt |
> >>>>+ * |---------------------------------------------------------------------------|
> >>>>+ * | false | false | true  | Courtesy client      | nfsd4_courtesy_clnt_expired|
> >>>>+ * |       |       |       | reconnected,         |                            |
> >>>>+ * |       |       |       | becoming active      |                            |
> >>>>+ * -----------------------------------------------------------------------------
> >By the way, where is a client returned to the normal (0) state?  That
> >has to happen at some point.
> 
> For 4.1 courtesy client reconnects is detected in nfsd4_sequence,
> nfsd4_bind_conn_to_session.

Those are the places where NFSD54_CLIENT_RECONNECTED is set, which isn't
the question I asked.

> >Why are RECONNECTED clients discarded in so many cases?  (E.g. whenever
> >a bind_conn_to_session fails).
> 
> find_in_sessionid_hashtbl: we discard the courtesy client when it
> reconnects and there is error from nfsd4_get_session_locked. This
> should be a rare condition so rather than reverting the client
> state back to courtesy, it is simpler just to discard it.

That may be a rare situation, but I don't believe the behavior of
discarding the client in this case is correct.

> nfsd4_create_session/find_confirmed_client: I think the only time
> the courtesy client sends CREATE_SESSION, before sending the SEQUENCE
> to reconnect after missing its leases, is when it wants to do clientid
> trunking. This should be a rare condition so instead of dealing
> with it we just do not allow it and discard the client for now.

We can't wave away incorrect behavior with "but it's rare".  Users with
heavy and/or unusual workloads hit rare conditions.  Clients may change
their behavior over time.  (E.g., trunking may become more common.)

--b.

> nfsd4_destroy_clientid/find_confirmed_client: instead of destroy
> the courtesy client here we just let the laundromat destroy it
> as if the client already expired.
> 
> nfsd4_setclientid_confirm/find_confirmed_client: there should not
> be any courtesy client found from nfsd4_setclientid_confirm, it
> should be detected and discarded in nfsd4_setclientid.
> 
> -Dai
> 
> >>>These are mutually exclusive values, not bits that may set to 0 or 1, so
> >>>the three boolean columns are confusing.  I'd just structure the table
> >>>like:
> >>>
> >>>	client state	meaning			where set
> >>>	0		Confirmed, active	Default
> >>>	CLIENT_COURTESY	Courtesy state....	nfs4_get_client_reaplist
> >>>	CLIENT_EXPIRED	Courtesy client to be..	nfs4_laundromat
> >>>
> >>>etc.
> >>will fix in v19.
> >>
> >>Thanks,
> >>-Dai
> >>
> >>>--b.
> >>>
> >>>>+ */
> >>>>+
> >>>>+enum courtesy_client_state {
> >>>>+	NFSD4_CLIENT_COURTESY = 1,
> >>>>+	NFSD4_CLIENT_EXPIRED,
> >>>>+	NFSD4_CLIENT_RECONNECTED,
> >>>>+};
> >>>>+
> >>>>+/*
> >>>>   * struct nfs4_client - one per client.  Clientids live here.
> >>>>   *
> >>>>   * The initial object created by an NFS client using SETCLIENTID (for NFSv4.0)
> >>>>@@ -385,6 +414,10 @@ struct nfs4_client {
> >>>>  	struct list_head	async_copies;	/* list of async copies */
> >>>>  	spinlock_t		async_lock;	/* lock for async copies */
> >>>>  	atomic_t		cl_cb_inflight;	/* Outstanding callbacks */
> >>>>+
> >>>>+	enum courtesy_client_state	cl_cs_client_state;
> >>>>+	spinlock_t		cl_cs_lock;
> >>>>+	struct list_head	cl_cs_list;
> >>>>  };
> >>>>  /* struct nfs4_client_reset
> >>>>-- 
> >>>>2.9.5
Chuck Lever March 29, 2022, 7:32 p.m. UTC | #7
> On Mar 29, 2022, at 2:39 PM, J. Bruce Fields <bfields@fieldses.org> wrote:
> 
> On Tue, Mar 29, 2022 at 11:19:51AM -0700, dai.ngo@oracle.com wrote:
>> 
>> On 3/29/22 9:30 AM, J. Bruce Fields wrote:
>>> On Tue, Mar 29, 2022 at 09:20:02AM -0700, dai.ngo@oracle.com wrote:
>>>> On 3/29/22 8:47 AM, J. Bruce Fields wrote:
>>>>> On Thu, Mar 24, 2022 at 09:34:42PM -0700, Dai Ngo wrote:
>>>>>> Update nfs4_client to add:
>>>>>> . cl_cs_client_state: courtesy client state
>>>>>> . cl_cs_lock: spinlock to synchronize access to cl_cs_client_state
>>>>>> . cl_cs_list: list used by laundromat to process courtesy clients
>>>>>> 
>>>>>> Modify alloc_client to initialize these fields.
>>>>>> 
>>>>>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
>>>>>> ---
>>>>>> fs/nfsd/nfs4state.c |  2 ++
>>>>>> fs/nfsd/nfsd.h      |  1 +
>>>>>> fs/nfsd/state.h     | 33 +++++++++++++++++++++++++++++++++
>>>>>> 3 files changed, 36 insertions(+)
>>>>>> 
>>>>>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>>>>>> index 234e852fcdfa..a65d59510681 100644
>>>>>> --- a/fs/nfsd/nfs4state.c
>>>>>> +++ b/fs/nfsd/nfs4state.c
>>>>>> @@ -2009,12 +2009,14 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
>>>>>> 	INIT_LIST_HEAD(&clp->cl_delegations);
>>>>>> 	INIT_LIST_HEAD(&clp->cl_lru);
>>>>>> 	INIT_LIST_HEAD(&clp->cl_revoked);
>>>>>> +	INIT_LIST_HEAD(&clp->cl_cs_list);
>>>>>> #ifdef CONFIG_NFSD_PNFS
>>>>>> 	INIT_LIST_HEAD(&clp->cl_lo_states);
>>>>>> #endif
>>>>>> 	INIT_LIST_HEAD(&clp->async_copies);
>>>>>> 	spin_lock_init(&clp->async_lock);
>>>>>> 	spin_lock_init(&clp->cl_lock);
>>>>>> +	spin_lock_init(&clp->cl_cs_lock);
>>>>>> 	rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
>>>>>> 	return clp;
>>>>>> err_no_hashtbl:
>>>>>> diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
>>>>>> index 4fc1fd639527..23996c6ca75e 100644
>>>>>> --- a/fs/nfsd/nfsd.h
>>>>>> +++ b/fs/nfsd/nfsd.h
>>>>>> @@ -336,6 +336,7 @@ void		nfsd_lockd_shutdown(void);
>>>>>> #define COMPOUND_ERR_SLACK_SPACE	16     /* OP_SETATTR */
>>>>>> #define NFSD_LAUNDROMAT_MINTIMEOUT      1   /* seconds */
>>>>>> +#define	NFSD_COURTESY_CLIENT_TIMEOUT	(24 * 60 * 60)	/* seconds */
>>>>>> /*
>>>>>>  * The following attributes are currently not supported by the NFSv4 server:
>>>>>> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
>>>>>> index 95457cfd37fc..40e390abc842 100644
>>>>>> --- a/fs/nfsd/state.h
>>>>>> +++ b/fs/nfsd/state.h
>>>>>> @@ -283,6 +283,35 @@ struct nfsd4_sessionid {
>>>>>> #define HEXDIR_LEN     33 /* hex version of 16 byte md5 of cl_name plus '\0' */
>>>>>> /*
>>>>>> + * CLIENT_  CLIENT_ CLIENT_
>>>>>> + * COURTESY EXPIRED RECONNECTED      Meaning                  Where set
>>>>>> + * -----------------------------------------------------------------------------
>>>>>> + * | false | false | false | Confirmed, active    | Default                    |
>>>>>> + * |---------------------------------------------------------------------------|
>>>>>> + * | true  | false | false | Courtesy state.      | nfs4_get_client_reaplist   |
>>>>>> + * |       |       |       | Lease/lock/share     |                            |
>>>>>> + * |       |       |       | reservation conflict |                            |
>>>>>> + * |       |       |       | can cause Courtesy   |                            |
>>>>>> + * |       |       |       | client to be expired |                            |
>>>>>> + * |---------------------------------------------------------------------------|
>>>>>> + * | false | true  | false | Courtesy client to be| nfs4_laundromat            |
>>>>>> + * |       |       |       | expired by Laundromat| nfsd4_lm_lock_expired      |
>>>>>> + * |       |       |       | due to conflict     | nfsd4_discard_courtesy_clnt |
>>>>>> + * |       |       |       |                      | nfsd4_expire_courtesy_clnt |
>>>>>> + * |---------------------------------------------------------------------------|
>>>>>> + * | false | false | true  | Courtesy client      | nfsd4_courtesy_clnt_expired|
>>>>>> + * |       |       |       | reconnected,         |                            |
>>>>>> + * |       |       |       | becoming active      |                            |
>>>>>> + * -----------------------------------------------------------------------------
>>> By the way, where is a client returned to the normal (0) state?  That
>>> has to happen at some point.
>> 
>> For 4.1 courtesy client reconnects is detected in nfsd4_sequence,
>> nfsd4_bind_conn_to_session.
> 
> Those are the places where NFSD54_CLIENT_RECONNECTED is set, which isn't
> the question I asked.

"reconnected" simply means the client has gotten back in touch.

The server then has to decide whether to allow the client to
become active again or it needs to purge it. That decision
is different for each operation and minor version. Look for
"if (cl_cs_client_state == NFSD4_CLIENT_RECONNECTED)" for how
those choices are made.


>>> Why are RECONNECTED clients discarded in so many cases?  (E.g. whenever
>>> a bind_conn_to_session fails).
>> 
>> find_in_sessionid_hashtbl: we discard the courtesy client when it
>> reconnects and there is error from nfsd4_get_session_locked. This
>> should be a rare condition so rather than reverting the client
>> state back to courtesy, it is simpler just to discard it.
> 
> That may be a rare situation, but I don't believe the behavior of
> discarding the client in this case is correct.

Can you explain this? It's a courtesy client... the server can
decide it's expired at that point, can't it? IOW what breaks?


>> nfsd4_create_session/find_confirmed_client: I think the only time
>> the courtesy client sends CREATE_SESSION, before sending the SEQUENCE
>> to reconnect after missing its leases, is when it wants to do clientid
>> trunking. This should be a rare condition so instead of dealing
>> with it we just do not allow it and discard the client for now.
> 
> We can't wave away incorrect behavior with "but it's rare".  Users with
> heavy and/or unusual workloads hit rare conditions.  Clients may change
> their behavior over time.  (E.g., trunking may become more common.)


--
Chuck Lever
J. Bruce Fields March 29, 2022, 7:49 p.m. UTC | #8
On Tue, Mar 29, 2022 at 07:32:57PM +0000, Chuck Lever III wrote:
> 
> 
> > On Mar 29, 2022, at 2:39 PM, J. Bruce Fields <bfields@fieldses.org> wrote:
> > 
> > On Tue, Mar 29, 2022 at 11:19:51AM -0700, dai.ngo@oracle.com wrote:
> >> 
> >> On 3/29/22 9:30 AM, J. Bruce Fields wrote:
> >>> On Tue, Mar 29, 2022 at 09:20:02AM -0700, dai.ngo@oracle.com wrote:
> >>>> On 3/29/22 8:47 AM, J. Bruce Fields wrote:
> >>>>> On Thu, Mar 24, 2022 at 09:34:42PM -0700, Dai Ngo wrote:
> >>>>>> Update nfs4_client to add:
> >>>>>> . cl_cs_client_state: courtesy client state
> >>>>>> . cl_cs_lock: spinlock to synchronize access to cl_cs_client_state
> >>>>>> . cl_cs_list: list used by laundromat to process courtesy clients
> >>>>>> 
> >>>>>> Modify alloc_client to initialize these fields.
> >>>>>> 
> >>>>>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
> >>>>>> ---
> >>>>>> fs/nfsd/nfs4state.c |  2 ++
> >>>>>> fs/nfsd/nfsd.h      |  1 +
> >>>>>> fs/nfsd/state.h     | 33 +++++++++++++++++++++++++++++++++
> >>>>>> 3 files changed, 36 insertions(+)
> >>>>>> 
> >>>>>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> >>>>>> index 234e852fcdfa..a65d59510681 100644
> >>>>>> --- a/fs/nfsd/nfs4state.c
> >>>>>> +++ b/fs/nfsd/nfs4state.c
> >>>>>> @@ -2009,12 +2009,14 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
> >>>>>> 	INIT_LIST_HEAD(&clp->cl_delegations);
> >>>>>> 	INIT_LIST_HEAD(&clp->cl_lru);
> >>>>>> 	INIT_LIST_HEAD(&clp->cl_revoked);
> >>>>>> +	INIT_LIST_HEAD(&clp->cl_cs_list);
> >>>>>> #ifdef CONFIG_NFSD_PNFS
> >>>>>> 	INIT_LIST_HEAD(&clp->cl_lo_states);
> >>>>>> #endif
> >>>>>> 	INIT_LIST_HEAD(&clp->async_copies);
> >>>>>> 	spin_lock_init(&clp->async_lock);
> >>>>>> 	spin_lock_init(&clp->cl_lock);
> >>>>>> +	spin_lock_init(&clp->cl_cs_lock);
> >>>>>> 	rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
> >>>>>> 	return clp;
> >>>>>> err_no_hashtbl:
> >>>>>> diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
> >>>>>> index 4fc1fd639527..23996c6ca75e 100644
> >>>>>> --- a/fs/nfsd/nfsd.h
> >>>>>> +++ b/fs/nfsd/nfsd.h
> >>>>>> @@ -336,6 +336,7 @@ void		nfsd_lockd_shutdown(void);
> >>>>>> #define COMPOUND_ERR_SLACK_SPACE	16     /* OP_SETATTR */
> >>>>>> #define NFSD_LAUNDROMAT_MINTIMEOUT      1   /* seconds */
> >>>>>> +#define	NFSD_COURTESY_CLIENT_TIMEOUT	(24 * 60 * 60)	/* seconds */
> >>>>>> /*
> >>>>>>  * The following attributes are currently not supported by the NFSv4 server:
> >>>>>> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> >>>>>> index 95457cfd37fc..40e390abc842 100644
> >>>>>> --- a/fs/nfsd/state.h
> >>>>>> +++ b/fs/nfsd/state.h
> >>>>>> @@ -283,6 +283,35 @@ struct nfsd4_sessionid {
> >>>>>> #define HEXDIR_LEN     33 /* hex version of 16 byte md5 of cl_name plus '\0' */
> >>>>>> /*
> >>>>>> + * CLIENT_  CLIENT_ CLIENT_
> >>>>>> + * COURTESY EXPIRED RECONNECTED      Meaning                  Where set
> >>>>>> + * -----------------------------------------------------------------------------
> >>>>>> + * | false | false | false | Confirmed, active    | Default                    |
> >>>>>> + * |---------------------------------------------------------------------------|
> >>>>>> + * | true  | false | false | Courtesy state.      | nfs4_get_client_reaplist   |
> >>>>>> + * |       |       |       | Lease/lock/share     |                            |
> >>>>>> + * |       |       |       | reservation conflict |                            |
> >>>>>> + * |       |       |       | can cause Courtesy   |                            |
> >>>>>> + * |       |       |       | client to be expired |                            |
> >>>>>> + * |---------------------------------------------------------------------------|
> >>>>>> + * | false | true  | false | Courtesy client to be| nfs4_laundromat            |
> >>>>>> + * |       |       |       | expired by Laundromat| nfsd4_lm_lock_expired      |
> >>>>>> + * |       |       |       | due to conflict     | nfsd4_discard_courtesy_clnt |
> >>>>>> + * |       |       |       |                      | nfsd4_expire_courtesy_clnt |
> >>>>>> + * |---------------------------------------------------------------------------|
> >>>>>> + * | false | false | true  | Courtesy client      | nfsd4_courtesy_clnt_expired|
> >>>>>> + * |       |       |       | reconnected,         |                            |
> >>>>>> + * |       |       |       | becoming active      |                            |
> >>>>>> + * -----------------------------------------------------------------------------
> >>> By the way, where is a client returned to the normal (0) state?  That
> >>> has to happen at some point.
> >> 
> >> For 4.1 courtesy client reconnects is detected in nfsd4_sequence,
> >> nfsd4_bind_conn_to_session.
> > 
> > Those are the places where NFSD54_CLIENT_RECONNECTED is set, which isn't
> > the question I asked.
> 
> "reconnected" simply means the client has gotten back in touch.

Again, my question was: when is cl_cs_client_state set back to 0?  As
far as I can tell, the answer is never.  That means, even long after the
client has reconnected, it's left in a weird state where it can be
suddenly expired for all sorts of reasons.

> The server then has to decide whether to allow the client to
> become active again or it needs to purge it. That decision
> is different for each operation and minor version. Look for
> "if (cl_cs_client_state == NFSD4_CLIENT_RECONNECTED)" for how
> those choices are made.
> 
> 
> >>> Why are RECONNECTED clients discarded in so many cases?  (E.g. whenever
> >>> a bind_conn_to_session fails).
> >> 
> >> find_in_sessionid_hashtbl: we discard the courtesy client when it
> >> reconnects and there is error from nfsd4_get_session_locked. This
> >> should be a rare condition so rather than reverting the client
> >> state back to courtesy, it is simpler just to discard it.
> > 
> > That may be a rare situation, but I don't believe the behavior of
> > discarding the client in this case is correct.
> 
> Can you explain this? It's a courtesy client... the server can
> decide it's expired at that point, can't it? IOW what breaks?

I'm not worried about courtesy clients, I'm worried about clients that
were courtesy clients but have since succesfully renewed their state.
Expiring them for a failed bind_conn_to_session isn't right.

--b.

> 
> 
> >> nfsd4_create_session/find_confirmed_client: I think the only time
> >> the courtesy client sends CREATE_SESSION, before sending the SEQUENCE
> >> to reconnect after missing its leases, is when it wants to do clientid
> >> trunking. This should be a rare condition so instead of dealing
> >> with it we just do not allow it and discard the client for now.
> > 
> > We can't wave away incorrect behavior with "but it's rare".  Users with
> > heavy and/or unusual workloads hit rare conditions.  Clients may change
> > their behavior over time.  (E.g., trunking may become more common.)
> 
> 
> --
> Chuck Lever
> 
>
Chuck Lever March 29, 2022, 7:58 p.m. UTC | #9
> On Mar 29, 2022, at 3:49 PM, Bruce Fields <bfields@fieldses.org> wrote:
> 
> On Tue, Mar 29, 2022 at 07:32:57PM +0000, Chuck Lever III wrote:
>> 
>> 
>>> On Mar 29, 2022, at 2:39 PM, J. Bruce Fields <bfields@fieldses.org> wrote:
>>> 
>>> On Tue, Mar 29, 2022 at 11:19:51AM -0700, dai.ngo@oracle.com wrote:
>>>> 
>>>> On 3/29/22 9:30 AM, J. Bruce Fields wrote:
>>>>> On Tue, Mar 29, 2022 at 09:20:02AM -0700, dai.ngo@oracle.com wrote:
>>>>>> On 3/29/22 8:47 AM, J. Bruce Fields wrote:
>>>>>>> On Thu, Mar 24, 2022 at 09:34:42PM -0700, Dai Ngo wrote:
>>>>>>>> Update nfs4_client to add:
>>>>>>>> . cl_cs_client_state: courtesy client state
>>>>>>>> . cl_cs_lock: spinlock to synchronize access to cl_cs_client_state
>>>>>>>> . cl_cs_list: list used by laundromat to process courtesy clients
>>>>>>>> 
>>>>>>>> Modify alloc_client to initialize these fields.
>>>>>>>> 
>>>>>>>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
>>>>>>>> ---
>>>>>>>> fs/nfsd/nfs4state.c |  2 ++
>>>>>>>> fs/nfsd/nfsd.h      |  1 +
>>>>>>>> fs/nfsd/state.h     | 33 +++++++++++++++++++++++++++++++++
>>>>>>>> 3 files changed, 36 insertions(+)
>>>>>>>> 
>>>>>>>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>>>>>>>> index 234e852fcdfa..a65d59510681 100644
>>>>>>>> --- a/fs/nfsd/nfs4state.c
>>>>>>>> +++ b/fs/nfsd/nfs4state.c
>>>>>>>> @@ -2009,12 +2009,14 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
>>>>>>>> 	INIT_LIST_HEAD(&clp->cl_delegations);
>>>>>>>> 	INIT_LIST_HEAD(&clp->cl_lru);
>>>>>>>> 	INIT_LIST_HEAD(&clp->cl_revoked);
>>>>>>>> +	INIT_LIST_HEAD(&clp->cl_cs_list);
>>>>>>>> #ifdef CONFIG_NFSD_PNFS
>>>>>>>> 	INIT_LIST_HEAD(&clp->cl_lo_states);
>>>>>>>> #endif
>>>>>>>> 	INIT_LIST_HEAD(&clp->async_copies);
>>>>>>>> 	spin_lock_init(&clp->async_lock);
>>>>>>>> 	spin_lock_init(&clp->cl_lock);
>>>>>>>> +	spin_lock_init(&clp->cl_cs_lock);
>>>>>>>> 	rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
>>>>>>>> 	return clp;
>>>>>>>> err_no_hashtbl:
>>>>>>>> diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
>>>>>>>> index 4fc1fd639527..23996c6ca75e 100644
>>>>>>>> --- a/fs/nfsd/nfsd.h
>>>>>>>> +++ b/fs/nfsd/nfsd.h
>>>>>>>> @@ -336,6 +336,7 @@ void		nfsd_lockd_shutdown(void);
>>>>>>>> #define COMPOUND_ERR_SLACK_SPACE	16     /* OP_SETATTR */
>>>>>>>> #define NFSD_LAUNDROMAT_MINTIMEOUT      1   /* seconds */
>>>>>>>> +#define	NFSD_COURTESY_CLIENT_TIMEOUT	(24 * 60 * 60)	/* seconds */
>>>>>>>> /*
>>>>>>>> * The following attributes are currently not supported by the NFSv4 server:
>>>>>>>> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
>>>>>>>> index 95457cfd37fc..40e390abc842 100644
>>>>>>>> --- a/fs/nfsd/state.h
>>>>>>>> +++ b/fs/nfsd/state.h
>>>>>>>> @@ -283,6 +283,35 @@ struct nfsd4_sessionid {
>>>>>>>> #define HEXDIR_LEN     33 /* hex version of 16 byte md5 of cl_name plus '\0' */
>>>>>>>> /*
>>>>>>>> + * CLIENT_  CLIENT_ CLIENT_
>>>>>>>> + * COURTESY EXPIRED RECONNECTED      Meaning                  Where set
>>>>>>>> + * -----------------------------------------------------------------------------
>>>>>>>> + * | false | false | false | Confirmed, active    | Default                    |
>>>>>>>> + * |---------------------------------------------------------------------------|
>>>>>>>> + * | true  | false | false | Courtesy state.      | nfs4_get_client_reaplist   |
>>>>>>>> + * |       |       |       | Lease/lock/share     |                            |
>>>>>>>> + * |       |       |       | reservation conflict |                            |
>>>>>>>> + * |       |       |       | can cause Courtesy   |                            |
>>>>>>>> + * |       |       |       | client to be expired |                            |
>>>>>>>> + * |---------------------------------------------------------------------------|
>>>>>>>> + * | false | true  | false | Courtesy client to be| nfs4_laundromat            |
>>>>>>>> + * |       |       |       | expired by Laundromat| nfsd4_lm_lock_expired      |
>>>>>>>> + * |       |       |       | due to conflict     | nfsd4_discard_courtesy_clnt |
>>>>>>>> + * |       |       |       |                      | nfsd4_expire_courtesy_clnt |
>>>>>>>> + * |---------------------------------------------------------------------------|
>>>>>>>> + * | false | false | true  | Courtesy client      | nfsd4_courtesy_clnt_expired|
>>>>>>>> + * |       |       |       | reconnected,         |                            |
>>>>>>>> + * |       |       |       | becoming active      |                            |
>>>>>>>> + * -----------------------------------------------------------------------------
>>>>> By the way, where is a client returned to the normal (0) state?  That
>>>>> has to happen at some point.
>>>> 
>>>> For 4.1 courtesy client reconnects is detected in nfsd4_sequence,
>>>> nfsd4_bind_conn_to_session.
>>> 
>>> Those are the places where NFSD54_CLIENT_RECONNECTED is set, which isn't
>>> the question I asked.
>> 
>> "reconnected" simply means the client has gotten back in touch.
> 
> Again, my question was: when is cl_cs_client_state set back to 0?  As
> far as I can tell, the answer is never.  That means, even long after the
> client has reconnected, it's left in a weird state where it can be
> suddenly expired for all sorts of reasons.

Got it. Agreed, cl_cs_client_state should be reinitialized if
a courtesy client is transitioned back to "active".

Dai, would you add

+enum courtesy_client_state {
>>>	NFSD4_CLIENT_ACTIVE = 0,
+	NFSD4_CLIENT_COURTESY,
+	NFSD4_CLIENT_EXPIRED,
+	NFSD4_CLIENT_RECONNECTED,
+};

And set cl_cs_client_state to ACTIVE where the client is
allowed to transition back to being active?


>> The server then has to decide whether to allow the client to
>> become active again or it needs to purge it. That decision
>> is different for each operation and minor version. Look for
>> "if (cl_cs_client_state == NFSD4_CLIENT_RECONNECTED)" for how
>> those choices are made.
>> 
>> 
>>>>> Why are RECONNECTED clients discarded in so many cases?  (E.g. whenever
>>>>> a bind_conn_to_session fails).
>>>> 
>>>> find_in_sessionid_hashtbl: we discard the courtesy client when it
>>>> reconnects and there is error from nfsd4_get_session_locked. This
>>>> should be a rare condition so rather than reverting the client
>>>> state back to courtesy, it is simpler just to discard it.
>>> 
>>> That may be a rare situation, but I don't believe the behavior of
>>> discarding the client in this case is correct.
>> 
>> Can you explain this? It's a courtesy client... the server can
>> decide it's expired at that point, can't it? IOW what breaks?
> 
> I'm not worried about courtesy clients, I'm worried about clients that
> were courtesy clients but have since succesfully renewed their state.
> Expiring them for a failed bind_conn_to_session isn't right.


--
Chuck Lever
J. Bruce Fields March 29, 2022, 8:01 p.m. UTC | #10
On Tue, Mar 29, 2022 at 07:58:46PM +0000, Chuck Lever III wrote:
> Got it. Agreed, cl_cs_client_state should be reinitialized if
> a courtesy client is transitioned back to "active".
> 
> Dai, would you add
> 
> +enum courtesy_client_state {
> >>>	NFSD4_CLIENT_ACTIVE = 0,
> +	NFSD4_CLIENT_COURTESY,
> +	NFSD4_CLIENT_EXPIRED,
> +	NFSD4_CLIENT_RECONNECTED,
> +};
> 
> And set cl_cs_client_state to ACTIVE where the client is
> allowed to transition back to being active?

I'm not clear then what the RECONNECTED->ACTIVE transition would be.

My feeling is that the RECONNECTED state shouldn't exist, and that there
should only be a transition of EXPIRED back to ACTIVE.

--b.
Chuck Lever March 29, 2022, 8:20 p.m. UTC | #11
> On Mar 29, 2022, at 4:01 PM, Bruce Fields <bfields@fieldses.org> wrote:
> 
> On Tue, Mar 29, 2022 at 07:58:46PM +0000, Chuck Lever III wrote:
>> Got it. Agreed, cl_cs_client_state should be reinitialized if
>> a courtesy client is transitioned back to "active".
>> 
>> Dai, would you add
>> 
>> +enum courtesy_client_state {
>>>>> 	NFSD4_CLIENT_ACTIVE = 0,
>> +	NFSD4_CLIENT_COURTESY,
>> +	NFSD4_CLIENT_EXPIRED,
>> +	NFSD4_CLIENT_RECONNECTED,
>> +};
>> 
>> And set cl_cs_client_state to ACTIVE where the client is
>> allowed to transition back to being active?
> 
> I'm not clear then what the RECONNECTED->ACTIVE transition would be.
> 
> My feeling is that the RECONNECTED state shouldn't exist, and that there
> should only be a transition of EXPIRED back to ACTIVE.

Audit the places that check for NFSD4_CLIENT_RECONNECTED.
Some of them will expire a reconnected client, some will
let it transition back to active. My impression from Dai
was that the server cannot transition a courtesy client
back to active in _every_ case.

If you can demonstrate that in every case where RECONNECTED
is found that a client should be transitioned to ACTIVE
rather than discarded, then yes, we should get rid of
RECONNECTED in favor of going from COURTESY -> ACTIVE.


--
Chuck Lever
Dai Ngo March 29, 2022, 8:50 p.m. UTC | #12
On 3/29/22 1:01 PM, Bruce Fields wrote:
> On Tue, Mar 29, 2022 at 07:58:46PM +0000, Chuck Lever III wrote:
>> Got it. Agreed, cl_cs_client_state should be reinitialized if
>> a courtesy client is transitioned back to "active".
>>
>> Dai, would you add
>>
>> +enum courtesy_client_state {
>>>>> 	NFSD4_CLIENT_ACTIVE = 0,
>> +	NFSD4_CLIENT_COURTESY,
>> +	NFSD4_CLIENT_EXPIRED,
>> +	NFSD4_CLIENT_RECONNECTED,
>> +};
>>
>> And set cl_cs_client_state to ACTIVE where the client is
>> allowed to transition back to being active?

fix in v19.

> I'm not clear then what the RECONNECTED->ACTIVE transition would be.
>
> My feeling is that the RECONNECTED state shouldn't exist, and that there
> should only be a transition of EXPIRED back to ACTIVE.

For the client to be truly active we need to create the client record.
We do not want to create the client record when we just detect that
the client reconnects because not all the callers want the client to
be active, we leave it for the callers to decide. Also some callers of
nfsd4_courtesy_clnt_expired hold the nn->client_lock so we can create
the client record there.

Leaving the NFSD4_CLIENT_RECONNECTED state set does not really
cause any functional problem since the RECONNECT state is meant
to used temporary within the context of the same request. But
I will reset the state back to NFSD4_CLIENT_ACTIVE for clarity.


-Dai

>
> --b.
Dai Ngo March 29, 2022, 9:45 p.m. UTC | #13
On 3/29/22 11:39 AM, J. Bruce Fields wrote:
> On Tue, Mar 29, 2022 at 11:19:51AM -0700, dai.ngo@oracle.com wrote:
>> On 3/29/22 9:30 AM, J. Bruce Fields wrote:
>>> On Tue, Mar 29, 2022 at 09:20:02AM -0700, dai.ngo@oracle.com wrote:
>>>> On 3/29/22 8:47 AM, J. Bruce Fields wrote:
>>>>> On Thu, Mar 24, 2022 at 09:34:42PM -0700, Dai Ngo wrote:
>>>>>> Update nfs4_client to add:
>>>>>>   . cl_cs_client_state: courtesy client state
>>>>>>   . cl_cs_lock: spinlock to synchronize access to cl_cs_client_state
>>>>>>   . cl_cs_list: list used by laundromat to process courtesy clients
>>>>>>
>>>>>> Modify alloc_client to initialize these fields.
>>>>>>
>>>>>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
>>>>>> ---
>>>>>>   fs/nfsd/nfs4state.c |  2 ++
>>>>>>   fs/nfsd/nfsd.h      |  1 +
>>>>>>   fs/nfsd/state.h     | 33 +++++++++++++++++++++++++++++++++
>>>>>>   3 files changed, 36 insertions(+)
>>>>>>
>>>>>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>>>>>> index 234e852fcdfa..a65d59510681 100644
>>>>>> --- a/fs/nfsd/nfs4state.c
>>>>>> +++ b/fs/nfsd/nfs4state.c
>>>>>> @@ -2009,12 +2009,14 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
>>>>>>   	INIT_LIST_HEAD(&clp->cl_delegations);
>>>>>>   	INIT_LIST_HEAD(&clp->cl_lru);
>>>>>>   	INIT_LIST_HEAD(&clp->cl_revoked);
>>>>>> +	INIT_LIST_HEAD(&clp->cl_cs_list);
>>>>>>   #ifdef CONFIG_NFSD_PNFS
>>>>>>   	INIT_LIST_HEAD(&clp->cl_lo_states);
>>>>>>   #endif
>>>>>>   	INIT_LIST_HEAD(&clp->async_copies);
>>>>>>   	spin_lock_init(&clp->async_lock);
>>>>>>   	spin_lock_init(&clp->cl_lock);
>>>>>> +	spin_lock_init(&clp->cl_cs_lock);
>>>>>>   	rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
>>>>>>   	return clp;
>>>>>>   err_no_hashtbl:
>>>>>> diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
>>>>>> index 4fc1fd639527..23996c6ca75e 100644
>>>>>> --- a/fs/nfsd/nfsd.h
>>>>>> +++ b/fs/nfsd/nfsd.h
>>>>>> @@ -336,6 +336,7 @@ void		nfsd_lockd_shutdown(void);
>>>>>>   #define COMPOUND_ERR_SLACK_SPACE	16     /* OP_SETATTR */
>>>>>>   #define NFSD_LAUNDROMAT_MINTIMEOUT      1   /* seconds */
>>>>>> +#define	NFSD_COURTESY_CLIENT_TIMEOUT	(24 * 60 * 60)	/* seconds */
>>>>>>   /*
>>>>>>    * The following attributes are currently not supported by the NFSv4 server:
>>>>>> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
>>>>>> index 95457cfd37fc..40e390abc842 100644
>>>>>> --- a/fs/nfsd/state.h
>>>>>> +++ b/fs/nfsd/state.h
>>>>>> @@ -283,6 +283,35 @@ struct nfsd4_sessionid {
>>>>>>   #define HEXDIR_LEN     33 /* hex version of 16 byte md5 of cl_name plus '\0' */
>>>>>>   /*
>>>>>> + * CLIENT_  CLIENT_ CLIENT_
>>>>>> + * COURTESY EXPIRED RECONNECTED      Meaning                  Where set
>>>>>> + * -----------------------------------------------------------------------------
>>>>>> + * | false | false | false | Confirmed, active    | Default                    |
>>>>>> + * |---------------------------------------------------------------------------|
>>>>>> + * | true  | false | false | Courtesy state.      | nfs4_get_client_reaplist   |
>>>>>> + * |       |       |       | Lease/lock/share     |                            |
>>>>>> + * |       |       |       | reservation conflict |                            |
>>>>>> + * |       |       |       | can cause Courtesy   |                            |
>>>>>> + * |       |       |       | client to be expired |                            |
>>>>>> + * |---------------------------------------------------------------------------|
>>>>>> + * | false | true  | false | Courtesy client to be| nfs4_laundromat            |
>>>>>> + * |       |       |       | expired by Laundromat| nfsd4_lm_lock_expired      |
>>>>>> + * |       |       |       | due to conflict     | nfsd4_discard_courtesy_clnt |
>>>>>> + * |       |       |       |                      | nfsd4_expire_courtesy_clnt |
>>>>>> + * |---------------------------------------------------------------------------|
>>>>>> + * | false | false | true  | Courtesy client      | nfsd4_courtesy_clnt_expired|
>>>>>> + * |       |       |       | reconnected,         |                            |
>>>>>> + * |       |       |       | becoming active      |                            |
>>>>>> + * -----------------------------------------------------------------------------
>>> By the way, where is a client returned to the normal (0) state?  That
>>> has to happen at some point.
>> For 4.1 courtesy client reconnects is detected in nfsd4_sequence,
>> nfsd4_bind_conn_to_session.
> Those are the places where NFSD54_CLIENT_RECONNECTED is set, which isn't
> the question I asked.
>>> Why are RECONNECTED clients discarded in so many cases?  (E.g. whenever
>>> a bind_conn_to_session fails).
>> find_in_sessionid_hashtbl: we discard the courtesy client when it
>> reconnects and there is error from nfsd4_get_session_locked. This
>> should be a rare condition so rather than reverting the client
>> state back to courtesy, it is simpler just to discard it.
> That may be a rare situation, but I don't believe the behavior of
> discarding the client in this case is correct.
>
>> nfsd4_create_session/find_confirmed_client: I think the only time
>> the courtesy client sends CREATE_SESSION, before sending the SEQUENCE
>> to reconnect after missing its leases, is when it wants to do clientid
>> trunking. This should be a rare condition so instead of dealing
>> with it we just do not allow it and discard the client for now.
> We can't wave away incorrect behavior with "but it's rare".  Users with
> heavy and/or unusual workloads hit rare conditions.  Clients may change
> their behavior over time.  (E.g., trunking may become more common.)

This does not prevent the courtesy client from doing trunking in all
cases. It is only prevent the courtesy client from doing trunking without
first reconnect to the server.

I think this behavior is the same as if the server does not support courtesy
client; the server can expire the courtesy anytime it wants. If the
courtesy client reconnected successfully then by the time nfsd4_create_session/
find_confirmed_client is called the client already becomes active
so the server will process the request normally.

Also to handle cases when the courtesy client reconnects after it was in
EXPIRED state, we want to force the client to recover its state starting
with EXCHANGE_ID so we have to return BAD_SESSION on CREATE_SESSION request.

-Dai
J. Bruce Fields March 30, 2022, 12:12 a.m. UTC | #14
On Tue, Mar 29, 2022 at 02:45:28PM -0700, dai.ngo@oracle.com wrote:
> This does not prevent the courtesy client from doing trunking in all
> cases. It is only prevent the courtesy client from doing trunking without
> first reconnect to the server.
> 
> I think this behavior is the same as if the server does not support courtesy
> client; the server can expire the courtesy anytime it wants. If the
> courtesy client reconnected successfully then by the time nfsd4_create_session/
> find_confirmed_client is called the client already becomes active
> so the server will process the request normally.

I'm not sure what you mean here.  All a client has to do to reconnect is
succesfully renew its lease.  That doesn't necessarily require calling
CREATE_SESSION again.

> Also to handle cases when the courtesy client reconnects after it was in
> EXPIRED state, we want to force the client to recover its state starting
> with EXCHANGE_ID so we have to return BAD_SESSION on CREATE_SESSION request.

The client should not have to send EXCHANGE_ID.

--b.
Dai Ngo March 30, 2022, 1:17 a.m. UTC | #15
On 3/29/22 5:12 PM, J. Bruce Fields wrote:
> On Tue, Mar 29, 2022 at 02:45:28PM -0700, dai.ngo@oracle.com wrote:
>> This does not prevent the courtesy client from doing trunking in all
>> cases. It is only prevent the courtesy client from doing trunking without
>> first reconnect to the server.
>>
>> I think this behavior is the same as if the server does not support courtesy
>> client; the server can expire the courtesy anytime it wants. If the
>> courtesy client reconnected successfully then by the time nfsd4_create_session/
>> find_confirmed_client is called the client already becomes active
>> so the server will process the request normally.
> I'm not sure what you mean here.  All a client has to do to reconnect is
> succesfully renew its lease.

For 4.1 the client renews its lease via the SEQUENCE, either stand-alone
or in a compound. Once the SEQUENCE completes successfully then the
subsequent CREATE_SESSION is processed normally. However, if the client
did not send the SEQUENCE first then server returns BAD_SESSION for the
CREATE_SESSION request.

>    That doesn't necessarily require calling
> CREATE_SESSION again.
>
>> Also to handle cases when the courtesy client reconnects after it was in
>> EXPIRED state, we want to force the client to recover its state starting
>> with EXCHANGE_ID so we have to return BAD_SESSION on CREATE_SESSION request.
> The client should not have to send EXCHANGE_ID.

For 4.1 the expired courtesy client must send EXCHANGE_ID to reconnect
to start new session. I don't see how the *expired* courtesy client can
access the export again without sending the EXCHANGE_ID. Attached is the
pcap that shows how the courtesy client recovers once it's in
CLIENT_EXPIRED state.

-Dai

>
> --b.
J. Bruce Fields March 30, 2022, 1:48 a.m. UTC | #16
On Tue, Mar 29, 2022 at 06:17:29PM -0700, dai.ngo@oracle.com wrote:
> 
> On 3/29/22 5:12 PM, J. Bruce Fields wrote:
> >On Tue, Mar 29, 2022 at 02:45:28PM -0700, dai.ngo@oracle.com wrote:
> >>This does not prevent the courtesy client from doing trunking in all
> >>cases. It is only prevent the courtesy client from doing trunking without
> >>first reconnect to the server.
> >>
> >>I think this behavior is the same as if the server does not support courtesy
> >>client; the server can expire the courtesy anytime it wants. If the
> >>courtesy client reconnected successfully then by the time nfsd4_create_session/
> >>find_confirmed_client is called the client already becomes active
> >>so the server will process the request normally.
> >I'm not sure what you mean here.  All a client has to do to reconnect is
> >succesfully renew its lease.
> 
> For 4.1 the client renews its lease via the SEQUENCE, either stand-alone
> or in a compound. Once the SEQUENCE completes successfully then the
> subsequent CREATE_SESSION is processed normally. However, if the client
> did not send the SEQUENCE first then server returns BAD_SESSION for the
> CREATE_SESSION request.
> 
> >   That doesn't necessarily require calling
> >CREATE_SESSION again.
> >
> >>Also to handle cases when the courtesy client reconnects after it was in
> >>EXPIRED state, we want to force the client to recover its state starting
> >>with EXCHANGE_ID so we have to return BAD_SESSION on CREATE_SESSION request.
> >The client should not have to send EXCHANGE_ID.
> 
> For 4.1 the expired courtesy client must send EXCHANGE_ID to reconnect
> to start new session. I don't see how the *expired* courtesy client can
> access the export again without sending the EXCHANGE_ID. Attached is the
> pcap that shows how the courtesy client recovers once it's in
> CLIENT_EXPIRED state.

Oh, sorry, sure, we're talking about an actual expired client.  That's
fine.

--b.
diff mbox series

Patch

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 234e852fcdfa..a65d59510681 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2009,12 +2009,14 @@  static struct nfs4_client *alloc_client(struct xdr_netobj name)
 	INIT_LIST_HEAD(&clp->cl_delegations);
 	INIT_LIST_HEAD(&clp->cl_lru);
 	INIT_LIST_HEAD(&clp->cl_revoked);
+	INIT_LIST_HEAD(&clp->cl_cs_list);
 #ifdef CONFIG_NFSD_PNFS
 	INIT_LIST_HEAD(&clp->cl_lo_states);
 #endif
 	INIT_LIST_HEAD(&clp->async_copies);
 	spin_lock_init(&clp->async_lock);
 	spin_lock_init(&clp->cl_lock);
+	spin_lock_init(&clp->cl_cs_lock);
 	rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
 	return clp;
 err_no_hashtbl:
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 4fc1fd639527..23996c6ca75e 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -336,6 +336,7 @@  void		nfsd_lockd_shutdown(void);
 #define COMPOUND_ERR_SLACK_SPACE	16     /* OP_SETATTR */
 
 #define NFSD_LAUNDROMAT_MINTIMEOUT      1   /* seconds */
+#define	NFSD_COURTESY_CLIENT_TIMEOUT	(24 * 60 * 60)	/* seconds */
 
 /*
  * The following attributes are currently not supported by the NFSv4 server:
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 95457cfd37fc..40e390abc842 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -283,6 +283,35 @@  struct nfsd4_sessionid {
 #define HEXDIR_LEN     33 /* hex version of 16 byte md5 of cl_name plus '\0' */
 
 /*
+ * CLIENT_  CLIENT_ CLIENT_
+ * COURTESY EXPIRED RECONNECTED      Meaning                  Where set
+ * -----------------------------------------------------------------------------
+ * | false | false | false | Confirmed, active    | Default                    |
+ * |---------------------------------------------------------------------------|
+ * | true  | false | false | Courtesy state.      | nfs4_get_client_reaplist   |
+ * |       |       |       | Lease/lock/share     |                            |
+ * |       |       |       | reservation conflict |                            |
+ * |       |       |       | can cause Courtesy   |                            |
+ * |       |       |       | client to be expired |                            |
+ * |---------------------------------------------------------------------------|
+ * | false | true  | false | Courtesy client to be| nfs4_laundromat            |
+ * |       |       |       | expired by Laundromat| nfsd4_lm_lock_expired      |
+ * |       |       |       | due to conflict     | nfsd4_discard_courtesy_clnt |
+ * |       |       |       |                      | nfsd4_expire_courtesy_clnt |
+ * |---------------------------------------------------------------------------|
+ * | false | false | true  | Courtesy client      | nfsd4_courtesy_clnt_expired|
+ * |       |       |       | reconnected,         |                            |
+ * |       |       |       | becoming active      |                            |
+ * -----------------------------------------------------------------------------
+ */
+
+enum courtesy_client_state {
+	NFSD4_CLIENT_COURTESY = 1,
+	NFSD4_CLIENT_EXPIRED,
+	NFSD4_CLIENT_RECONNECTED,
+};
+
+/*
  * struct nfs4_client - one per client.  Clientids live here.
  *
  * The initial object created by an NFS client using SETCLIENTID (for NFSv4.0)
@@ -385,6 +414,10 @@  struct nfs4_client {
 	struct list_head	async_copies;	/* list of async copies */
 	spinlock_t		async_lock;	/* lock for async copies */
 	atomic_t		cl_cb_inflight;	/* Outstanding callbacks */
+
+	enum courtesy_client_state	cl_cs_client_state;
+	spinlock_t		cl_cs_lock;
+	struct list_head	cl_cs_list;
 };
 
 /* struct nfs4_client_reset