diff mbox series

[v5,13/13] NFSD: Use rhashtable for managing nfs4_file objects

Message ID 166665109477.50761.4457095370494745929.stgit@klimt.1015granger.net (mailing list archive)
State New, archived
Headers show
Series A course change, for sure | expand

Commit Message

Chuck Lever Oct. 24, 2022, 10:38 p.m. UTC
fh_match() is costly, especially when filehandles are large (as is
the case for NFSv4). It needs to be used sparingly when searching
data structures. Unfortunately, with common workloads, I see
multiple thousands of objects stored in file_hashtbl[], which always
has only 256 buckets, which makes the hash chains quite lengthy.

Walking long hash chains with the state_lock held blocks other
activity that needs that lock.

To help mitigate the cost of searching with fh_match(), replace the
nfs4_file hash table with an rhashtable, which can dynamically
resize its bucket array to minimize hash chain length. The ideal for
this use case is one bucket per inode.

The result is an improvement in the latency of NFSv4 operations
and the reduction of nfsd CPU utilization due to the cost of
fh_match() and the CPU cache misses incurred while walking long
hash chains in the nfs4_file hash table.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c |   64 +++++++++++++++++++++++++--------------------------
 fs/nfsd/state.h     |    4 ---
 2 files changed, 31 insertions(+), 37 deletions(-)

Comments

NeilBrown Oct. 24, 2022, 11:43 p.m. UTC | #1
On Tue, 25 Oct 2022, Chuck Lever wrote:
> fh_match() is costly, especially when filehandles are large (as is
> the case for NFSv4). It needs to be used sparingly when searching
> data structures. Unfortunately, with common workloads, I see
> multiple thousands of objects stored in file_hashtbl[], which always
> has only 256 buckets, which makes the hash chains quite lengthy.
> 
> Walking long hash chains with the state_lock held blocks other
> activity that needs that lock.
> 
> To help mitigate the cost of searching with fh_match(), replace the
> nfs4_file hash table with an rhashtable, which can dynamically
> resize its bucket array to minimize hash chain length. The ideal for
> this use case is one bucket per inode.
> 
> The result is an improvement in the latency of NFSv4 operations
> and the reduction of nfsd CPU utilization due to the cost of
> fh_match() and the CPU cache misses incurred while walking long
> hash chains in the nfs4_file hash table.
> 
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
>  fs/nfsd/nfs4state.c |   64 +++++++++++++++++++++++++--------------------------
>  fs/nfsd/state.h     |    4 ---
>  2 files changed, 31 insertions(+), 37 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 681cb2daa843..5b90e5a6a04f 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -591,11 +591,8 @@ static void nfsd4_free_file_rcu(struct rcu_head *rcu)
>  void
>  put_nfs4_file(struct nfs4_file *fi)
>  {
> -	might_lock(&state_lock);
> -
> -	if (refcount_dec_and_lock(&fi->fi_ref, &state_lock)) {
> +	if (refcount_dec_and_test(&fi->fi_ref)) {
>  		remove_nfs4_file_locked(fi);
> -		spin_unlock(&state_lock);
>  		WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate));
>  		WARN_ON_ONCE(!list_empty(&fi->fi_delegations));
>  		call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu);
> @@ -709,20 +706,6 @@ static unsigned int ownerstr_hashval(struct xdr_netobj *ownername)
>  	return ret & OWNER_HASH_MASK;
>  }
>  
> -/* hash table for nfs4_file */
> -#define FILE_HASH_BITS                   8
> -#define FILE_HASH_SIZE                  (1 << FILE_HASH_BITS)
> -
> -static unsigned int file_hashval(const struct svc_fh *fh)
> -{
> -	struct inode *inode = d_inode(fh->fh_dentry);
> -
> -	/* XXX: why not (here & in file cache) use inode? */
> -	return (unsigned int)hash_long(inode->i_ino, FILE_HASH_BITS);
> -}
> -
> -static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
> -
>  static struct rhltable nfs4_file_rhltable ____cacheline_aligned_in_smp;
>  
>  static const struct rhashtable_params nfs4_file_rhash_params = {
> @@ -4683,12 +4666,13 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
>  static noinline_for_stack struct nfs4_file *
>  find_nfs4_file(const struct svc_fh *fhp)
>  {
> -	unsigned int hashval = file_hashval(fhp);
> +	struct rhlist_head *tmp, *list;
>  	struct nfs4_file *fi = NULL;
>  
>  	rcu_read_lock();
> -	hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash,
> -				 lockdep_is_held(&state_lock)) {
> +	list = rhltable_lookup(&nfs4_file_rhltable, fhp,
> +			       nfs4_file_rhash_params);
> +	rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) {
>  		if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
>  			if (!refcount_inc_not_zero(&fi->fi_ref))
>  				fi = NULL;
> @@ -4708,33 +4692,45 @@ find_nfs4_file(const struct svc_fh *fhp)
>  static noinline_for_stack struct nfs4_file *
>  insert_nfs4_file(struct nfs4_file *new, const struct svc_fh *fhp)
>  {
> -	unsigned int hashval = file_hashval(fhp);
> +	struct rhlist_head *tmp, *list;
>  	struct nfs4_file *ret = NULL;
>  	bool alias_found = false;
>  	struct nfs4_file *fi;
> +	int err;
>  
> -	spin_lock(&state_lock);
> -	hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash,
> -				 lockdep_is_held(&state_lock)) {
> +	rcu_read_lock();
> +

As mentioned separately, we need some sort of lock around this loop and
the later insert.

> +	list = rhltable_lookup(&nfs4_file_rhltable, fhp,
> +			       nfs4_file_rhash_params);
> +	rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) {
>  		if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
>  			if (refcount_inc_not_zero(&fi->fi_ref))
>  				ret = fi;
>  		} else if (d_inode(fhp->fh_dentry) == fi->fi_inode)
>  			fi->fi_aliased = alias_found = true;

This d_inde)( == fi->fi_inode test is no longer relevant.  Everything in
the list must have the same inode.  Best remove it.

Thanks,
NeilBrown

>  	}
> -	if (likely(ret == NULL)) {
> -		init_nfs4_file(fhp, new);
> -		hlist_add_head_rcu(&new->fi_hash, &file_hashtbl[hashval]);
> -		new->fi_aliased = alias_found;
> -		ret = new;
> -	}
> -	spin_unlock(&state_lock);
> +	if (ret)
> +		goto out_unlock;
> +
> +	init_nfs4_file(fhp, new);
> +	err = rhltable_insert_key(&nfs4_file_rhltable, fhp,
> +				  &new->fi_rlist,
> +				  nfs4_file_rhash_params);
> +	if (err)
> +		goto out_unlock;
> +
> +	new->fi_aliased = alias_found;
> +	ret = new;
> +
> +out_unlock:
> +	rcu_read_unlock();
>  	return ret;
>  }
>  
>  static noinline_for_stack void remove_nfs4_file_locked(struct nfs4_file *fi)
>  {
> -	hlist_del_rcu(&fi->fi_hash);
> +	rhltable_remove(&nfs4_file_rhltable, &fi->fi_rlist,
> +			nfs4_file_rhash_params);
>  }
>  
>  /*
> @@ -5624,6 +5620,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
>  	 * If not found, create the nfs4_file struct
>  	 */
>  	fp = insert_nfs4_file(open->op_file, current_fh);
> +	if (unlikely(!fp))
> +		return nfserr_jukebox;
>  	if (fp != open->op_file) {
>  		status = nfs4_check_deleg(cl, open, &dp);
>  		if (status)
> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> index 190fc7e418a4..eadd7f465bf5 100644
> --- a/fs/nfsd/state.h
> +++ b/fs/nfsd/state.h
> @@ -536,16 +536,12 @@ struct nfs4_clnt_odstate {
>   * inode can have multiple filehandles associated with it, so there is
>   * (potentially) a many to one relationship between this struct and struct
>   * inode.
> - *
> - * These are hashed by filehandle in the file_hashtbl, which is protected by
> - * the global state_lock spinlock.
>   */
>  struct nfs4_file {
>  	refcount_t		fi_ref;
>  	struct inode *		fi_inode;
>  	bool			fi_aliased;
>  	spinlock_t		fi_lock;
> -	struct hlist_node       fi_hash;	/* hash on fi_fhandle */
>  	struct rhlist_head	fi_rlist;
>  	struct list_head        fi_stateids;
>  	union {
> 
> 
>
Chuck Lever Oct. 25, 2022, 12:19 a.m. UTC | #2
> On Oct 24, 2022, at 7:43 PM, NeilBrown <neilb@suse.de> wrote:
> 
> On Tue, 25 Oct 2022, Chuck Lever wrote:
>> fh_match() is costly, especially when filehandles are large (as is
>> the case for NFSv4). It needs to be used sparingly when searching
>> data structures. Unfortunately, with common workloads, I see
>> multiple thousands of objects stored in file_hashtbl[], which always
>> has only 256 buckets, which makes the hash chains quite lengthy.
>> 
>> Walking long hash chains with the state_lock held blocks other
>> activity that needs that lock.
>> 
>> To help mitigate the cost of searching with fh_match(), replace the
>> nfs4_file hash table with an rhashtable, which can dynamically
>> resize its bucket array to minimize hash chain length. The ideal for
>> this use case is one bucket per inode.
>> 
>> The result is an improvement in the latency of NFSv4 operations
>> and the reduction of nfsd CPU utilization due to the cost of
>> fh_match() and the CPU cache misses incurred while walking long
>> hash chains in the nfs4_file hash table.
>> 
>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>> ---
>> fs/nfsd/nfs4state.c |   64 +++++++++++++++++++++++++--------------------------
>> fs/nfsd/state.h     |    4 ---
>> 2 files changed, 31 insertions(+), 37 deletions(-)
>> 
>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>> index 681cb2daa843..5b90e5a6a04f 100644
>> --- a/fs/nfsd/nfs4state.c
>> +++ b/fs/nfsd/nfs4state.c
>> @@ -591,11 +591,8 @@ static void nfsd4_free_file_rcu(struct rcu_head *rcu)
>> void
>> put_nfs4_file(struct nfs4_file *fi)
>> {
>> -	might_lock(&state_lock);
>> -
>> -	if (refcount_dec_and_lock(&fi->fi_ref, &state_lock)) {
>> +	if (refcount_dec_and_test(&fi->fi_ref)) {
>> 		remove_nfs4_file_locked(fi);
>> -		spin_unlock(&state_lock);
>> 		WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate));
>> 		WARN_ON_ONCE(!list_empty(&fi->fi_delegations));
>> 		call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu);
>> @@ -709,20 +706,6 @@ static unsigned int ownerstr_hashval(struct xdr_netobj *ownername)
>> 	return ret & OWNER_HASH_MASK;
>> }
>> 
>> -/* hash table for nfs4_file */
>> -#define FILE_HASH_BITS                   8
>> -#define FILE_HASH_SIZE                  (1 << FILE_HASH_BITS)
>> -
>> -static unsigned int file_hashval(const struct svc_fh *fh)
>> -{
>> -	struct inode *inode = d_inode(fh->fh_dentry);
>> -
>> -	/* XXX: why not (here & in file cache) use inode? */
>> -	return (unsigned int)hash_long(inode->i_ino, FILE_HASH_BITS);
>> -}
>> -
>> -static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
>> -
>> static struct rhltable nfs4_file_rhltable ____cacheline_aligned_in_smp;
>> 
>> static const struct rhashtable_params nfs4_file_rhash_params = {
>> @@ -4683,12 +4666,13 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
>> static noinline_for_stack struct nfs4_file *
>> find_nfs4_file(const struct svc_fh *fhp)
>> {
>> -	unsigned int hashval = file_hashval(fhp);
>> +	struct rhlist_head *tmp, *list;
>> 	struct nfs4_file *fi = NULL;
>> 
>> 	rcu_read_lock();
>> -	hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash,
>> -				 lockdep_is_held(&state_lock)) {
>> +	list = rhltable_lookup(&nfs4_file_rhltable, fhp,
>> +			       nfs4_file_rhash_params);
>> +	rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) {
>> 		if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
>> 			if (!refcount_inc_not_zero(&fi->fi_ref))
>> 				fi = NULL;
>> @@ -4708,33 +4692,45 @@ find_nfs4_file(const struct svc_fh *fhp)
>> static noinline_for_stack struct nfs4_file *
>> insert_nfs4_file(struct nfs4_file *new, const struct svc_fh *fhp)
>> {
>> -	unsigned int hashval = file_hashval(fhp);
>> +	struct rhlist_head *tmp, *list;
>> 	struct nfs4_file *ret = NULL;
>> 	bool alias_found = false;
>> 	struct nfs4_file *fi;
>> +	int err;
>> 
>> -	spin_lock(&state_lock);
>> -	hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash,
>> -				 lockdep_is_held(&state_lock)) {
>> +	rcu_read_lock();
>> +
> 
> As mentioned separately, we need some sort of lock around this loop and
> the later insert.

Yep. I just ran out of time today.


>> +	list = rhltable_lookup(&nfs4_file_rhltable, fhp,
>> +			       nfs4_file_rhash_params);
>> +	rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) {
>> 		if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
>> 			if (refcount_inc_not_zero(&fi->fi_ref))
>> 				ret = fi;
>> 		} else if (d_inode(fhp->fh_dentry) == fi->fi_inode)
>> 			fi->fi_aliased = alias_found = true;
> 
> This d_inde)( == fi->fi_inode test is no longer relevant.  Everything in
> the list must have the same inode.  Best remove it.

My understanding is that more than one inode can hash into one of
these bucket lists. I don't see how rhltable_insert() can prevent
that from happening, and that will certainly be true if we go back
to using a fixed-size table.

I will try to test this assumption before posting the next revision.


> Thanks,
> NeilBrown
> 
>> 	}
>> -	if (likely(ret == NULL)) {
>> -		init_nfs4_file(fhp, new);
>> -		hlist_add_head_rcu(&new->fi_hash, &file_hashtbl[hashval]);
>> -		new->fi_aliased = alias_found;
>> -		ret = new;
>> -	}
>> -	spin_unlock(&state_lock);
>> +	if (ret)
>> +		goto out_unlock;
>> +
>> +	init_nfs4_file(fhp, new);
>> +	err = rhltable_insert_key(&nfs4_file_rhltable, fhp,
>> +				  &new->fi_rlist,
>> +				  nfs4_file_rhash_params);
>> +	if (err)
>> +		goto out_unlock;
>> +
>> +	new->fi_aliased = alias_found;
>> +	ret = new;
>> +
>> +out_unlock:
>> +	rcu_read_unlock();
>> 	return ret;
>> }
>> 
>> static noinline_for_stack void remove_nfs4_file_locked(struct nfs4_file *fi)
>> {
>> -	hlist_del_rcu(&fi->fi_hash);
>> +	rhltable_remove(&nfs4_file_rhltable, &fi->fi_rlist,
>> +			nfs4_file_rhash_params);
>> }
>> 
>> /*
>> @@ -5624,6 +5620,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
>> 	 * If not found, create the nfs4_file struct
>> 	 */
>> 	fp = insert_nfs4_file(open->op_file, current_fh);
>> +	if (unlikely(!fp))
>> +		return nfserr_jukebox;
>> 	if (fp != open->op_file) {
>> 		status = nfs4_check_deleg(cl, open, &dp);
>> 		if (status)
>> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
>> index 190fc7e418a4..eadd7f465bf5 100644
>> --- a/fs/nfsd/state.h
>> +++ b/fs/nfsd/state.h
>> @@ -536,16 +536,12 @@ struct nfs4_clnt_odstate {
>>  * inode can have multiple filehandles associated with it, so there is
>>  * (potentially) a many to one relationship between this struct and struct
>>  * inode.
>> - *
>> - * These are hashed by filehandle in the file_hashtbl, which is protected by
>> - * the global state_lock spinlock.
>>  */
>> struct nfs4_file {
>> 	refcount_t		fi_ref;
>> 	struct inode *		fi_inode;
>> 	bool			fi_aliased;
>> 	spinlock_t		fi_lock;
>> -	struct hlist_node       fi_hash;	/* hash on fi_fhandle */
>> 	struct rhlist_head	fi_rlist;
>> 	struct list_head        fi_stateids;
>> 	union {
>> 
>> 
>> 

--
Chuck Lever
NeilBrown Oct. 25, 2022, 1:01 a.m. UTC | #3
On Tue, 25 Oct 2022, Chuck Lever III wrote:
> 
> > On Oct 24, 2022, at 7:43 PM, NeilBrown <neilb@suse.de> wrote:
> > 
> > On Tue, 25 Oct 2022, Chuck Lever wrote:
> >> fh_match() is costly, especially when filehandles are large (as is
> >> the case for NFSv4). It needs to be used sparingly when searching
> >> data structures. Unfortunately, with common workloads, I see
> >> multiple thousands of objects stored in file_hashtbl[], which always
> >> has only 256 buckets, which makes the hash chains quite lengthy.
> >> 
> >> Walking long hash chains with the state_lock held blocks other
> >> activity that needs that lock.
> >> 
> >> To help mitigate the cost of searching with fh_match(), replace the
> >> nfs4_file hash table with an rhashtable, which can dynamically
> >> resize its bucket array to minimize hash chain length. The ideal for
> >> this use case is one bucket per inode.
> >> 
> >> The result is an improvement in the latency of NFSv4 operations
> >> and the reduction of nfsd CPU utilization due to the cost of
> >> fh_match() and the CPU cache misses incurred while walking long
> >> hash chains in the nfs4_file hash table.
> >> 
> >> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> >> ---
> >> fs/nfsd/nfs4state.c |   64 +++++++++++++++++++++++++--------------------------
> >> fs/nfsd/state.h     |    4 ---
> >> 2 files changed, 31 insertions(+), 37 deletions(-)
> >> 
> >> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> >> index 681cb2daa843..5b90e5a6a04f 100644
> >> --- a/fs/nfsd/nfs4state.c
> >> +++ b/fs/nfsd/nfs4state.c
> >> @@ -591,11 +591,8 @@ static void nfsd4_free_file_rcu(struct rcu_head *rcu)
> >> void
> >> put_nfs4_file(struct nfs4_file *fi)
> >> {
> >> -	might_lock(&state_lock);
> >> -
> >> -	if (refcount_dec_and_lock(&fi->fi_ref, &state_lock)) {
> >> +	if (refcount_dec_and_test(&fi->fi_ref)) {
> >> 		remove_nfs4_file_locked(fi);
> >> -		spin_unlock(&state_lock);
> >> 		WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate));
> >> 		WARN_ON_ONCE(!list_empty(&fi->fi_delegations));
> >> 		call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu);
> >> @@ -709,20 +706,6 @@ static unsigned int ownerstr_hashval(struct xdr_netobj *ownername)
> >> 	return ret & OWNER_HASH_MASK;
> >> }
> >> 
> >> -/* hash table for nfs4_file */
> >> -#define FILE_HASH_BITS                   8
> >> -#define FILE_HASH_SIZE                  (1 << FILE_HASH_BITS)
> >> -
> >> -static unsigned int file_hashval(const struct svc_fh *fh)
> >> -{
> >> -	struct inode *inode = d_inode(fh->fh_dentry);
> >> -
> >> -	/* XXX: why not (here & in file cache) use inode? */
> >> -	return (unsigned int)hash_long(inode->i_ino, FILE_HASH_BITS);
> >> -}
> >> -
> >> -static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
> >> -
> >> static struct rhltable nfs4_file_rhltable ____cacheline_aligned_in_smp;
> >> 
> >> static const struct rhashtable_params nfs4_file_rhash_params = {
> >> @@ -4683,12 +4666,13 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
> >> static noinline_for_stack struct nfs4_file *
> >> find_nfs4_file(const struct svc_fh *fhp)
> >> {
> >> -	unsigned int hashval = file_hashval(fhp);
> >> +	struct rhlist_head *tmp, *list;
> >> 	struct nfs4_file *fi = NULL;
> >> 
> >> 	rcu_read_lock();
> >> -	hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash,
> >> -				 lockdep_is_held(&state_lock)) {
> >> +	list = rhltable_lookup(&nfs4_file_rhltable, fhp,
> >> +			       nfs4_file_rhash_params);
> >> +	rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) {
> >> 		if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
> >> 			if (!refcount_inc_not_zero(&fi->fi_ref))
> >> 				fi = NULL;
> >> @@ -4708,33 +4692,45 @@ find_nfs4_file(const struct svc_fh *fhp)
> >> static noinline_for_stack struct nfs4_file *
> >> insert_nfs4_file(struct nfs4_file *new, const struct svc_fh *fhp)
> >> {
> >> -	unsigned int hashval = file_hashval(fhp);
> >> +	struct rhlist_head *tmp, *list;
> >> 	struct nfs4_file *ret = NULL;
> >> 	bool alias_found = false;
> >> 	struct nfs4_file *fi;
> >> +	int err;
> >> 
> >> -	spin_lock(&state_lock);
> >> -	hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash,
> >> -				 lockdep_is_held(&state_lock)) {
> >> +	rcu_read_lock();
> >> +
> > 
> > As mentioned separately, we need some sort of lock around this loop and
> > the later insert.
> 
> Yep. I just ran out of time today.
> 
> 
> >> +	list = rhltable_lookup(&nfs4_file_rhltable, fhp,
> >> +			       nfs4_file_rhash_params);
> >> +	rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) {
> >> 		if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
> >> 			if (refcount_inc_not_zero(&fi->fi_ref))
> >> 				ret = fi;
> >> 		} else if (d_inode(fhp->fh_dentry) == fi->fi_inode)
> >> 			fi->fi_aliased = alias_found = true;
> > 
> > This d_inde)( == fi->fi_inode test is no longer relevant.  Everything in
> > the list must have the same inode.  Best remove it.
> 
> My understanding is that more than one inode can hash into one of
> these bucket lists. I don't see how rhltable_insert() can prevent
> that from happening, and that will certainly be true if we go back
> to using a fixed-size table.

With rhltable each bucket is a list of lists.  An rhlist_head contains
two pointers.  "rhead.next" points to the next entry in the bucket which
will have a different key (different inode in this case).  "next" points
to the next entry with the same key - different filehandle, same inode,
in this case.

The list you get back from rhltable_lookup() isn't the full bucket list.
It is just the list within the bucket of all elements with the same key.

NeilBrown


> 
> I will try to test this assumption before posting the next revision.
>
Chuck Lever Oct. 25, 2022, 4:58 p.m. UTC | #4
> On Oct 24, 2022, at 9:01 PM, NeilBrown <neilb@suse.de> wrote:
> 
> On Tue, 25 Oct 2022, Chuck Lever III wrote:
>> 
>>> On Oct 24, 2022, at 7:43 PM, NeilBrown <neilb@suse.de> wrote:
>> 
>>>> +	list = rhltable_lookup(&nfs4_file_rhltable, fhp,
>>>> +			       nfs4_file_rhash_params);
>>>> +	rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) {
>>>> 		if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
>>>> 			if (refcount_inc_not_zero(&fi->fi_ref))
>>>> 				ret = fi;
>>>> 		} else if (d_inode(fhp->fh_dentry) == fi->fi_inode)
>>>> 			fi->fi_aliased = alias_found = true;
>>> 
>>> This d_inde)( == fi->fi_inode test is no longer relevant.  Everything in
>>> the list must have the same inode.  Best remove it.
>> 
>> My understanding is that more than one inode can hash into one of
>> these bucket lists. I don't see how rhltable_insert() can prevent
>> that from happening, and that will certainly be true if we go back
>> to using a fixed-size table.
> 
> With rhltable each bucket is a list of lists.  An rhlist_head contains
> two pointers.  "rhead.next" points to the next entry in the bucket which
> will have a different key (different inode in this case).  "next" points
> to the next entry with the same key - different filehandle, same inode,
> in this case.

That's the detail I didn't yet have, thanks. I will remove the extra
inode test and give it a try.


> The list you get back from rhltable_lookup() isn't the full bucket list.
> It is just the list within the bucket of all elements with the same key.


--
Chuck Lever
diff mbox series

Patch

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 681cb2daa843..5b90e5a6a04f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -591,11 +591,8 @@  static void nfsd4_free_file_rcu(struct rcu_head *rcu)
 void
 put_nfs4_file(struct nfs4_file *fi)
 {
-	might_lock(&state_lock);
-
-	if (refcount_dec_and_lock(&fi->fi_ref, &state_lock)) {
+	if (refcount_dec_and_test(&fi->fi_ref)) {
 		remove_nfs4_file_locked(fi);
-		spin_unlock(&state_lock);
 		WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate));
 		WARN_ON_ONCE(!list_empty(&fi->fi_delegations));
 		call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu);
@@ -709,20 +706,6 @@  static unsigned int ownerstr_hashval(struct xdr_netobj *ownername)
 	return ret & OWNER_HASH_MASK;
 }
 
-/* hash table for nfs4_file */
-#define FILE_HASH_BITS                   8
-#define FILE_HASH_SIZE                  (1 << FILE_HASH_BITS)
-
-static unsigned int file_hashval(const struct svc_fh *fh)
-{
-	struct inode *inode = d_inode(fh->fh_dentry);
-
-	/* XXX: why not (here & in file cache) use inode? */
-	return (unsigned int)hash_long(inode->i_ino, FILE_HASH_BITS);
-}
-
-static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
-
 static struct rhltable nfs4_file_rhltable ____cacheline_aligned_in_smp;
 
 static const struct rhashtable_params nfs4_file_rhash_params = {
@@ -4683,12 +4666,13 @@  move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
 static noinline_for_stack struct nfs4_file *
 find_nfs4_file(const struct svc_fh *fhp)
 {
-	unsigned int hashval = file_hashval(fhp);
+	struct rhlist_head *tmp, *list;
 	struct nfs4_file *fi = NULL;
 
 	rcu_read_lock();
-	hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash,
-				 lockdep_is_held(&state_lock)) {
+	list = rhltable_lookup(&nfs4_file_rhltable, fhp,
+			       nfs4_file_rhash_params);
+	rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) {
 		if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
 			if (!refcount_inc_not_zero(&fi->fi_ref))
 				fi = NULL;
@@ -4708,33 +4692,45 @@  find_nfs4_file(const struct svc_fh *fhp)
 static noinline_for_stack struct nfs4_file *
 insert_nfs4_file(struct nfs4_file *new, const struct svc_fh *fhp)
 {
-	unsigned int hashval = file_hashval(fhp);
+	struct rhlist_head *tmp, *list;
 	struct nfs4_file *ret = NULL;
 	bool alias_found = false;
 	struct nfs4_file *fi;
+	int err;
 
-	spin_lock(&state_lock);
-	hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash,
-				 lockdep_is_held(&state_lock)) {
+	rcu_read_lock();
+
+	list = rhltable_lookup(&nfs4_file_rhltable, fhp,
+			       nfs4_file_rhash_params);
+	rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) {
 		if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
 			if (refcount_inc_not_zero(&fi->fi_ref))
 				ret = fi;
 		} else if (d_inode(fhp->fh_dentry) == fi->fi_inode)
 			fi->fi_aliased = alias_found = true;
 	}
-	if (likely(ret == NULL)) {
-		init_nfs4_file(fhp, new);
-		hlist_add_head_rcu(&new->fi_hash, &file_hashtbl[hashval]);
-		new->fi_aliased = alias_found;
-		ret = new;
-	}
-	spin_unlock(&state_lock);
+	if (ret)
+		goto out_unlock;
+
+	init_nfs4_file(fhp, new);
+	err = rhltable_insert_key(&nfs4_file_rhltable, fhp,
+				  &new->fi_rlist,
+				  nfs4_file_rhash_params);
+	if (err)
+		goto out_unlock;
+
+	new->fi_aliased = alias_found;
+	ret = new;
+
+out_unlock:
+	rcu_read_unlock();
 	return ret;
 }
 
 static noinline_for_stack void remove_nfs4_file_locked(struct nfs4_file *fi)
 {
-	hlist_del_rcu(&fi->fi_hash);
+	rhltable_remove(&nfs4_file_rhltable, &fi->fi_rlist,
+			nfs4_file_rhash_params);
 }
 
 /*
@@ -5624,6 +5620,8 @@  nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	 * If not found, create the nfs4_file struct
 	 */
 	fp = insert_nfs4_file(open->op_file, current_fh);
+	if (unlikely(!fp))
+		return nfserr_jukebox;
 	if (fp != open->op_file) {
 		status = nfs4_check_deleg(cl, open, &dp);
 		if (status)
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 190fc7e418a4..eadd7f465bf5 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -536,16 +536,12 @@  struct nfs4_clnt_odstate {
  * inode can have multiple filehandles associated with it, so there is
  * (potentially) a many to one relationship between this struct and struct
  * inode.
- *
- * These are hashed by filehandle in the file_hashtbl, which is protected by
- * the global state_lock spinlock.
  */
 struct nfs4_file {
 	refcount_t		fi_ref;
 	struct inode *		fi_inode;
 	bool			fi_aliased;
 	spinlock_t		fi_lock;
-	struct hlist_node       fi_hash;	/* hash on fi_fhandle */
 	struct rhlist_head	fi_rlist;
 	struct list_head        fi_stateids;
 	union {