Message ID | 166696848023.106044.12150149492678240864.stgit@klimt.1015granger.net (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | A course adjustment, for sure... | expand |
On Fri, 2022-10-28 at 10:48 -0400, Chuck Lever wrote: > fh_match() is costly, especially when filehandles are large (as is > the case for NFSv4). It needs to be used sparingly when searching > data structures. Unfortunately, with common workloads, I see > multiple thousands of objects stored in file_hashtbl[], which has > just 256 buckets, making its bucket hash chains quite lengthy. > > Walking long hash chains with the state_lock held blocks other > activity that needs that lock. Sizable hash chains are a common > occurrance once the server has handed out some delegations, for > example -- IIUC, each delegated file is held open on the server by > an nfs4_file object. > > To help mitigate the cost of searching with fh_match(), replace the > nfs4_file hash table with an rhashtable, which can dynamically > resize its bucket array to minimize hash chain length. > > The result of this modification is an improvement in the latency of > NFSv4 operations, and the reduction of nfsd CPU utilization due to > eliminating the cost of multiple calls to fh_match() and reducing > the CPU cache misses incurred while walking long hash chains in the > nfs4_file hash table. > > Signed-off-by: Chuck Lever <chuck.lever@oracle.com> > Reviewed-by: NeilBrown <neilb@suse.de> > --- > fs/nfsd/nfs4state.c | 77 ++++++++++++++++++++++++++------------------------- > fs/nfsd/state.h | 4 --- > 2 files changed, 40 insertions(+), 41 deletions(-) > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c > index c2ef2db9c84c..c78b66e678dd 100644 > --- a/fs/nfsd/nfs4state.c > +++ b/fs/nfsd/nfs4state.c > @@ -591,11 +591,8 @@ static void nfsd4_free_file_rcu(struct rcu_head *rcu) > void > put_nfs4_file(struct nfs4_file *fi) > { > - might_lock(&state_lock); > - > - if (refcount_dec_and_lock(&fi->fi_ref, &state_lock)) { > + if (refcount_dec_and_test(&fi->fi_ref)) { > nfsd4_file_hash_remove(fi); > - spin_unlock(&state_lock); > WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate)); > WARN_ON_ONCE(!list_empty(&fi->fi_delegations)); > call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu); > @@ -709,20 +706,6 @@ static unsigned int ownerstr_hashval(struct xdr_netobj *ownername) > return ret & OWNER_HASH_MASK; > } > > -/* hash table for nfs4_file */ > -#define FILE_HASH_BITS 8 > -#define FILE_HASH_SIZE (1 << FILE_HASH_BITS) > - > -static unsigned int file_hashval(const struct svc_fh *fh) > -{ > - struct inode *inode = d_inode(fh->fh_dentry); > - > - /* XXX: why not (here & in file cache) use inode? */ > - return (unsigned int)hash_long(inode->i_ino, FILE_HASH_BITS); > -} > - > -static struct hlist_head file_hashtbl[FILE_HASH_SIZE]; > - > static struct rhltable nfs4_file_rhltable ____cacheline_aligned_in_smp; > > static const struct rhashtable_params nfs4_file_rhash_params = { > @@ -4687,12 +4670,14 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net) > static noinline_for_stack struct nfs4_file * > nfsd4_file_hash_lookup(const struct svc_fh *fhp) > { > - unsigned int hashval = file_hashval(fhp); > + struct inode *inode = d_inode(fhp->fh_dentry); > + struct rhlist_head *tmp, *list; > struct nfs4_file *fi; > > rcu_read_lock(); > - hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash, > - lockdep_is_held(&state_lock)) { > + list = rhltable_lookup(&nfs4_file_rhltable, &inode, > + nfs4_file_rhash_params); > + rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) { > if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) { > if (refcount_inc_not_zero(&fi->fi_ref)) { > rcu_read_unlock(); > @@ -4706,40 +4691,56 @@ nfsd4_file_hash_lookup(const struct svc_fh *fhp) > > /* > * On hash insertion, identify entries with the same inode but > - * distinct filehandles. They will all be in the same hash bucket > - * because nfs4_file's are hashed by the address in the fi_inode > - * field. > + * distinct filehandles. They will all be on the list returned > + * by rhltable_lookup(). > + * > + * inode->i_lock prevents racing insertions from adding an entry > + * for the same inode/fhp pair twice. > */ > static noinline_for_stack struct nfs4_file * > nfsd4_file_hash_insert(struct nfs4_file *new, const struct svc_fh *fhp) > { > - unsigned int hashval = file_hashval(fhp); > + struct inode *inode = d_inode(fhp->fh_dentry); > + struct rhlist_head *tmp, *list; > struct nfs4_file *ret = NULL; > bool alias_found = false; > struct nfs4_file *fi; > + int err; > > - spin_lock(&state_lock); > - hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash, > - lockdep_is_held(&state_lock)) { > + rcu_read_lock(); > + spin_lock(&inode->i_lock); > + > + list = rhltable_lookup(&nfs4_file_rhltable, &inode, > + nfs4_file_rhash_params); > + rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) { > if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) { > if (refcount_inc_not_zero(&fi->fi_ref)) > ret = fi; > - } else if (d_inode(fhp->fh_dentry) == fi->fi_inode) > + } else > fi->fi_aliased = alias_found = true; > } > - if (likely(ret == NULL)) { > - nfsd4_file_init(fhp, new); > - hlist_add_head_rcu(&new->fi_hash, &file_hashtbl[hashval]); > - new->fi_aliased = alias_found; > - ret = new; > - } > - spin_unlock(&state_lock); > + if (ret) > + goto out_unlock; > + > + nfsd4_file_init(fhp, new); > + err = rhltable_insert(&nfs4_file_rhltable, &new->fi_rlist, > + nfs4_file_rhash_params); > + if (err) > + goto out_unlock; > + > + new->fi_aliased = alias_found; > + ret = new; > + > +out_unlock: > + spin_unlock(&inode->i_lock); > + rcu_read_unlock(); > return ret; > } > > static noinline_for_stack void nfsd4_file_hash_remove(struct nfs4_file *fi) > { > - hlist_del_rcu(&fi->fi_hash); > + rhltable_remove(&nfs4_file_rhltable, &fi->fi_rlist, > + nfs4_file_rhash_params); > } > > /* > @@ -5629,6 +5630,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf > * If not found, create the nfs4_file struct > */ > fp = nfsd4_file_hash_insert(open->op_file, current_fh); > + if (unlikely(!fp)) > + return nfserr_jukebox; > if (fp != open->op_file) { > status = nfs4_check_deleg(cl, open, &dp); > if (status) > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h > index 190fc7e418a4..eadd7f465bf5 100644 > --- a/fs/nfsd/state.h > +++ b/fs/nfsd/state.h > @@ -536,16 +536,12 @@ struct nfs4_clnt_odstate { > * inode can have multiple filehandles associated with it, so there is > * (potentially) a many to one relationship between this struct and struct > * inode. > - * > - * These are hashed by filehandle in the file_hashtbl, which is protected by > - * the global state_lock spinlock. > */ > struct nfs4_file { > refcount_t fi_ref; > struct inode * fi_inode; > bool fi_aliased; > spinlock_t fi_lock; > - struct hlist_node fi_hash; /* hash on fi_fhandle */ > struct rhlist_head fi_rlist; > struct list_head fi_stateids; > union { > > You can add this to 13 and 14: Reviewed-by: Jeff Layton <jlayton@redhat.com> That said, I'd probably prefer to see the two squashed together, since you're adding unused infrastructure in 13.
On Mon, 2022-10-31 at 12:54 -0400, Jeff Layton wrote: > On Fri, 2022-10-28 at 10:48 -0400, Chuck Lever wrote: > > fh_match() is costly, especially when filehandles are large (as is > > the case for NFSv4). It needs to be used sparingly when searching > > data structures. Unfortunately, with common workloads, I see > > multiple thousands of objects stored in file_hashtbl[], which has > > just 256 buckets, making its bucket hash chains quite lengthy. > > > > Walking long hash chains with the state_lock held blocks other > > activity that needs that lock. Sizable hash chains are a common > > occurrance once the server has handed out some delegations, for > > example -- IIUC, each delegated file is held open on the server by > > an nfs4_file object. > > > > To help mitigate the cost of searching with fh_match(), replace the > > nfs4_file hash table with an rhashtable, which can dynamically > > resize its bucket array to minimize hash chain length. > > > > The result of this modification is an improvement in the latency of > > NFSv4 operations, and the reduction of nfsd CPU utilization due to > > eliminating the cost of multiple calls to fh_match() and reducing > > the CPU cache misses incurred while walking long hash chains in the > > nfs4_file hash table. > > > > Signed-off-by: Chuck Lever <chuck.lever@oracle.com> > > Reviewed-by: NeilBrown <neilb@suse.de> > > --- > > fs/nfsd/nfs4state.c | 77 ++++++++++++++++++++++++++------------------------- > > fs/nfsd/state.h | 4 --- > > 2 files changed, 40 insertions(+), 41 deletions(-) > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c > > index c2ef2db9c84c..c78b66e678dd 100644 > > --- a/fs/nfsd/nfs4state.c > > +++ b/fs/nfsd/nfs4state.c > > @@ -591,11 +591,8 @@ static void nfsd4_free_file_rcu(struct rcu_head *rcu) > > void > > put_nfs4_file(struct nfs4_file *fi) > > { > > - might_lock(&state_lock); > > - > > - if (refcount_dec_and_lock(&fi->fi_ref, &state_lock)) { > > + if (refcount_dec_and_test(&fi->fi_ref)) { > > nfsd4_file_hash_remove(fi); > > - spin_unlock(&state_lock); > > WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate)); > > WARN_ON_ONCE(!list_empty(&fi->fi_delegations)); > > call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu); > > @@ -709,20 +706,6 @@ static unsigned int ownerstr_hashval(struct xdr_netobj *ownername) > > return ret & OWNER_HASH_MASK; > > } > > > > -/* hash table for nfs4_file */ > > -#define FILE_HASH_BITS 8 > > -#define FILE_HASH_SIZE (1 << FILE_HASH_BITS) > > - > > -static unsigned int file_hashval(const struct svc_fh *fh) > > -{ > > - struct inode *inode = d_inode(fh->fh_dentry); > > - > > - /* XXX: why not (here & in file cache) use inode? */ > > - return (unsigned int)hash_long(inode->i_ino, FILE_HASH_BITS); > > -} > > - > > -static struct hlist_head file_hashtbl[FILE_HASH_SIZE]; > > - > > static struct rhltable nfs4_file_rhltable ____cacheline_aligned_in_smp; > > > > static const struct rhashtable_params nfs4_file_rhash_params = { > > @@ -4687,12 +4670,14 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net) > > static noinline_for_stack struct nfs4_file * > > nfsd4_file_hash_lookup(const struct svc_fh *fhp) > > { > > - unsigned int hashval = file_hashval(fhp); > > + struct inode *inode = d_inode(fhp->fh_dentry); > > + struct rhlist_head *tmp, *list; > > struct nfs4_file *fi; > > > > rcu_read_lock(); > > - hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash, > > - lockdep_is_held(&state_lock)) { > > + list = rhltable_lookup(&nfs4_file_rhltable, &inode, > > + nfs4_file_rhash_params); > > + rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) { > > if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) { > > if (refcount_inc_not_zero(&fi->fi_ref)) { > > rcu_read_unlock(); > > @@ -4706,40 +4691,56 @@ nfsd4_file_hash_lookup(const struct svc_fh *fhp) > > > > /* > > * On hash insertion, identify entries with the same inode but > > - * distinct filehandles. They will all be in the same hash bucket > > - * because nfs4_file's are hashed by the address in the fi_inode > > - * field. > > + * distinct filehandles. They will all be on the list returned > > + * by rhltable_lookup(). > > + * > > + * inode->i_lock prevents racing insertions from adding an entry > > + * for the same inode/fhp pair twice. > > */ > > static noinline_for_stack struct nfs4_file * > > nfsd4_file_hash_insert(struct nfs4_file *new, const struct svc_fh *fhp) > > { > > - unsigned int hashval = file_hashval(fhp); > > + struct inode *inode = d_inode(fhp->fh_dentry); > > + struct rhlist_head *tmp, *list; > > struct nfs4_file *ret = NULL; > > bool alias_found = false; > > struct nfs4_file *fi; > > + int err; > > > > - spin_lock(&state_lock); > > - hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash, > > - lockdep_is_held(&state_lock)) { > > + rcu_read_lock(); > > + spin_lock(&inode->i_lock); > > + > > + list = rhltable_lookup(&nfs4_file_rhltable, &inode, > > + nfs4_file_rhash_params); > > + rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) { > > if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) { > > if (refcount_inc_not_zero(&fi->fi_ref)) > > ret = fi; > > - } else if (d_inode(fhp->fh_dentry) == fi->fi_inode) > > + } else > > fi->fi_aliased = alias_found = true; > > } > > - if (likely(ret == NULL)) { > > - nfsd4_file_init(fhp, new); > > - hlist_add_head_rcu(&new->fi_hash, &file_hashtbl[hashval]); > > - new->fi_aliased = alias_found; > > - ret = new; > > - } > > - spin_unlock(&state_lock); > > + if (ret) > > + goto out_unlock; > > + > > + nfsd4_file_init(fhp, new); > > + err = rhltable_insert(&nfs4_file_rhltable, &new->fi_rlist, > > + nfs4_file_rhash_params); > > + if (err) > > + goto out_unlock; > > + > > + new->fi_aliased = alias_found; > > + ret = new; > > + > > +out_unlock: > > + spin_unlock(&inode->i_lock); > > + rcu_read_unlock(); > > return ret; > > } > > > > static noinline_for_stack void nfsd4_file_hash_remove(struct nfs4_file *fi) > > { > > - hlist_del_rcu(&fi->fi_hash); > > + rhltable_remove(&nfs4_file_rhltable, &fi->fi_rlist, > > + nfs4_file_rhash_params); > > } > > > > /* > > @@ -5629,6 +5630,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf > > * If not found, create the nfs4_file struct > > */ > > fp = nfsd4_file_hash_insert(open->op_file, current_fh); > > + if (unlikely(!fp)) > > + return nfserr_jukebox; > > if (fp != open->op_file) { > > status = nfs4_check_deleg(cl, open, &dp); > > if (status) > > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h > > index 190fc7e418a4..eadd7f465bf5 100644 > > --- a/fs/nfsd/state.h > > +++ b/fs/nfsd/state.h > > @@ -536,16 +536,12 @@ struct nfs4_clnt_odstate { > > * inode can have multiple filehandles associated with it, so there is > > * (potentially) a many to one relationship between this struct and struct > > * inode. > > - * > > - * These are hashed by filehandle in the file_hashtbl, which is protected by > > - * the global state_lock spinlock. > > */ > > struct nfs4_file { > > refcount_t fi_ref; > > struct inode * fi_inode; > > bool fi_aliased; > > spinlock_t fi_lock; > > - struct hlist_node fi_hash; /* hash on fi_fhandle */ > > struct rhlist_head fi_rlist; > > struct list_head fi_stateids; > > union { > > > > > > You can add this to 13 and 14: > > Reviewed-by: Jeff Layton <jlayton@redhat.com> > > That said, I'd probably prefer to see the two squashed together, since > you're adding unused infrastructure in 13. > Erm, maybe make that my kernel.org address instead? I try to use that for upstream work.
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index c2ef2db9c84c..c78b66e678dd 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -591,11 +591,8 @@ static void nfsd4_free_file_rcu(struct rcu_head *rcu) void put_nfs4_file(struct nfs4_file *fi) { - might_lock(&state_lock); - - if (refcount_dec_and_lock(&fi->fi_ref, &state_lock)) { + if (refcount_dec_and_test(&fi->fi_ref)) { nfsd4_file_hash_remove(fi); - spin_unlock(&state_lock); WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate)); WARN_ON_ONCE(!list_empty(&fi->fi_delegations)); call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu); @@ -709,20 +706,6 @@ static unsigned int ownerstr_hashval(struct xdr_netobj *ownername) return ret & OWNER_HASH_MASK; } -/* hash table for nfs4_file */ -#define FILE_HASH_BITS 8 -#define FILE_HASH_SIZE (1 << FILE_HASH_BITS) - -static unsigned int file_hashval(const struct svc_fh *fh) -{ - struct inode *inode = d_inode(fh->fh_dentry); - - /* XXX: why not (here & in file cache) use inode? */ - return (unsigned int)hash_long(inode->i_ino, FILE_HASH_BITS); -} - -static struct hlist_head file_hashtbl[FILE_HASH_SIZE]; - static struct rhltable nfs4_file_rhltable ____cacheline_aligned_in_smp; static const struct rhashtable_params nfs4_file_rhash_params = { @@ -4687,12 +4670,14 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net) static noinline_for_stack struct nfs4_file * nfsd4_file_hash_lookup(const struct svc_fh *fhp) { - unsigned int hashval = file_hashval(fhp); + struct inode *inode = d_inode(fhp->fh_dentry); + struct rhlist_head *tmp, *list; struct nfs4_file *fi; rcu_read_lock(); - hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash, - lockdep_is_held(&state_lock)) { + list = rhltable_lookup(&nfs4_file_rhltable, &inode, + nfs4_file_rhash_params); + rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) { if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) { if (refcount_inc_not_zero(&fi->fi_ref)) { rcu_read_unlock(); @@ -4706,40 +4691,56 @@ nfsd4_file_hash_lookup(const struct svc_fh *fhp) /* * On hash insertion, identify entries with the same inode but - * distinct filehandles. They will all be in the same hash bucket - * because nfs4_file's are hashed by the address in the fi_inode - * field. + * distinct filehandles. They will all be on the list returned + * by rhltable_lookup(). + * + * inode->i_lock prevents racing insertions from adding an entry + * for the same inode/fhp pair twice. */ static noinline_for_stack struct nfs4_file * nfsd4_file_hash_insert(struct nfs4_file *new, const struct svc_fh *fhp) { - unsigned int hashval = file_hashval(fhp); + struct inode *inode = d_inode(fhp->fh_dentry); + struct rhlist_head *tmp, *list; struct nfs4_file *ret = NULL; bool alias_found = false; struct nfs4_file *fi; + int err; - spin_lock(&state_lock); - hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash, - lockdep_is_held(&state_lock)) { + rcu_read_lock(); + spin_lock(&inode->i_lock); + + list = rhltable_lookup(&nfs4_file_rhltable, &inode, + nfs4_file_rhash_params); + rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) { if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) { if (refcount_inc_not_zero(&fi->fi_ref)) ret = fi; - } else if (d_inode(fhp->fh_dentry) == fi->fi_inode) + } else fi->fi_aliased = alias_found = true; } - if (likely(ret == NULL)) { - nfsd4_file_init(fhp, new); - hlist_add_head_rcu(&new->fi_hash, &file_hashtbl[hashval]); - new->fi_aliased = alias_found; - ret = new; - } - spin_unlock(&state_lock); + if (ret) + goto out_unlock; + + nfsd4_file_init(fhp, new); + err = rhltable_insert(&nfs4_file_rhltable, &new->fi_rlist, + nfs4_file_rhash_params); + if (err) + goto out_unlock; + + new->fi_aliased = alias_found; + ret = new; + +out_unlock: + spin_unlock(&inode->i_lock); + rcu_read_unlock(); return ret; } static noinline_for_stack void nfsd4_file_hash_remove(struct nfs4_file *fi) { - hlist_del_rcu(&fi->fi_hash); + rhltable_remove(&nfs4_file_rhltable, &fi->fi_rlist, + nfs4_file_rhash_params); } /* @@ -5629,6 +5630,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf * If not found, create the nfs4_file struct */ fp = nfsd4_file_hash_insert(open->op_file, current_fh); + if (unlikely(!fp)) + return nfserr_jukebox; if (fp != open->op_file) { status = nfs4_check_deleg(cl, open, &dp); if (status) diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 190fc7e418a4..eadd7f465bf5 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -536,16 +536,12 @@ struct nfs4_clnt_odstate { * inode can have multiple filehandles associated with it, so there is * (potentially) a many to one relationship between this struct and struct * inode. - * - * These are hashed by filehandle in the file_hashtbl, which is protected by - * the global state_lock spinlock. */ struct nfs4_file { refcount_t fi_ref; struct inode * fi_inode; bool fi_aliased; spinlock_t fi_lock; - struct hlist_node fi_hash; /* hash on fi_fhandle */ struct rhlist_head fi_rlist; struct list_head fi_stateids; union {