diff mbox

[v2,04/18] nfsd: add a new struct file caching facility to nfsd

Message ID 1438809216-4846-5-git-send-email-jeff.layton@primarydata.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jeff Layton Aug. 5, 2015, 9:13 p.m. UTC
Currently, NFSv2/3 reads and writes have to open a file, do the read or
write and then close it again for each RPC. This is highly inefficient,
especially when the underlying filesystem has a relatively slow open
routine.

This patch adds a new open file cache to knfsd. Rather than doing an
open for each RPC, the read/write handlers can call into this cache to
see if there is one already there for the correct filehandle and
NFS_MAY_READ/WRITE flags.

If there isn't an entry, then we create a new one and attempt to
perform the open. If there is, then we wait until the entry is fully
instantiated and return it if it is at the end of the wait. If it's
not, then we attempt to take over construction.

Since the main goal is to speed up NFSv2/3 I/O, we don't want to
close these files on last put of these objects. We need to keep them
around for a little while since we never know when the next READ/WRITE
will come in.

Cache entries have a hardcoded 1s timeout, and we have a recurring
workqueue job that walks the cache and purges any entries that have
expired.

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
---
 fs/nfsd/Makefile    |   3 +-
 fs/nfsd/filecache.c | 333 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/filecache.h |  21 ++++
 fs/nfsd/nfssvc.c    |  10 +-
 4 files changed, 365 insertions(+), 2 deletions(-)
 create mode 100644 fs/nfsd/filecache.c

Comments

Kinglong Mee Aug. 7, 2015, 3:28 p.m. UTC | #1
On 8/6/2015 05:13, Jeff Layton wrote:
> Currently, NFSv2/3 reads and writes have to open a file, do the read or
> write and then close it again for each RPC. This is highly inefficient,
> especially when the underlying filesystem has a relatively slow open
> routine.
> 
> This patch adds a new open file cache to knfsd. Rather than doing an
> open for each RPC, the read/write handlers can call into this cache to
> see if there is one already there for the correct filehandle and
> NFS_MAY_READ/WRITE flags.
> 
> If there isn't an entry, then we create a new one and attempt to
> perform the open. If there is, then we wait until the entry is fully
> instantiated and return it if it is at the end of the wait. If it's
> not, then we attempt to take over construction.
> 
> Since the main goal is to speed up NFSv2/3 I/O, we don't want to
> close these files on last put of these objects. We need to keep them
> around for a little while since we never know when the next READ/WRITE
> will come in.
> 
> Cache entries have a hardcoded 1s timeout, and we have a recurring
> workqueue job that walks the cache and purges any entries that have
> expired.
> 
> Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
... snip ... 
> diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
> index 9051ee54faa3..adf7e78b8e43 100644
> --- a/fs/nfsd/filecache.h
> +++ b/fs/nfsd/filecache.h
> @@ -4,6 +4,7 @@
>  #include <linux/jhash.h>
>  #include <linux/sunrpc/xdr.h>
>  
> +#include "nfsfh.h"
>  #include "export.h"
>  
>  /* hash table for nfs4_file */
> @@ -22,4 +23,24 @@ file_hashval(struct knfsd_fh *fh)
>  	return nfsd_fh_hashval(fh) & (NFSD_FILE_HASH_SIZE - 1);
>  }
>  
> +struct nfsd_file {

There is a nfs4_file in nfsd, it's not easy to distinguish them for a new folk.
More comments or a meaningful name is better.

> +	struct hlist_node	nf_node;
> +	struct list_head	nf_dispose;
> +	struct rcu_head		nf_rcu;
> +	struct file		*nf_file;
> +	unsigned long		nf_time;
> +#define NFSD_FILE_HASHED	(0)

Why not using hlist_unhashed()? 

thanks,
Kinglong Mee

> +#define NFSD_FILE_PENDING	(1)
> +	unsigned long		nf_flags;
> +	struct knfsd_fh		nf_handle;
> +	unsigned int		nf_hashval;
> +	atomic_t		nf_ref;
> +	unsigned char		nf_may;
> +};
> +
> +int nfsd_file_cache_init(void);
> +void nfsd_file_cache_shutdown(void);
> +void nfsd_file_put(struct nfsd_file *nf);
> +__be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> +		  unsigned int may_flags, struct nfsd_file **nfp);
>  #endif /* _FS_NFSD_FILECACHE_H */
> diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
> index ced9944201a0..0572441e23ec 100644
> --- a/fs/nfsd/nfssvc.c
> +++ b/fs/nfsd/nfssvc.c
> @@ -23,6 +23,7 @@
>  #include "cache.h"
>  #include "vfs.h"
>  #include "netns.h"
> +#include "filecache.h"
>  
>  #define NFSDDBG_FACILITY	NFSDDBG_SVC
>  
> @@ -233,11 +234,17 @@ static int nfsd_startup_generic(int nrservs)
>  	if (!nfsd_laundry_wq)
>  		goto out_racache;
>  
> -	ret = nfs4_state_start();
> +	ret = nfsd_file_cache_init();
>  	if (ret)
>  		goto out_wq;
> +
> +	ret = nfs4_state_start();
> +	if (ret)
> +		goto out_nfsd_file;
>  	return 0;
>  
> +out_nfsd_file:
> +	nfsd_file_cache_shutdown();
>  out_wq:
>  	destroy_workqueue(nfsd_laundry_wq);
>  	nfsd_laundry_wq = NULL;
> @@ -254,6 +261,7 @@ static void nfsd_shutdown_generic(void)
>  		return;
>  
>  	nfs4_state_shutdown();
> +	nfsd_file_cache_shutdown();
>  	nfsd_racache_shutdown();
>  }
>  
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Layton Aug. 7, 2015, 5:18 p.m. UTC | #2
On Fri, 7 Aug 2015 23:28:41 +0800
Kinglong Mee <kinglongmee@gmail.com> wrote:

> On 8/6/2015 05:13, Jeff Layton wrote:
> > Currently, NFSv2/3 reads and writes have to open a file, do the read or
> > write and then close it again for each RPC. This is highly inefficient,
> > especially when the underlying filesystem has a relatively slow open
> > routine.
> > 
> > This patch adds a new open file cache to knfsd. Rather than doing an
> > open for each RPC, the read/write handlers can call into this cache to
> > see if there is one already there for the correct filehandle and
> > NFS_MAY_READ/WRITE flags.
> > 
> > If there isn't an entry, then we create a new one and attempt to
> > perform the open. If there is, then we wait until the entry is fully
> > instantiated and return it if it is at the end of the wait. If it's
> > not, then we attempt to take over construction.
> > 
> > Since the main goal is to speed up NFSv2/3 I/O, we don't want to
> > close these files on last put of these objects. We need to keep them
> > around for a little while since we never know when the next READ/WRITE
> > will come in.
> > 
> > Cache entries have a hardcoded 1s timeout, and we have a recurring
> > workqueue job that walks the cache and purges any entries that have
> > expired.
> > 
> > Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
> ... snip ... 
> > diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
> > index 9051ee54faa3..adf7e78b8e43 100644
> > --- a/fs/nfsd/filecache.h
> > +++ b/fs/nfsd/filecache.h
> > @@ -4,6 +4,7 @@
> >  #include <linux/jhash.h>
> >  #include <linux/sunrpc/xdr.h>
> >  
> > +#include "nfsfh.h"
> >  #include "export.h"
> >  
> >  /* hash table for nfs4_file */
> > @@ -22,4 +23,24 @@ file_hashval(struct knfsd_fh *fh)
> >  	return nfsd_fh_hashval(fh) & (NFSD_FILE_HASH_SIZE - 1);
> >  }
> >  
> > +struct nfsd_file {
> 
> There is a nfs4_file in nfsd, it's not easy to distinguish them for a new folk.
> More comments or a meaningful name is better.
> 

Maybe. Again, any suggestions? My hope was that eventually we can unify
the two caches somehow but maybe that's naive.

> > +	struct hlist_node	nf_node;
> > +	struct list_head	nf_dispose;
> > +	struct rcu_head		nf_rcu;
> > +	struct file		*nf_file;
> > +	unsigned long		nf_time;
> > +#define NFSD_FILE_HASHED	(0)
> 
> Why not using hlist_unhashed()? 
> 

These entries are removed from the list using hlist_del_rcu(), and
hlist_unhashed will not return true after that.


> > +#define NFSD_FILE_PENDING	(1)
> > +	unsigned long		nf_flags;
> > +	struct knfsd_fh		nf_handle;
> > +	unsigned int		nf_hashval;
> > +	atomic_t		nf_ref;
> > +	unsigned char		nf_may;
> > +};
> > +
> > +int nfsd_file_cache_init(void);
> > +void nfsd_file_cache_shutdown(void);
> > +void nfsd_file_put(struct nfsd_file *nf);
> > +__be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > +		  unsigned int may_flags, struct nfsd_file **nfp);
> >  #endif /* _FS_NFSD_FILECACHE_H */
> > diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
> > index ced9944201a0..0572441e23ec 100644
> > --- a/fs/nfsd/nfssvc.c
> > +++ b/fs/nfsd/nfssvc.c
> > @@ -23,6 +23,7 @@
> >  #include "cache.h"
> >  #include "vfs.h"
> >  #include "netns.h"
> > +#include "filecache.h"
> >  
> >  #define NFSDDBG_FACILITY	NFSDDBG_SVC
> >  
> > @@ -233,11 +234,17 @@ static int nfsd_startup_generic(int nrservs)
> >  	if (!nfsd_laundry_wq)
> >  		goto out_racache;
> >  
> > -	ret = nfs4_state_start();
> > +	ret = nfsd_file_cache_init();
> >  	if (ret)
> >  		goto out_wq;
> > +
> > +	ret = nfs4_state_start();
> > +	if (ret)
> > +		goto out_nfsd_file;
> >  	return 0;
> >  
> > +out_nfsd_file:
> > +	nfsd_file_cache_shutdown();
> >  out_wq:
> >  	destroy_workqueue(nfsd_laundry_wq);
> >  	nfsd_laundry_wq = NULL;
> > @@ -254,6 +261,7 @@ static void nfsd_shutdown_generic(void)
> >  		return;
> >  
> >  	nfs4_state_shutdown();
> > +	nfsd_file_cache_shutdown();
> >  	nfsd_racache_shutdown();
> >  }
> >  
> >
Kinglong Mee Aug. 8, 2015, 12:14 a.m. UTC | #3
On 8/8/2015 01:18, Jeff Layton wrote:
> On Fri, 7 Aug 2015 23:28:41 +0800
> Kinglong Mee <kinglongmee@gmail.com> wrote:
> 
>> On 8/6/2015 05:13, Jeff Layton wrote:
>>> Currently, NFSv2/3 reads and writes have to open a file, do the read or
>>> write and then close it again for each RPC. This is highly inefficient,
>>> especially when the underlying filesystem has a relatively slow open
>>> routine.
>>>
>>> This patch adds a new open file cache to knfsd. Rather than doing an
>>> open for each RPC, the read/write handlers can call into this cache to
>>> see if there is one already there for the correct filehandle and
>>> NFS_MAY_READ/WRITE flags.
>>>
>>> If there isn't an entry, then we create a new one and attempt to
>>> perform the open. If there is, then we wait until the entry is fully
>>> instantiated and return it if it is at the end of the wait. If it's
>>> not, then we attempt to take over construction.
>>>
>>> Since the main goal is to speed up NFSv2/3 I/O, we don't want to
>>> close these files on last put of these objects. We need to keep them
>>> around for a little while since we never know when the next READ/WRITE
>>> will come in.
>>>
>>> Cache entries have a hardcoded 1s timeout, and we have a recurring
>>> workqueue job that walks the cache and purges any entries that have
>>> expired.
>>>
>>> Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
>> ... snip ... 
>>> diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
>>> index 9051ee54faa3..adf7e78b8e43 100644
>>> --- a/fs/nfsd/filecache.h
>>> +++ b/fs/nfsd/filecache.h
>>> @@ -4,6 +4,7 @@
>>>  #include <linux/jhash.h>
>>>  #include <linux/sunrpc/xdr.h>
>>>  
>>> +#include "nfsfh.h"
>>>  #include "export.h"
>>>  
>>>  /* hash table for nfs4_file */
>>> @@ -22,4 +23,24 @@ file_hashval(struct knfsd_fh *fh)
>>>  	return nfsd_fh_hashval(fh) & (NFSD_FILE_HASH_SIZE - 1);
>>>  }
>>>  
>>> +struct nfsd_file {
>>
>> There is a nfs4_file in nfsd, it's not easy to distinguish them for a new folk.
>> More comments or a meaningful name is better.
>>
> 
> Maybe. Again, any suggestions? My hope was that eventually we can unify
> the two caches somehow but maybe that's naive.

I cannot find a better name for the new file cache. More comments.

I don't agree with merging those two into one cache.

nfsv4's file cache is a state resource of client which will exist since close
or lease expire. But nfsd_file cache is a temporary resource for nfsv2/v3 client
without state. 

Also, for nfsv4's conflict checking, should we check the temporary file cache
for nfsv2/v3 too?

> 
>>> +	struct hlist_node	nf_node;
>>> +	struct list_head	nf_dispose;
>>> +	struct rcu_head		nf_rcu;
>>> +	struct file		*nf_file;
>>> +	unsigned long		nf_time;
>>> +#define NFSD_FILE_HASHED	(0)
>>
>> Why not using hlist_unhashed()? 
>>
> 
> These entries are removed from the list using hlist_del_rcu(), and
> hlist_unhashed will not return true after that.

As I understand, NFSD_FILE_HASHED means the file cache is added to 
nfsd_file_hashtbl, and increased the global file count.

With calling hlist_del_rcu, file cache has be deleted from the hashtbl,
and clear the NFSD_FILE_HASHED bit. 

As using in the codes, the bit and hlist_unhashed have the same meaning.
Any mistake of my understand?

+static void
+nfsd_file_unhash(struct nfsd_file *nf)
+{
+	if (test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
+		hlist_del_rcu(&nf->nf_node);
+		nfsd_file_count_dec();
+	}
+}

thanks,
Kinglong Mee

> 
> 
>>> +#define NFSD_FILE_PENDING	(1)
>>> +	unsigned long		nf_flags;
>>> +	struct knfsd_fh		nf_handle;
>>> +	unsigned int		nf_hashval;
>>> +	atomic_t		nf_ref;
>>> +	unsigned char		nf_may;
>>> +};
>>> +
>>> +int nfsd_file_cache_init(void);
>>> +void nfsd_file_cache_shutdown(void);
>>> +void nfsd_file_put(struct nfsd_file *nf);
>>> +__be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
>>> +		  unsigned int may_flags, struct nfsd_file **nfp);
>>>  #endif /* _FS_NFSD_FILECACHE_H */
>>> diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
>>> index ced9944201a0..0572441e23ec 100644
>>> --- a/fs/nfsd/nfssvc.c
>>> +++ b/fs/nfsd/nfssvc.c
>>> @@ -23,6 +23,7 @@
>>>  #include "cache.h"
>>>  #include "vfs.h"
>>>  #include "netns.h"
>>> +#include "filecache.h"
>>>  
>>>  #define NFSDDBG_FACILITY	NFSDDBG_SVC
>>>  
>>> @@ -233,11 +234,17 @@ static int nfsd_startup_generic(int nrservs)
>>>  	if (!nfsd_laundry_wq)
>>>  		goto out_racache;
>>>  
>>> -	ret = nfs4_state_start();
>>> +	ret = nfsd_file_cache_init();
>>>  	if (ret)
>>>  		goto out_wq;
>>> +
>>> +	ret = nfs4_state_start();
>>> +	if (ret)
>>> +		goto out_nfsd_file;
>>>  	return 0;
>>>  
>>> +out_nfsd_file:
>>> +	nfsd_file_cache_shutdown();
>>>  out_wq:
>>>  	destroy_workqueue(nfsd_laundry_wq);
>>>  	nfsd_laundry_wq = NULL;
>>> @@ -254,6 +261,7 @@ static void nfsd_shutdown_generic(void)
>>>  		return;
>>>  
>>>  	nfs4_state_shutdown();
>>> +	nfsd_file_cache_shutdown();
>>>  	nfsd_racache_shutdown();
>>>  }
>>>  
>>>
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Layton Aug. 8, 2015, 10:36 a.m. UTC | #4
On Sat, 8 Aug 2015 08:14:14 +0800
Kinglong Mee <kinglongmee@gmail.com> wrote:

> On 8/8/2015 01:18, Jeff Layton wrote:
> > On Fri, 7 Aug 2015 23:28:41 +0800
> > Kinglong Mee <kinglongmee@gmail.com> wrote:
> > 
> >> On 8/6/2015 05:13, Jeff Layton wrote:
> >>> Currently, NFSv2/3 reads and writes have to open a file, do the read or
> >>> write and then close it again for each RPC. This is highly inefficient,
> >>> especially when the underlying filesystem has a relatively slow open
> >>> routine.
> >>>
> >>> This patch adds a new open file cache to knfsd. Rather than doing an
> >>> open for each RPC, the read/write handlers can call into this cache to
> >>> see if there is one already there for the correct filehandle and
> >>> NFS_MAY_READ/WRITE flags.
> >>>
> >>> If there isn't an entry, then we create a new one and attempt to
> >>> perform the open. If there is, then we wait until the entry is fully
> >>> instantiated and return it if it is at the end of the wait. If it's
> >>> not, then we attempt to take over construction.
> >>>
> >>> Since the main goal is to speed up NFSv2/3 I/O, we don't want to
> >>> close these files on last put of these objects. We need to keep them
> >>> around for a little while since we never know when the next READ/WRITE
> >>> will come in.
> >>>
> >>> Cache entries have a hardcoded 1s timeout, and we have a recurring
> >>> workqueue job that walks the cache and purges any entries that have
> >>> expired.
> >>>
> >>> Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
> >> ... snip ... 
> >>> diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
> >>> index 9051ee54faa3..adf7e78b8e43 100644
> >>> --- a/fs/nfsd/filecache.h
> >>> +++ b/fs/nfsd/filecache.h
> >>> @@ -4,6 +4,7 @@
> >>>  #include <linux/jhash.h>
> >>>  #include <linux/sunrpc/xdr.h>
> >>>  
> >>> +#include "nfsfh.h"
> >>>  #include "export.h"
> >>>  
> >>>  /* hash table for nfs4_file */
> >>> @@ -22,4 +23,24 @@ file_hashval(struct knfsd_fh *fh)
> >>>  	return nfsd_fh_hashval(fh) & (NFSD_FILE_HASH_SIZE - 1);
> >>>  }
> >>>  
> >>> +struct nfsd_file {
> >>
> >> There is a nfs4_file in nfsd, it's not easy to distinguish them for a new folk.
> >> More comments or a meaningful name is better.
> >>
> > 
> > Maybe. Again, any suggestions? My hope was that eventually we can unify
> > the two caches somehow but maybe that's naive.
> 
> I cannot find a better name for the new file cache. More comments.
> 
> I don't agree with merging those two into one cache.
> 
> nfsv4's file cache is a state resource of client which will exist since close
> or lease expire. But nfsd_file cache is a temporary resource for nfsv2/v3 client
> without state. 
> 

You're probably right here. It was idle thought from when I first
started this work. What we probably would want to do however is to
layer the nfs4_file cache on top of this cache instead of having it
manage filps on its own.

I tried to design this cache so that it can handle O_RDWR opens, even
though the current callers don't actually ever request those. It should
be possible to hook up the nfs4_file cache to it, though I'd prefer to
wait until we have this code in place first and then do that later.

> Also, for nfsv4's conflict checking, should we check the temporary file cache
> for nfsv2/v3 too?
> 

You mean for share/deny modes? We traditionally have not done that, and
I don't see a compelling reason to start now. That would be a separate
project in its own right, IMO. We'd need to lift the share/deny mode
handling into this new cache.

There's also the problem of there not being any protocol support for
that in NFSv2/3. What would we return to the client if there's a deny
mode conflict when it's trying to do (e.g.) a READ?


> > 
> >>> +	struct hlist_node	nf_node;
> >>> +	struct list_head	nf_dispose;
> >>> +	struct rcu_head		nf_rcu;
> >>> +	struct file		*nf_file;
> >>> +	unsigned long		nf_time;
> >>> +#define NFSD_FILE_HASHED	(0)
> >>
> >> Why not using hlist_unhashed()? 
> >>
> > 
> > These entries are removed from the list using hlist_del_rcu(), and
> > hlist_unhashed will not return true after that.
> 
> As I understand, NFSD_FILE_HASHED means the file cache is added to 
> nfsd_file_hashtbl, and increased the global file count.
> 
> With calling hlist_del_rcu, file cache has be deleted from the hashtbl,
> and clear the NFSD_FILE_HASHED bit. 
> 

That's correct.

> As using in the codes, the bit and hlist_unhashed have the same meaning.
> Any mistake of my understand?
> 

hlist_unhashed() won't work here:

static inline int hlist_unhashed(const struct hlist_node *h)
{
        return !h->pprev;
}

...but:

static inline void hlist_del_rcu(struct hlist_node *n)
{
        __hlist_del(n);
        n->pprev = LIST_POISON2;
}

...so after a hlist_del_rcu, hlist_unhashed will still return false. In
order to get it to return true, we'd need to use hlist_del_init, but
that would mean that we couldn't safely traverse the hash chain under
the rcu_read_lock.

> +static void
> +nfsd_file_unhash(struct nfsd_file *nf)
> +{
> +	if (test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
> +		hlist_del_rcu(&nf->nf_node);
> +		nfsd_file_count_dec();
> +	}
> +}
> 
> thanks,
> Kinglong Mee
> 
> > 
> > 
> >>> +#define NFSD_FILE_PENDING	(1)
> >>> +	unsigned long		nf_flags;
> >>> +	struct knfsd_fh		nf_handle;
> >>> +	unsigned int		nf_hashval;
> >>> +	atomic_t		nf_ref;
> >>> +	unsigned char		nf_may;
> >>> +};
> >>> +
> >>> +int nfsd_file_cache_init(void);
> >>> +void nfsd_file_cache_shutdown(void);
> >>> +void nfsd_file_put(struct nfsd_file *nf);
> >>> +__be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> >>> +		  unsigned int may_flags, struct nfsd_file **nfp);
> >>>  #endif /* _FS_NFSD_FILECACHE_H */
> >>> diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
> >>> index ced9944201a0..0572441e23ec 100644
> >>> --- a/fs/nfsd/nfssvc.c
> >>> +++ b/fs/nfsd/nfssvc.c
> >>> @@ -23,6 +23,7 @@
> >>>  #include "cache.h"
> >>>  #include "vfs.h"
> >>>  #include "netns.h"
> >>> +#include "filecache.h"
> >>>  
> >>>  #define NFSDDBG_FACILITY	NFSDDBG_SVC
> >>>  
> >>> @@ -233,11 +234,17 @@ static int nfsd_startup_generic(int nrservs)
> >>>  	if (!nfsd_laundry_wq)
> >>>  		goto out_racache;
> >>>  
> >>> -	ret = nfs4_state_start();
> >>> +	ret = nfsd_file_cache_init();
> >>>  	if (ret)
> >>>  		goto out_wq;
> >>> +
> >>> +	ret = nfs4_state_start();
> >>> +	if (ret)
> >>> +		goto out_nfsd_file;
> >>>  	return 0;
> >>>  
> >>> +out_nfsd_file:
> >>> +	nfsd_file_cache_shutdown();
> >>>  out_wq:
> >>>  	destroy_workqueue(nfsd_laundry_wq);
> >>>  	nfsd_laundry_wq = NULL;
> >>> @@ -254,6 +261,7 @@ static void nfsd_shutdown_generic(void)
> >>>  		return;
> >>>  
> >>>  	nfs4_state_shutdown();
> >>> +	nfsd_file_cache_shutdown();
> >>>  	nfsd_racache_shutdown();
> >>>  }
> >>>  
> >>>
> > 
> >
Christoph Hellwig Aug. 9, 2015, 7:17 a.m. UTC | #5
On Wed, Aug 05, 2015 at 05:13:22PM -0400, Jeff Layton wrote:
> Currently, NFSv2/3 reads and writes have to open a file, do the read or
> write and then close it again for each RPC. This is highly inefficient,
> especially when the underlying filesystem has a relatively slow open
> routine.

.. as do many NFSv4 reads and writes as they often get special stateids
passed from the client.  Seems a little odd that we take more care
of this for the legacy protocols than the current one.

I think I'd rather see a nfs4_file cache and turn v2/3 into something
like the special stateid case.  Did you consider that?
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Layton Aug. 9, 2015, 11:19 a.m. UTC | #6
On Sun, 9 Aug 2015 00:17:02 -0700
Christoph Hellwig <hch@infradead.org> wrote:

> On Wed, Aug 05, 2015 at 05:13:22PM -0400, Jeff Layton wrote:
> > Currently, NFSv2/3 reads and writes have to open a file, do the read or
> > write and then close it again for each RPC. This is highly inefficient,
> > especially when the underlying filesystem has a relatively slow open
> > routine.
> 
> .. as do many NFSv4 reads and writes as they often get special stateids
> passed from the client.  Seems a little odd that we take more care
> of this for the legacy protocols than the current one.
> 

This is just an initial step. I'd like to see the nfs4_file cache
layered on top of this eventually. That's a bit more work though, and I
wanted to get this piece merged first before I did that part.

> I think I'd rather see a nfs4_file cache and turn v2/3 into something
> like the special stateid case.  Did you consider that?

I started looking at extending the nfs4_file cache to NFSv2/3, but it's
actually rather difficult...

We traditionally haven't dealt with share/deny modes in NFSv3, so we'd
need a mechanism to bypass all of that stuff for legacy protocols. We'd
also have to convert that cache from one that frees objects when the
last one is put to one that keeps them around for a bit.

That also means that the v2/3 opens have to keep around extra fields
that aren't really needed, and deal with the really godawful locking of
the NFSv4 code.

I really think this approach is a better one. We can still use this
cache from the NFSv4 code and wiring it in shouldn't be too hard. It's
mostly a matter of plumbing in struct nfsd_file objects where that code
is passing around struct file objects now.
Christoph Hellwig Aug. 10, 2015, 8:28 a.m. UTC | #7
> I started looking at extending the nfs4_file cache to NFSv2/3, but it's
> actually rather difficult...
> 
> We traditionally haven't dealt with share/deny modes in NFSv3, so we'd
> need a mechanism to bypass all of that stuff for legacy protocols. We'd
> also have to convert that cache from one that frees objects when the
> last one is put to one that keeps them around for a bit.
> 
> That also means that the v2/3 opens have to keep around extra fields
> that aren't really needed, and deal with the really godawful locking of
> the NFSv4 code.
> 
> I really think this approach is a better one. We can still use this
> cache from the NFSv4 code and wiring it in shouldn't be too hard. It's
> mostly a matter of plumbing in struct nfsd_file objects where that code
> is passing around struct file objects now.

Ok.  I'd still like to it wired up to NFSv4 - with my changes to make all
temporary file opens happens from nfs4_preprocess_stateid_op it should
be fairly simple.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Layton Aug. 10, 2015, 11:31 a.m. UTC | #8
On Mon, 10 Aug 2015 01:28:07 -0700
Christoph Hellwig <hch@infradead.org> wrote:

> > I started looking at extending the nfs4_file cache to NFSv2/3, but it's
> > actually rather difficult...
> > 
> > We traditionally haven't dealt with share/deny modes in NFSv3, so we'd
> > need a mechanism to bypass all of that stuff for legacy protocols. We'd
> > also have to convert that cache from one that frees objects when the
> > last one is put to one that keeps them around for a bit.
> > 
> > That also means that the v2/3 opens have to keep around extra fields
> > that aren't really needed, and deal with the really godawful locking of
> > the NFSv4 code.
> > 
> > I really think this approach is a better one. We can still use this
> > cache from the NFSv4 code and wiring it in shouldn't be too hard. It's
> > mostly a matter of plumbing in struct nfsd_file objects where that code
> > is passing around struct file objects now.
> 
> Ok.  I'd still like to it wired up to NFSv4 - with my changes to make all
> temporary file opens happens from nfs4_preprocess_stateid_op it should
> be fairly simple.


Ok, I should be able to add that in. I'll plan to do so before I send
out a v3 of the patchset.
Kinglong Mee Aug. 10, 2015, 11:36 a.m. UTC | #9
On 8/8/2015 18:36, Jeff Layton wrote:
> On Sat, 8 Aug 2015 08:14:14 +0800
> Kinglong Mee <kinglongmee@gmail.com> wrote:
> 
>> On 8/8/2015 01:18, Jeff Layton wrote:
>>> On Fri, 7 Aug 2015 23:28:41 +0800
>>> Kinglong Mee <kinglongmee@gmail.com> wrote:
>>>
>>>> On 8/6/2015 05:13, Jeff Layton wrote:
>>>>> Currently, NFSv2/3 reads and writes have to open a file, do the read or
>>>>> write and then close it again for each RPC. This is highly inefficient,
>>>>> especially when the underlying filesystem has a relatively slow open
>>>>> routine.
>>>>>
>>>>> This patch adds a new open file cache to knfsd. Rather than doing an
>>>>> open for each RPC, the read/write handlers can call into this cache to
>>>>> see if there is one already there for the correct filehandle and
>>>>> NFS_MAY_READ/WRITE flags.
>>>>>
>>>>> If there isn't an entry, then we create a new one and attempt to
>>>>> perform the open. If there is, then we wait until the entry is fully
>>>>> instantiated and return it if it is at the end of the wait. If it's
>>>>> not, then we attempt to take over construction.
>>>>>
>>>>> Since the main goal is to speed up NFSv2/3 I/O, we don't want to
>>>>> close these files on last put of these objects. We need to keep them
>>>>> around for a little while since we never know when the next READ/WRITE
>>>>> will come in.
>>>>>
>>>>> Cache entries have a hardcoded 1s timeout, and we have a recurring
>>>>> workqueue job that walks the cache and purges any entries that have
>>>>> expired.
>>>>>
>>>>> Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
>>>> ... snip ... 
>>>>> diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
>>>>> index 9051ee54faa3..adf7e78b8e43 100644
>>>>> --- a/fs/nfsd/filecache.h
>>>>> +++ b/fs/nfsd/filecache.h
>>>>> @@ -4,6 +4,7 @@
>>>>>  #include <linux/jhash.h>
>>>>>  #include <linux/sunrpc/xdr.h>
>>>>>  
>>>>> +#include "nfsfh.h"
>>>>>  #include "export.h"
>>>>>  
>>>>>  /* hash table for nfs4_file */
>>>>> @@ -22,4 +23,24 @@ file_hashval(struct knfsd_fh *fh)
>>>>>  	return nfsd_fh_hashval(fh) & (NFSD_FILE_HASH_SIZE - 1);
>>>>>  }
>>>>>  
>>>>> +struct nfsd_file {
>>>>
>>>> There is a nfs4_file in nfsd, it's not easy to distinguish them for a new folk.
>>>> More comments or a meaningful name is better.
>>>>
>>>
>>> Maybe. Again, any suggestions? My hope was that eventually we can unify
>>> the two caches somehow but maybe that's naive.
>>
>> I cannot find a better name for the new file cache. More comments.
>>
>> I don't agree with merging those two into one cache.
>>
>> nfsv4's file cache is a state resource of client which will exist since close
>> or lease expire. But nfsd_file cache is a temporary resource for nfsv2/v3 client
>> without state. 
>>
> 
> You're probably right here. It was idle thought from when I first
> started this work. What we probably would want to do however is to
> layer the nfs4_file cache on top of this cache instead of having it
> manage filps on its own.
> 
> I tried to design this cache so that it can handle O_RDWR opens, even
> though the current callers don't actually ever request those. It should
> be possible to hook up the nfs4_file cache to it, though I'd prefer to
> wait until we have this code in place first and then do that later.
> 
>> Also, for nfsv4's conflict checking, should we check the temporary file cache
>> for nfsv2/v3 too?
>>
> 
> You mean for share/deny modes? We traditionally have not done that, and
> I don't see a compelling reason to start now. That would be a separate
> project in its own right, IMO. We'd need to lift the share/deny mode
> handling into this new cache.

OK.
I will look forward to the new project.

> 
> There's also the problem of there not being any protocol support for
> that in NFSv2/3. What would we return to the client if there's a deny
> mode conflict when it's trying to do (e.g.) a READ?
> 

It's my worry too.
Without this cache, this case only influence an NFSv2/3 RPC process time,
but, with this cache, it's more than 1s at worst.

> 
>>>
>>>>> +	struct hlist_node	nf_node;
>>>>> +	struct list_head	nf_dispose;
>>>>> +	struct rcu_head		nf_rcu;
>>>>> +	struct file		*nf_file;
>>>>> +	unsigned long		nf_time;
>>>>> +#define NFSD_FILE_HASHED	(0)
>>>>
>>>> Why not using hlist_unhashed()? 
>>>>
>>>
>>> These entries are removed from the list using hlist_del_rcu(), and
>>> hlist_unhashed will not return true after that.
>>
>> As I understand, NFSD_FILE_HASHED means the file cache is added to 
>> nfsd_file_hashtbl, and increased the global file count.
>>
>> With calling hlist_del_rcu, file cache has be deleted from the hashtbl,
>> and clear the NFSD_FILE_HASHED bit. 
>>
> 
> That's correct.
> 
>> As using in the codes, the bit and hlist_unhashed have the same meaning.
>> Any mistake of my understand?
>>
> 
> hlist_unhashed() won't work here:
> 
> static inline int hlist_unhashed(const struct hlist_node *h)
> {
>         return !h->pprev;
> }
> 
> ...but:
> 
> static inline void hlist_del_rcu(struct hlist_node *n)
> {
>         __hlist_del(n);
>         n->pprev = LIST_POISON2;
> }

Got it.
You are right.

thanks,
Kinglong Mee
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 9a6028e120c6..8908bb467727 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -10,7 +10,8 @@  obj-$(CONFIG_NFSD)	+= nfsd.o
 nfsd-y			+= trace.o
 
 nfsd-y 			+= nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
-			   export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o
+			   export.o auth.o lockd.o nfscache.o nfsxdr.o \
+			   stats.o filecache.o
 nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o
 nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
 nfsd-$(CONFIG_NFSD_V3)	+= nfs3proc.o nfs3xdr.o
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
new file mode 100644
index 000000000000..5278b8d9e79a
--- /dev/null
+++ b/fs/nfsd/filecache.c
@@ -0,0 +1,333 @@ 
+/*
+ * Open file cache.
+ *
+ * (c) 2015 - Jeff Layton <jeff.layton@primarydata.com>
+ */
+
+#include <linux/slab.h>
+#include <linux/hash.h>
+#include <linux/file.h>
+#include <linux/sched.h>
+
+#include "vfs.h"
+#include "nfsd.h"
+#include "nfsfh.h"
+#include "filecache.h"
+
+#define NFSDDBG_FACILITY	NFSDDBG_FH
+
+/* Min time we should keep around a file cache entry */
+#define NFSD_FILE_EXPIRE	(HZ)
+
+/* We only care about NFSD_MAY_READ/WRITE for this cache */
+#define NFSD_FILE_MAY_MASK	(NFSD_MAY_READ|NFSD_MAY_WRITE)
+
+struct nfsd_fcache_bucket {
+	struct hlist_head	nfb_head;
+	spinlock_t		nfb_lock;
+};
+
+static struct nfsd_fcache_bucket	*nfsd_file_hashtbl;
+
+/* Count of hashed nfsd_file objects */
+static atomic_t				nfsd_file_count;
+
+/* Periodic job for cleaning nfsd_file cache */
+static struct delayed_work		nfsd_file_cache_clean_work;
+
+static void
+nfsd_file_count_inc(void)
+{
+	if (atomic_inc_return(&nfsd_file_count) == 1)
+		queue_delayed_work(nfsd_laundry_wq, &nfsd_file_cache_clean_work,
+					NFSD_FILE_EXPIRE);
+}
+
+static void
+nfsd_file_count_dec(void)
+{
+	if (atomic_dec_and_test(&nfsd_file_count))
+		cancel_delayed_work(&nfsd_file_cache_clean_work);
+}
+
+static struct nfsd_file *
+nfsd_file_alloc(struct knfsd_fh *fh, unsigned int may, unsigned int hashval)
+{
+	struct nfsd_file *nf;
+
+	/* FIXME: create a new slabcache for these? */
+	nf = kzalloc(sizeof(*nf), GFP_KERNEL);
+	if (nf) {
+		INIT_HLIST_NODE(&nf->nf_node);
+		INIT_LIST_HEAD(&nf->nf_dispose);
+		nf->nf_time = jiffies;
+		fh_copy_shallow(&nf->nf_handle, fh);
+		nf->nf_hashval = hashval;
+		atomic_set(&nf->nf_ref, 1);
+		nf->nf_may = NFSD_FILE_MAY_MASK & may;
+	}
+	return nf;
+}
+
+static void
+nfsd_file_put_final(struct nfsd_file *nf)
+{
+	if (nf->nf_file)
+		fput(nf->nf_file);
+	kfree_rcu(nf, nf_rcu);
+}
+
+static void
+nfsd_file_unhash(struct nfsd_file *nf)
+{
+	if (test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
+		hlist_del_rcu(&nf->nf_node);
+		nfsd_file_count_dec();
+	}
+}
+
+static void
+nfsd_file_put_locked(struct nfsd_file *nf, struct list_head *dispose)
+{
+	if (!atomic_dec_and_test(&nf->nf_ref)) {
+		nf->nf_time = jiffies;
+		return;
+	}
+
+	nfsd_file_unhash(nf);
+	list_add(&nf->nf_dispose, dispose);
+}
+
+void
+nfsd_file_put(struct nfsd_file *nf)
+{
+	if (!atomic_dec_and_lock(&nf->nf_ref,
+				&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock)) {
+		nf->nf_time = jiffies;
+		return;
+	}
+
+	nfsd_file_unhash(nf);
+	spin_unlock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
+	nfsd_file_put_final(nf);
+}
+
+static void
+nfsd_file_dispose_list(struct list_head *dispose)
+{
+	struct nfsd_file *nf;
+
+	while(!list_empty(dispose)) {
+		nf = list_first_entry(dispose, struct nfsd_file, nf_dispose);
+		list_del(&nf->nf_dispose);
+		nfsd_file_put_final(nf);
+	}
+}
+
+static void
+nfsd_file_cache_prune(void)
+{
+	unsigned int		i;
+	struct nfsd_file	*nf;
+	struct hlist_node	*tmp;
+	LIST_HEAD(dispose);
+
+	for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) {
+		if (hlist_empty(&nfsd_file_hashtbl[i].nfb_head))
+			continue;
+
+		spin_lock(&nfsd_file_hashtbl[i].nfb_lock);
+		hlist_for_each_entry_safe(nf, tmp,
+				&nfsd_file_hashtbl[i].nfb_head, nf_node) {
+
+			/* does someone else have a reference? */
+			if (atomic_read(&nf->nf_ref) > 1)
+				continue;
+
+			/* Was this file touched recently? */
+			if (time_before(nf->nf_time + NFSD_FILE_EXPIRE, jiffies))
+				continue;
+
+			/* Ok, it's expired...unhash it */
+			nfsd_file_unhash(nf);
+
+			/* ...and put the hash reference */
+			nfsd_file_put_locked(nf, &dispose);
+		}
+		spin_unlock(&nfsd_file_hashtbl[i].nfb_lock);
+		nfsd_file_dispose_list(&dispose);
+	}
+}
+
+static void
+nfsd_file_cache_cleaner(struct work_struct *work)
+{
+	if (!atomic_read(&nfsd_file_count))
+		return;
+
+	nfsd_file_cache_prune();
+
+	if (atomic_read(&nfsd_file_count))
+		queue_delayed_work(nfsd_laundry_wq, &nfsd_file_cache_clean_work,
+					NFSD_FILE_EXPIRE);
+}
+
+int
+nfsd_file_cache_init(void)
+{
+	unsigned int i;
+
+	if (nfsd_file_hashtbl)
+		return 0;
+
+	nfsd_file_hashtbl = kcalloc(NFSD_FILE_HASH_SIZE,
+				sizeof(*nfsd_file_hashtbl), GFP_KERNEL);
+	if (!nfsd_file_hashtbl)
+		goto out_nomem;
+
+	for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) {
+		INIT_HLIST_HEAD(&nfsd_file_hashtbl[i].nfb_head);
+		spin_lock_init(&nfsd_file_hashtbl[i].nfb_lock);
+	}
+
+	INIT_DELAYED_WORK(&nfsd_file_cache_clean_work, nfsd_file_cache_cleaner);
+	return 0;
+out_nomem:
+	printk(KERN_ERR "nfsd: failed to init nfsd file cache\n");
+	return -ENOMEM;
+}
+
+void
+nfsd_file_cache_shutdown(void)
+{
+	unsigned int		i;
+	struct nfsd_file	*nf;
+	LIST_HEAD(dispose);
+
+	cancel_delayed_work_sync(&nfsd_file_cache_clean_work);
+	for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) {
+		spin_lock(&nfsd_file_hashtbl[i].nfb_lock);
+		while(!hlist_empty(&nfsd_file_hashtbl[i].nfb_head)) {
+			nf = hlist_entry(nfsd_file_hashtbl[i].nfb_head.first,
+					 struct nfsd_file, nf_node);
+			nfsd_file_unhash(nf);
+			/* put the hash reference */
+			nfsd_file_put_locked(nf, &dispose);
+		}
+		spin_unlock(&nfsd_file_hashtbl[i].nfb_lock);
+		nfsd_file_dispose_list(&dispose);
+	}
+	kfree(nfsd_file_hashtbl);
+	nfsd_file_hashtbl = NULL;
+}
+
+/*
+ * Search nfsd_file_hashtbl[] for file. We hash on the filehandle and also on
+ * the NFSD_MAY_READ/WRITE flags. If the file is open for r/w, then it's usable
+ * for either.
+ */
+static struct nfsd_file *
+nfsd_file_find_locked(struct knfsd_fh *fh, unsigned int may_flags,
+			unsigned int hashval)
+{
+	struct nfsd_file *nf;
+	unsigned char need = may_flags & NFSD_FILE_MAY_MASK;
+
+	hlist_for_each_entry_rcu(nf, &nfsd_file_hashtbl[hashval].nfb_head,
+				 nf_node) {
+		if ((need & nf->nf_may) != need)
+			continue;
+		if (fh_match(&nf->nf_handle, fh)) {
+			if (atomic_inc_not_zero(&nf->nf_ref))
+				return nf;
+		}
+	}
+	return NULL;
+}
+
+__be32
+nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
+		  unsigned int may_flags, struct nfsd_file **pnf)
+{
+	__be32	status = nfs_ok;
+	struct nfsd_file *nf, *new = NULL;
+	struct knfsd_fh *fh = &fhp->fh_handle;
+	unsigned int hashval = file_hashval(fh);
+
+	/* Mask off any extraneous bits */
+	may_flags &= NFSD_FILE_MAY_MASK;
+retry:
+	rcu_read_lock();
+	nf = nfsd_file_find_locked(fh, may_flags, hashval);
+	rcu_read_unlock();
+	if (nf)
+		goto wait_for_construction;
+
+	if (!new) {
+		new = nfsd_file_alloc(&fhp->fh_handle, may_flags, hashval);
+		if (!new)
+			return nfserr_jukebox;
+	}
+
+	spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock);
+	nf = nfsd_file_find_locked(fh, may_flags, hashval);
+	if (likely(nf == NULL)) {
+		/* Take reference for the hashtable */
+		atomic_inc(&new->nf_ref);
+		__set_bit(NFSD_FILE_HASHED, &new->nf_flags);
+		__set_bit(NFSD_FILE_PENDING, &new->nf_flags);
+		hlist_add_head_rcu(&new->nf_node,
+				&nfsd_file_hashtbl[hashval].nfb_head);
+		spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
+		nfsd_file_count_inc();
+		nf = new;
+		new = NULL;
+		goto open_file;
+	}
+	spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
+
+wait_for_construction:
+	wait_on_bit(&nf->nf_flags, NFSD_FILE_PENDING, TASK_UNINTERRUPTIBLE);
+
+	/* Did construction of this file fail? */
+	if (!nf->nf_file) {
+		/*
+		 * We can only take over construction for this nfsd_file if the
+		 * MAY flags are equal. Otherwise, we put the reference and try
+		 * again.
+		 */
+		if (may_flags != nf->nf_may) {
+			nfsd_file_put(nf);
+			goto retry;
+		}
+
+		/* try to take over construction for this file */
+		if (test_and_set_bit(NFSD_FILE_PENDING, &nf->nf_flags))
+			goto wait_for_construction;
+		goto open_file;
+	}
+
+	/*
+	 * We have a file that was opened in the context of another rqst. We
+	 * must check permissions. Since we're dealing with open files here,
+	 * we always want to set the OWNER_OVERRIDE bit.
+	 */
+	status = fh_verify(rqstp, fhp, S_IFREG, may_flags);
+	if (status == nfs_ok)
+		status = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
+						may_flags|NFSD_MAY_OWNER_OVERRIDE);
+out:
+	if (status == nfs_ok)
+		*pnf = nf;
+	else
+		nfsd_file_put(nf);
+
+	if (new)
+		nfsd_file_put(new);
+	return status;
+open_file:
+	status = nfsd_open(rqstp, fhp, S_IFREG, may_flags, &nf->nf_file);
+	clear_bit(NFSD_FILE_PENDING, &nf->nf_flags);
+	wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING);
+	goto out;
+}
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index 9051ee54faa3..adf7e78b8e43 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -4,6 +4,7 @@ 
 #include <linux/jhash.h>
 #include <linux/sunrpc/xdr.h>
 
+#include "nfsfh.h"
 #include "export.h"
 
 /* hash table for nfs4_file */
@@ -22,4 +23,24 @@  file_hashval(struct knfsd_fh *fh)
 	return nfsd_fh_hashval(fh) & (NFSD_FILE_HASH_SIZE - 1);
 }
 
+struct nfsd_file {
+	struct hlist_node	nf_node;
+	struct list_head	nf_dispose;
+	struct rcu_head		nf_rcu;
+	struct file		*nf_file;
+	unsigned long		nf_time;
+#define NFSD_FILE_HASHED	(0)
+#define NFSD_FILE_PENDING	(1)
+	unsigned long		nf_flags;
+	struct knfsd_fh		nf_handle;
+	unsigned int		nf_hashval;
+	atomic_t		nf_ref;
+	unsigned char		nf_may;
+};
+
+int nfsd_file_cache_init(void);
+void nfsd_file_cache_shutdown(void);
+void nfsd_file_put(struct nfsd_file *nf);
+__be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
+		  unsigned int may_flags, struct nfsd_file **nfp);
 #endif /* _FS_NFSD_FILECACHE_H */
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index ced9944201a0..0572441e23ec 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -23,6 +23,7 @@ 
 #include "cache.h"
 #include "vfs.h"
 #include "netns.h"
+#include "filecache.h"
 
 #define NFSDDBG_FACILITY	NFSDDBG_SVC
 
@@ -233,11 +234,17 @@  static int nfsd_startup_generic(int nrservs)
 	if (!nfsd_laundry_wq)
 		goto out_racache;
 
-	ret = nfs4_state_start();
+	ret = nfsd_file_cache_init();
 	if (ret)
 		goto out_wq;
+
+	ret = nfs4_state_start();
+	if (ret)
+		goto out_nfsd_file;
 	return 0;
 
+out_nfsd_file:
+	nfsd_file_cache_shutdown();
 out_wq:
 	destroy_workqueue(nfsd_laundry_wq);
 	nfsd_laundry_wq = NULL;
@@ -254,6 +261,7 @@  static void nfsd_shutdown_generic(void)
 		return;
 
 	nfs4_state_shutdown();
+	nfsd_file_cache_shutdown();
 	nfsd_racache_shutdown();
 }