diff mbox

[v3,14/20] nfsd: close cached files prior to a REMOVE or RENAME that would replace target

Message ID 1440069440-27454-15-git-send-email-jeff.layton@primarydata.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jeff Layton Aug. 20, 2015, 11:17 a.m. UTC
It's not uncommon for some workloads to do a bunch of I/O to a file and
delete it just afterward. If knfsd has a cached open file however, then
the file may still be open when the dentry is unlinked. If the
underlying filesystem is nfs, then that could trigger it to do a
sillyrename.

On a REMOVE or RENAME scan the nfsd_file cache for open files that
correspond to the inode, and proactively unhash and put their
references. This should prevent any delete-on-last-close activity from
occurring, solely due to knfsd's open file cache.

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
---
 fs/nfsd/filecache.c | 25 +++++++++++++++++++++++++
 fs/nfsd/filecache.h |  1 +
 fs/nfsd/trace.h     | 17 +++++++++++++++++
 fs/nfsd/vfs.c       | 17 +++++++++++++++--
 4 files changed, 58 insertions(+), 2 deletions(-)

Comments

J. Bruce Fields Aug. 26, 2015, 8 p.m. UTC | #1
On Thu, Aug 20, 2015 at 07:17:14AM -0400, Jeff Layton wrote:
> It's not uncommon for some workloads to do a bunch of I/O to a file and
> delete it just afterward. If knfsd has a cached open file however, then
> the file may still be open when the dentry is unlinked. If the
> underlying filesystem is nfs, then that could trigger it to do a
> sillyrename.

Possibly worth noting that situation doesn't currently occur upstream.

(And, another justification worth noting: space used by a file should be
deallocated on last unlink or close.  People do sometimes notice if it's
not, especially if the file is large.)

> On a REMOVE or RENAME scan the nfsd_file cache for open files that
> correspond to the inode, and proactively unhash and put their
> references. This should prevent any delete-on-last-close activity from
> occurring, solely due to knfsd's open file cache.

Is there anything here to prevent a new cache entry being added after
nfsd_file_close_inode and before the file is actually removed?

--b.

> 
> Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
> ---
>  fs/nfsd/filecache.c | 25 +++++++++++++++++++++++++
>  fs/nfsd/filecache.h |  1 +
>  fs/nfsd/trace.h     | 17 +++++++++++++++++
>  fs/nfsd/vfs.c       | 17 +++++++++++++++--
>  4 files changed, 58 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> index e48b536762aa..4bd683f03b6e 100644
> --- a/fs/nfsd/filecache.c
> +++ b/fs/nfsd/filecache.c
> @@ -283,6 +283,31 @@ nfsd_file_find_locked(struct inode *inode, unsigned int may_flags,
>  	return NULL;
>  }
>  
> +/**
> + * nfsd_file_close_inode - attempt to forcibly close a nfsd_file
> + * @inode: inode of the file to attempt to remove
> + *
> + * Walk the whole hash bucket, looking for any files that correspond to "inode".
> + * If any do, then unhash them and put the hashtable reference to them.
> + */
> +void
> +nfsd_file_close_inode(struct inode *inode)
> +{
> +	struct nfsd_file	*nf;
> +	struct hlist_node	*tmp;
> +	unsigned int		hashval = (unsigned int)hash_ptr(inode, NFSD_FILE_HASH_BITS);
> +	LIST_HEAD(dispose);
> +
> +	spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock);
> +	hlist_for_each_entry_safe(nf, tmp, &nfsd_file_hashtbl[hashval].nfb_head, nf_node) {
> +		if (inode == nf->nf_inode)
> +			nfsd_file_unhash_and_release_locked(nf, &dispose);
> +	}
> +	spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
> +	trace_nfsd_file_close_inode(hashval, inode, !list_empty(&dispose));
> +	nfsd_file_dispose_list(&dispose);
> +}
> +
>  __be32
>  nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  		  unsigned int may_flags, struct nfsd_file **pnf)
> diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
> index debd558ef786..191cdb25aa66 100644
> --- a/fs/nfsd/filecache.h
> +++ b/fs/nfsd/filecache.h
> @@ -26,6 +26,7 @@ int nfsd_file_cache_init(void);
>  void nfsd_file_cache_shutdown(void);
>  void nfsd_file_put(struct nfsd_file *nf);
>  struct nfsd_file *nfsd_file_get(struct nfsd_file *nf);
> +void nfsd_file_close_inode(struct inode *inode);
>  __be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  		  unsigned int may_flags, struct nfsd_file **nfp);
>  #endif /* _FS_NFSD_FILECACHE_H */
> diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> index 2dac872d31e8..95af3b9c7b66 100644
> --- a/fs/nfsd/trace.h
> +++ b/fs/nfsd/trace.h
> @@ -139,6 +139,23 @@ TRACE_EVENT(nfsd_file_acquire,
>  			show_nf_may(__entry->nf_may), __entry->nf_file,
>  			be32_to_cpu(__entry->status))
>  );
> +
> +TRACE_EVENT(nfsd_file_close_inode,
> +	TP_PROTO(unsigned int hash, struct inode *inode, int found),
> +	TP_ARGS(hash, inode, found),
> +	TP_STRUCT__entry(
> +		__field(unsigned int, hash)
> +		__field(struct inode *, inode)
> +		__field(int, found)
> +	),
> +	TP_fast_assign(
> +		__entry->hash = hash;
> +		__entry->inode = inode;
> +		__entry->found = found;
> +	),
> +	TP_printk("hash=0x%x inode=0x%p found=%d", __entry->hash,
> +			__entry->inode, __entry->found)
> +);
>  #endif /* _NFSD_TRACE_H */
>  
>  #undef TRACE_INCLUDE_PATH
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index 6cfd96adcc71..98d3b9d96480 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -1583,6 +1583,15 @@ out_nfserr:
>  	goto out_unlock;
>  }
>  
> +static void
> +nfsd_close_cached_files(struct dentry *dentry)
> +{
> +	struct inode *inode = d_inode(dentry);
> +
> +	if (inode && S_ISREG(inode->i_mode))
> +		nfsd_file_close_inode(inode);
> +}
> +
>  /*
>   * Rename a file
>   * N.B. After this call _both_ ffhp and tfhp need an fh_put
> @@ -1652,6 +1661,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
>  	if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry)
>  		goto out_dput_new;
>  
> +	nfsd_close_cached_files(ndentry);
>  	host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0);
>  	if (!host_err) {
>  		host_err = commit_metadata(tfhp);
> @@ -1721,10 +1731,13 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
>  	if (!type)
>  		type = d_inode(rdentry)->i_mode & S_IFMT;
>  
> -	if (type != S_IFDIR)
> +	if (type != S_IFDIR) {
> +		nfsd_close_cached_files(rdentry);
>  		host_err = vfs_unlink(dirp, rdentry, NULL);
> -	else
> +	} else {
>  		host_err = vfs_rmdir(dirp, rdentry);
> +	}
> +
>  	if (!host_err)
>  		host_err = commit_metadata(fhp);
>  	dput(rdentry);
> -- 
> 2.4.3
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Layton Aug. 26, 2015, 10:53 p.m. UTC | #2
On Wed, 26 Aug 2015 16:00:32 -0400
"J. Bruce Fields" <bfields@fieldses.org> wrote:

> On Thu, Aug 20, 2015 at 07:17:14AM -0400, Jeff Layton wrote:
> > It's not uncommon for some workloads to do a bunch of I/O to a file and
> > delete it just afterward. If knfsd has a cached open file however, then
> > the file may still be open when the dentry is unlinked. If the
> > underlying filesystem is nfs, then that could trigger it to do a
> > sillyrename.
> 
> Possibly worth noting that situation doesn't currently occur upstream.
> 
> (And, another justification worth noting: space used by a file should be
> deallocated on last unlink or close.  People do sometimes notice if it's
> not, especially if the file is large.)
> 

Good points.

> > On a REMOVE or RENAME scan the nfsd_file cache for open files that
> > correspond to the inode, and proactively unhash and put their
> > references. This should prevent any delete-on-last-close activity from
> > occurring, solely due to knfsd's open file cache.
> 
> Is there anything here to prevent a new cache entry being added after
> nfsd_file_close_inode and before the file is actually removed?
> 

No, nothing -- it's strictly best effort.

What might make sense is to consider looking at the dentry associated
with the struct file when putting the last reference to the nfsd_file.
If it's unhashed, then we could unhash the nfsd_file and put the hash
reference for it.

That won't prevent silly renames in the case of NFS being reexported,
of course, but it should ensure that we don't leave the thing open
indefinitely in the case of such a race.

I'll  have to think about that one as well...

> --b.
> 
> > 
> > Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
> > ---
> >  fs/nfsd/filecache.c | 25 +++++++++++++++++++++++++
> >  fs/nfsd/filecache.h |  1 +
> >  fs/nfsd/trace.h     | 17 +++++++++++++++++
> >  fs/nfsd/vfs.c       | 17 +++++++++++++++--
> >  4 files changed, 58 insertions(+), 2 deletions(-)
> > 
> > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> > index e48b536762aa..4bd683f03b6e 100644
> > --- a/fs/nfsd/filecache.c
> > +++ b/fs/nfsd/filecache.c
> > @@ -283,6 +283,31 @@ nfsd_file_find_locked(struct inode *inode, unsigned int may_flags,
> >  	return NULL;
> >  }
> >  
> > +/**
> > + * nfsd_file_close_inode - attempt to forcibly close a nfsd_file
> > + * @inode: inode of the file to attempt to remove
> > + *
> > + * Walk the whole hash bucket, looking for any files that correspond to "inode".
> > + * If any do, then unhash them and put the hashtable reference to them.
> > + */
> > +void
> > +nfsd_file_close_inode(struct inode *inode)
> > +{
> > +	struct nfsd_file	*nf;
> > +	struct hlist_node	*tmp;
> > +	unsigned int		hashval = (unsigned int)hash_ptr(inode, NFSD_FILE_HASH_BITS);
> > +	LIST_HEAD(dispose);
> > +
> > +	spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock);
> > +	hlist_for_each_entry_safe(nf, tmp, &nfsd_file_hashtbl[hashval].nfb_head, nf_node) {
> > +		if (inode == nf->nf_inode)
> > +			nfsd_file_unhash_and_release_locked(nf, &dispose);
> > +	}
> > +	spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
> > +	trace_nfsd_file_close_inode(hashval, inode, !list_empty(&dispose));
> > +	nfsd_file_dispose_list(&dispose);
> > +}
> > +
> >  __be32
> >  nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> >  		  unsigned int may_flags, struct nfsd_file **pnf)
> > diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
> > index debd558ef786..191cdb25aa66 100644
> > --- a/fs/nfsd/filecache.h
> > +++ b/fs/nfsd/filecache.h
> > @@ -26,6 +26,7 @@ int nfsd_file_cache_init(void);
> >  void nfsd_file_cache_shutdown(void);
> >  void nfsd_file_put(struct nfsd_file *nf);
> >  struct nfsd_file *nfsd_file_get(struct nfsd_file *nf);
> > +void nfsd_file_close_inode(struct inode *inode);
> >  __be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> >  		  unsigned int may_flags, struct nfsd_file **nfp);
> >  #endif /* _FS_NFSD_FILECACHE_H */
> > diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> > index 2dac872d31e8..95af3b9c7b66 100644
> > --- a/fs/nfsd/trace.h
> > +++ b/fs/nfsd/trace.h
> > @@ -139,6 +139,23 @@ TRACE_EVENT(nfsd_file_acquire,
> >  			show_nf_may(__entry->nf_may), __entry->nf_file,
> >  			be32_to_cpu(__entry->status))
> >  );
> > +
> > +TRACE_EVENT(nfsd_file_close_inode,
> > +	TP_PROTO(unsigned int hash, struct inode *inode, int found),
> > +	TP_ARGS(hash, inode, found),
> > +	TP_STRUCT__entry(
> > +		__field(unsigned int, hash)
> > +		__field(struct inode *, inode)
> > +		__field(int, found)
> > +	),
> > +	TP_fast_assign(
> > +		__entry->hash = hash;
> > +		__entry->inode = inode;
> > +		__entry->found = found;
> > +	),
> > +	TP_printk("hash=0x%x inode=0x%p found=%d", __entry->hash,
> > +			__entry->inode, __entry->found)
> > +);
> >  #endif /* _NFSD_TRACE_H */
> >  
> >  #undef TRACE_INCLUDE_PATH
> > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > index 6cfd96adcc71..98d3b9d96480 100644
> > --- a/fs/nfsd/vfs.c
> > +++ b/fs/nfsd/vfs.c
> > @@ -1583,6 +1583,15 @@ out_nfserr:
> >  	goto out_unlock;
> >  }
> >  
> > +static void
> > +nfsd_close_cached_files(struct dentry *dentry)
> > +{
> > +	struct inode *inode = d_inode(dentry);
> > +
> > +	if (inode && S_ISREG(inode->i_mode))
> > +		nfsd_file_close_inode(inode);
> > +}
> > +
> >  /*
> >   * Rename a file
> >   * N.B. After this call _both_ ffhp and tfhp need an fh_put
> > @@ -1652,6 +1661,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
> >  	if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry)
> >  		goto out_dput_new;
> >  
> > +	nfsd_close_cached_files(ndentry);
> >  	host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0);
> >  	if (!host_err) {
> >  		host_err = commit_metadata(tfhp);
> > @@ -1721,10 +1731,13 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
> >  	if (!type)
> >  		type = d_inode(rdentry)->i_mode & S_IFMT;
> >  
> > -	if (type != S_IFDIR)
> > +	if (type != S_IFDIR) {
> > +		nfsd_close_cached_files(rdentry);
> >  		host_err = vfs_unlink(dirp, rdentry, NULL);
> > -	else
> > +	} else {
> >  		host_err = vfs_rmdir(dirp, rdentry);
> > +	}
> > +
> >  	if (!host_err)
> >  		host_err = commit_metadata(fhp);
> >  	dput(rdentry);
> > -- 
> > 2.4.3
J. Bruce Fields Aug. 27, 2015, 1:38 p.m. UTC | #3
On Wed, Aug 26, 2015 at 06:53:31PM -0400, Jeff Layton wrote:
> On Wed, 26 Aug 2015 16:00:32 -0400
> "J. Bruce Fields" <bfields@fieldses.org> wrote:
> 
> > On Thu, Aug 20, 2015 at 07:17:14AM -0400, Jeff Layton wrote:
> > > It's not uncommon for some workloads to do a bunch of I/O to a file and
> > > delete it just afterward. If knfsd has a cached open file however, then
> > > the file may still be open when the dentry is unlinked. If the
> > > underlying filesystem is nfs, then that could trigger it to do a
> > > sillyrename.
> > 
> > Possibly worth noting that situation doesn't currently occur upstream.
> > 
> > (And, another justification worth noting: space used by a file should be
> > deallocated on last unlink or close.  People do sometimes notice if it's
> > not, especially if the file is large.)
> > 
> 
> Good points.
> 
> > > On a REMOVE or RENAME scan the nfsd_file cache for open files that
> > > correspond to the inode, and proactively unhash and put their
> > > references. This should prevent any delete-on-last-close activity from
> > > occurring, solely due to knfsd's open file cache.
> > 
> > Is there anything here to prevent a new cache entry being added after
> > nfsd_file_close_inode and before the file is actually removed?
> > 
> 
> No, nothing -- it's strictly best effort.

Unfortunately I think this is something we really want to guarantee.

--b.

> What might make sense is to consider looking at the dentry associated
> with the struct file when putting the last reference to the nfsd_file.
> If it's unhashed, then we could unhash the nfsd_file and put the hash
> reference for it.
> 
> That won't prevent silly renames in the case of NFS being reexported,
> of course, but it should ensure that we don't leave the thing open
> indefinitely in the case of such a race.
> 
> I'll  have to think about that one as well...
> 
> > --b.
> > 
> > > 
> > > Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
> > > ---
> > >  fs/nfsd/filecache.c | 25 +++++++++++++++++++++++++
> > >  fs/nfsd/filecache.h |  1 +
> > >  fs/nfsd/trace.h     | 17 +++++++++++++++++
> > >  fs/nfsd/vfs.c       | 17 +++++++++++++++--
> > >  4 files changed, 58 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> > > index e48b536762aa..4bd683f03b6e 100644
> > > --- a/fs/nfsd/filecache.c
> > > +++ b/fs/nfsd/filecache.c
> > > @@ -283,6 +283,31 @@ nfsd_file_find_locked(struct inode *inode, unsigned int may_flags,
> > >  	return NULL;
> > >  }
> > >  
> > > +/**
> > > + * nfsd_file_close_inode - attempt to forcibly close a nfsd_file
> > > + * @inode: inode of the file to attempt to remove
> > > + *
> > > + * Walk the whole hash bucket, looking for any files that correspond to "inode".
> > > + * If any do, then unhash them and put the hashtable reference to them.
> > > + */
> > > +void
> > > +nfsd_file_close_inode(struct inode *inode)
> > > +{
> > > +	struct nfsd_file	*nf;
> > > +	struct hlist_node	*tmp;
> > > +	unsigned int		hashval = (unsigned int)hash_ptr(inode, NFSD_FILE_HASH_BITS);
> > > +	LIST_HEAD(dispose);
> > > +
> > > +	spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock);
> > > +	hlist_for_each_entry_safe(nf, tmp, &nfsd_file_hashtbl[hashval].nfb_head, nf_node) {
> > > +		if (inode == nf->nf_inode)
> > > +			nfsd_file_unhash_and_release_locked(nf, &dispose);
> > > +	}
> > > +	spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
> > > +	trace_nfsd_file_close_inode(hashval, inode, !list_empty(&dispose));
> > > +	nfsd_file_dispose_list(&dispose);
> > > +}
> > > +
> > >  __be32
> > >  nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > >  		  unsigned int may_flags, struct nfsd_file **pnf)
> > > diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
> > > index debd558ef786..191cdb25aa66 100644
> > > --- a/fs/nfsd/filecache.h
> > > +++ b/fs/nfsd/filecache.h
> > > @@ -26,6 +26,7 @@ int nfsd_file_cache_init(void);
> > >  void nfsd_file_cache_shutdown(void);
> > >  void nfsd_file_put(struct nfsd_file *nf);
> > >  struct nfsd_file *nfsd_file_get(struct nfsd_file *nf);
> > > +void nfsd_file_close_inode(struct inode *inode);
> > >  __be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > >  		  unsigned int may_flags, struct nfsd_file **nfp);
> > >  #endif /* _FS_NFSD_FILECACHE_H */
> > > diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> > > index 2dac872d31e8..95af3b9c7b66 100644
> > > --- a/fs/nfsd/trace.h
> > > +++ b/fs/nfsd/trace.h
> > > @@ -139,6 +139,23 @@ TRACE_EVENT(nfsd_file_acquire,
> > >  			show_nf_may(__entry->nf_may), __entry->nf_file,
> > >  			be32_to_cpu(__entry->status))
> > >  );
> > > +
> > > +TRACE_EVENT(nfsd_file_close_inode,
> > > +	TP_PROTO(unsigned int hash, struct inode *inode, int found),
> > > +	TP_ARGS(hash, inode, found),
> > > +	TP_STRUCT__entry(
> > > +		__field(unsigned int, hash)
> > > +		__field(struct inode *, inode)
> > > +		__field(int, found)
> > > +	),
> > > +	TP_fast_assign(
> > > +		__entry->hash = hash;
> > > +		__entry->inode = inode;
> > > +		__entry->found = found;
> > > +	),
> > > +	TP_printk("hash=0x%x inode=0x%p found=%d", __entry->hash,
> > > +			__entry->inode, __entry->found)
> > > +);
> > >  #endif /* _NFSD_TRACE_H */
> > >  
> > >  #undef TRACE_INCLUDE_PATH
> > > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > > index 6cfd96adcc71..98d3b9d96480 100644
> > > --- a/fs/nfsd/vfs.c
> > > +++ b/fs/nfsd/vfs.c
> > > @@ -1583,6 +1583,15 @@ out_nfserr:
> > >  	goto out_unlock;
> > >  }
> > >  
> > > +static void
> > > +nfsd_close_cached_files(struct dentry *dentry)
> > > +{
> > > +	struct inode *inode = d_inode(dentry);
> > > +
> > > +	if (inode && S_ISREG(inode->i_mode))
> > > +		nfsd_file_close_inode(inode);
> > > +}
> > > +
> > >  /*
> > >   * Rename a file
> > >   * N.B. After this call _both_ ffhp and tfhp need an fh_put
> > > @@ -1652,6 +1661,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
> > >  	if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry)
> > >  		goto out_dput_new;
> > >  
> > > +	nfsd_close_cached_files(ndentry);
> > >  	host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0);
> > >  	if (!host_err) {
> > >  		host_err = commit_metadata(tfhp);
> > > @@ -1721,10 +1731,13 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
> > >  	if (!type)
> > >  		type = d_inode(rdentry)->i_mode & S_IFMT;
> > >  
> > > -	if (type != S_IFDIR)
> > > +	if (type != S_IFDIR) {
> > > +		nfsd_close_cached_files(rdentry);
> > >  		host_err = vfs_unlink(dirp, rdentry, NULL);
> > > -	else
> > > +	} else {
> > >  		host_err = vfs_rmdir(dirp, rdentry);
> > > +	}
> > > +
> > >  	if (!host_err)
> > >  		host_err = commit_metadata(fhp);
> > >  	dput(rdentry);
> > > -- 
> > > 2.4.3
> 
> 
> -- 
> Jeff Layton <jlayton@poochiereds.net>
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Layton Aug. 28, 2015, 12:19 p.m. UTC | #4
On Thu, 27 Aug 2015 09:38:06 -0400
"J. Bruce Fields" <bfields@fieldses.org> wrote:

> On Wed, Aug 26, 2015 at 06:53:31PM -0400, Jeff Layton wrote:
> > On Wed, 26 Aug 2015 16:00:32 -0400
> > "J. Bruce Fields" <bfields@fieldses.org> wrote:
> > 
> > > On Thu, Aug 20, 2015 at 07:17:14AM -0400, Jeff Layton wrote:
> > > > It's not uncommon for some workloads to do a bunch of I/O to a file and
> > > > delete it just afterward. If knfsd has a cached open file however, then
> > > > the file may still be open when the dentry is unlinked. If the
> > > > underlying filesystem is nfs, then that could trigger it to do a
> > > > sillyrename.
> > > 
> > > Possibly worth noting that situation doesn't currently occur upstream.
> > > 
> > > (And, another justification worth noting: space used by a file should be
> > > deallocated on last unlink or close.  People do sometimes notice if it's
> > > not, especially if the file is large.)
> > > 
> > 
> > Good points.
> > 
> > > > On a REMOVE or RENAME scan the nfsd_file cache for open files that
> > > > correspond to the inode, and proactively unhash and put their
> > > > references. This should prevent any delete-on-last-close activity from
> > > > occurring, solely due to knfsd's open file cache.
> > > 
> > > Is there anything here to prevent a new cache entry being added after
> > > nfsd_file_close_inode and before the file is actually removed?
> > > 
> > 
> > No, nothing -- it's strictly best effort.
> 
> Unfortunately I think this is something we really want to guarantee.
> 

That should be doable.

One question though -- if we hit this race, what's the right way to
handle it?

We don't want to return nfserr_stale or anything since we won't know if
the inode will really be stale after the remove completes. We could
just be removing one link from a multiply-linked inode.

We also don't want to make the caller wait out nfsd_file_acquire, as the
file could be open via NFSv4 and those references might not get put for
quite some time.

What semantics should we be aiming for?
J. Bruce Fields Aug. 28, 2015, 5:58 p.m. UTC | #5
On Fri, Aug 28, 2015 at 08:19:46AM -0400, Jeff Layton wrote:
> On Thu, 27 Aug 2015 09:38:06 -0400
> "J. Bruce Fields" <bfields@fieldses.org> wrote:
> 
> > On Wed, Aug 26, 2015 at 06:53:31PM -0400, Jeff Layton wrote:
> > > On Wed, 26 Aug 2015 16:00:32 -0400
> > > "J. Bruce Fields" <bfields@fieldses.org> wrote:
> > > 
> > > > On Thu, Aug 20, 2015 at 07:17:14AM -0400, Jeff Layton wrote:
> > > > > It's not uncommon for some workloads to do a bunch of I/O to a file and
> > > > > delete it just afterward. If knfsd has a cached open file however, then
> > > > > the file may still be open when the dentry is unlinked. If the
> > > > > underlying filesystem is nfs, then that could trigger it to do a
> > > > > sillyrename.
> > > > 
> > > > Possibly worth noting that situation doesn't currently occur upstream.
> > > > 
> > > > (And, another justification worth noting: space used by a file should be
> > > > deallocated on last unlink or close.  People do sometimes notice if it's
> > > > not, especially if the file is large.)
> > > > 
> > > 
> > > Good points.
> > > 
> > > > > On a REMOVE or RENAME scan the nfsd_file cache for open files that
> > > > > correspond to the inode, and proactively unhash and put their
> > > > > references. This should prevent any delete-on-last-close activity from
> > > > > occurring, solely due to knfsd's open file cache.
> > > > 
> > > > Is there anything here to prevent a new cache entry being added after
> > > > nfsd_file_close_inode and before the file is actually removed?
> > > > 
> > > 
> > > No, nothing -- it's strictly best effort.
> > 
> > Unfortunately I think this is something we really want to guarantee.
> > 
> 
> That should be doable.
> 
> One question though -- if we hit this race, what's the right way to
> handle it?
> 
> We don't want to return nfserr_stale or anything since we won't know if
> the inode will really be stale after the remove completes. We could
> just be removing one link from a multiply-linked inode.
> 
> We also don't want to make the caller wait out nfsd_file_acquire, as the
> file could be open via NFSv4 and those references might not get put for
> quite some time.
> 
> What semantics should we be aiming for?

Hm.

The RENAME or REMOVE has to succeed regardless.

If the other operation is an open or something lookup-like, then it has
to succeed or see ENOENT.  If it's a filehandle-based operation like
READ or WRITE then it's got to succeed or get STALE, and if the latter
then it better be really and truly gone.  Continuing to process a READ
or WRITE after the REMOVE continues is fine, I think, it just means some
application on some other v3 client has an open that we're honoring a
moment longer than we're required to.

Now I'd have to look back at your code....  If the caching were just an
optimization, we could just it off temporarily, but I don't think we
can.

Marking the file dead somehow and failing further attempts to use it
might work.  I guess that's equivalent to your suggestion to unhash it
in this case.

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Layton Aug. 31, 2015, 4:50 p.m. UTC | #6
On Fri, 28 Aug 2015 13:58:34 -0400
"J. Bruce Fields" <bfields@fieldses.org> wrote:

> On Fri, Aug 28, 2015 at 08:19:46AM -0400, Jeff Layton wrote:
> > On Thu, 27 Aug 2015 09:38:06 -0400
> > "J. Bruce Fields" <bfields@fieldses.org> wrote:
> > 
> > > On Wed, Aug 26, 2015 at 06:53:31PM -0400, Jeff Layton wrote:
> > > > On Wed, 26 Aug 2015 16:00:32 -0400
> > > > "J. Bruce Fields" <bfields@fieldses.org> wrote:
> > > > 
> > > > > On Thu, Aug 20, 2015 at 07:17:14AM -0400, Jeff Layton wrote:
> > > > > > It's not uncommon for some workloads to do a bunch of I/O to a file and
> > > > > > delete it just afterward. If knfsd has a cached open file however, then
> > > > > > the file may still be open when the dentry is unlinked. If the
> > > > > > underlying filesystem is nfs, then that could trigger it to do a
> > > > > > sillyrename.
> > > > > 
> > > > > Possibly worth noting that situation doesn't currently occur upstream.
> > > > > 
> > > > > (And, another justification worth noting: space used by a file should be
> > > > > deallocated on last unlink or close.  People do sometimes notice if it's
> > > > > not, especially if the file is large.)
> > > > > 
> > > > 
> > > > Good points.
> > > > 
> > > > > > On a REMOVE or RENAME scan the nfsd_file cache for open files that
> > > > > > correspond to the inode, and proactively unhash and put their
> > > > > > references. This should prevent any delete-on-last-close activity from
> > > > > > occurring, solely due to knfsd's open file cache.
> > > > > 
> > > > > Is there anything here to prevent a new cache entry being added after
> > > > > nfsd_file_close_inode and before the file is actually removed?
> > > > > 
> > > > 
> > > > No, nothing -- it's strictly best effort.
> > > 
> > > Unfortunately I think this is something we really want to guarantee.
> > > 
> > 
> > That should be doable.
> > 
> > One question though -- if we hit this race, what's the right way to
> > handle it?
> > 
> > We don't want to return nfserr_stale or anything since we won't know if
> > the inode will really be stale after the remove completes. We could
> > just be removing one link from a multiply-linked inode.
> > 
> > We also don't want to make the caller wait out nfsd_file_acquire, as the
> > file could be open via NFSv4 and those references might not get put for
> > quite some time.
> > 
> > What semantics should we be aiming for?
> 
> Hm.
> 
> The RENAME or REMOVE has to succeed regardless.
> 
> If the other operation is an open or something lookup-like, then it has
> to succeed or see ENOENT.  If it's a filehandle-based operation like
> READ or WRITE then it's got to succeed or get STALE, and if the latter
> then it better be really and truly gone.  Continuing to process a READ
> or WRITE after the REMOVE continues is fine, I think, it just means some
> application on some other v3 client has an open that we're honoring a
> moment longer than we're required to.
> 
> Now I'd have to look back at your code....  If the caching were just an
> optimization, we could just it off temporarily, but I don't think we
> can.
> 
> Marking the file dead somehow and failing further attempts to use it
> might work.  I guess that's equivalent to your suggestion to unhash it
> in this case.
> 

Yeah, though that is a rather substantive change. Right now, we just
tear these things down when we put the last reference. If we have to
prevent new references from being acquired during the duration of the
unlink, then that changes the entire semantics of the cache.

We'll need to allow some sort of object to persist in the cache (to act
as a placeholder), but still allow the struct file to be closed. I'll
have to think about how we can make that work. It'll probably make this
quite a bit more complex...
diff mbox

Patch

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index e48b536762aa..4bd683f03b6e 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -283,6 +283,31 @@  nfsd_file_find_locked(struct inode *inode, unsigned int may_flags,
 	return NULL;
 }
 
+/**
+ * nfsd_file_close_inode - attempt to forcibly close a nfsd_file
+ * @inode: inode of the file to attempt to remove
+ *
+ * Walk the whole hash bucket, looking for any files that correspond to "inode".
+ * If any do, then unhash them and put the hashtable reference to them.
+ */
+void
+nfsd_file_close_inode(struct inode *inode)
+{
+	struct nfsd_file	*nf;
+	struct hlist_node	*tmp;
+	unsigned int		hashval = (unsigned int)hash_ptr(inode, NFSD_FILE_HASH_BITS);
+	LIST_HEAD(dispose);
+
+	spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock);
+	hlist_for_each_entry_safe(nf, tmp, &nfsd_file_hashtbl[hashval].nfb_head, nf_node) {
+		if (inode == nf->nf_inode)
+			nfsd_file_unhash_and_release_locked(nf, &dispose);
+	}
+	spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
+	trace_nfsd_file_close_inode(hashval, inode, !list_empty(&dispose));
+	nfsd_file_dispose_list(&dispose);
+}
+
 __be32
 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		  unsigned int may_flags, struct nfsd_file **pnf)
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index debd558ef786..191cdb25aa66 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -26,6 +26,7 @@  int nfsd_file_cache_init(void);
 void nfsd_file_cache_shutdown(void);
 void nfsd_file_put(struct nfsd_file *nf);
 struct nfsd_file *nfsd_file_get(struct nfsd_file *nf);
+void nfsd_file_close_inode(struct inode *inode);
 __be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		  unsigned int may_flags, struct nfsd_file **nfp);
 #endif /* _FS_NFSD_FILECACHE_H */
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 2dac872d31e8..95af3b9c7b66 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -139,6 +139,23 @@  TRACE_EVENT(nfsd_file_acquire,
 			show_nf_may(__entry->nf_may), __entry->nf_file,
 			be32_to_cpu(__entry->status))
 );
+
+TRACE_EVENT(nfsd_file_close_inode,
+	TP_PROTO(unsigned int hash, struct inode *inode, int found),
+	TP_ARGS(hash, inode, found),
+	TP_STRUCT__entry(
+		__field(unsigned int, hash)
+		__field(struct inode *, inode)
+		__field(int, found)
+	),
+	TP_fast_assign(
+		__entry->hash = hash;
+		__entry->inode = inode;
+		__entry->found = found;
+	),
+	TP_printk("hash=0x%x inode=0x%p found=%d", __entry->hash,
+			__entry->inode, __entry->found)
+);
 #endif /* _NFSD_TRACE_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 6cfd96adcc71..98d3b9d96480 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1583,6 +1583,15 @@  out_nfserr:
 	goto out_unlock;
 }
 
+static void
+nfsd_close_cached_files(struct dentry *dentry)
+{
+	struct inode *inode = d_inode(dentry);
+
+	if (inode && S_ISREG(inode->i_mode))
+		nfsd_file_close_inode(inode);
+}
+
 /*
  * Rename a file
  * N.B. After this call _both_ ffhp and tfhp need an fh_put
@@ -1652,6 +1661,7 @@  nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 	if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry)
 		goto out_dput_new;
 
+	nfsd_close_cached_files(ndentry);
 	host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0);
 	if (!host_err) {
 		host_err = commit_metadata(tfhp);
@@ -1721,10 +1731,13 @@  nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 	if (!type)
 		type = d_inode(rdentry)->i_mode & S_IFMT;
 
-	if (type != S_IFDIR)
+	if (type != S_IFDIR) {
+		nfsd_close_cached_files(rdentry);
 		host_err = vfs_unlink(dirp, rdentry, NULL);
-	else
+	} else {
 		host_err = vfs_rmdir(dirp, rdentry);
+	}
+
 	if (!host_err)
 		host_err = commit_metadata(fhp);
 	dput(rdentry);