diff mbox

[RFC] NFSD: fix cannot umounting mount points under pseudo root

Message ID 20150505155203.GD27106@fieldses.org (mailing list archive)
State New, archived
Headers show

Commit Message

J. Bruce Fields May 5, 2015, 3:52 p.m. UTC
On Tue, May 05, 2015 at 10:18:01AM -0400, J. Bruce Fields wrote:
> On Tue, May 05, 2015 at 09:54:43PM +0800, Kinglong Mee wrote:
> > On 5/5/2015 6:01 AM, J. Bruce Fields wrote:
> > > + * Unlike lookup_one_len, it should be called without the parent
> > > + * i_mutex held, and will take the i_mutex itself if necessary.
> > > + */
> > > +struct dentry *lookup_one_len_unlocked(const char *name,
> > > +				       struct dentry *base, int len)
> > > +{
> > > +	struct qstr this;
> > > +	unsigned int c;
> > > +	int err;
> > > +	struct dentry *ret;
> > > +
> > > +	WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
> > 
> > Remove this line.
> 
> Whoops, thanks.
> 
> > > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > > index a30e79900086..cc7995762190 100644
> > > --- a/fs/nfsd/vfs.c
> > > +++ b/fs/nfsd/vfs.c
> > > @@ -217,6 +217,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > >  		host_err = PTR_ERR(dentry);
> > >  		if (IS_ERR(dentry))
> > >  			goto out_nfserr;
> > > +		if (!S_ISREG(d_inode(dentry)->i_mode)) {
> > 
> > Got a crash here tested by pynfs,
> 
> OK, I guess I just forgot to take into account negative dentries.
> Testing that now with (!(d_inode(dentry) && S_ISREG(..))).

And I also forgot nfs3xdr.c.

--b.

commit 6c942d342f9f
Author: NeilBrown <neilb@suse.de>
Date:   Fri May 1 11:53:26 2015 +1000

    nfsd: don't hold i_mutex over userspace upcalls
    
    We need information about exports when crossing mountpoints during
    lookup or NFSv4 readdir.  If we don't already have that information
    cached, we may have to ask (and wait for) rpc.mountd.
    
    In both cases we currently hold the i_mutex on the parent of the
    directory we're asking rpc.mountd about.  We've seen situations where
    rpc.mountd performs some operation on that directory that tries to take
    the i_mutex again, resulting in deadlock.
    
    With some care, we may be able to avoid that in rpc.mountd.  But it
    seems better just to avoid holding a mutex while waiting on userspace.
    
    It appears that lookup_one_len is pretty much the only operation that
    needs the i_mutex.  So we could just drop the i_mutex elsewhere and do
    something like
    
    	mutex_lock()
    	lookup_one_len()
    	mutex_unlock()
    
    In many cases though the lookup would have been cached and not required
    the i_mutex, so it's more efficient to create a lookup_one_len() variant
    that only takes the i_mutex when necessary.
    
    Signed-off-by: J. Bruce Fields <bfields@redhat.com>

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

NeilBrown May 5, 2015, 10:26 p.m. UTC | #1
On Tue, 5 May 2015 11:52:03 -0400 "J. Bruce Fields" <bfields@fieldses.org>
wrote:

> On Tue, May 05, 2015 at 10:18:01AM -0400, J. Bruce Fields wrote:
> > On Tue, May 05, 2015 at 09:54:43PM +0800, Kinglong Mee wrote:
> > > On 5/5/2015 6:01 AM, J. Bruce Fields wrote:
> > > > + * Unlike lookup_one_len, it should be called without the parent
> > > > + * i_mutex held, and will take the i_mutex itself if necessary.
> > > > + */
> > > > +struct dentry *lookup_one_len_unlocked(const char *name,
> > > > +				       struct dentry *base, int len)
> > > > +{
> > > > +	struct qstr this;
> > > > +	unsigned int c;
> > > > +	int err;
> > > > +	struct dentry *ret;
> > > > +
> > > > +	WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
> > > 
> > > Remove this line.
> > 
> > Whoops, thanks.
> > 
> > > > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > > > index a30e79900086..cc7995762190 100644
> > > > --- a/fs/nfsd/vfs.c
> > > > +++ b/fs/nfsd/vfs.c
> > > > @@ -217,6 +217,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > > >  		host_err = PTR_ERR(dentry);
> > > >  		if (IS_ERR(dentry))
> > > >  			goto out_nfserr;
> > > > +		if (!S_ISREG(d_inode(dentry)->i_mode)) {
> > > 
> > > Got a crash here tested by pynfs,
> > 
> > OK, I guess I just forgot to take into account negative dentries.
> > Testing that now with (!(d_inode(dentry) && S_ISREG(..))).
> 
> And I also forgot nfs3xdr.c.
> 
> --b.
> 
> commit 6c942d342f9f
> Author: NeilBrown <neilb@suse.de>
> Date:   Fri May 1 11:53:26 2015 +1000
> 
>     nfsd: don't hold i_mutex over userspace upcalls
>     
>     We need information about exports when crossing mountpoints during
>     lookup or NFSv4 readdir.  If we don't already have that information
>     cached, we may have to ask (and wait for) rpc.mountd.
>     
>     In both cases we currently hold the i_mutex on the parent of the
>     directory we're asking rpc.mountd about.  We've seen situations where
>     rpc.mountd performs some operation on that directory that tries to take
>     the i_mutex again, resulting in deadlock.
>     
>     With some care, we may be able to avoid that in rpc.mountd.  But it
>     seems better just to avoid holding a mutex while waiting on userspace.
>     
>     It appears that lookup_one_len is pretty much the only operation that
>     needs the i_mutex.  So we could just drop the i_mutex elsewhere and do
>     something like
>     
>     	mutex_lock()
>     	lookup_one_len()
>     	mutex_unlock()
>     
>     In many cases though the lookup would have been cached and not required
>     the i_mutex, so it's more efficient to create a lookup_one_len() variant
>     that only takes the i_mutex when necessary.
>     
>     Signed-off-by: J. Bruce Fields <bfields@redhat.com>
> 
> diff --git a/fs/namei.c b/fs/namei.c
> index 4a8d998b7274..0f554bf41dea 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -2139,6 +2139,8 @@ static struct dentry *lookup_hash(struct nameidata *nd)
>   *
>   * Note that this routine is purely a helper for filesystem usage and should
>   * not be called by generic code.
> + *
> + * Must be called with the parented i_mutex held.

"parented"?

    * base->i_mutex must be held by caller.

??


>   */
>  struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
>  {
> @@ -2182,6 +2184,66 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
>  }
>  EXPORT_SYMBOL(lookup_one_len);
>  
> +/**
> + * lookup_one_len - filesystem helper to lookup single pathname component
      lookup_one_len_unlocked - ..



> + * @name:	pathname component to lookup
> + * @base:	base directory to lookup from
> + * @len:	maximum length @len should be interpreted to
> + *
> + * Note that this routine is purely a helper for filesystem usage and should
> + * not be called by generic code.
> + *
> + * Unlike lookup_one_len, it should be called without the parent
> + * i_mutex held, and will take the i_mutex itself if necessary.
> + */
> +struct dentry *lookup_one_len_unlocked(const char *name,
> +				       struct dentry *base, int len)
> +{
> +	struct qstr this;
> +	unsigned int c;
> +	int err;
> +	struct dentry *ret;
> +
> +	this.name = name;
> +	this.len = len;
> +	this.hash = full_name_hash(name, len);
> +	if (!len)
> +		return ERR_PTR(-EACCES);
> +
> +	if (unlikely(name[0] == '.')) {
> +		if (len < 2 || (len == 2 && name[1] == '.'))
> +			return ERR_PTR(-EACCES);
> +	}
> +
> +	while (len--) {
> +		c = *(const unsigned char *)name++;
> +		if (c == '/' || c == '\0')
> +			return ERR_PTR(-EACCES);
> +	}
> +	/*
> +	 * See if the low-level filesystem might want
> +	 * to use its own hash..
> +	 */
> +	if (base->d_flags & DCACHE_OP_HASH) {
> +		int err = base->d_op->d_hash(base, &this);
> +		if (err < 0)
> +			return ERR_PTR(err);
> +	}
> +
> +	err = inode_permission(base->d_inode, MAY_EXEC);
> +	if (err)
> +		return ERR_PTR(err);
> +
> +	ret = __d_lookup(base, &this);
> +	if (ret)
> +		return ret;

__d_lookup() is a little too low level, as it doesn't call d_revalidate() and
it doesn't retry if the rename_lock seqlock says it should.

The latter doesn't really matter as we would fall through to the safe
mutex-protected version.
The former isn't much of an issue for most filesystems under nfsd, but still
needs to be handled to some extent.
Also, the comment for __d_lookup says

 * __d_lookup callers must be commented.

The simplest resolution would be:

	/* __d_lookup() is used to try to get a quick answer and avoid the
	 * mutex.  A false-negative does no harm.
	 */
	ret = __d_lookup(base, &this);
	if (ret && ret->d_flags & DCACHE_OP_REVALIDATE) {
		dput(ret);
		ret = NULL;
	}
	if (ret)
		return ret;


It probably wouldn't be too much harder to add the d_revalidate() call, and
call d_invalidate() if it returns zero.  

Maybe.


	if (ret && ret->d_flags & DCACHE_OP_REVALIDATE) {
		int err = d_revalidate(ret, 0);
		if (err == 0) {
			d_invalidate(ret);
			dput(ret);
			ret = NULL;
		} else if (err < 0) {
			dput(ret);
			return ERR_PTR(err);
		}
	}


> +	mutex_lock(&base->d_inode->i_mutex);
> +	ret =  __lookup_hash(&this, base, 0);
> +	mutex_unlock(&base->d_inode->i_mutex);
> +	return ret;
> +}
> +EXPORT_SYMBOL(lookup_one_len_unlocked);
> +
>  int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
>  		 struct path *path, int *empty)
>  {
> diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
> index f6e7cbabac5a..01dcd494f781 100644
> --- a/fs/nfsd/nfs3xdr.c
> +++ b/fs/nfsd/nfs3xdr.c
> @@ -823,7 +823,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
>  		} else
>  			dchild = dget(dparent);
>  	} else
> -		dchild = lookup_one_len(name, dparent, namlen);
> +		dchild = lookup_one_len_unlocked(name, dparent, namlen);
>  	if (IS_ERR(dchild))
>  		return rv;
>  	if (d_mountpoint(dchild))
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index 158badf945df..2c1adaa0bd2f 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -2804,14 +2804,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
>  	__be32 nfserr;
>  	int ignore_crossmnt = 0;
>  
> -	dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
> +	dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
>  	if (IS_ERR(dentry))
>  		return nfserrno(PTR_ERR(dentry));
>  	if (d_really_is_negative(dentry)) {
>  		/*
> -		 * nfsd_buffered_readdir drops the i_mutex between
> -		 * readdir and calling this callback, leaving a window
> -		 * where this directory entry could have gone away.
> +		 * we're not holding the i_mutex here, so there's
> +		 * a window where this directory entry could have gone
> +		 * away.
>  		 */
>  		dput(dentry);
>  		return nfserr_noent;
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index a30e79900086..4accdb00976b 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -217,6 +217,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  		host_err = PTR_ERR(dentry);
>  		if (IS_ERR(dentry))
>  			goto out_nfserr;
> +		if (!(d_inode(dentry) && S_ISREG(d_inode(dentry)->i_mode))) {
> +			/*
> +			 * Never mind, we're not going to open this
> +			 * anyway.  And we don't want to hold it over
> +			 * the userspace upcalls in nfsd_mountpoint. */
> +			fh_unlock(fhp);
> +		}

You could avoid the test by just putting the fh_unlock(fhp) inside the 

		if (nfsd_mountpoint(dentry, exp)) {

block.  It might also be appropriate for nfsd_mountpoint to fail on
non-directories.

Up to you though.

Thanks.  Feel free to add
  *-by: NeilBrown <neilb@suse.de>

as you see fit.

NeilBrown


>  		/*
>  		 * check if we have crossed a mount point ...
>  		 */
> @@ -1876,7 +1883,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
>  	offset = *offsetp;
>  
>  	while (1) {
> -		struct inode *dir_inode = file_inode(file);
>  		unsigned int reclen;
>  
>  		cdp->err = nfserr_eof; /* will be cleared on successful read */
> @@ -1895,15 +1901,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
>  		if (!size)
>  			break;
>  
> -		/*
> -		 * Various filldir functions may end up calling back into
> -		 * lookup_one_len() and the file system's ->lookup() method.
> -		 * These expect i_mutex to be held, as it would within readdir.
> -		 */
> -		host_err = mutex_lock_killable(&dir_inode->i_mutex);
> -		if (host_err)
> -			break;
> -
>  		de = (struct buffered_dirent *)buf.dirent;
>  		while (size > 0) {
>  			offset = de->offset;
> @@ -1920,7 +1917,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
>  			size -= reclen;
>  			de = (struct buffered_dirent *)((char *)de + reclen);
>  		}
> -		mutex_unlock(&dir_inode->i_mutex);
>  		if (size > 0) /* We bailed out early */
>  			break;
>  
> diff --git a/include/linux/namei.h b/include/linux/namei.h
> index c8990779f0c3..bb3a2f7cca67 100644
> --- a/include/linux/namei.h
> +++ b/include/linux/namei.h
> @@ -62,6 +62,7 @@ extern struct dentry *kern_path_locked(const char *, struct path *);
>  extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
>  
>  extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
> +extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
>  
>  extern int follow_down_one(struct path *);
>  extern int follow_down(struct path *);
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields May 7, 2015, 3:31 p.m. UTC | #2
On Tue, May 05, 2015 at 11:52:03AM -0400, J. Bruce Fields wrote:
> On Tue, May 05, 2015 at 10:18:01AM -0400, J. Bruce Fields wrote:
> > On Tue, May 05, 2015 at 09:54:43PM +0800, Kinglong Mee wrote:
> > > On 5/5/2015 6:01 AM, J. Bruce Fields wrote:
> > > > + * Unlike lookup_one_len, it should be called without the parent
> > > > + * i_mutex held, and will take the i_mutex itself if necessary.
> > > > + */
> > > > +struct dentry *lookup_one_len_unlocked(const char *name,
> > > > +				       struct dentry *base, int len)
> > > > +{
> > > > +	struct qstr this;
> > > > +	unsigned int c;
> > > > +	int err;
> > > > +	struct dentry *ret;
> > > > +
> > > > +	WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
> > > 
> > > Remove this line.
> > 
> > Whoops, thanks.
> > 
> > > > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > > > index a30e79900086..cc7995762190 100644
> > > > --- a/fs/nfsd/vfs.c
> > > > +++ b/fs/nfsd/vfs.c
> > > > @@ -217,6 +217,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > > >  		host_err = PTR_ERR(dentry);
> > > >  		if (IS_ERR(dentry))
> > > >  			goto out_nfserr;
> > > > +		if (!S_ISREG(d_inode(dentry)->i_mode)) {
> > > 
> > > Got a crash here tested by pynfs,
> > 
> > OK, I guess I just forgot to take into account negative dentries.
> > Testing that now with (!(d_inode(dentry) && S_ISREG(..))).
> 
> And I also forgot nfs3xdr.c.

But, this doesn't look good.

OK, to be fair I'm not sure whether this was already happening before this
patch.

--b.

[57287.226846] ======================================================
[57287.227499] [ INFO: possible circular locking dependency detected ]
[57287.228009] 4.1.0-rc2-22946-g6c942d3 #174 Not tainted
[57287.228009] -------------------------------------------------------
[57287.228009] updatedb/19312 is trying to acquire lock:
[57287.228009]  (&mm->mmap_sem){++++++}, at: [<ffffffff811980bf>] might_fault+0x5f/0xb0
[57287.228009] 
but task is already holding lock:
[57287.228009]  (&xfs_dir_ilock_class){++++..}, at: [<ffffffff81400c1a>] xfs_ilock+0x10a/0x2e0
[57287.228009] 
which lock already depends on the new lock.

[57287.228009] 
the existing dependency chain (in reverse order) is:
[57287.228009] 
-> #2 (&xfs_dir_ilock_class){++++..}:
[57287.228009]        [<ffffffff810cf1c2>] __lock_acquire+0x15f2/0x1920
[57287.228009]        [<ffffffff810cfe40>] lock_acquire+0xc0/0x280
[57287.228009]        [<ffffffff810c8e0d>] down_read_nested+0x4d/0x70
[57287.228009]        [<ffffffff81400c1a>] xfs_ilock+0x10a/0x2e0
[57287.228009]        [<ffffffff81400e68>] xfs_ilock_attr_map_shared+0x38/0x50
[57287.228009]        [<ffffffff8139b9e1>] xfs_attr_get+0xc1/0x180
[57287.228009]        [<ffffffff8140ff87>] xfs_xattr_get+0x37/0x50
[57287.228009]        [<ffffffff811e866f>] generic_getxattr+0x4f/0x70
[57287.228009]        [<ffffffff816535d2>] inode_doinit_with_dentry+0x152/0x670
[57287.228009]        [<ffffffff81657f8b>] sb_finish_set_opts+0xdb/0x260
[57287.228009]        [<ffffffff81658604>] selinux_set_mnt_opts+0x2c4/0x600
[57287.228009]        [<ffffffff816589ff>] superblock_doinit+0xbf/0xd0
[57287.228009]        [<ffffffff81658a6d>] selinux_sb_kern_mount+0x3d/0xa0
[57287.228009]        [<ffffffff8164efa6>] security_sb_kern_mount+0x16/0x20
[57287.228009]        [<ffffffff811c29dd>] mount_fs+0x7d/0x190
[57287.228009]        [<ffffffff811e2258>] vfs_kern_mount+0x68/0x160
[57287.228009]        [<ffffffff811e58b4>] do_mount+0x204/0xc60
[57287.228009]        [<ffffffff811e660f>] SyS_mount+0x6f/0xb0
[57287.228009]        [<ffffffff81b6b697>] system_call_fastpath+0x12/0x6f
[57287.228009] 
-> #1 (&isec->lock){+.+.+.}:
[57287.228009]        [<ffffffff810cf1c2>] __lock_acquire+0x15f2/0x1920
[57287.228009]        [<ffffffff810cfe40>] lock_acquire+0xc0/0x280
[57287.228009]        [<ffffffff81b675f3>] mutex_lock_nested+0x63/0x400
[57287.228009]        [<ffffffff81653525>] inode_doinit_with_dentry+0xa5/0x670
[57287.228009]        [<ffffffff81653b0c>] selinux_d_instantiate+0x1c/0x20
[57287.228009]        [<ffffffff8164eb2b>] security_d_instantiate+0x1b/0x30
[57287.228009]        [<ffffffff811d9ce4>] d_instantiate+0x54/0x80
[57287.228009]        [<ffffffff811894fe>] __shmem_file_setup+0xce/0x250
[57287.228009]        [<ffffffff8118c5f8>] shmem_zero_setup+0x28/0x70
[57287.228009]        [<ffffffff811a1818>] mmap_region+0x5c8/0x5e0
[57287.228009]        [<ffffffff811a1ba3>] do_mmap_pgoff+0x373/0x420
[57287.228009]        [<ffffffff8118cb00>] vm_mmap_pgoff+0x90/0xc0
[57287.228009]        [<ffffffff811a0000>] SyS_mmap_pgoff+0x1b0/0x240
[57287.228009]        [<ffffffff81008c92>] SyS_mmap+0x22/0x30
[57287.228009]        [<ffffffff81b6b697>] system_call_fastpath+0x12/0x6f
[57287.228009] 
-> #0 (&mm->mmap_sem){++++++}:
[57287.228009]        [<ffffffff810ccafb>] check_prev_add+0x51b/0x860
[57287.228009]        [<ffffffff810cf1c2>] __lock_acquire+0x15f2/0x1920
[57287.228009]        [<ffffffff810cfe40>] lock_acquire+0xc0/0x280
[57287.228009]        [<ffffffff811980ec>] might_fault+0x8c/0xb0
[57287.228009]        [<ffffffff811d3ff2>] filldir+0x92/0x120
[57287.228009]        [<ffffffff813ec51a>] xfs_dir2_sf_getdents.isra.10+0x1ba/0x220
[57287.228009]        [<ffffffff813ed0de>] xfs_readdir+0x15e/0x300
[57287.228009]        [<ffffffff813f116b>] xfs_file_readdir+0x2b/0x30
[57287.228009]        [<ffffffff811d3dca>] iterate_dir+0x9a/0x140
[57287.228009]        [<ffffffff811d42b1>] SyS_getdents+0x81/0x100
[57287.228009]        [<ffffffff81b6b697>] system_call_fastpath+0x12/0x6f
[57287.228009] 
other info that might help us debug this:

[57287.228009] Chain exists of:
  &mm->mmap_sem --> &isec->lock --> &xfs_dir_ilock_class

[57287.228009]  Possible unsafe locking scenario:

[57287.228009]        CPU0                    CPU1
[57287.228009]        ----                    ----
[57287.228009]   lock(&xfs_dir_ilock_class);
[57287.228009]                                lock(&isec->lock);
[57287.228009]                                lock(&xfs_dir_ilock_class);
[57287.228009]   lock(&mm->mmap_sem);
[57287.228009] 
 *** DEADLOCK ***

[57287.228009] 2 locks held by updatedb/19312:
[57287.228009]  #0:  (&type->i_mutex_dir_key#5){+.+.+.}, at: [<ffffffff811d3d91>] iterate_dir+0x61/0x140
[57287.228009]  #1:  (&xfs_dir_ilock_class){++++..}, at: [<ffffffff81400c1a>] xfs_ilock+0x10a/0x2e0
[57287.228009] 
stack backtrace:
[57287.228009] CPU: 0 PID: 19312 Comm: updatedb Not tainted 4.1.0-rc2-22946-g6c942d3 #174
[57287.228009] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140709_153950- 04/01/2014
[57287.228009]  ffffffff82c61ed0 ffff880060d0faa8 ffffffff81b61a1a 0000000080000001
[57287.228009]  ffffffff82c345d0 ffff880060d0faf8 ffffffff810cb843 0000000000000000
[57287.228009]  ffff880060d0fb28 0000000000000000 0000000000000001 0000000000000000
[57287.228009] Call Trace:
[57287.228009]  [<ffffffff81b61a1a>] dump_stack+0x4f/0x7b
[57287.228009]  [<ffffffff810cb843>] print_circular_bug+0x203/0x310
[57287.228009]  [<ffffffff810ccafb>] check_prev_add+0x51b/0x860
[57287.228009]  [<ffffffff810ce018>] ? __lock_acquire+0x448/0x1920
[57287.228009]  [<ffffffff810cf1c2>] __lock_acquire+0x15f2/0x1920
[57287.228009]  [<ffffffff810cfe40>] lock_acquire+0xc0/0x280
[57287.228009]  [<ffffffff811980bf>] ? might_fault+0x5f/0xb0
[57287.228009]  [<ffffffff811980ec>] might_fault+0x8c/0xb0
[57287.228009]  [<ffffffff811980bf>] ? might_fault+0x5f/0xb0
[57287.228009]  [<ffffffff811d3ff2>] filldir+0x92/0x120
[57287.228009]  [<ffffffff81400e24>] ? xfs_ilock_data_map_shared+0x34/0x40
[57287.228009]  [<ffffffff813ec51a>] xfs_dir2_sf_getdents.isra.10+0x1ba/0x220
[57287.228009]  [<ffffffff81400c1a>] ? xfs_ilock+0x10a/0x2e0
[57287.228009]  [<ffffffff813ed0de>] xfs_readdir+0x15e/0x300
[57287.228009]  [<ffffffff810cd6b5>] ? mark_held_locks+0x75/0xa0
[57287.228009]  [<ffffffff81b6800a>] ? mutex_lock_killable_nested+0x27a/0x4e0
[57287.228009]  [<ffffffff813f116b>] xfs_file_readdir+0x2b/0x30
[57287.228009]  [<ffffffff811d3dca>] iterate_dir+0x9a/0x140
[57287.228009]  [<ffffffff811dfceb>] ? __fget_light+0x7b/0xa0
[57287.228009]  [<ffffffff811d42b1>] SyS_getdents+0x81/0x100
[57287.228009]  [<ffffffff811d3f60>] ? fillonedir+0xf0/0xf0
[57287.228009]  [<ffffffff81b6b697>] system_call_fastpath+0x12/0x6f

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
NeilBrown May 7, 2015, 10:42 p.m. UTC | #3
On Thu, 7 May 2015 11:31:16 -0400 "J. Bruce Fields" <bfields@fieldses.org>
wrote:

> On Tue, May 05, 2015 at 11:52:03AM -0400, J. Bruce Fields wrote:
> > On Tue, May 05, 2015 at 10:18:01AM -0400, J. Bruce Fields wrote:
> > > On Tue, May 05, 2015 at 09:54:43PM +0800, Kinglong Mee wrote:
> > > > On 5/5/2015 6:01 AM, J. Bruce Fields wrote:
> > > > > + * Unlike lookup_one_len, it should be called without the parent
> > > > > + * i_mutex held, and will take the i_mutex itself if necessary.
> > > > > + */
> > > > > +struct dentry *lookup_one_len_unlocked(const char *name,
> > > > > +				       struct dentry *base, int len)
> > > > > +{
> > > > > +	struct qstr this;
> > > > > +	unsigned int c;
> > > > > +	int err;
> > > > > +	struct dentry *ret;
> > > > > +
> > > > > +	WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
> > > > 
> > > > Remove this line.
> > > 
> > > Whoops, thanks.
> > > 
> > > > > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > > > > index a30e79900086..cc7995762190 100644
> > > > > --- a/fs/nfsd/vfs.c
> > > > > +++ b/fs/nfsd/vfs.c
> > > > > @@ -217,6 +217,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > > > >  		host_err = PTR_ERR(dentry);
> > > > >  		if (IS_ERR(dentry))
> > > > >  			goto out_nfserr;
> > > > > +		if (!S_ISREG(d_inode(dentry)->i_mode)) {
> > > > 
> > > > Got a crash here tested by pynfs,
> > > 
> > > OK, I guess I just forgot to take into account negative dentries.
> > > Testing that now with (!(d_inode(dentry) && S_ISREG(..))).
> > 
> > And I also forgot nfs3xdr.c.
> 
> But, this doesn't look good.
> 
> OK, to be fair I'm not sure whether this was already happening before this
> patch.

None of the stacks show any code that we changed, so I'm guessing it is a
separate issues as you hint at.

If I didn't have a list as long as my arm of things I really should be doing,
I'd go diving into the XFS code ....

NeilBrown

> 
> --b.
> 
> [57287.226846] ======================================================
> [57287.227499] [ INFO: possible circular locking dependency detected ]
> [57287.228009] 4.1.0-rc2-22946-g6c942d3 #174 Not tainted
> [57287.228009] -------------------------------------------------------
> [57287.228009] updatedb/19312 is trying to acquire lock:
> [57287.228009]  (&mm->mmap_sem){++++++}, at: [<ffffffff811980bf>] might_fault+0x5f/0xb0
> [57287.228009] 
> but task is already holding lock:
> [57287.228009]  (&xfs_dir_ilock_class){++++..}, at: [<ffffffff81400c1a>] xfs_ilock+0x10a/0x2e0
> [57287.228009] 
> which lock already depends on the new lock.
> 
> [57287.228009] 
> the existing dependency chain (in reverse order) is:
> [57287.228009] 
> -> #2 (&xfs_dir_ilock_class){++++..}:
> [57287.228009]        [<ffffffff810cf1c2>] __lock_acquire+0x15f2/0x1920
> [57287.228009]        [<ffffffff810cfe40>] lock_acquire+0xc0/0x280
> [57287.228009]        [<ffffffff810c8e0d>] down_read_nested+0x4d/0x70
> [57287.228009]        [<ffffffff81400c1a>] xfs_ilock+0x10a/0x2e0
> [57287.228009]        [<ffffffff81400e68>] xfs_ilock_attr_map_shared+0x38/0x50
> [57287.228009]        [<ffffffff8139b9e1>] xfs_attr_get+0xc1/0x180
> [57287.228009]        [<ffffffff8140ff87>] xfs_xattr_get+0x37/0x50
> [57287.228009]        [<ffffffff811e866f>] generic_getxattr+0x4f/0x70
> [57287.228009]        [<ffffffff816535d2>] inode_doinit_with_dentry+0x152/0x670
> [57287.228009]        [<ffffffff81657f8b>] sb_finish_set_opts+0xdb/0x260
> [57287.228009]        [<ffffffff81658604>] selinux_set_mnt_opts+0x2c4/0x600
> [57287.228009]        [<ffffffff816589ff>] superblock_doinit+0xbf/0xd0
> [57287.228009]        [<ffffffff81658a6d>] selinux_sb_kern_mount+0x3d/0xa0
> [57287.228009]        [<ffffffff8164efa6>] security_sb_kern_mount+0x16/0x20
> [57287.228009]        [<ffffffff811c29dd>] mount_fs+0x7d/0x190
> [57287.228009]        [<ffffffff811e2258>] vfs_kern_mount+0x68/0x160
> [57287.228009]        [<ffffffff811e58b4>] do_mount+0x204/0xc60
> [57287.228009]        [<ffffffff811e660f>] SyS_mount+0x6f/0xb0
> [57287.228009]        [<ffffffff81b6b697>] system_call_fastpath+0x12/0x6f
> [57287.228009] 
> -> #1 (&isec->lock){+.+.+.}:
> [57287.228009]        [<ffffffff810cf1c2>] __lock_acquire+0x15f2/0x1920
> [57287.228009]        [<ffffffff810cfe40>] lock_acquire+0xc0/0x280
> [57287.228009]        [<ffffffff81b675f3>] mutex_lock_nested+0x63/0x400
> [57287.228009]        [<ffffffff81653525>] inode_doinit_with_dentry+0xa5/0x670
> [57287.228009]        [<ffffffff81653b0c>] selinux_d_instantiate+0x1c/0x20
> [57287.228009]        [<ffffffff8164eb2b>] security_d_instantiate+0x1b/0x30
> [57287.228009]        [<ffffffff811d9ce4>] d_instantiate+0x54/0x80
> [57287.228009]        [<ffffffff811894fe>] __shmem_file_setup+0xce/0x250
> [57287.228009]        [<ffffffff8118c5f8>] shmem_zero_setup+0x28/0x70
> [57287.228009]        [<ffffffff811a1818>] mmap_region+0x5c8/0x5e0
> [57287.228009]        [<ffffffff811a1ba3>] do_mmap_pgoff+0x373/0x420
> [57287.228009]        [<ffffffff8118cb00>] vm_mmap_pgoff+0x90/0xc0
> [57287.228009]        [<ffffffff811a0000>] SyS_mmap_pgoff+0x1b0/0x240
> [57287.228009]        [<ffffffff81008c92>] SyS_mmap+0x22/0x30
> [57287.228009]        [<ffffffff81b6b697>] system_call_fastpath+0x12/0x6f
> [57287.228009] 
> -> #0 (&mm->mmap_sem){++++++}:
> [57287.228009]        [<ffffffff810ccafb>] check_prev_add+0x51b/0x860
> [57287.228009]        [<ffffffff810cf1c2>] __lock_acquire+0x15f2/0x1920
> [57287.228009]        [<ffffffff810cfe40>] lock_acquire+0xc0/0x280
> [57287.228009]        [<ffffffff811980ec>] might_fault+0x8c/0xb0
> [57287.228009]        [<ffffffff811d3ff2>] filldir+0x92/0x120
> [57287.228009]        [<ffffffff813ec51a>] xfs_dir2_sf_getdents.isra.10+0x1ba/0x220
> [57287.228009]        [<ffffffff813ed0de>] xfs_readdir+0x15e/0x300
> [57287.228009]        [<ffffffff813f116b>] xfs_file_readdir+0x2b/0x30
> [57287.228009]        [<ffffffff811d3dca>] iterate_dir+0x9a/0x140
> [57287.228009]        [<ffffffff811d42b1>] SyS_getdents+0x81/0x100
> [57287.228009]        [<ffffffff81b6b697>] system_call_fastpath+0x12/0x6f
> [57287.228009] 
> other info that might help us debug this:
> 
> [57287.228009] Chain exists of:
>   &mm->mmap_sem --> &isec->lock --> &xfs_dir_ilock_class
> 
> [57287.228009]  Possible unsafe locking scenario:
> 
> [57287.228009]        CPU0                    CPU1
> [57287.228009]        ----                    ----
> [57287.228009]   lock(&xfs_dir_ilock_class);
> [57287.228009]                                lock(&isec->lock);
> [57287.228009]                                lock(&xfs_dir_ilock_class);
> [57287.228009]   lock(&mm->mmap_sem);
> [57287.228009] 
>  *** DEADLOCK ***
> 
> [57287.228009] 2 locks held by updatedb/19312:
> [57287.228009]  #0:  (&type->i_mutex_dir_key#5){+.+.+.}, at: [<ffffffff811d3d91>] iterate_dir+0x61/0x140
> [57287.228009]  #1:  (&xfs_dir_ilock_class){++++..}, at: [<ffffffff81400c1a>] xfs_ilock+0x10a/0x2e0
> [57287.228009] 
> stack backtrace:
> [57287.228009] CPU: 0 PID: 19312 Comm: updatedb Not tainted 4.1.0-rc2-22946-g6c942d3 #174
> [57287.228009] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140709_153950- 04/01/2014
> [57287.228009]  ffffffff82c61ed0 ffff880060d0faa8 ffffffff81b61a1a 0000000080000001
> [57287.228009]  ffffffff82c345d0 ffff880060d0faf8 ffffffff810cb843 0000000000000000
> [57287.228009]  ffff880060d0fb28 0000000000000000 0000000000000001 0000000000000000
> [57287.228009] Call Trace:
> [57287.228009]  [<ffffffff81b61a1a>] dump_stack+0x4f/0x7b
> [57287.228009]  [<ffffffff810cb843>] print_circular_bug+0x203/0x310
> [57287.228009]  [<ffffffff810ccafb>] check_prev_add+0x51b/0x860
> [57287.228009]  [<ffffffff810ce018>] ? __lock_acquire+0x448/0x1920
> [57287.228009]  [<ffffffff810cf1c2>] __lock_acquire+0x15f2/0x1920
> [57287.228009]  [<ffffffff810cfe40>] lock_acquire+0xc0/0x280
> [57287.228009]  [<ffffffff811980bf>] ? might_fault+0x5f/0xb0
> [57287.228009]  [<ffffffff811980ec>] might_fault+0x8c/0xb0
> [57287.228009]  [<ffffffff811980bf>] ? might_fault+0x5f/0xb0
> [57287.228009]  [<ffffffff811d3ff2>] filldir+0x92/0x120
> [57287.228009]  [<ffffffff81400e24>] ? xfs_ilock_data_map_shared+0x34/0x40
> [57287.228009]  [<ffffffff813ec51a>] xfs_dir2_sf_getdents.isra.10+0x1ba/0x220
> [57287.228009]  [<ffffffff81400c1a>] ? xfs_ilock+0x10a/0x2e0
> [57287.228009]  [<ffffffff813ed0de>] xfs_readdir+0x15e/0x300
> [57287.228009]  [<ffffffff810cd6b5>] ? mark_held_locks+0x75/0xa0
> [57287.228009]  [<ffffffff81b6800a>] ? mutex_lock_killable_nested+0x27a/0x4e0
> [57287.228009]  [<ffffffff813f116b>] xfs_file_readdir+0x2b/0x30
> [57287.228009]  [<ffffffff811d3dca>] iterate_dir+0x9a/0x140
> [57287.228009]  [<ffffffff811dfceb>] ? __fget_light+0x7b/0xa0
> [57287.228009]  [<ffffffff811d42b1>] SyS_getdents+0x81/0x100
> [57287.228009]  [<ffffffff811d3f60>] ? fillonedir+0xf0/0xf0
> [57287.228009]  [<ffffffff81b6b697>] system_call_fastpath+0x12/0x6f
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields May 8, 2015, 2:10 p.m. UTC | #4
On Fri, May 08, 2015 at 08:42:30AM +1000, NeilBrown wrote:
> On Thu, 7 May 2015 11:31:16 -0400 "J. Bruce Fields" <bfields@fieldses.org>
> wrote:
> 
> > On Tue, May 05, 2015 at 11:52:03AM -0400, J. Bruce Fields wrote:
> > > On Tue, May 05, 2015 at 10:18:01AM -0400, J. Bruce Fields wrote:
> > > > On Tue, May 05, 2015 at 09:54:43PM +0800, Kinglong Mee wrote:
> > > > > On 5/5/2015 6:01 AM, J. Bruce Fields wrote:
> > > > > > + * Unlike lookup_one_len, it should be called without the parent
> > > > > > + * i_mutex held, and will take the i_mutex itself if necessary.
> > > > > > + */
> > > > > > +struct dentry *lookup_one_len_unlocked(const char *name,
> > > > > > +				       struct dentry *base, int len)
> > > > > > +{
> > > > > > +	struct qstr this;
> > > > > > +	unsigned int c;
> > > > > > +	int err;
> > > > > > +	struct dentry *ret;
> > > > > > +
> > > > > > +	WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
> > > > > 
> > > > > Remove this line.
> > > > 
> > > > Whoops, thanks.
> > > > 
> > > > > > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > > > > > index a30e79900086..cc7995762190 100644
> > > > > > --- a/fs/nfsd/vfs.c
> > > > > > +++ b/fs/nfsd/vfs.c
> > > > > > @@ -217,6 +217,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > > > > >  		host_err = PTR_ERR(dentry);
> > > > > >  		if (IS_ERR(dentry))
> > > > > >  			goto out_nfserr;
> > > > > > +		if (!S_ISREG(d_inode(dentry)->i_mode)) {
> > > > > 
> > > > > Got a crash here tested by pynfs,
> > > > 
> > > > OK, I guess I just forgot to take into account negative dentries.
> > > > Testing that now with (!(d_inode(dentry) && S_ISREG(..))).
> > > 
> > > And I also forgot nfs3xdr.c.
> > 
> > But, this doesn't look good.
> > 
> > OK, to be fair I'm not sure whether this was already happening before this
> > patch.
> 
> None of the stacks show any code that we changed, so I'm guessing it is a
> separate issues as you hint at.
> 
> If I didn't have a list as long as my arm of things I really should be doing,
> I'd go diving into the XFS code ....

Yeah.  Looks like it might be intermittent.  I'll keep an eye on it and
report to xfs people, I guess....

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields May 8, 2015, 4:15 p.m. UTC | #5
On Wed, May 06, 2015 at 08:26:28AM +1000, NeilBrown wrote:
> On Tue, 5 May 2015 11:52:03 -0400 "J. Bruce Fields" <bfields@fieldses.org>
> > diff --git a/fs/namei.c b/fs/namei.c
> > index 4a8d998b7274..0f554bf41dea 100644
> > --- a/fs/namei.c
> > +++ b/fs/namei.c
> > @@ -2139,6 +2139,8 @@ static struct dentry *lookup_hash(struct nameidata *nd)
> >   *
> >   * Note that this routine is purely a helper for filesystem usage and should
> >   * not be called by generic code.
> > + *
> > + * Must be called with the parented i_mutex held.
> 
> "parented"?
> 
>     * base->i_mutex must be held by caller.
> 
> ??

Thanks.

> > +struct dentry *lookup_one_len_unlocked(const char *name,
...
> > +	ret = __d_lookup(base, &this);
> > +	if (ret)
> > +		return ret;
> 
> __d_lookup() is a little too low level, as it doesn't call d_revalidate() and
> it doesn't retry if the rename_lock seqlock says it should.
> 
> The latter doesn't really matter as we would fall through to the safe
> mutex-protected version.
> The former isn't much of an issue for most filesystems under nfsd, but still
> needs to be handled to some extent.
> Also, the comment for __d_lookup says
> 
>  * __d_lookup callers must be commented.
> 
> The simplest resolution would be:
> 
> 	/* __d_lookup() is used to try to get a quick answer and avoid the
> 	 * mutex.  A false-negative does no harm.
> 	 */
> 	ret = __d_lookup(base, &this);
> 	if (ret && ret->d_flags & DCACHE_OP_REVALIDATE) {
> 		dput(ret);
> 		ret = NULL;
> 	}
> 	if (ret)
> 		return ret;

That looks right to me....  I'll probably take the simple option.

> > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > index a30e79900086..4accdb00976b 100644
> > --- a/fs/nfsd/vfs.c
> > +++ b/fs/nfsd/vfs.c
> > @@ -217,6 +217,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
> >  		host_err = PTR_ERR(dentry);
> >  		if (IS_ERR(dentry))
> >  			goto out_nfserr;
> > +		if (!(d_inode(dentry) && S_ISREG(d_inode(dentry)->i_mode))) {
> > +			/*
> > +			 * Never mind, we're not going to open this
> > +			 * anyway.  And we don't want to hold it over
> > +			 * the userspace upcalls in nfsd_mountpoint. */
> > +			fh_unlock(fhp);
> > +		}
> 
> You could avoid the test by just putting the fh_unlock(fhp) inside the 
> 
> 		if (nfsd_mountpoint(dentry, exp)) {
> 
> block.  It might also be appropriate for nfsd_mountpoint to fail on
> non-directories.

If check_export()'s to be believed, our support for regular-file
exports is intentional. So even if nfsd_mountpoint() is true, it's
possible we might still be about to open the result.

But I think we're OK:
 
The only reason for keeping the i_mutex here in the open case is to
avoid the situation where a client does OPEN(dirfh, "foo") and gets a
reply that grants a delegation even though the file has actually been
renamed to "bar".  (Thus violating the protocol, since the delegation's
supposed to guarantee you'll be notified before the file's renamed.)

(So the i_mutex is actually overkill, it would be enough to detect a
rename and then refuse the delegation (which we're always allowed to
do).)

But actually, if this is a mountpoint then I suppose we're safe from a
rename.  So, right, we can just move that unlock into the mountpoint
case.

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/namei.c b/fs/namei.c
index 4a8d998b7274..0f554bf41dea 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2139,6 +2139,8 @@  static struct dentry *lookup_hash(struct nameidata *nd)
  *
  * Note that this routine is purely a helper for filesystem usage and should
  * not be called by generic code.
+ *
+ * Must be called with the parented i_mutex held.
  */
 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 {
@@ -2182,6 +2184,66 @@  struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 }
 EXPORT_SYMBOL(lookup_one_len);
 
+/**
+ * lookup_one_len - filesystem helper to lookup single pathname component
+ * @name:	pathname component to lookup
+ * @base:	base directory to lookup from
+ * @len:	maximum length @len should be interpreted to
+ *
+ * Note that this routine is purely a helper for filesystem usage and should
+ * not be called by generic code.
+ *
+ * Unlike lookup_one_len, it should be called without the parent
+ * i_mutex held, and will take the i_mutex itself if necessary.
+ */
+struct dentry *lookup_one_len_unlocked(const char *name,
+				       struct dentry *base, int len)
+{
+	struct qstr this;
+	unsigned int c;
+	int err;
+	struct dentry *ret;
+
+	this.name = name;
+	this.len = len;
+	this.hash = full_name_hash(name, len);
+	if (!len)
+		return ERR_PTR(-EACCES);
+
+	if (unlikely(name[0] == '.')) {
+		if (len < 2 || (len == 2 && name[1] == '.'))
+			return ERR_PTR(-EACCES);
+	}
+
+	while (len--) {
+		c = *(const unsigned char *)name++;
+		if (c == '/' || c == '\0')
+			return ERR_PTR(-EACCES);
+	}
+	/*
+	 * See if the low-level filesystem might want
+	 * to use its own hash..
+	 */
+	if (base->d_flags & DCACHE_OP_HASH) {
+		int err = base->d_op->d_hash(base, &this);
+		if (err < 0)
+			return ERR_PTR(err);
+	}
+
+	err = inode_permission(base->d_inode, MAY_EXEC);
+	if (err)
+		return ERR_PTR(err);
+
+	ret = __d_lookup(base, &this);
+	if (ret)
+		return ret;
+	mutex_lock(&base->d_inode->i_mutex);
+	ret =  __lookup_hash(&this, base, 0);
+	mutex_unlock(&base->d_inode->i_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(lookup_one_len_unlocked);
+
 int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
 		 struct path *path, int *empty)
 {
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index f6e7cbabac5a..01dcd494f781 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -823,7 +823,7 @@  compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
 		} else
 			dchild = dget(dparent);
 	} else
-		dchild = lookup_one_len(name, dparent, namlen);
+		dchild = lookup_one_len_unlocked(name, dparent, namlen);
 	if (IS_ERR(dchild))
 		return rv;
 	if (d_mountpoint(dchild))
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 158badf945df..2c1adaa0bd2f 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2804,14 +2804,14 @@  nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
 	__be32 nfserr;
 	int ignore_crossmnt = 0;
 
-	dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
+	dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
 	if (IS_ERR(dentry))
 		return nfserrno(PTR_ERR(dentry));
 	if (d_really_is_negative(dentry)) {
 		/*
-		 * nfsd_buffered_readdir drops the i_mutex between
-		 * readdir and calling this callback, leaving a window
-		 * where this directory entry could have gone away.
+		 * we're not holding the i_mutex here, so there's
+		 * a window where this directory entry could have gone
+		 * away.
 		 */
 		dput(dentry);
 		return nfserr_noent;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a30e79900086..4accdb00976b 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -217,6 +217,13 @@  nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		host_err = PTR_ERR(dentry);
 		if (IS_ERR(dentry))
 			goto out_nfserr;
+		if (!(d_inode(dentry) && S_ISREG(d_inode(dentry)->i_mode))) {
+			/*
+			 * Never mind, we're not going to open this
+			 * anyway.  And we don't want to hold it over
+			 * the userspace upcalls in nfsd_mountpoint. */
+			fh_unlock(fhp);
+		}
 		/*
 		 * check if we have crossed a mount point ...
 		 */
@@ -1876,7 +1883,6 @@  static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 	offset = *offsetp;
 
 	while (1) {
-		struct inode *dir_inode = file_inode(file);
 		unsigned int reclen;
 
 		cdp->err = nfserr_eof; /* will be cleared on successful read */
@@ -1895,15 +1901,6 @@  static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 		if (!size)
 			break;
 
-		/*
-		 * Various filldir functions may end up calling back into
-		 * lookup_one_len() and the file system's ->lookup() method.
-		 * These expect i_mutex to be held, as it would within readdir.
-		 */
-		host_err = mutex_lock_killable(&dir_inode->i_mutex);
-		if (host_err)
-			break;
-
 		de = (struct buffered_dirent *)buf.dirent;
 		while (size > 0) {
 			offset = de->offset;
@@ -1920,7 +1917,6 @@  static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 			size -= reclen;
 			de = (struct buffered_dirent *)((char *)de + reclen);
 		}
-		mutex_unlock(&dir_inode->i_mutex);
 		if (size > 0) /* We bailed out early */
 			break;
 
diff --git a/include/linux/namei.h b/include/linux/namei.h
index c8990779f0c3..bb3a2f7cca67 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -62,6 +62,7 @@  extern struct dentry *kern_path_locked(const char *, struct path *);
 extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
 
 extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
+extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
 
 extern int follow_down_one(struct path *);
 extern int follow_down(struct path *);