diff mbox

[3/3] VFS: close race between getcwd() and d_move()

Message ID 151019772763.30101.16040338743875884111.stgit@noble (mailing list archive)
State New, archived
Headers show

Commit Message

NeilBrown Nov. 9, 2017, 3:22 a.m. UTC
d_move() will call __d_drop() and then __d_rehash()
on the dentry being moved.  This creates a small window
when the dentry appears to be unhashed.  Many tests
of d_unhashed() are made under ->d_lock and so are safe
from racing with this window, but some aren't.
In particular, getcwd() calls d_unlinked() (which calls
d_unhashed()) without d_lock protection, so it can race.

This races has been seen in practice with lustre, which uses d_move() as
part of name lookup.  See:
   https://jira.hpdd.intel.com/browse/LU-9735
It could race with a regular rename(), and result in ENOENT instead
of either the 'before' or 'after' name.

The race can be demonstrated with a simple program which
has two threads, one renaming a directory back and forth
while another calls getcwd() within that directory: it should never
fail, but does.  See:
  https://patchwork.kernel.org/patch/9455345/

We could fix this race by taking d_lock and rechecking when
d_unhashed() reports true.  Alternately when can remove the window,
which is the approach this patch takes.

When __d_drop and __d_rehash are used to move a dentry, an extra
flag is passed which causes d_hash.pprev to not be cleared, and
to not be tested.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 fs/dcache.c |   31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

Comments

Nikolay Borisov Nov. 9, 2017, 11:41 a.m. UTC | #1
On  9.11.2017 05:22, NeilBrown wrote:
> d_move() will call __d_drop() and then __d_rehash()
> on the dentry being moved.  This creates a small window
> when the dentry appears to be unhashed.  Many tests
> of d_unhashed() are made under ->d_lock and so are safe
> from racing with this window, but some aren't.
> In particular, getcwd() calls d_unlinked() (which calls
> d_unhashed()) without d_lock protection, so it can race.
> 
> This races has been seen in practice with lustre, which uses d_move() as
> part of name lookup.  See:
>    https://jira.hpdd.intel.com/browse/LU-9735
> It could race with a regular rename(), and result in ENOENT instead
> of either the 'before' or 'after' name.
> 
> The race can be demonstrated with a simple program which
> has two threads, one renaming a directory back and forth
> while another calls getcwd() within that directory: it should never
> fail, but does.  See:
>   https://patchwork.kernel.org/patch/9455345/
> 
> We could fix this race by taking d_lock and rechecking when
> d_unhashed() reports true.  Alternately when can remove the window,
> which is the approach this patch takes.
> 
> When __d_drop and __d_rehash are used to move a dentry, an extra
> flag is passed which causes d_hash.pprev to not be cleared, and
> to not be tested.
> 
> Signed-off-by: NeilBrown <neilb@suse.com>
> ---
>  fs/dcache.c |   31 ++++++++++++++++++++-----------
>  1 file changed, 20 insertions(+), 11 deletions(-)
> 
> diff --git a/fs/dcache.c b/fs/dcache.c
> index d5952306206b..3130d62f29c9 100644
> --- a/fs/dcache.c
> +++ b/fs/dcache.c
> @@ -471,8 +471,11 @@ static void dentry_lru_add(struct dentry *dentry)
>   * reason (NFS timeouts or autofs deletes).
>   *
>   * __d_drop requires dentry->d_lock.
> + * ___d_drop takes an extra @moving argument.
> + * If true, d_hash.pprev is not cleared, so there is no transient d_unhashed()
> + * state.
>   */
> -void __d_drop(struct dentry *dentry)
> +static void inline ___d_drop(struct dentry *dentry, bool moving)
>  {
>  	if (!d_unhashed(dentry)) {
>  		struct hlist_bl_head *b;
> @@ -493,12 +496,18 @@ void __d_drop(struct dentry *dentry)
>  		} else
>  			hlist_bl_lock(b);
>  		__hlist_bl_del(&dentry->d_hash);
> -		dentry->d_hash.pprev = NULL;
> +		if (likely(!moving))
> +			dentry->d_hash.pprev = NULL;

nit: isn't a bit more explicit if (unlikely(moving)). I suspect the end
result is the same, however it's easy to miss the !. It's not a big deal
but just wondering.

>  		hlist_bl_unlock(b);
>  		/* After this call, in-progress rcu-walk path lookup will fail. */
>  		write_seqcount_invalidate(&dentry->d_seq);
>  	}
>  }
> +
> +void __d_drop(struct dentry *dentry)
> +{
> +	___d_drop(dentry, false);
> +}
>  EXPORT_SYMBOL(__d_drop);
>  
>  void d_drop(struct dentry *dentry)
> @@ -2387,10 +2396,10 @@ void d_delete(struct dentry * dentry)
>  }
>  EXPORT_SYMBOL(d_delete);
>  
> -static void __d_rehash(struct dentry *entry)
> +static void __d_rehash(struct dentry *entry, bool moving)
>  {
>  	struct hlist_bl_head *b = d_hash(entry->d_name.hash);
> -	BUG_ON(!d_unhashed(entry));
> +	BUG_ON(!moving && !d_unhashed(entry));
>  	hlist_bl_lock(b);
>  	hlist_bl_add_head_rcu(&entry->d_hash, b);
>  	hlist_bl_unlock(b);
> @@ -2406,7 +2415,7 @@ static void __d_rehash(struct dentry *entry)
>  void d_rehash(struct dentry * entry)
>  {
>  	spin_lock(&entry->d_lock);
> -	__d_rehash(entry);
> +	__d_rehash(entry, false);
>  	spin_unlock(&entry->d_lock);
>  }
>  EXPORT_SYMBOL(d_rehash);
> @@ -2580,7 +2589,7 @@ static inline void __d_add(struct dentry *dentry, struct inode *inode)
>  		raw_write_seqcount_end(&dentry->d_seq);
>  		fsnotify_update_flags(dentry);
>  	}
> -	__d_rehash(dentry);
> +	__d_rehash(dentry, false);
>  	if (dir)
>  		end_dir_add(dir, n);
>  	spin_unlock(&dentry->d_lock);
> @@ -2642,7 +2651,7 @@ struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode)
>  			alias = NULL;
>  		} else {
>  			__dget_dlock(alias);
> -			__d_rehash(alias);
> +			__d_rehash(alias, false);
>  			spin_unlock(&alias->d_lock);
>  		}
>  		spin_unlock(&inode->i_lock);
> @@ -2828,8 +2837,8 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
>  
>  	/* unhash both */
>  	/* __d_drop does write_seqcount_barrier, but they're OK to nest. */
> -	__d_drop(dentry);
> -	__d_drop(target);
> +	___d_drop(dentry, true);
> +	___d_drop(target, exchange);
>  
>  	/* Switch the names.. */
>  	if (exchange)
> @@ -2838,9 +2847,9 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
>  		copy_name(dentry, target);
>  
>  	/* rehash in new place(s) */
> -	__d_rehash(dentry);
> +	__d_rehash(dentry, true);
>  	if (exchange)
> -		__d_rehash(target);
> +		__d_rehash(target, true);
>  
>  	/* ... and switch them in the tree */
>  	if (IS_ROOT(dentry)) {
> 
> 
>
Matthew Wilcox Nov. 9, 2017, 1:08 p.m. UTC | #2
On Thu, Nov 09, 2017 at 01:41:24PM +0200, Nikolay Borisov wrote:
> On  9.11.2017 05:22, NeilBrown wrote:
> > @@ -493,12 +496,18 @@ void __d_drop(struct dentry *dentry)
> >  		} else
> >  			hlist_bl_lock(b);
> >  		__hlist_bl_del(&dentry->d_hash);
> > -		dentry->d_hash.pprev = NULL;
> > +		if (likely(!moving))
> > +			dentry->d_hash.pprev = NULL;
> 
> nit: isn't a bit more explicit if (unlikely(moving)). I suspect the end
> result is the same, however it's easy to miss the !. It's not a big deal
> but just wondering.

umm ... you just suggested the exact opposite of what the patch is
intended to do.  likely()/unlikely() only hint to the compiler the
probabilities of the branch; they don't change the meaning of the
condition.
Nikolay Borisov Nov. 9, 2017, 4:02 p.m. UTC | #3
On  9.11.2017 15:08, Matthew Wilcox wrote:
> On Thu, Nov 09, 2017 at 01:41:24PM +0200, Nikolay Borisov wrote:
>> On  9.11.2017 05:22, NeilBrown wrote:
>>> @@ -493,12 +496,18 @@ void __d_drop(struct dentry *dentry)
>>>  		} else
>>>  			hlist_bl_lock(b);
>>>  		__hlist_bl_del(&dentry->d_hash);
>>> -		dentry->d_hash.pprev = NULL;
>>> +		if (likely(!moving))
>>> +			dentry->d_hash.pprev = NULL;
>>
>> nit: isn't a bit more explicit if (unlikely(moving)). I suspect the end
>> result is the same, however it's easy to miss the !. It's not a big deal
>> but just wondering.
> 
> umm ... you just suggested the exact opposite of what the patch is
> intended to do.  likely()/unlikely() only hint to the compiler the
> probabilities of the branch; they don't change the meaning of the
> condition.

brainfart, disregard my comment doh....

> 
>
Linus Torvalds Nov. 9, 2017, 8:23 p.m. UTC | #4
On Wed, Nov 8, 2017 at 7:22 PM, NeilBrown <neilb@suse.com> wrote:
> d_move() will call __d_drop() and then __d_rehash()
> on the dentry being moved.  This creates a small window
> when the dentry appears to be unhashed.  Many tests
> of d_unhashed() are made under ->d_lock and so are safe
> from racing with this window, but some aren't.
> In particular, getcwd() calls d_unlinked() (which calls
> d_unhashed()) without d_lock protection, so it can race.

Hmm.

I see what you're doing, but I don't necessarily agree.

I would actually almost prefer that we simply change __d_move() itself.

The problem is that __d_move() really wants to move the hashes things
atomically, but instead of doing that it does a "unhash and then
rehash".

How nasty would it be to just expand the calls to __d_drop/__d_rehash
into __d_move itself, and take both has list locks at the same time
(with the usual ordering and checking if it's the same list, of
course).

                     Linus
diff mbox

Patch

diff --git a/fs/dcache.c b/fs/dcache.c
index d5952306206b..3130d62f29c9 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -471,8 +471,11 @@  static void dentry_lru_add(struct dentry *dentry)
  * reason (NFS timeouts or autofs deletes).
  *
  * __d_drop requires dentry->d_lock.
+ * ___d_drop takes an extra @moving argument.
+ * If true, d_hash.pprev is not cleared, so there is no transient d_unhashed()
+ * state.
  */
-void __d_drop(struct dentry *dentry)
+static void inline ___d_drop(struct dentry *dentry, bool moving)
 {
 	if (!d_unhashed(dentry)) {
 		struct hlist_bl_head *b;
@@ -493,12 +496,18 @@  void __d_drop(struct dentry *dentry)
 		} else
 			hlist_bl_lock(b);
 		__hlist_bl_del(&dentry->d_hash);
-		dentry->d_hash.pprev = NULL;
+		if (likely(!moving))
+			dentry->d_hash.pprev = NULL;
 		hlist_bl_unlock(b);
 		/* After this call, in-progress rcu-walk path lookup will fail. */
 		write_seqcount_invalidate(&dentry->d_seq);
 	}
 }
+
+void __d_drop(struct dentry *dentry)
+{
+	___d_drop(dentry, false);
+}
 EXPORT_SYMBOL(__d_drop);
 
 void d_drop(struct dentry *dentry)
@@ -2387,10 +2396,10 @@  void d_delete(struct dentry * dentry)
 }
 EXPORT_SYMBOL(d_delete);
 
-static void __d_rehash(struct dentry *entry)
+static void __d_rehash(struct dentry *entry, bool moving)
 {
 	struct hlist_bl_head *b = d_hash(entry->d_name.hash);
-	BUG_ON(!d_unhashed(entry));
+	BUG_ON(!moving && !d_unhashed(entry));
 	hlist_bl_lock(b);
 	hlist_bl_add_head_rcu(&entry->d_hash, b);
 	hlist_bl_unlock(b);
@@ -2406,7 +2415,7 @@  static void __d_rehash(struct dentry *entry)
 void d_rehash(struct dentry * entry)
 {
 	spin_lock(&entry->d_lock);
-	__d_rehash(entry);
+	__d_rehash(entry, false);
 	spin_unlock(&entry->d_lock);
 }
 EXPORT_SYMBOL(d_rehash);
@@ -2580,7 +2589,7 @@  static inline void __d_add(struct dentry *dentry, struct inode *inode)
 		raw_write_seqcount_end(&dentry->d_seq);
 		fsnotify_update_flags(dentry);
 	}
-	__d_rehash(dentry);
+	__d_rehash(dentry, false);
 	if (dir)
 		end_dir_add(dir, n);
 	spin_unlock(&dentry->d_lock);
@@ -2642,7 +2651,7 @@  struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode)
 			alias = NULL;
 		} else {
 			__dget_dlock(alias);
-			__d_rehash(alias);
+			__d_rehash(alias, false);
 			spin_unlock(&alias->d_lock);
 		}
 		spin_unlock(&inode->i_lock);
@@ -2828,8 +2837,8 @@  static void __d_move(struct dentry *dentry, struct dentry *target,
 
 	/* unhash both */
 	/* __d_drop does write_seqcount_barrier, but they're OK to nest. */
-	__d_drop(dentry);
-	__d_drop(target);
+	___d_drop(dentry, true);
+	___d_drop(target, exchange);
 
 	/* Switch the names.. */
 	if (exchange)
@@ -2838,9 +2847,9 @@  static void __d_move(struct dentry *dentry, struct dentry *target,
 		copy_name(dentry, target);
 
 	/* rehash in new place(s) */
-	__d_rehash(dentry);
+	__d_rehash(dentry, true);
 	if (exchange)
-		__d_rehash(target);
+		__d_rehash(target, true);
 
 	/* ... and switch them in the tree */
 	if (IS_ROOT(dentry)) {