diff mbox series

[10/11] VFS: take a shared lock for create/remove directory operations.

Message ID 20241220030830.272429-11-neilb@suse.de (mailing list archive)
State New
Headers show
Series Allow concurrent changes in a directory | expand

Commit Message

NeilBrown Dec. 20, 2024, 2:54 a.m. UTC
With this patch the VFS takes a shared lock on the directory (i_rwsem)
when performing create or remove operations.  Rename is as yet
unchanged.

Not all callers are changed, only the common ones in fs/namei.c

While the directory only has a shared lock, the dentry being updated has
an exclusive lock using a bit in ->d_flags.  Waiters use
wait_var_event_spinlock(), and a wakeup is only sent in the unusual case
that some other task is actually waiting - indicated by another d_flags
bit.

Once the exclusive "update" lock is obtained on the dentry we must make
sure it wasn't unlinked or renamed while we slept.  If it was we repeat
the lookup.

The filesystem operations that expect an exclusive lock are still
provided with exclusion, but this is handled by inode_dir_lock().

Signed-off-by: NeilBrown <neilb@suse.de>
---
 fs/dcache.c            |  9 ++++++-
 fs/namei.c             | 53 ++++++++++++++++++++++++++++++++++++++----
 include/linux/dcache.h |  4 ++++
 3 files changed, 60 insertions(+), 6 deletions(-)

Comments

Al Viro Dec. 23, 2024, 5:19 a.m. UTC | #1
On Fri, Dec 20, 2024 at 01:54:28PM +1100, NeilBrown wrote:

> Once the exclusive "update" lock is obtained on the dentry we must make
> sure it wasn't unlinked or renamed while we slept.  If it was we repeat
> the lookup.

> +		if (
> +			/* Was unlinked while we waited ?*/
> +			d_unhashed(dentry) ||
> +			/* Or was dentry renamed ?? */
> +			dentry->d_parent != base ||
> +			dentry->d_name.hash != last->hash ||
> +			!d_same_name(dentry, base, last)
> +		) {
> +			rcu_read_unlock();
> +			spin_unlock(&dentry->d_lock);
> +			lock_map_release(&dentry->d_update_map);
> +			dput(dentry);
> +			goto retry;
> +		}
> +		rcu_read_unlock();
> +	}
> +	dentry->d_flags |= DCACHE_PAR_UPDATE;
> +	spin_unlock(&dentry->d_lock);

... and now __d_unalias() moves it to new place, making all the checks
you've just done completely useless.
NeilBrown Dec. 23, 2024, 7:11 a.m. UTC | #2
On Mon, 23 Dec 2024, Al Viro wrote:
> On Fri, Dec 20, 2024 at 01:54:28PM +1100, NeilBrown wrote:
> 
> > Once the exclusive "update" lock is obtained on the dentry we must make
> > sure it wasn't unlinked or renamed while we slept.  If it was we repeat
> > the lookup.
> 
> > +		if (
> > +			/* Was unlinked while we waited ?*/
> > +			d_unhashed(dentry) ||
> > +			/* Or was dentry renamed ?? */
> > +			dentry->d_parent != base ||
> > +			dentry->d_name.hash != last->hash ||
> > +			!d_same_name(dentry, base, last)
> > +		) {
> > +			rcu_read_unlock();
> > +			spin_unlock(&dentry->d_lock);
> > +			lock_map_release(&dentry->d_update_map);
> > +			dput(dentry);
> > +			goto retry;
> > +		}
> > +		rcu_read_unlock();
> > +	}
> > +	dentry->d_flags |= DCACHE_PAR_UPDATE;
> > +	spin_unlock(&dentry->d_lock);
> 
> ... and now __d_unalias() moves it to new place, making all the checks
> you've just done completely useless.
> 

... Yes, thanks.

So I need __d_unalias() to effectively do a "try_lock" of
DCACHE_PAR_UPDATE and hold the lock across __d_move().
I think that would address the concerned you raised.

Did you see anything else that might be problematic?

Thanks,
NeilBrown
Al Viro Dec. 23, 2024, 7:26 a.m. UTC | #3
On Mon, Dec 23, 2024 at 06:11:16PM +1100, NeilBrown wrote:
> ... Yes, thanks.
> 
> So I need __d_unalias() to effectively do a "try_lock" of
> DCACHE_PAR_UPDATE and hold the lock across __d_move().
> I think that would address the concerned you raised.
> 
> Did you see anything else that might be problematic?

That might work with ->d_parent, but it won't help with ->d_name
in same-parent case of __d_unalias()...
NeilBrown Dec. 23, 2024, 8:40 p.m. UTC | #4
On Mon, 23 Dec 2024, Al Viro wrote:
> On Mon, Dec 23, 2024 at 06:11:16PM +1100, NeilBrown wrote:
> > ... Yes, thanks.
> > 
> > So I need __d_unalias() to effectively do a "try_lock" of
> > DCACHE_PAR_UPDATE and hold the lock across __d_move().
> > I think that would address the concerned you raised.
> > 
> > Did you see anything else that might be problematic?
> 
> That might work with ->d_parent, but it won't help with ->d_name
> in same-parent case of __d_unalias()...
> 

Why would the same-parent case be any different?
Certainly it doesn't need s_vfs_rename_mutex and it there is no second
parent to get a shared lock on.  But we would still need to set
DCACHE_PAR_UPDATE under ->d_lock on "alias".  If we found that it was
already set and instead failed with -ESTALE, that would prevent
__d_unalias from changing anything including ->d_name after
lookup_and_lock has checked that the parent and d_name are unchanged
(until done_lookup_and_lock is called of course).

What am I missing?
Thanks,
NeilBrown
diff mbox series

Patch

diff --git a/fs/dcache.c b/fs/dcache.c
index ebe849474bd8..3fb3af83add5 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1636,9 +1636,10 @@  EXPORT_SYMBOL(d_invalidate);
  * available. On a success the dentry is returned. The name passed in is
  * copied and the copy passed in may be reused after this call.
  */
- 
+
 static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
 {
+	static struct lock_class_key __key;
 	struct dentry *dentry;
 	char *dname;
 	int err;
@@ -1697,6 +1698,8 @@  static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
 	INIT_HLIST_NODE(&dentry->d_sib);
 	d_set_d_op(dentry, dentry->d_sb->s_d_op);
 
+	lockdep_init_map(&dentry->d_update_map, "DCACHE_PAR_UPDATE", &__key, 0);
+
 	if (dentry->d_op && dentry->d_op->d_init) {
 		err = dentry->d_op->d_init(dentry);
 		if (err) {
@@ -3030,6 +3033,10 @@  static int __d_unalias(struct dentry *dentry, struct dentry *alias)
  * In that case, we know that the inode will be a regular file, and also this
  * will only occur during atomic_open. So we need to check for the dentry
  * being already hashed only in the final case.
+ *
+ * @dentry must have a valid ->d_parent and that directory must be
+ * locked (i_rwsem) either exclusively or shared.  If shared then
+ * @dentry must have %DCACHE_PAR_LOOKUP or %DCACHE_PAR_UPDATE set.
  */
 struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
 {
diff --git a/fs/namei.c b/fs/namei.c
index 68750b15dbf4..fb40ae64dc8d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1703,6 +1703,15 @@  struct dentry *lookup_one_qstr_excl(const struct qstr *name,
 }
 EXPORT_SYMBOL(lookup_one_qstr_excl);
 
+static bool check_dentry_locked(struct dentry *de)
+{
+	if (de->d_flags & DCACHE_PAR_UPDATE) {
+		de->d_flags |= DCACHE_PAR_WAITER;
+		return true;
+	}
+	return false;
+}
+
 static struct dentry *lookup_and_lock(const struct qstr *last,
 				      struct dentry *base,
 				      unsigned int lookup_flags)
@@ -1710,10 +1719,36 @@  static struct dentry *lookup_and_lock(const struct qstr *last,
 	struct dentry *dentry;
 	int err;
 
-	inode_lock_nested(base->d_inode, I_MUTEX_PARENT);
+	inode_lock_shared_nested(base->d_inode, I_MUTEX_PARENT);
+retry:
 	dentry = lookup_one_qstr_excl(last, base, lookup_flags);
 	if (IS_ERR(dentry))
 		goto out;
+	lock_acquire_exclusive(&dentry->d_update_map, 0, 0, NULL, _THIS_IP_);
+	spin_lock(&dentry->d_lock);
+	wait_var_event_spinlock(&dentry->d_flags,
+				!check_dentry_locked(dentry),
+				&dentry->d_lock);
+	if (d_is_positive(dentry)) {
+		rcu_read_lock(); /* needed for d_same_name() */
+		if (
+			/* Was unlinked while we waited ?*/
+			d_unhashed(dentry) ||
+			/* Or was dentry renamed ?? */
+			dentry->d_parent != base ||
+			dentry->d_name.hash != last->hash ||
+			!d_same_name(dentry, base, last)
+		) {
+			rcu_read_unlock();
+			spin_unlock(&dentry->d_lock);
+			lock_map_release(&dentry->d_update_map);
+			dput(dentry);
+			goto retry;
+		}
+		rcu_read_unlock();
+	}
+	dentry->d_flags |= DCACHE_PAR_UPDATE;
+	spin_unlock(&dentry->d_lock);
 	err = -EEXIST;
 	if ((lookup_flags & LOOKUP_EXCL) && d_is_positive(dentry))
 		goto err;
@@ -1723,10 +1758,11 @@  static struct dentry *lookup_and_lock(const struct qstr *last,
 	return dentry;
 
 err:
-	dput(dentry);
-	dentry = ERR_PTR(err);
+	done_lookup_and_lock(base, dentry);
+	return ERR_PTR(err);
+
 out:
-	inode_unlock(base->d_inode);
+	inode_unlock_shared(base->d_inode);
 	return dentry;
 }
 
@@ -2795,8 +2831,15 @@  EXPORT_SYMBOL(user_path_locked_at);
 
 void done_lookup_and_lock(struct dentry *parent, struct dentry *child)
 {
+	lock_map_release(&child->d_update_map);
+	spin_lock(&child->d_lock);
+	if (child->d_flags & DCACHE_PAR_WAITER)
+		wake_up_var_locked(&child->d_flags, &child->d_lock);
+	child->d_flags &= ~(DCACHE_PAR_UPDATE | DCACHE_PAR_WAITER);
+	spin_unlock(&child->d_lock);
+
+	inode_unlock_shared(parent->d_inode);
 	dput(child);
-	inode_unlock(d_inode(parent));
 }
 EXPORT_SYMBOL(done_lookup_and_lock);
 
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index fc7f571bd5bb..6d404c296ac0 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -102,6 +102,8 @@  struct dentry {
 					 * possible!
 					 */
 
+	/* lockdep tracking of DCACHE_PAR_UPDATE locks */
+	struct lockdep_map		d_update_map;
 	union {
 		struct list_head d_lru;		/* LRU list */
 		wait_queue_head_t *d_wait;	/* in-lookup ones only */
@@ -220,6 +222,8 @@  struct dentry_operations {
 #define DCACHE_DENTRY_CURSOR		BIT(25)
 #define DCACHE_NORCU			BIT(26) /* No RCU delay for freeing */
 
+#define DCACHE_PAR_UPDATE		BIT(27) /* Locked for update */
+#define DCACHE_PAR_WAITER		BIT(28) /* someone is waiting for PAR_UPDATE */
 extern seqlock_t rename_lock;
 
 /*