diff mbox series

[14/19] VFS: Ensure no async updates happening in directory being removed.

Message ID 20250206054504.2950516-15-neilb@suse.de
State New
Headers show
Series RFC: Allow concurrent and async changes in a directory | expand

Commit Message

NeilBrown Feb. 6, 2025, 5:42 a.m. UTC
vfs_rmdir takes an exclusive lock on the target directory to ensure
nothing new is created in it while the rmdir progresses.  With the
possibility of async updates continuing after the inode lock is dropped
we now need extra protection.

Any async updates will have DCACHE_PAR_UPDATE set on the dentry.  We
simply wait for that flag to be cleared on all children.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 fs/dcache.c |  2 +-
 fs/namei.c  | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 1 deletion(-)

Comments

Christian Brauner Feb. 6, 2025, 2:06 p.m. UTC | #1
On Thu, Feb 06, 2025 at 04:42:51PM +1100, NeilBrown wrote:
> vfs_rmdir takes an exclusive lock on the target directory to ensure
> nothing new is created in it while the rmdir progresses.  With the

It also excludes concurrent mount operations.

> possibility of async updates continuing after the inode lock is dropped
> we now need extra protection.
> 
> Any async updates will have DCACHE_PAR_UPDATE set on the dentry.  We
> simply wait for that flag to be cleared on all children.
> 
> Signed-off-by: NeilBrown <neilb@suse.de>
> ---
>  fs/dcache.c |  2 +-
>  fs/namei.c  | 40 ++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 41 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/dcache.c b/fs/dcache.c
> index fb331596f1b1..90dee859d138 100644
> --- a/fs/dcache.c
> +++ b/fs/dcache.c
> @@ -53,7 +53,7 @@
>   *   - d_lru
>   *   - d_count
>   *   - d_unhashed()
> - *   - d_parent and d_chilren
> + *   - d_parent and d_children
>   *   - childrens' d_sib and d_parent
>   *   - d_u.d_alias, d_inode
>   *
> diff --git a/fs/namei.c b/fs/namei.c
> index 3a107d6098be..e8a85c9f431c 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -1839,6 +1839,27 @@ bool d_update_lock(struct dentry *dentry,
>  	return true;
>  }
>  
> +static void d_update_wait(struct dentry *dentry, unsigned int subclass)
> +{
> +	/* Note this may only ever be called in a context where we have
> +	 * a lock preventing this dentry from becoming locked, possibly
> +	 * an update lock on the parent dentry.  The must be a smp_mb()
> +	 * after that lock is taken and before this is called so that
> +	 * the following test is safe. d_update_lock() provides that
> +	 * barrier.
> +	 */
> +	if (!(dentry->d_flags & DCACHE_PAR_UPDATE))
> +		return
> +	lock_acquire_exclusive(&dentry->d_update_map, subclass,
> +			       0, NULL, _THIS_IP_);
> +	spin_lock(&dentry->d_lock);
> +	wait_var_event_spinlock(&dentry->d_flags,
> +				!check_dentry_locked(dentry),
> +				&dentry->d_lock);
> +	spin_unlock(&dentry->d_lock);
> +	lock_map_release(&dentry->d_update_map);
> +}
> +
>  bool d_update_trylock(struct dentry *dentry,
>  		      struct dentry *base,
>  		      const struct qstr *last)
> @@ -4688,6 +4709,7 @@ int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir,
>  		     struct dentry *dentry)
>  {
>  	int error = may_delete(idmap, dir, dentry, 1);
> +	struct dentry *child;
>  
>  	if (error)
>  		return error;
> @@ -4697,6 +4719,24 @@ int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir,
>  
>  	dget(dentry);
>  	inode_lock(dentry->d_inode);
> +	/*
> +	 * Some children of dentry might be active in an async update.
> +	 * We need to wait for them.  New children cannot be locked
> +	 * while the inode lock is held.
> +	 */
> +again:
> +	spin_lock(&dentry->d_lock);
> +	for (child = d_first_child(dentry); child;
> +	     child = d_next_sibling(child)) {
> +		if (child->d_flags & DCACHE_PAR_UPDATE) {
> +			dget(child);
> +			spin_unlock(&dentry->d_lock);
> +			d_update_wait(child, I_MUTEX_CHILD);
> +			dput(child);
> +			goto again;
> +		}
> +	}
> +	spin_unlock(&dentry->d_lock);

That looks like it can cause stalls when you call rmdir on a directory
that has a lots of children and a larg-ish subset of them has pending
async updates, no?
NeilBrown Feb. 7, 2025, 2:17 a.m. UTC | #2
On Fri, 07 Feb 2025, Christian Brauner wrote:
> On Thu, Feb 06, 2025 at 04:42:51PM +1100, NeilBrown wrote:
> > vfs_rmdir takes an exclusive lock on the target directory to ensure
> > nothing new is created in it while the rmdir progresses.  With the
> 
> It also excludes concurrent mount operations.

And it excludes chown and ACL changes.  I doubt those are important.  I
do need to check mount/unmount.

> 
> > possibility of async updates continuing after the inode lock is dropped
> > we now need extra protection.
> > 
> > Any async updates will have DCACHE_PAR_UPDATE set on the dentry.  We
> > simply wait for that flag to be cleared on all children.
> > 
> > Signed-off-by: NeilBrown <neilb@suse.de>
> > ---
> >  fs/dcache.c |  2 +-
> >  fs/namei.c  | 40 ++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 41 insertions(+), 1 deletion(-)
> > 
> > diff --git a/fs/dcache.c b/fs/dcache.c
> > index fb331596f1b1..90dee859d138 100644
> > --- a/fs/dcache.c
> > +++ b/fs/dcache.c
> > @@ -53,7 +53,7 @@
> >   *   - d_lru
> >   *   - d_count
> >   *   - d_unhashed()
> > - *   - d_parent and d_chilren
> > + *   - d_parent and d_children
> >   *   - childrens' d_sib and d_parent
> >   *   - d_u.d_alias, d_inode
> >   *
> > diff --git a/fs/namei.c b/fs/namei.c
> > index 3a107d6098be..e8a85c9f431c 100644
> > --- a/fs/namei.c
> > +++ b/fs/namei.c
> > @@ -1839,6 +1839,27 @@ bool d_update_lock(struct dentry *dentry,
> >  	return true;
> >  }
> >  
> > +static void d_update_wait(struct dentry *dentry, unsigned int subclass)
> > +{
> > +	/* Note this may only ever be called in a context where we have
> > +	 * a lock preventing this dentry from becoming locked, possibly
> > +	 * an update lock on the parent dentry.  The must be a smp_mb()
> > +	 * after that lock is taken and before this is called so that
> > +	 * the following test is safe. d_update_lock() provides that
> > +	 * barrier.
> > +	 */
> > +	if (!(dentry->d_flags & DCACHE_PAR_UPDATE))
> > +		return
> > +	lock_acquire_exclusive(&dentry->d_update_map, subclass,
> > +			       0, NULL, _THIS_IP_);
> > +	spin_lock(&dentry->d_lock);
> > +	wait_var_event_spinlock(&dentry->d_flags,
> > +				!check_dentry_locked(dentry),
> > +				&dentry->d_lock);
> > +	spin_unlock(&dentry->d_lock);
> > +	lock_map_release(&dentry->d_update_map);
> > +}
> > +
> >  bool d_update_trylock(struct dentry *dentry,
> >  		      struct dentry *base,
> >  		      const struct qstr *last)
> > @@ -4688,6 +4709,7 @@ int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir,
> >  		     struct dentry *dentry)
> >  {
> >  	int error = may_delete(idmap, dir, dentry, 1);
> > +	struct dentry *child;
> >  
> >  	if (error)
> >  		return error;
> > @@ -4697,6 +4719,24 @@ int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir,
> >  
> >  	dget(dentry);
> >  	inode_lock(dentry->d_inode);
> > +	/*
> > +	 * Some children of dentry might be active in an async update.
> > +	 * We need to wait for them.  New children cannot be locked
> > +	 * while the inode lock is held.
> > +	 */
> > +again:
> > +	spin_lock(&dentry->d_lock);
> > +	for (child = d_first_child(dentry); child;
> > +	     child = d_next_sibling(child)) {
> > +		if (child->d_flags & DCACHE_PAR_UPDATE) {
> > +			dget(child);
> > +			spin_unlock(&dentry->d_lock);
> > +			d_update_wait(child, I_MUTEX_CHILD);
> > +			dput(child);
> > +			goto again;
> > +		}
> > +	}
> > +	spin_unlock(&dentry->d_lock);
> 
> That looks like it can cause stalls when you call rmdir on a directory
> that has a lots of children and a larg-ish subset of them has pending
> async updates, no?
> 

It can certainly block waiting for other operations to complete, but
that is already the case when waiting for an exclusive lock on i_rwsem. 
Any thread that has already tried to get that lock might get it before
rmdir eventually succeeds.  So I don't think that is a behavioural
change.

I'm not concerned about walking the sibling list under a spinlock if the
list if very long.  Maybe I could periodically take a ref to the current
child, drop and reclaim the spinlock, and hopefully continue from there.
Doing that on a non-D_PAR_UPDATE dentry should be safe.  I wonder if
that complexity is worth it.

Thanks,
NeilBrown
Al Viro Feb. 7, 2025, 9:06 p.m. UTC | #3
On Thu, Feb 06, 2025 at 04:42:51PM +1100, NeilBrown wrote:
> vfs_rmdir takes an exclusive lock on the target directory to ensure
> nothing new is created in it while the rmdir progresses.  With the
> possibility of async updates continuing after the inode lock is dropped
> we now need extra protection.
> 
> Any async updates will have DCACHE_PAR_UPDATE set on the dentry.  We
> simply wait for that flag to be cleared on all children.

> +static void d_update_wait(struct dentry *dentry, unsigned int subclass)
> +{
> +	/* Note this may only ever be called in a context where we have
> +	 * a lock preventing this dentry from becoming locked, possibly
> +	 * an update lock on the parent dentry.  The must be a smp_mb()
> +	 * after that lock is taken and before this is called so that
> +	 * the following test is safe. d_update_lock() provides that
> +	 * barrier.
> +	 */
> +	if (!(dentry->d_flags & DCACHE_PAR_UPDATE))
> +		return
> +	lock_acquire_exclusive(&dentry->d_update_map, subclass,
> +			       0, NULL, _THIS_IP_);

What the fuck?

> +	spin_lock(&dentry->d_lock);
> +	wait_var_event_spinlock(&dentry->d_flags,
> +				!check_dentry_locked(dentry),
> +				&dentry->d_lock);
> +	spin_unlock(&dentry->d_lock);
> +	lock_map_release(&dentry->d_update_map);
> +}

OK, I realize that it compiles, but it should've raised all
kinds of red flags for anyone reading that.  return + <newline> is
already fishy, but having the next line indented *less* than that
return is firmly in the "somebody's trying to hide something nasty
here" territory, even without parsing the damn thing.
diff mbox series

Patch

diff --git a/fs/dcache.c b/fs/dcache.c
index fb331596f1b1..90dee859d138 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -53,7 +53,7 @@ 
  *   - d_lru
  *   - d_count
  *   - d_unhashed()
- *   - d_parent and d_chilren
+ *   - d_parent and d_children
  *   - childrens' d_sib and d_parent
  *   - d_u.d_alias, d_inode
  *
diff --git a/fs/namei.c b/fs/namei.c
index 3a107d6098be..e8a85c9f431c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1839,6 +1839,27 @@  bool d_update_lock(struct dentry *dentry,
 	return true;
 }
 
+static void d_update_wait(struct dentry *dentry, unsigned int subclass)
+{
+	/* Note this may only ever be called in a context where we have
+	 * a lock preventing this dentry from becoming locked, possibly
+	 * an update lock on the parent dentry.  The must be a smp_mb()
+	 * after that lock is taken and before this is called so that
+	 * the following test is safe. d_update_lock() provides that
+	 * barrier.
+	 */
+	if (!(dentry->d_flags & DCACHE_PAR_UPDATE))
+		return
+	lock_acquire_exclusive(&dentry->d_update_map, subclass,
+			       0, NULL, _THIS_IP_);
+	spin_lock(&dentry->d_lock);
+	wait_var_event_spinlock(&dentry->d_flags,
+				!check_dentry_locked(dentry),
+				&dentry->d_lock);
+	spin_unlock(&dentry->d_lock);
+	lock_map_release(&dentry->d_update_map);
+}
+
 bool d_update_trylock(struct dentry *dentry,
 		      struct dentry *base,
 		      const struct qstr *last)
@@ -4688,6 +4709,7 @@  int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir,
 		     struct dentry *dentry)
 {
 	int error = may_delete(idmap, dir, dentry, 1);
+	struct dentry *child;
 
 	if (error)
 		return error;
@@ -4697,6 +4719,24 @@  int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir,
 
 	dget(dentry);
 	inode_lock(dentry->d_inode);
+	/*
+	 * Some children of dentry might be active in an async update.
+	 * We need to wait for them.  New children cannot be locked
+	 * while the inode lock is held.
+	 */
+again:
+	spin_lock(&dentry->d_lock);
+	for (child = d_first_child(dentry); child;
+	     child = d_next_sibling(child)) {
+		if (child->d_flags & DCACHE_PAR_UPDATE) {
+			dget(child);
+			spin_unlock(&dentry->d_lock);
+			d_update_wait(child, I_MUTEX_CHILD);
+			dput(child);
+			goto again;
+		}
+	}
+	spin_unlock(&dentry->d_lock);
 
 	error = -EBUSY;
 	if (is_local_mountpoint(dentry) ||