diff mbox series

[v4] fs/namespace: defer RCU sync for MNT_DETACH umount

Message ID 20250408210350.749901-12-echanude@redhat.com (mailing list archive)
State New
Headers show
Series [v4] fs/namespace: defer RCU sync for MNT_DETACH umount | expand

Commit Message

Eric Chanudet April 8, 2025, 8:58 p.m. UTC
Defer releasing the detached file-system when calling namespace_unlock()
during a lazy umount to return faster.

When requesting MNT_DETACH, the caller does not expect the file-system
to be shut down upon returning from the syscall. Calling
synchronize_rcu_expedited() has a significant cost on RT kernel that
defaults to rcupdate.rcu_normal_after_boot=1. Queue the detached struct
mount in a separate list and put it on a workqueue to run post RCU
grace-period.

w/o patch, 6.15-rc1 PREEMPT_RT:
perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
    0.02455 +- 0.00107 seconds time elapsed  ( +-  4.36% )
perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt
    0.02555 +- 0.00114 seconds time elapsed  ( +-  4.46% )

w/ patch, 6.15-rc1 PREEMPT_RT:
perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
    0.026311 +- 0.000869 seconds time elapsed  ( +-  3.30% )
perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt
    0.003194 +- 0.000160 seconds time elapsed  ( +-  5.01% )

Signed-off-by: Alexander Larsson <alexl@redhat.com>
Signed-off-by: Lucas Karpinski <lkarpins@redhat.com>
Signed-off-by: Eric Chanudet <echanude@redhat.com>
---

Attempt to re-spin this series based on the feedback received in v3 that
pointed out the need to wait the grace-period in namespace_unlock()
before calling the deferred mntput().

v4:
- Use queue_rcu_work() to defer free_mounts() for lazy umounts
- Drop lazy_unlock global and refactor using a helper
v3: https://lore.kernel.org/all/20240626201129.272750-2-lkarpins@redhat.com/
- Removed unneeded code for lazy umount case.
- Don't block within interrupt context.
v2: https://lore.kernel.org/all/20240426195429.28547-1-lkarpins@redhat.com/
- Only defer releasing umount'ed filesystems for lazy umounts
v1: https://lore.kernel.org/all/20230119205521.497401-1-echanude@redhat.com/

 fs/namespace.c | 52 +++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 45 insertions(+), 7 deletions(-)

Comments

Christian Brauner April 9, 2025, 10:37 a.m. UTC | #1
On Tue, Apr 08, 2025 at 04:58:34PM -0400, Eric Chanudet wrote:
> Defer releasing the detached file-system when calling namespace_unlock()
> during a lazy umount to return faster.
> 
> When requesting MNT_DETACH, the caller does not expect the file-system
> to be shut down upon returning from the syscall. Calling
> synchronize_rcu_expedited() has a significant cost on RT kernel that
> defaults to rcupdate.rcu_normal_after_boot=1. Queue the detached struct
> mount in a separate list and put it on a workqueue to run post RCU
> grace-period.
> 
> w/o patch, 6.15-rc1 PREEMPT_RT:
> perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
>     0.02455 +- 0.00107 seconds time elapsed  ( +-  4.36% )
> perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt
>     0.02555 +- 0.00114 seconds time elapsed  ( +-  4.46% )
> 
> w/ patch, 6.15-rc1 PREEMPT_RT:
> perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
>     0.026311 +- 0.000869 seconds time elapsed  ( +-  3.30% )
> perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt
>     0.003194 +- 0.000160 seconds time elapsed  ( +-  5.01% )
> 
> Signed-off-by: Alexander Larsson <alexl@redhat.com>
> Signed-off-by: Lucas Karpinski <lkarpins@redhat.com>
> Signed-off-by: Eric Chanudet <echanude@redhat.com>
> ---
> 
> Attempt to re-spin this series based on the feedback received in v3 that
> pointed out the need to wait the grace-period in namespace_unlock()
> before calling the deferred mntput().

I still hate this with a passion because it adds another special-sauce
path into the unlock path. I've folded the following diff into it so it
at least doesn't start passing that pointless boolean and doesn't
introduce __namespace_unlock(). Just use a global variable and pick the
value off of it just as we do with the lists. Testing this now:

diff --git a/fs/namespace.c b/fs/namespace.c
index e5b0b920dd97..25599428706c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -82,8 +82,9 @@ static struct hlist_head *mount_hashtable __ro_after_init;
 static struct hlist_head *mountpoint_hashtable __ro_after_init;
 static struct kmem_cache *mnt_cache __ro_after_init;
 static DECLARE_RWSEM(namespace_sem);
-static HLIST_HEAD(unmounted);  /* protected by namespace_sem */
-static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
+static bool unmounted_lazily;          /* protected by namespace_sem */
+static HLIST_HEAD(unmounted);          /* protected by namespace_sem */
+static LIST_HEAD(ex_mountpoints);      /* protected by namespace_sem */
 static DEFINE_SEQLOCK(mnt_ns_tree_lock);

 #ifdef CONFIG_FSNOTIFY
@@ -1807,17 +1808,18 @@ static void free_mounts(struct hlist_head *mount_list)

 static void defer_free_mounts(struct work_struct *work)
 {
-       struct deferred_free_mounts *d = container_of(
-               to_rcu_work(work), struct deferred_free_mounts, rwork);
+       struct deferred_free_mounts *d;

+       d = container_of(to_rcu_work(work), struct deferred_free_mounts, rwork);
        free_mounts(&d->release_list);
        kfree(d);
 }

-static void __namespace_unlock(bool lazy)
+static void namespace_unlock(void)
 {
        HLIST_HEAD(head);
        LIST_HEAD(list);
+       bool defer = unmounted_lazily;

        hlist_move_list(&unmounted, &head);
        list_splice_init(&ex_mountpoints, &list);
@@ -1840,29 +1842,21 @@ static void __namespace_unlock(bool lazy)
        if (likely(hlist_empty(&head)))
                return;

-       if (lazy) {
-               struct deferred_free_mounts *d =
-                       kmalloc(sizeof(*d), GFP_KERNEL);
+       if (defer) {
+               struct deferred_free_mounts *d;

-               if (unlikely(!d))
-                       goto out;
-
-               hlist_move_list(&head, &d->release_list);
-               INIT_RCU_WORK(&d->rwork, defer_free_mounts);
-               queue_rcu_work(system_wq, &d->rwork);
-               return;
+               d = kmalloc(sizeof(struct deferred_free_mounts), GFP_KERNEL);
+               if (d) {
+                       hlist_move_list(&head, &d->release_list);
+                       INIT_RCU_WORK(&d->rwork, defer_free_mounts);
+                       queue_rcu_work(system_wq, &d->rwork);
+                       return;
+               }
        }
-
-out:
        synchronize_rcu_expedited();
        free_mounts(&head);
 }

-static inline void namespace_unlock(void)
-{
-       __namespace_unlock(false);
-}
-
 static inline void namespace_lock(void)
 {
        down_write(&namespace_sem);
@@ -2094,7 +2088,7 @@ static int do_umount(struct mount *mnt, int flags)
        }
 out:
        unlock_mount_hash();
-       __namespace_unlock(flags & MNT_DETACH);
+       namespace_unlock();
        return retval;
 }


> 
> v4:
> - Use queue_rcu_work() to defer free_mounts() for lazy umounts
> - Drop lazy_unlock global and refactor using a helper
> v3: https://lore.kernel.org/all/20240626201129.272750-2-lkarpins@redhat.com/
> - Removed unneeded code for lazy umount case.
> - Don't block within interrupt context.
> v2: https://lore.kernel.org/all/20240426195429.28547-1-lkarpins@redhat.com/
> - Only defer releasing umount'ed filesystems for lazy umounts
> v1: https://lore.kernel.org/all/20230119205521.497401-1-echanude@redhat.com/
> 
>  fs/namespace.c | 52 +++++++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 45 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 14935a0500a2..e5b0b920dd97 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -45,6 +45,11 @@ static unsigned int m_hash_shift __ro_after_init;
>  static unsigned int mp_hash_mask __ro_after_init;
>  static unsigned int mp_hash_shift __ro_after_init;
>  
> +struct deferred_free_mounts {
> +	struct rcu_work rwork;
> +	struct hlist_head release_list;
> +};
> +
>  static __initdata unsigned long mhash_entries;
>  static int __init set_mhash_entries(char *str)
>  {
> @@ -1789,11 +1794,29 @@ static bool need_notify_mnt_list(void)
>  }
>  #endif
>  
> -static void namespace_unlock(void)
> +static void free_mounts(struct hlist_head *mount_list)
>  {
> -	struct hlist_head head;
>  	struct hlist_node *p;
>  	struct mount *m;
> +
> +	hlist_for_each_entry_safe(m, p, mount_list, mnt_umount) {
> +		hlist_del(&m->mnt_umount);
> +		mntput(&m->mnt);
> +	}
> +}
> +
> +static void defer_free_mounts(struct work_struct *work)
> +{
> +	struct deferred_free_mounts *d = container_of(
> +		to_rcu_work(work), struct deferred_free_mounts, rwork);
> +
> +	free_mounts(&d->release_list);
> +	kfree(d);
> +}
> +
> +static void __namespace_unlock(bool lazy)
> +{
> +	HLIST_HEAD(head);
>  	LIST_HEAD(list);
>  
>  	hlist_move_list(&unmounted, &head);
> @@ -1817,12 +1840,27 @@ static void namespace_unlock(void)
>  	if (likely(hlist_empty(&head)))
>  		return;
>  
> -	synchronize_rcu_expedited();
> +	if (lazy) {
> +		struct deferred_free_mounts *d =
> +			kmalloc(sizeof(*d), GFP_KERNEL);
>  
> -	hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
> -		hlist_del(&m->mnt_umount);
> -		mntput(&m->mnt);
> +		if (unlikely(!d))
> +			goto out;
> +
> +		hlist_move_list(&head, &d->release_list);
> +		INIT_RCU_WORK(&d->rwork, defer_free_mounts);
> +		queue_rcu_work(system_wq, &d->rwork);
> +		return;
>  	}
> +
> +out:
> +	synchronize_rcu_expedited();
> +	free_mounts(&head);
> +}
> +
> +static inline void namespace_unlock(void)
> +{
> +	__namespace_unlock(false);
>  }
>  
>  static inline void namespace_lock(void)
> @@ -2056,7 +2094,7 @@ static int do_umount(struct mount *mnt, int flags)
>  	}
>  out:
>  	unlock_mount_hash();
> -	namespace_unlock();
> +	__namespace_unlock(flags & MNT_DETACH);
>  	return retval;
>  }
>  
> -- 
> 2.49.0
>
Mateusz Guzik April 9, 2025, 1:04 p.m. UTC | #2
On Tue, Apr 08, 2025 at 04:58:34PM -0400, Eric Chanudet wrote:
> Defer releasing the detached file-system when calling namespace_unlock()
> during a lazy umount to return faster.
> 
> When requesting MNT_DETACH, the caller does not expect the file-system
> to be shut down upon returning from the syscall. Calling
> synchronize_rcu_expedited() has a significant cost on RT kernel that
> defaults to rcupdate.rcu_normal_after_boot=1. Queue the detached struct
> mount in a separate list and put it on a workqueue to run post RCU
> grace-period.
> 
> w/o patch, 6.15-rc1 PREEMPT_RT:
> perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
>     0.02455 +- 0.00107 seconds time elapsed  ( +-  4.36% )
> perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt
>     0.02555 +- 0.00114 seconds time elapsed  ( +-  4.46% )
> 
> w/ patch, 6.15-rc1 PREEMPT_RT:
> perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
>     0.026311 +- 0.000869 seconds time elapsed  ( +-  3.30% )
> perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt
>     0.003194 +- 0.000160 seconds time elapsed  ( +-  5.01% )
> 

Christian wants the patch done differently and posted his diff, so I'm
not going to comment on it.

I do have some feedback about the commit message though.

In v1 it points out a real user which runs into it, while this one does
not. So I would rewrite this and put in bench results from the actual
consumer -- as it is one is left to wonder why patching up lazy unmount
is of any significance.

I had to look up what rcupdate.rcu_normal_after_boot=1 is. Docs claim it
makes everyone use normal grace-periods, which explains the difference.
But without that one is left to wonder if perhaps there is a perf bug in
RCU instead where this is taking longer than it should despite the
option. Thus I would also denote how the delay shows up.

v1 for reference:
> v1: https://lore.kernel.org/all/20230119205521.497401-1-echanude@redhat.com/
Sebastian Andrzej Siewior April 9, 2025, 1:14 p.m. UTC | #3
On 2025-04-09 12:37:06 [+0200], Christian Brauner wrote:
> I still hate this with a passion because it adds another special-sauce
> path into the unlock path. I've folded the following diff into it so it
> at least doesn't start passing that pointless boolean and doesn't
> introduce __namespace_unlock(). Just use a global variable and pick the
> value off of it just as we do with the lists. Testing this now:

I tried to apply this on top of the previous one but it all chunks
failed.

One question: Do we need this lazy/ MNT_DETACH case? Couldn't we handle
them all via queue_rcu_work()?
If so, couldn't we have make deferred_free_mounts global and have two
release_list, say release_list and release_list_next_gp? The first one
will be used if queue_rcu_work() returns true, otherwise the second.
Then once defer_free_mounts() is done and release_list_next_gp not
empty, it would move release_list_next_gp -> release_list and invoke
queue_rcu_work().
This would avoid the kmalloc, synchronize_rcu_expedited() and the
special-sauce.

> diff --git a/fs/namespace.c b/fs/namespace.c
> index e5b0b920dd97..25599428706c 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -1840,29 +1842,21 @@ static void __namespace_unlock(bool lazy)> +               d = kmalloc(sizeof(struct deferred_free_mounts), GFP_KERNEL);
> +               if (d) {
> +                       hlist_move_list(&head, &d->release_list);
> +                       INIT_RCU_WORK(&d->rwork, defer_free_mounts);
> +                       queue_rcu_work(system_wq, &d->rwork);

Couldn't we do system_unbound_wq?

Sebastian
Mateusz Guzik April 9, 2025, 2:02 p.m. UTC | #4
On Wed, Apr 09, 2025 at 03:14:44PM +0200, Sebastian Andrzej Siewior wrote:
> One question: Do we need this lazy/ MNT_DETACH case? Couldn't we handle
> them all via queue_rcu_work()?
> If so, couldn't we have make deferred_free_mounts global and have two
> release_list, say release_list and release_list_next_gp? The first one
> will be used if queue_rcu_work() returns true, otherwise the second.
> Then once defer_free_mounts() is done and release_list_next_gp not
> empty, it would move release_list_next_gp -> release_list and invoke
> queue_rcu_work().
> This would avoid the kmalloc, synchronize_rcu_expedited() and the
> special-sauce.
> 

To my understanding it was preferred for non-lazy unmount consumers to
wait until the mntput before returning.

That aside if I understood your approach it would de facto serialize all
of these?

As in with the posted patches you can have different worker threads
progress in parallel as they all get a private list to iterate.

With your proposal only one can do any work.

One has to assume with sufficient mount/unmount traffic this can
eventually get into trouble.
Sebastian Andrzej Siewior April 9, 2025, 2:25 p.m. UTC | #5
On 2025-04-09 16:02:29 [+0200], Mateusz Guzik wrote:
> On Wed, Apr 09, 2025 at 03:14:44PM +0200, Sebastian Andrzej Siewior wrote:
> > One question: Do we need this lazy/ MNT_DETACH case? Couldn't we handle
> > them all via queue_rcu_work()?
> > If so, couldn't we have make deferred_free_mounts global and have two
> > release_list, say release_list and release_list_next_gp? The first one
> > will be used if queue_rcu_work() returns true, otherwise the second.
> > Then once defer_free_mounts() is done and release_list_next_gp not
> > empty, it would move release_list_next_gp -> release_list and invoke
> > queue_rcu_work().
> > This would avoid the kmalloc, synchronize_rcu_expedited() and the
> > special-sauce.
> > 
> 
> To my understanding it was preferred for non-lazy unmount consumers to
> wait until the mntput before returning.
> 
> That aside if I understood your approach it would de facto serialize all
> of these?
> 
> As in with the posted patches you can have different worker threads
> progress in parallel as they all get a private list to iterate.
> 
> With your proposal only one can do any work.
> 
> One has to assume with sufficient mount/unmount traffic this can
> eventually get into trouble.

Right, it would serialize them within the same worker thread. With one
worker for each put you would schedule multiple worker from the RCU
callback. Given the system_wq you will schedule them all on the CPU
which invokes the RCU callback. This kind of serializes it, too.

The mntput() callback uses spinlock_t for locking and then it frees
resources. It does not look like it waits for something nor takes ages.
So it might not be needed to split each put into its own worker on a
different CPU… One busy bee might be enough ;)

Sebastian
Christian Brauner April 9, 2025, 4:04 p.m. UTC | #6
On Wed, Apr 09, 2025 at 04:25:10PM +0200, Sebastian Andrzej Siewior wrote:
> On 2025-04-09 16:02:29 [+0200], Mateusz Guzik wrote:
> > On Wed, Apr 09, 2025 at 03:14:44PM +0200, Sebastian Andrzej Siewior wrote:
> > > One question: Do we need this lazy/ MNT_DETACH case? Couldn't we handle
> > > them all via queue_rcu_work()?
> > > If so, couldn't we have make deferred_free_mounts global and have two
> > > release_list, say release_list and release_list_next_gp? The first one
> > > will be used if queue_rcu_work() returns true, otherwise the second.
> > > Then once defer_free_mounts() is done and release_list_next_gp not
> > > empty, it would move release_list_next_gp -> release_list and invoke
> > > queue_rcu_work().
> > > This would avoid the kmalloc, synchronize_rcu_expedited() and the
> > > special-sauce.
> > > 
> > 
> > To my understanding it was preferred for non-lazy unmount consumers to
> > wait until the mntput before returning.
> > 
> > That aside if I understood your approach it would de facto serialize all
> > of these?
> > 
> > As in with the posted patches you can have different worker threads
> > progress in parallel as they all get a private list to iterate.
> > 
> > With your proposal only one can do any work.
> > 
> > One has to assume with sufficient mount/unmount traffic this can
> > eventually get into trouble.
> 
> Right, it would serialize them within the same worker thread. With one
> worker for each put you would schedule multiple worker from the RCU
> callback. Given the system_wq you will schedule them all on the CPU
> which invokes the RCU callback. This kind of serializes it, too.
> 
> The mntput() callback uses spinlock_t for locking and then it frees
> resources. It does not look like it waits for something nor takes ages.
> So it might not be needed to split each put into its own worker on a
> different CPU… One busy bee might be enough ;)

Unmounting can trigger very large number of mounts to be unmounted. If
you're on a container heavy system or services that all propagate to
each other in different mount namespaces mount propagation will generate
a ton of umounts. So this cannot be underestimated.

If a mount tree is wasted without MNT_DETACH it will pass UMOUNT_SYNC to
umount_tree(). That'll cause MNT_SYNC_UMOUNT to be raised on all mounts
during the unmount.

If a concurrent path lookup calls legitimize_mnt() on such a mount and
sees that MNT_SYNC_UMOUNT is set it will discount as it know that the
concurrent unmounter hold the last reference and it __legitimize_mnt()
can thus simply drop the reference count. The final mntput() will be
done by the umounter.

The synchronize_rcu() call in namespace_unlock() takes care that the
last mntput() doesn't happen until path walking has dropped out of RCU
mode.

Without it it's possible that a non-MNT_DETACH umounter gets a spurious
EBUSY error because a concurrent lazy path walk will suddenly put the
last reference via mntput().

I'm unclear how that's handled in whatever it is you're proposing.
Eric Chanudet April 9, 2025, 4:08 p.m. UTC | #7
> > > On Wed, Apr 09, 2025 at 12:37:06PM +0200, Christian Brauner wrote:
> > > > On Tue, Apr 08, 2025 at 04:58:34PM -0400, Eric Chanudet wrote:
> > > > > Attempt to re-spin this series based on the feedback received in v3 that
> > > > > pointed out the need to wait the grace-period in namespace_unlock()
> > > > > before calling the deferred mntput().
> > > > 
> > > > I still hate this with a passion because it adds another special-sauce
> > > > path into the unlock path. I've folded the following diff into it so it
> > > > at least doesn't start passing that pointless boolean and doesn't
> > > > introduce __namespace_unlock(). Just use a global variable and pick the
> > > > value off of it just as we do with the lists. Testing this now:

My apologies, I went with the feedback from v3[1] and failed to parse
the context surrounding it.

[1] https://lore.kernel.org/all/Znx-WGU5Wx6RaJyD@casper.infradead.org/

> > > > @@ -2094,7 +2088,7 @@ static int do_umount(struct mount *mnt, int flags)
> > > >         }
> > > >  out:
> > > >         unlock_mount_hash();
> > > > -       __namespace_unlock(flags & MNT_DETACH);
> > > > +       namespace_unlock();
> > > >         return retval;
> > > >  }
> > > > 
> > > > 

I believe you skipped setting unmounted_lazily in this hunk?

With this, I have applied your patch for the following discussion and
down thread. Happy to send a v5, should this patch be deemed worth
pursuing.

On Wed, Apr 09, 2025 at 04:25:10PM +0200, Sebastian Andrzej Siewior wrote:
> On 2025-04-09 16:02:29 [+0200], Mateusz Guzik wrote:
> > On Wed, Apr 09, 2025 at 03:14:44PM +0200, Sebastian Andrzej Siewior wrote:
> > > One question: Do we need this lazy/ MNT_DETACH case? Couldn't we handle
> > > them all via queue_rcu_work()?
> > > If so, couldn't we have make deferred_free_mounts global and have two
> > > release_list, say release_list and release_list_next_gp? The first one
> > > will be used if queue_rcu_work() returns true, otherwise the second.
> > > Then once defer_free_mounts() is done and release_list_next_gp not
> > > empty, it would move release_list_next_gp -> release_list and invoke
> > > queue_rcu_work().
> > > This would avoid the kmalloc, synchronize_rcu_expedited() and the
> > > special-sauce.
> > > 
> > 
> > To my understanding it was preferred for non-lazy unmount consumers to
> > wait until the mntput before returning.

Unless I misunderstand the statement, and from the previous thread[2],
this is a requirement of the user API.

[2] https://lore.kernel.org/all/Y8m+M%2FffIEEWbfmv@ZenIV/

> > On Wed, Apr 09, 2025 at 03:14:44PM +0200, Sebastian Andrzej Siewior wrote:
> > > On 2025-04-09 12:37:06 [+0200], Christian Brauner wrote:
> > > > diff --git a/fs/namespace.c b/fs/namespace.c
> > > > index e5b0b920dd97..25599428706c 100644
> > > > --- a/fs/namespace.c
> > > > +++ b/fs/namespace.c
> > > > @@ -1840,29 +1842,21 @@ static void __namespace_unlock(bool lazy)
> > > …
> > > > +               d = kmalloc(sizeof(struct deferred_free_mounts), GFP_KERNEL);
> > > > +               if (d) {
> > > > +                       hlist_move_list(&head, &d->release_list);
> > > > +                       INIT_RCU_WORK(&d->rwork, defer_free_mounts);
> > > > +                       queue_rcu_work(system_wq, &d->rwork);
> > > 
> > > Couldn't we do system_unbound_wq?

I think we can, afaict we don't need locality? I'll run some tests with
system_unbound_wq.

Thanks,
Christian Brauner April 9, 2025, 4:09 p.m. UTC | #8
On Wed, Apr 09, 2025 at 03:14:44PM +0200, Sebastian Andrzej Siewior wrote:
> On 2025-04-09 12:37:06 [+0200], Christian Brauner wrote:
> > I still hate this with a passion because it adds another special-sauce
> > path into the unlock path. I've folded the following diff into it so it
> > at least doesn't start passing that pointless boolean and doesn't
> > introduce __namespace_unlock(). Just use a global variable and pick the
> > value off of it just as we do with the lists. Testing this now:
> 
> I tried to apply this on top of the previous one but it all chunks
> failed.

See https://web.git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git/log/?h=vfs-6.16.mount
Eric Chanudet April 9, 2025, 4:41 p.m. UTC | #9
On Wed, Apr 09, 2025 at 03:04:43PM +0200, Mateusz Guzik wrote:
> On Tue, Apr 08, 2025 at 04:58:34PM -0400, Eric Chanudet wrote:
> > Defer releasing the detached file-system when calling namespace_unlock()
> > during a lazy umount to return faster.
> > 
> > When requesting MNT_DETACH, the caller does not expect the file-system
> > to be shut down upon returning from the syscall. Calling
> > synchronize_rcu_expedited() has a significant cost on RT kernel that
> > defaults to rcupdate.rcu_normal_after_boot=1. Queue the detached struct
> > mount in a separate list and put it on a workqueue to run post RCU
> > grace-period.
> > 
> > w/o patch, 6.15-rc1 PREEMPT_RT:
> > perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
> >     0.02455 +- 0.00107 seconds time elapsed  ( +-  4.36% )
> > perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt
> >     0.02555 +- 0.00114 seconds time elapsed  ( +-  4.46% )
> > 
> > w/ patch, 6.15-rc1 PREEMPT_RT:
> > perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
> >     0.026311 +- 0.000869 seconds time elapsed  ( +-  3.30% )
> > perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt
> >     0.003194 +- 0.000160 seconds time elapsed  ( +-  5.01% )
> > 
> 
> Christian wants the patch done differently and posted his diff, so I'm
> not going to comment on it.
> 
> I do have some feedback about the commit message though.
> 
> In v1 it points out a real user which runs into it, while this one does
> not. So I would rewrite this and put in bench results from the actual
> consumer -- as it is one is left to wonder why patching up lazy unmount
> is of any significance.

Certainly. Doing the test mentioned in v1 again with v4+Christian's
suggested changes:
- QEMU x86_64, 8cpus, PREEMPT_RT, w/o patch:
# perf stat -r 10 --table --null -- crun run test
    0.07584 +- 0.00440 seconds time elapsed  ( +-  5.80% )
- QEMU x86_64, 8cpus, PREEMPT_RT, w/ patch:
# perf stat -r 10 --table --null -- crun run test
    0.01421 +- 0.00387 seconds time elapsed  ( +- 27.26% )

I will add that to the commit message.

> I had to look up what rcupdate.rcu_normal_after_boot=1 is. Docs claim it
> makes everyone use normal grace-periods, which explains the difference.
> But without that one is left to wonder if perhaps there is a perf bug in
> RCU instead where this is taking longer than it should despite the
> option. Thus I would also denote how the delay shows up.

I tried the test above while trying to force expedited RCU on the
cmdline with:
    rcupdate.rcu_normal_after_boot=0 rcupdate.rcu_expedited=1

Unfortunately, rcupdate.rcu_normal_after_boot=0 has no effect and
rcupdate_announce_bootup_oddness() reports:
[    0.015251] 	No expedited grace period (rcu_normal_after_boot).

Which yielded similar results:
- QEMU x86_64, 8cpus, PREEMPT_RT, w/o patch:
# perf stat -r 10 --table --null -- crun run test
    0.07838 +- 0.00322 seconds time elapsed  ( +-  4.11% )
- QEMU x86_64, 8cpus, PREEMPT_RT, w/ patch:
# perf stat -r 10 --table --null -- crun run test
    0.01582 +- 0.00353 seconds time elapsed  ( +- 22.30% )

I don't think rcupdate.rcu_expedited=1 had an effect, but I have not
confirmed that yet.

> v1 for reference:
> > v1: https://lore.kernel.org/all/20230119205521.497401-1-echanude@redhat.com/
Ian Kent April 10, 2025, 1:17 a.m. UTC | #10
On 9/4/25 18:37, Christian Brauner wrote:
> On Tue, Apr 08, 2025 at 04:58:34PM -0400, Eric Chanudet wrote:
>> Defer releasing the detached file-system when calling namespace_unlock()
>> during a lazy umount to return faster.
>>
>> When requesting MNT_DETACH, the caller does not expect the file-system
>> to be shut down upon returning from the syscall. Calling
>> synchronize_rcu_expedited() has a significant cost on RT kernel that
>> defaults to rcupdate.rcu_normal_after_boot=1. Queue the detached struct
>> mount in a separate list and put it on a workqueue to run post RCU
>> grace-period.
>>
>> w/o patch, 6.15-rc1 PREEMPT_RT:
>> perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
>>      0.02455 +- 0.00107 seconds time elapsed  ( +-  4.36% )
>> perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt
>>      0.02555 +- 0.00114 seconds time elapsed  ( +-  4.46% )
>>
>> w/ patch, 6.15-rc1 PREEMPT_RT:
>> perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
>>      0.026311 +- 0.000869 seconds time elapsed  ( +-  3.30% )
>> perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt
>>      0.003194 +- 0.000160 seconds time elapsed  ( +-  5.01% )
>>
>> Signed-off-by: Alexander Larsson <alexl@redhat.com>
>> Signed-off-by: Lucas Karpinski <lkarpins@redhat.com>
>> Signed-off-by: Eric Chanudet <echanude@redhat.com>
>> ---
>>
>> Attempt to re-spin this series based on the feedback received in v3 that
>> pointed out the need to wait the grace-period in namespace_unlock()
>> before calling the deferred mntput().
> I still hate this with a passion because it adds another special-sauce
> path into the unlock path. I've folded the following diff into it so it
> at least doesn't start passing that pointless boolean and doesn't
> introduce __namespace_unlock(). Just use a global variable and pick the
> value off of it just as we do with the lists. Testing this now:

Yeah, it's painful that's for sure.


I also agree with you about the parameter, changing the call signature

always rubbed me the wrong way but I didn't push back on it mostly because

we needed to find a way to do it sensibly and it sounds like that's still

the case.


AFAICT what's needed is a way to synchronize umount with the lockless path

walk. Now umount detaches the mounts concerned, calls the rcu synchronize

(essentially sleeps) to ensure that any lockless path walks see the umount

before completing. But that rcu sync. is, as we can see, really wasteful so

we do need to find a viable way to synchronize this.


Strictly speaking the synchronization problem exists for normal and detached

umounts but if we can find a sound solution for detached mounts perhaps 
we can

extend later (but now that seems like a stretch) ...


I'm not sure why, perhaps it's just me, I don't know, but with this we don't

seem to be working well together to find a solution, I hope we can 
change that

this time around.


I was thinking of using a completion for this synchronization but even that

would be messy because of possible multiple processes doing walks at the 
time

which doesn't lend cleanly to using a completion.


Do you have any ideas on how this could be done yourself?


Ian

>
> diff --git a/fs/namespace.c b/fs/namespace.c
> index e5b0b920dd97..25599428706c 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -82,8 +82,9 @@ static struct hlist_head *mount_hashtable __ro_after_init;
>   static struct hlist_head *mountpoint_hashtable __ro_after_init;
>   static struct kmem_cache *mnt_cache __ro_after_init;
>   static DECLARE_RWSEM(namespace_sem);
> -static HLIST_HEAD(unmounted);  /* protected by namespace_sem */
> -static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
> +static bool unmounted_lazily;          /* protected by namespace_sem */
> +static HLIST_HEAD(unmounted);          /* protected by namespace_sem */
> +static LIST_HEAD(ex_mountpoints);      /* protected by namespace_sem */
>   static DEFINE_SEQLOCK(mnt_ns_tree_lock);
>
>   #ifdef CONFIG_FSNOTIFY
> @@ -1807,17 +1808,18 @@ static void free_mounts(struct hlist_head *mount_list)
>
>   static void defer_free_mounts(struct work_struct *work)
>   {
> -       struct deferred_free_mounts *d = container_of(
> -               to_rcu_work(work), struct deferred_free_mounts, rwork);
> +       struct deferred_free_mounts *d;
>
> +       d = container_of(to_rcu_work(work), struct deferred_free_mounts, rwork);
>          free_mounts(&d->release_list);
>          kfree(d);
>   }
>
> -static void __namespace_unlock(bool lazy)
> +static void namespace_unlock(void)
>   {
>          HLIST_HEAD(head);
>          LIST_HEAD(list);
> +       bool defer = unmounted_lazily;
>
>          hlist_move_list(&unmounted, &head);
>          list_splice_init(&ex_mountpoints, &list);
> @@ -1840,29 +1842,21 @@ static void __namespace_unlock(bool lazy)
>          if (likely(hlist_empty(&head)))
>                  return;
>
> -       if (lazy) {
> -               struct deferred_free_mounts *d =
> -                       kmalloc(sizeof(*d), GFP_KERNEL);
> +       if (defer) {
> +               struct deferred_free_mounts *d;
>
> -               if (unlikely(!d))
> -                       goto out;
> -
> -               hlist_move_list(&head, &d->release_list);
> -               INIT_RCU_WORK(&d->rwork, defer_free_mounts);
> -               queue_rcu_work(system_wq, &d->rwork);
> -               return;
> +               d = kmalloc(sizeof(struct deferred_free_mounts), GFP_KERNEL);
> +               if (d) {
> +                       hlist_move_list(&head, &d->release_list);
> +                       INIT_RCU_WORK(&d->rwork, defer_free_mounts);
> +                       queue_rcu_work(system_wq, &d->rwork);
> +                       return;
> +               }
>          }
> -
> -out:
>          synchronize_rcu_expedited();
>          free_mounts(&head);
>   }
>
> -static inline void namespace_unlock(void)
> -{
> -       __namespace_unlock(false);
> -}
> -
>   static inline void namespace_lock(void)
>   {
>          down_write(&namespace_sem);
> @@ -2094,7 +2088,7 @@ static int do_umount(struct mount *mnt, int flags)
>          }
>   out:
>          unlock_mount_hash();
> -       __namespace_unlock(flags & MNT_DETACH);
> +       namespace_unlock();
>          return retval;
>   }
>
>
>> v4:
>> - Use queue_rcu_work() to defer free_mounts() for lazy umounts
>> - Drop lazy_unlock global and refactor using a helper
>> v3: https://lore.kernel.org/all/20240626201129.272750-2-lkarpins@redhat.com/
>> - Removed unneeded code for lazy umount case.
>> - Don't block within interrupt context.
>> v2: https://lore.kernel.org/all/20240426195429.28547-1-lkarpins@redhat.com/
>> - Only defer releasing umount'ed filesystems for lazy umounts
>> v1: https://lore.kernel.org/all/20230119205521.497401-1-echanude@redhat.com/
>>
>>   fs/namespace.c | 52 +++++++++++++++++++++++++++++++++++++++++++-------
>>   1 file changed, 45 insertions(+), 7 deletions(-)
>>
>> diff --git a/fs/namespace.c b/fs/namespace.c
>> index 14935a0500a2..e5b0b920dd97 100644
>> --- a/fs/namespace.c
>> +++ b/fs/namespace.c
>> @@ -45,6 +45,11 @@ static unsigned int m_hash_shift __ro_after_init;
>>   static unsigned int mp_hash_mask __ro_after_init;
>>   static unsigned int mp_hash_shift __ro_after_init;
>>   
>> +struct deferred_free_mounts {
>> +	struct rcu_work rwork;
>> +	struct hlist_head release_list;
>> +};
>> +
>>   static __initdata unsigned long mhash_entries;
>>   static int __init set_mhash_entries(char *str)
>>   {
>> @@ -1789,11 +1794,29 @@ static bool need_notify_mnt_list(void)
>>   }
>>   #endif
>>   
>> -static void namespace_unlock(void)
>> +static void free_mounts(struct hlist_head *mount_list)
>>   {
>> -	struct hlist_head head;
>>   	struct hlist_node *p;
>>   	struct mount *m;
>> +
>> +	hlist_for_each_entry_safe(m, p, mount_list, mnt_umount) {
>> +		hlist_del(&m->mnt_umount);
>> +		mntput(&m->mnt);
>> +	}
>> +}
>> +
>> +static void defer_free_mounts(struct work_struct *work)
>> +{
>> +	struct deferred_free_mounts *d = container_of(
>> +		to_rcu_work(work), struct deferred_free_mounts, rwork);
>> +
>> +	free_mounts(&d->release_list);
>> +	kfree(d);
>> +}
>> +
>> +static void __namespace_unlock(bool lazy)
>> +{
>> +	HLIST_HEAD(head);
>>   	LIST_HEAD(list);
>>   
>>   	hlist_move_list(&unmounted, &head);
>> @@ -1817,12 +1840,27 @@ static void namespace_unlock(void)
>>   	if (likely(hlist_empty(&head)))
>>   		return;
>>   
>> -	synchronize_rcu_expedited();
>> +	if (lazy) {
>> +		struct deferred_free_mounts *d =
>> +			kmalloc(sizeof(*d), GFP_KERNEL);
>>   
>> -	hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
>> -		hlist_del(&m->mnt_umount);
>> -		mntput(&m->mnt);
>> +		if (unlikely(!d))
>> +			goto out;
>> +
>> +		hlist_move_list(&head, &d->release_list);
>> +		INIT_RCU_WORK(&d->rwork, defer_free_mounts);
>> +		queue_rcu_work(system_wq, &d->rwork);
>> +		return;
>>   	}
>> +
>> +out:
>> +	synchronize_rcu_expedited();
>> +	free_mounts(&head);
>> +}
>> +
>> +static inline void namespace_unlock(void)
>> +{
>> +	__namespace_unlock(false);
>>   }
>>   
>>   static inline void namespace_lock(void)
>> @@ -2056,7 +2094,7 @@ static int do_umount(struct mount *mnt, int flags)
>>   	}
>>   out:
>>   	unlock_mount_hash();
>> -	namespace_unlock();
>> +	__namespace_unlock(flags & MNT_DETACH);
>>   	return retval;
>>   }
>>   
>> -- 
>> 2.49.0
>>
Ian Kent April 10, 2025, 3:04 a.m. UTC | #11
On 10/4/25 00:04, Christian Brauner wrote:
> On Wed, Apr 09, 2025 at 04:25:10PM +0200, Sebastian Andrzej Siewior wrote:
>> On 2025-04-09 16:02:29 [+0200], Mateusz Guzik wrote:
>>> On Wed, Apr 09, 2025 at 03:14:44PM +0200, Sebastian Andrzej Siewior wrote:
>>>> One question: Do we need this lazy/ MNT_DETACH case? Couldn't we handle
>>>> them all via queue_rcu_work()?
>>>> If so, couldn't we have make deferred_free_mounts global and have two
>>>> release_list, say release_list and release_list_next_gp? The first one
>>>> will be used if queue_rcu_work() returns true, otherwise the second.
>>>> Then once defer_free_mounts() is done and release_list_next_gp not
>>>> empty, it would move release_list_next_gp -> release_list and invoke
>>>> queue_rcu_work().
>>>> This would avoid the kmalloc, synchronize_rcu_expedited() and the
>>>> special-sauce.
>>>>
>>> To my understanding it was preferred for non-lazy unmount consumers to
>>> wait until the mntput before returning.
>>>
>>> That aside if I understood your approach it would de facto serialize all
>>> of these?
>>>
>>> As in with the posted patches you can have different worker threads
>>> progress in parallel as they all get a private list to iterate.
>>>
>>> With your proposal only one can do any work.
>>>
>>> One has to assume with sufficient mount/unmount traffic this can
>>> eventually get into trouble.
>> Right, it would serialize them within the same worker thread. With one
>> worker for each put you would schedule multiple worker from the RCU
>> callback. Given the system_wq you will schedule them all on the CPU
>> which invokes the RCU callback. This kind of serializes it, too.
>>
>> The mntput() callback uses spinlock_t for locking and then it frees
>> resources. It does not look like it waits for something nor takes ages.
>> So it might not be needed to split each put into its own worker on a
>> different CPU… One busy bee might be enough ;)
> Unmounting can trigger very large number of mounts to be unmounted. If
> you're on a container heavy system or services that all propagate to
> each other in different mount namespaces mount propagation will generate
> a ton of umounts. So this cannot be underestimated.

Indeed yes, or shutting down autofs when it's using a direct mount map

with a few hundred (or thousand) entries.


Foe my part it's things like the targeted mount information and mounted

mounts listing system calls (done again recently by Miklos) and this slow

umount that are, IMHO, really important.


Ian

>
> If a mount tree is wasted without MNT_DETACH it will pass UMOUNT_SYNC to
> umount_tree(). That'll cause MNT_SYNC_UMOUNT to be raised on all mounts
> during the unmount.
>
> If a concurrent path lookup calls legitimize_mnt() on such a mount and
> sees that MNT_SYNC_UMOUNT is set it will discount as it know that the
> concurrent unmounter hold the last reference and it __legitimize_mnt()
> can thus simply drop the reference count. The final mntput() will be
> done by the umounter.
>
> The synchronize_rcu() call in namespace_unlock() takes care that the
> last mntput() doesn't happen until path walking has dropped out of RCU
> mode.
>
> Without it it's possible that a non-MNT_DETACH umounter gets a spurious
> EBUSY error because a concurrent lazy path walk will suddenly put the
> last reference via mntput().
>
> I'm unclear how that's handled in whatever it is you're proposing.
>
Sebastian Andrzej Siewior April 10, 2025, 8:28 a.m. UTC | #12
On 2025-04-09 18:04:21 [+0200], Christian Brauner wrote:
> On Wed, Apr 09, 2025 at 04:25:10PM +0200, Sebastian Andrzej Siewior wrote:
> > On 2025-04-09 16:02:29 [+0200], Mateusz Guzik wrote:
> > > On Wed, Apr 09, 2025 at 03:14:44PM +0200, Sebastian Andrzej Siewior wrote:
> > > > One question: Do we need this lazy/ MNT_DETACH case? Couldn't we handle
> > > > them all via queue_rcu_work()?
> > > > If so, couldn't we have make deferred_free_mounts global and have two
> > > > release_list, say release_list and release_list_next_gp? The first one
> > > > will be used if queue_rcu_work() returns true, otherwise the second.
> > > > Then once defer_free_mounts() is done and release_list_next_gp not
> > > > empty, it would move release_list_next_gp -> release_list and invoke
> > > > queue_rcu_work().
> > > > This would avoid the kmalloc, synchronize_rcu_expedited() and the
> > > > special-sauce.
> > > > 
> > > 
> > > To my understanding it was preferred for non-lazy unmount consumers to
> > > wait until the mntput before returning.
> > > 
> > > That aside if I understood your approach it would de facto serialize all
> > > of these?
> > > 
> > > As in with the posted patches you can have different worker threads
> > > progress in parallel as they all get a private list to iterate.
> > > 
> > > With your proposal only one can do any work.
> > > 
> > > One has to assume with sufficient mount/unmount traffic this can
> > > eventually get into trouble.
> > 
> > Right, it would serialize them within the same worker thread. With one
> > worker for each put you would schedule multiple worker from the RCU
> > callback. Given the system_wq you will schedule them all on the CPU
> > which invokes the RCU callback. This kind of serializes it, too.
> > 
> > The mntput() callback uses spinlock_t for locking and then it frees
> > resources. It does not look like it waits for something nor takes ages.
> > So it might not be needed to split each put into its own worker on a
> > different CPU… One busy bee might be enough ;)
> 
> Unmounting can trigger very large number of mounts to be unmounted. If
> you're on a container heavy system or services that all propagate to
> each other in different mount namespaces mount propagation will generate
> a ton of umounts. So this cannot be underestimated.

So you want to have two of these unmounts in two worker so you can split
them on two CPUs in best case. As of today, in order to get through with
umounts asap you accelerate the grace period. And after the wake up may
utilize more than one CPU.

> If a mount tree is wasted without MNT_DETACH it will pass UMOUNT_SYNC to
> umount_tree(). That'll cause MNT_SYNC_UMOUNT to be raised on all mounts
> during the unmount.
> 
> If a concurrent path lookup calls legitimize_mnt() on such a mount and
> sees that MNT_SYNC_UMOUNT is set it will discount as it know that the
> concurrent unmounter hold the last reference and it __legitimize_mnt()
> can thus simply drop the reference count. The final mntput() will be
> done by the umounter.
> 
> The synchronize_rcu() call in namespace_unlock() takes care that the
> last mntput() doesn't happen until path walking has dropped out of RCU
> mode.
> 
> Without it it's possible that a non-MNT_DETACH umounter gets a spurious
> EBUSY error because a concurrent lazy path walk will suddenly put the
> last reference via mntput().
> 
> I'm unclear how that's handled in whatever it is you're proposing.

Okay. So we can't do this for UMOUNT_SYNC callers, thank you for the
explanation. We could avoid the memory allocation and have one worker to
take care of them all but you are afraid what this would mean to huge
container. Understandable. The s/system_wq/system_unbound_wq/ would make
sense.

Sebastian
Christian Brauner April 10, 2025, 10:48 a.m. UTC | #13
On Thu, Apr 10, 2025 at 10:28:33AM +0200, Sebastian Andrzej Siewior wrote:
> On 2025-04-09 18:04:21 [+0200], Christian Brauner wrote:
> > On Wed, Apr 09, 2025 at 04:25:10PM +0200, Sebastian Andrzej Siewior wrote:
> > > On 2025-04-09 16:02:29 [+0200], Mateusz Guzik wrote:
> > > > On Wed, Apr 09, 2025 at 03:14:44PM +0200, Sebastian Andrzej Siewior wrote:
> > > > > One question: Do we need this lazy/ MNT_DETACH case? Couldn't we handle
> > > > > them all via queue_rcu_work()?
> > > > > If so, couldn't we have make deferred_free_mounts global and have two
> > > > > release_list, say release_list and release_list_next_gp? The first one
> > > > > will be used if queue_rcu_work() returns true, otherwise the second.
> > > > > Then once defer_free_mounts() is done and release_list_next_gp not
> > > > > empty, it would move release_list_next_gp -> release_list and invoke
> > > > > queue_rcu_work().
> > > > > This would avoid the kmalloc, synchronize_rcu_expedited() and the
> > > > > special-sauce.
> > > > > 
> > > > 
> > > > To my understanding it was preferred for non-lazy unmount consumers to
> > > > wait until the mntput before returning.
> > > > 
> > > > That aside if I understood your approach it would de facto serialize all
> > > > of these?
> > > > 
> > > > As in with the posted patches you can have different worker threads
> > > > progress in parallel as they all get a private list to iterate.
> > > > 
> > > > With your proposal only one can do any work.
> > > > 
> > > > One has to assume with sufficient mount/unmount traffic this can
> > > > eventually get into trouble.
> > > 
> > > Right, it would serialize them within the same worker thread. With one
> > > worker for each put you would schedule multiple worker from the RCU
> > > callback. Given the system_wq you will schedule them all on the CPU
> > > which invokes the RCU callback. This kind of serializes it, too.
> > > 
> > > The mntput() callback uses spinlock_t for locking and then it frees
> > > resources. It does not look like it waits for something nor takes ages.
> > > So it might not be needed to split each put into its own worker on a
> > > different CPU… One busy bee might be enough ;)
> > 
> > Unmounting can trigger very large number of mounts to be unmounted. If
> > you're on a container heavy system or services that all propagate to
> > each other in different mount namespaces mount propagation will generate
> > a ton of umounts. So this cannot be underestimated.
> 
> So you want to have two of these unmounts in two worker so you can split
> them on two CPUs in best case. As of today, in order to get through with
> umounts asap you accelerate the grace period. And after the wake up may
> utilize more than one CPU.
> 
> > If a mount tree is wasted without MNT_DETACH it will pass UMOUNT_SYNC to
> > umount_tree(). That'll cause MNT_SYNC_UMOUNT to be raised on all mounts
> > during the unmount.
> > 
> > If a concurrent path lookup calls legitimize_mnt() on such a mount and
> > sees that MNT_SYNC_UMOUNT is set it will discount as it know that the
> > concurrent unmounter hold the last reference and it __legitimize_mnt()
> > can thus simply drop the reference count. The final mntput() will be
> > done by the umounter.
> > 
> > The synchronize_rcu() call in namespace_unlock() takes care that the
> > last mntput() doesn't happen until path walking has dropped out of RCU
> > mode.
> > 
> > Without it it's possible that a non-MNT_DETACH umounter gets a spurious
> > EBUSY error because a concurrent lazy path walk will suddenly put the
> > last reference via mntput().
> > 
> > I'm unclear how that's handled in whatever it is you're proposing.
> 
> Okay. So we can't do this for UMOUNT_SYNC callers, thank you for the
> explanation. We could avoid the memory allocation and have one worker to
> take care of them all but you are afraid what this would mean to huge
> container. Understandable. The s/system_wq/system_unbound_wq/ would make
> sense.

Don't get me wrong if you have a clever idea here I'm all ears.
Ian Kent April 10, 2025, 1:58 p.m. UTC | #14
On 10/4/25 00:04, Christian Brauner wrote:
> On Wed, Apr 09, 2025 at 04:25:10PM +0200, Sebastian Andrzej Siewior wrote:
>> On 2025-04-09 16:02:29 [+0200], Mateusz Guzik wrote:
>>> On Wed, Apr 09, 2025 at 03:14:44PM +0200, Sebastian Andrzej Siewior wrote:
>>>> One question: Do we need this lazy/ MNT_DETACH case? Couldn't we handle
>>>> them all via queue_rcu_work()?
>>>> If so, couldn't we have make deferred_free_mounts global and have two
>>>> release_list, say release_list and release_list_next_gp? The first one
>>>> will be used if queue_rcu_work() returns true, otherwise the second.
>>>> Then once defer_free_mounts() is done and release_list_next_gp not
>>>> empty, it would move release_list_next_gp -> release_list and invoke
>>>> queue_rcu_work().
>>>> This would avoid the kmalloc, synchronize_rcu_expedited() and the
>>>> special-sauce.
>>>>
>>> To my understanding it was preferred for non-lazy unmount consumers to
>>> wait until the mntput before returning.
>>>
>>> That aside if I understood your approach it would de facto serialize all
>>> of these?
>>>
>>> As in with the posted patches you can have different worker threads
>>> progress in parallel as they all get a private list to iterate.
>>>
>>> With your proposal only one can do any work.
>>>
>>> One has to assume with sufficient mount/unmount traffic this can
>>> eventually get into trouble.
>> Right, it would serialize them within the same worker thread. With one
>> worker for each put you would schedule multiple worker from the RCU
>> callback. Given the system_wq you will schedule them all on the CPU
>> which invokes the RCU callback. This kind of serializes it, too.
>>
>> The mntput() callback uses spinlock_t for locking and then it frees
>> resources. It does not look like it waits for something nor takes ages.
>> So it might not be needed to split each put into its own worker on a
>> different CPU… One busy bee might be enough ;)
> Unmounting can trigger very large number of mounts to be unmounted. If
> you're on a container heavy system or services that all propagate to
> each other in different mount namespaces mount propagation will generate
> a ton of umounts. So this cannot be underestimated.
>
> If a mount tree is wasted without MNT_DETACH it will pass UMOUNT_SYNC to
> umount_tree(). That'll cause MNT_SYNC_UMOUNT to be raised on all mounts
> during the unmount.
>
> If a concurrent path lookup calls legitimize_mnt() on such a mount and
> sees that MNT_SYNC_UMOUNT is set it will discount as it know that the
> concurrent unmounter hold the last reference and it __legitimize_mnt()
> can thus simply drop the reference count. The final mntput() will be
> done by the umounter.

In umount_tree() it looks like the unmounted mount remains hashed (ie.

disconnect_mount() returns false) so can't it still race with an rcu-walk

regardless of the sybcronsize_rcu().


Surely I'm missing something ...


Ian

>
> The synchronize_rcu() call in namespace_unlock() takes care that the
> last mntput() doesn't happen until path walking has dropped out of RCU
> mode.
>
> Without it it's possible that a non-MNT_DETACH umounter gets a spurious
> EBUSY error because a concurrent lazy path walk will suddenly put the
> last reference via mntput().
>
> I'm unclear how that's handled in whatever it is you're proposing.
>
Ian Kent April 11, 2025, 2:36 a.m. UTC | #15
On 10/4/25 21:58, Ian Kent wrote:
>
> On 10/4/25 00:04, Christian Brauner wrote:
>> On Wed, Apr 09, 2025 at 04:25:10PM +0200, Sebastian Andrzej Siewior 
>> wrote:
>>> On 2025-04-09 16:02:29 [+0200], Mateusz Guzik wrote:
>>>> On Wed, Apr 09, 2025 at 03:14:44PM +0200, Sebastian Andrzej Siewior 
>>>> wrote:
>>>>> One question: Do we need this lazy/ MNT_DETACH case? Couldn't we 
>>>>> handle
>>>>> them all via queue_rcu_work()?
>>>>> If so, couldn't we have make deferred_free_mounts global and have two
>>>>> release_list, say release_list and release_list_next_gp? The first 
>>>>> one
>>>>> will be used if queue_rcu_work() returns true, otherwise the second.
>>>>> Then once defer_free_mounts() is done and release_list_next_gp not
>>>>> empty, it would move release_list_next_gp -> release_list and invoke
>>>>> queue_rcu_work().
>>>>> This would avoid the kmalloc, synchronize_rcu_expedited() and the
>>>>> special-sauce.
>>>>>
>>>> To my understanding it was preferred for non-lazy unmount consumers to
>>>> wait until the mntput before returning.
>>>>
>>>> That aside if I understood your approach it would de facto 
>>>> serialize all
>>>> of these?
>>>>
>>>> As in with the posted patches you can have different worker threads
>>>> progress in parallel as they all get a private list to iterate.
>>>>
>>>> With your proposal only one can do any work.
>>>>
>>>> One has to assume with sufficient mount/unmount traffic this can
>>>> eventually get into trouble.
>>> Right, it would serialize them within the same worker thread. With one
>>> worker for each put you would schedule multiple worker from the RCU
>>> callback. Given the system_wq you will schedule them all on the CPU
>>> which invokes the RCU callback. This kind of serializes it, too.
>>>
>>> The mntput() callback uses spinlock_t for locking and then it frees
>>> resources. It does not look like it waits for something nor takes ages.
>>> So it might not be needed to split each put into its own worker on a
>>> different CPU… One busy bee might be enough ;)
>> Unmounting can trigger very large number of mounts to be unmounted. If
>> you're on a container heavy system or services that all propagate to
>> each other in different mount namespaces mount propagation will generate
>> a ton of umounts. So this cannot be underestimated.
>>
>> If a mount tree is wasted without MNT_DETACH it will pass UMOUNT_SYNC to
>> umount_tree(). That'll cause MNT_SYNC_UMOUNT to be raised on all mounts
>> during the unmount.
>>
>> If a concurrent path lookup calls legitimize_mnt() on such a mount and
>> sees that MNT_SYNC_UMOUNT is set it will discount as it know that the
>> concurrent unmounter hold the last reference and it __legitimize_mnt()
>> can thus simply drop the reference count. The final mntput() will be
>> done by the umounter.
>
> In umount_tree() it looks like the unmounted mount remains hashed (ie.
>
> disconnect_mount() returns false) so can't it still race with an rcu-walk
>
> regardless of the sybcronsize_rcu().
>
>
> Surely I'm missing something ...

Ans I am, please ignore this.

I miss-read the return of the mnt->mnt_parent->mnt.mnt_flags check in 
disconnect_mount(),

my bad.

>
>
> Ian
>
>>
>> The synchronize_rcu() call in namespace_unlock() takes care that the
>> last mntput() doesn't happen until path walking has dropped out of RCU
>> mode.
>>
>> Without it it's possible that a non-MNT_DETACH umounter gets a spurious
>> EBUSY error because a concurrent lazy path walk will suddenly put the
>> last reference via mntput().
>>
>> I'm unclear how that's handled in whatever it is you're proposing.
>>
>
Christian Brauner April 11, 2025, 3:17 p.m. UTC | #16
> With this, I have applied your patch for the following discussion and
> down thread. Happy to send a v5, should this patch be deemed worth
> pursuing.

I'll just switch to system_unbound_wq and there's no need to resend for
now. If the numbers somehow change significantly due to that change just
mention that. Thanks.
Eric Chanudet April 11, 2025, 6:30 p.m. UTC | #17
On Fri, Apr 11, 2025 at 05:17:25PM +0200, Christian Brauner wrote:
> > With this, I have applied your patch for the following discussion and
> > down thread. Happy to send a v5, should this patch be deemed worth
> > pursuing.
> 
> I'll just switch to system_unbound_wq and there's no need to resend for
> now. If the numbers somehow change significantly due to that change just
> mention that. Thanks.
> 

Thank you, I do not see a significant change in the numbers with
system_unbound_wq compared to system_wq.

Best,
diff mbox series

Patch

diff --git a/fs/namespace.c b/fs/namespace.c
index 14935a0500a2..e5b0b920dd97 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -45,6 +45,11 @@  static unsigned int m_hash_shift __ro_after_init;
 static unsigned int mp_hash_mask __ro_after_init;
 static unsigned int mp_hash_shift __ro_after_init;
 
+struct deferred_free_mounts {
+	struct rcu_work rwork;
+	struct hlist_head release_list;
+};
+
 static __initdata unsigned long mhash_entries;
 static int __init set_mhash_entries(char *str)
 {
@@ -1789,11 +1794,29 @@  static bool need_notify_mnt_list(void)
 }
 #endif
 
-static void namespace_unlock(void)
+static void free_mounts(struct hlist_head *mount_list)
 {
-	struct hlist_head head;
 	struct hlist_node *p;
 	struct mount *m;
+
+	hlist_for_each_entry_safe(m, p, mount_list, mnt_umount) {
+		hlist_del(&m->mnt_umount);
+		mntput(&m->mnt);
+	}
+}
+
+static void defer_free_mounts(struct work_struct *work)
+{
+	struct deferred_free_mounts *d = container_of(
+		to_rcu_work(work), struct deferred_free_mounts, rwork);
+
+	free_mounts(&d->release_list);
+	kfree(d);
+}
+
+static void __namespace_unlock(bool lazy)
+{
+	HLIST_HEAD(head);
 	LIST_HEAD(list);
 
 	hlist_move_list(&unmounted, &head);
@@ -1817,12 +1840,27 @@  static void namespace_unlock(void)
 	if (likely(hlist_empty(&head)))
 		return;
 
-	synchronize_rcu_expedited();
+	if (lazy) {
+		struct deferred_free_mounts *d =
+			kmalloc(sizeof(*d), GFP_KERNEL);
 
-	hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
-		hlist_del(&m->mnt_umount);
-		mntput(&m->mnt);
+		if (unlikely(!d))
+			goto out;
+
+		hlist_move_list(&head, &d->release_list);
+		INIT_RCU_WORK(&d->rwork, defer_free_mounts);
+		queue_rcu_work(system_wq, &d->rwork);
+		return;
 	}
+
+out:
+	synchronize_rcu_expedited();
+	free_mounts(&head);
+}
+
+static inline void namespace_unlock(void)
+{
+	__namespace_unlock(false);
 }
 
 static inline void namespace_lock(void)
@@ -2056,7 +2094,7 @@  static int do_umount(struct mount *mnt, int flags)
 	}
 out:
 	unlock_mount_hash();
-	namespace_unlock();
+	__namespace_unlock(flags & MNT_DETACH);
 	return retval;
 }