Message ID | 20250408210350.749901-12-echanude@redhat.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | [v4] fs/namespace: defer RCU sync for MNT_DETACH umount | expand |
On Tue, Apr 08, 2025 at 04:58:34PM -0400, Eric Chanudet wrote: > Defer releasing the detached file-system when calling namespace_unlock() > during a lazy umount to return faster. > > When requesting MNT_DETACH, the caller does not expect the file-system > to be shut down upon returning from the syscall. Calling > synchronize_rcu_expedited() has a significant cost on RT kernel that > defaults to rcupdate.rcu_normal_after_boot=1. Queue the detached struct > mount in a separate list and put it on a workqueue to run post RCU > grace-period. > > w/o patch, 6.15-rc1 PREEMPT_RT: > perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt > 0.02455 +- 0.00107 seconds time elapsed ( +- 4.36% ) > perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt > 0.02555 +- 0.00114 seconds time elapsed ( +- 4.46% ) > > w/ patch, 6.15-rc1 PREEMPT_RT: > perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt > 0.026311 +- 0.000869 seconds time elapsed ( +- 3.30% ) > perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt > 0.003194 +- 0.000160 seconds time elapsed ( +- 5.01% ) > > Signed-off-by: Alexander Larsson <alexl@redhat.com> > Signed-off-by: Lucas Karpinski <lkarpins@redhat.com> > Signed-off-by: Eric Chanudet <echanude@redhat.com> > --- > > Attempt to re-spin this series based on the feedback received in v3 that > pointed out the need to wait the grace-period in namespace_unlock() > before calling the deferred mntput(). I still hate this with a passion because it adds another special-sauce path into the unlock path. I've folded the following diff into it so it at least doesn't start passing that pointless boolean and doesn't introduce __namespace_unlock(). Just use a global variable and pick the value off of it just as we do with the lists. Testing this now: diff --git a/fs/namespace.c b/fs/namespace.c index e5b0b920dd97..25599428706c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -82,8 +82,9 @@ static struct hlist_head *mount_hashtable __ro_after_init; static struct hlist_head *mountpoint_hashtable __ro_after_init; static struct kmem_cache *mnt_cache __ro_after_init; static DECLARE_RWSEM(namespace_sem); -static HLIST_HEAD(unmounted); /* protected by namespace_sem */ -static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ +static bool unmounted_lazily; /* protected by namespace_sem */ +static HLIST_HEAD(unmounted); /* protected by namespace_sem */ +static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ static DEFINE_SEQLOCK(mnt_ns_tree_lock); #ifdef CONFIG_FSNOTIFY @@ -1807,17 +1808,18 @@ static void free_mounts(struct hlist_head *mount_list) static void defer_free_mounts(struct work_struct *work) { - struct deferred_free_mounts *d = container_of( - to_rcu_work(work), struct deferred_free_mounts, rwork); + struct deferred_free_mounts *d; + d = container_of(to_rcu_work(work), struct deferred_free_mounts, rwork); free_mounts(&d->release_list); kfree(d); } -static void __namespace_unlock(bool lazy) +static void namespace_unlock(void) { HLIST_HEAD(head); LIST_HEAD(list); + bool defer = unmounted_lazily; hlist_move_list(&unmounted, &head); list_splice_init(&ex_mountpoints, &list); @@ -1840,29 +1842,21 @@ static void __namespace_unlock(bool lazy) if (likely(hlist_empty(&head))) return; - if (lazy) { - struct deferred_free_mounts *d = - kmalloc(sizeof(*d), GFP_KERNEL); + if (defer) { + struct deferred_free_mounts *d; - if (unlikely(!d)) - goto out; - - hlist_move_list(&head, &d->release_list); - INIT_RCU_WORK(&d->rwork, defer_free_mounts); - queue_rcu_work(system_wq, &d->rwork); - return; + d = kmalloc(sizeof(struct deferred_free_mounts), GFP_KERNEL); + if (d) { + hlist_move_list(&head, &d->release_list); + INIT_RCU_WORK(&d->rwork, defer_free_mounts); + queue_rcu_work(system_wq, &d->rwork); + return; + } } - -out: synchronize_rcu_expedited(); free_mounts(&head); } -static inline void namespace_unlock(void) -{ - __namespace_unlock(false); -} - static inline void namespace_lock(void) { down_write(&namespace_sem); @@ -2094,7 +2088,7 @@ static int do_umount(struct mount *mnt, int flags) } out: unlock_mount_hash(); - __namespace_unlock(flags & MNT_DETACH); + namespace_unlock(); return retval; } > > v4: > - Use queue_rcu_work() to defer free_mounts() for lazy umounts > - Drop lazy_unlock global and refactor using a helper > v3: https://lore.kernel.org/all/20240626201129.272750-2-lkarpins@redhat.com/ > - Removed unneeded code for lazy umount case. > - Don't block within interrupt context. > v2: https://lore.kernel.org/all/20240426195429.28547-1-lkarpins@redhat.com/ > - Only defer releasing umount'ed filesystems for lazy umounts > v1: https://lore.kernel.org/all/20230119205521.497401-1-echanude@redhat.com/ > > fs/namespace.c | 52 +++++++++++++++++++++++++++++++++++++++++++------- > 1 file changed, 45 insertions(+), 7 deletions(-) > > diff --git a/fs/namespace.c b/fs/namespace.c > index 14935a0500a2..e5b0b920dd97 100644 > --- a/fs/namespace.c > +++ b/fs/namespace.c > @@ -45,6 +45,11 @@ static unsigned int m_hash_shift __ro_after_init; > static unsigned int mp_hash_mask __ro_after_init; > static unsigned int mp_hash_shift __ro_after_init; > > +struct deferred_free_mounts { > + struct rcu_work rwork; > + struct hlist_head release_list; > +}; > + > static __initdata unsigned long mhash_entries; > static int __init set_mhash_entries(char *str) > { > @@ -1789,11 +1794,29 @@ static bool need_notify_mnt_list(void) > } > #endif > > -static void namespace_unlock(void) > +static void free_mounts(struct hlist_head *mount_list) > { > - struct hlist_head head; > struct hlist_node *p; > struct mount *m; > + > + hlist_for_each_entry_safe(m, p, mount_list, mnt_umount) { > + hlist_del(&m->mnt_umount); > + mntput(&m->mnt); > + } > +} > + > +static void defer_free_mounts(struct work_struct *work) > +{ > + struct deferred_free_mounts *d = container_of( > + to_rcu_work(work), struct deferred_free_mounts, rwork); > + > + free_mounts(&d->release_list); > + kfree(d); > +} > + > +static void __namespace_unlock(bool lazy) > +{ > + HLIST_HEAD(head); > LIST_HEAD(list); > > hlist_move_list(&unmounted, &head); > @@ -1817,12 +1840,27 @@ static void namespace_unlock(void) > if (likely(hlist_empty(&head))) > return; > > - synchronize_rcu_expedited(); > + if (lazy) { > + struct deferred_free_mounts *d = > + kmalloc(sizeof(*d), GFP_KERNEL); > > - hlist_for_each_entry_safe(m, p, &head, mnt_umount) { > - hlist_del(&m->mnt_umount); > - mntput(&m->mnt); > + if (unlikely(!d)) > + goto out; > + > + hlist_move_list(&head, &d->release_list); > + INIT_RCU_WORK(&d->rwork, defer_free_mounts); > + queue_rcu_work(system_wq, &d->rwork); > + return; > } > + > +out: > + synchronize_rcu_expedited(); > + free_mounts(&head); > +} > + > +static inline void namespace_unlock(void) > +{ > + __namespace_unlock(false); > } > > static inline void namespace_lock(void) > @@ -2056,7 +2094,7 @@ static int do_umount(struct mount *mnt, int flags) > } > out: > unlock_mount_hash(); > - namespace_unlock(); > + __namespace_unlock(flags & MNT_DETACH); > return retval; > } > > -- > 2.49.0 >
On Tue, Apr 08, 2025 at 04:58:34PM -0400, Eric Chanudet wrote: > Defer releasing the detached file-system when calling namespace_unlock() > during a lazy umount to return faster. > > When requesting MNT_DETACH, the caller does not expect the file-system > to be shut down upon returning from the syscall. Calling > synchronize_rcu_expedited() has a significant cost on RT kernel that > defaults to rcupdate.rcu_normal_after_boot=1. Queue the detached struct > mount in a separate list and put it on a workqueue to run post RCU > grace-period. > > w/o patch, 6.15-rc1 PREEMPT_RT: > perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt > 0.02455 +- 0.00107 seconds time elapsed ( +- 4.36% ) > perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt > 0.02555 +- 0.00114 seconds time elapsed ( +- 4.46% ) > > w/ patch, 6.15-rc1 PREEMPT_RT: > perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt > 0.026311 +- 0.000869 seconds time elapsed ( +- 3.30% ) > perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt > 0.003194 +- 0.000160 seconds time elapsed ( +- 5.01% ) > Christian wants the patch done differently and posted his diff, so I'm not going to comment on it. I do have some feedback about the commit message though. In v1 it points out a real user which runs into it, while this one does not. So I would rewrite this and put in bench results from the actual consumer -- as it is one is left to wonder why patching up lazy unmount is of any significance. I had to look up what rcupdate.rcu_normal_after_boot=1 is. Docs claim it makes everyone use normal grace-periods, which explains the difference. But without that one is left to wonder if perhaps there is a perf bug in RCU instead where this is taking longer than it should despite the option. Thus I would also denote how the delay shows up. v1 for reference: > v1: https://lore.kernel.org/all/20230119205521.497401-1-echanude@redhat.com/
On 2025-04-09 12:37:06 [+0200], Christian Brauner wrote: > I still hate this with a passion because it adds another special-sauce > path into the unlock path. I've folded the following diff into it so it > at least doesn't start passing that pointless boolean and doesn't > introduce __namespace_unlock(). Just use a global variable and pick the > value off of it just as we do with the lists. Testing this now: I tried to apply this on top of the previous one but it all chunks failed. One question: Do we need this lazy/ MNT_DETACH case? Couldn't we handle them all via queue_rcu_work()? If so, couldn't we have make deferred_free_mounts global and have two release_list, say release_list and release_list_next_gp? The first one will be used if queue_rcu_work() returns true, otherwise the second. Then once defer_free_mounts() is done and release_list_next_gp not empty, it would move release_list_next_gp -> release_list and invoke queue_rcu_work(). This would avoid the kmalloc, synchronize_rcu_expedited() and the special-sauce. > diff --git a/fs/namespace.c b/fs/namespace.c > index e5b0b920dd97..25599428706c 100644 > --- a/fs/namespace.c > +++ b/fs/namespace.c > @@ -1840,29 +1842,21 @@ static void __namespace_unlock(bool lazy) … > + d = kmalloc(sizeof(struct deferred_free_mounts), GFP_KERNEL); > + if (d) { > + hlist_move_list(&head, &d->release_list); > + INIT_RCU_WORK(&d->rwork, defer_free_mounts); > + queue_rcu_work(system_wq, &d->rwork); Couldn't we do system_unbound_wq? Sebastian
On Wed, Apr 09, 2025 at 03:14:44PM +0200, Sebastian Andrzej Siewior wrote: > One question: Do we need this lazy/ MNT_DETACH case? Couldn't we handle > them all via queue_rcu_work()? > If so, couldn't we have make deferred_free_mounts global and have two > release_list, say release_list and release_list_next_gp? The first one > will be used if queue_rcu_work() returns true, otherwise the second. > Then once defer_free_mounts() is done and release_list_next_gp not > empty, it would move release_list_next_gp -> release_list and invoke > queue_rcu_work(). > This would avoid the kmalloc, synchronize_rcu_expedited() and the > special-sauce. > To my understanding it was preferred for non-lazy unmount consumers to wait until the mntput before returning. That aside if I understood your approach it would de facto serialize all of these? As in with the posted patches you can have different worker threads progress in parallel as they all get a private list to iterate. With your proposal only one can do any work. One has to assume with sufficient mount/unmount traffic this can eventually get into trouble.
On 2025-04-09 16:02:29 [+0200], Mateusz Guzik wrote: > On Wed, Apr 09, 2025 at 03:14:44PM +0200, Sebastian Andrzej Siewior wrote: > > One question: Do we need this lazy/ MNT_DETACH case? Couldn't we handle > > them all via queue_rcu_work()? > > If so, couldn't we have make deferred_free_mounts global and have two > > release_list, say release_list and release_list_next_gp? The first one > > will be used if queue_rcu_work() returns true, otherwise the second. > > Then once defer_free_mounts() is done and release_list_next_gp not > > empty, it would move release_list_next_gp -> release_list and invoke > > queue_rcu_work(). > > This would avoid the kmalloc, synchronize_rcu_expedited() and the > > special-sauce. > > > > To my understanding it was preferred for non-lazy unmount consumers to > wait until the mntput before returning. > > That aside if I understood your approach it would de facto serialize all > of these? > > As in with the posted patches you can have different worker threads > progress in parallel as they all get a private list to iterate. > > With your proposal only one can do any work. > > One has to assume with sufficient mount/unmount traffic this can > eventually get into trouble. Right, it would serialize them within the same worker thread. With one worker for each put you would schedule multiple worker from the RCU callback. Given the system_wq you will schedule them all on the CPU which invokes the RCU callback. This kind of serializes it, too. The mntput() callback uses spinlock_t for locking and then it frees resources. It does not look like it waits for something nor takes ages. So it might not be needed to split each put into its own worker on a different CPU… One busy bee might be enough ;) Sebastian
On Wed, Apr 09, 2025 at 04:25:10PM +0200, Sebastian Andrzej Siewior wrote: > On 2025-04-09 16:02:29 [+0200], Mateusz Guzik wrote: > > On Wed, Apr 09, 2025 at 03:14:44PM +0200, Sebastian Andrzej Siewior wrote: > > > One question: Do we need this lazy/ MNT_DETACH case? Couldn't we handle > > > them all via queue_rcu_work()? > > > If so, couldn't we have make deferred_free_mounts global and have two > > > release_list, say release_list and release_list_next_gp? The first one > > > will be used if queue_rcu_work() returns true, otherwise the second. > > > Then once defer_free_mounts() is done and release_list_next_gp not > > > empty, it would move release_list_next_gp -> release_list and invoke > > > queue_rcu_work(). > > > This would avoid the kmalloc, synchronize_rcu_expedited() and the > > > special-sauce. > > > > > > > To my understanding it was preferred for non-lazy unmount consumers to > > wait until the mntput before returning. > > > > That aside if I understood your approach it would de facto serialize all > > of these? > > > > As in with the posted patches you can have different worker threads > > progress in parallel as they all get a private list to iterate. > > > > With your proposal only one can do any work. > > > > One has to assume with sufficient mount/unmount traffic this can > > eventually get into trouble. > > Right, it would serialize them within the same worker thread. With one > worker for each put you would schedule multiple worker from the RCU > callback. Given the system_wq you will schedule them all on the CPU > which invokes the RCU callback. This kind of serializes it, too. > > The mntput() callback uses spinlock_t for locking and then it frees > resources. It does not look like it waits for something nor takes ages. > So it might not be needed to split each put into its own worker on a > different CPU… One busy bee might be enough ;) Unmounting can trigger very large number of mounts to be unmounted. If you're on a container heavy system or services that all propagate to each other in different mount namespaces mount propagation will generate a ton of umounts. So this cannot be underestimated. If a mount tree is wasted without MNT_DETACH it will pass UMOUNT_SYNC to umount_tree(). That'll cause MNT_SYNC_UMOUNT to be raised on all mounts during the unmount. If a concurrent path lookup calls legitimize_mnt() on such a mount and sees that MNT_SYNC_UMOUNT is set it will discount as it know that the concurrent unmounter hold the last reference and it __legitimize_mnt() can thus simply drop the reference count. The final mntput() will be done by the umounter. The synchronize_rcu() call in namespace_unlock() takes care that the last mntput() doesn't happen until path walking has dropped out of RCU mode. Without it it's possible that a non-MNT_DETACH umounter gets a spurious EBUSY error because a concurrent lazy path walk will suddenly put the last reference via mntput(). I'm unclear how that's handled in whatever it is you're proposing.
> > > On Wed, Apr 09, 2025 at 12:37:06PM +0200, Christian Brauner wrote: > > > > On Tue, Apr 08, 2025 at 04:58:34PM -0400, Eric Chanudet wrote: > > > > > Attempt to re-spin this series based on the feedback received in v3 that > > > > > pointed out the need to wait the grace-period in namespace_unlock() > > > > > before calling the deferred mntput(). > > > > > > > > I still hate this with a passion because it adds another special-sauce > > > > path into the unlock path. I've folded the following diff into it so it > > > > at least doesn't start passing that pointless boolean and doesn't > > > > introduce __namespace_unlock(). Just use a global variable and pick the > > > > value off of it just as we do with the lists. Testing this now: My apologies, I went with the feedback from v3[1] and failed to parse the context surrounding it. [1] https://lore.kernel.org/all/Znx-WGU5Wx6RaJyD@casper.infradead.org/ > > > > @@ -2094,7 +2088,7 @@ static int do_umount(struct mount *mnt, int flags) > > > > } > > > > out: > > > > unlock_mount_hash(); > > > > - __namespace_unlock(flags & MNT_DETACH); > > > > + namespace_unlock(); > > > > return retval; > > > > } > > > > > > > > I believe you skipped setting unmounted_lazily in this hunk? With this, I have applied your patch for the following discussion and down thread. Happy to send a v5, should this patch be deemed worth pursuing. On Wed, Apr 09, 2025 at 04:25:10PM +0200, Sebastian Andrzej Siewior wrote: > On 2025-04-09 16:02:29 [+0200], Mateusz Guzik wrote: > > On Wed, Apr 09, 2025 at 03:14:44PM +0200, Sebastian Andrzej Siewior wrote: > > > One question: Do we need this lazy/ MNT_DETACH case? Couldn't we handle > > > them all via queue_rcu_work()? > > > If so, couldn't we have make deferred_free_mounts global and have two > > > release_list, say release_list and release_list_next_gp? The first one > > > will be used if queue_rcu_work() returns true, otherwise the second. > > > Then once defer_free_mounts() is done and release_list_next_gp not > > > empty, it would move release_list_next_gp -> release_list and invoke > > > queue_rcu_work(). > > > This would avoid the kmalloc, synchronize_rcu_expedited() and the > > > special-sauce. > > > > > > > To my understanding it was preferred for non-lazy unmount consumers to > > wait until the mntput before returning. Unless I misunderstand the statement, and from the previous thread[2], this is a requirement of the user API. [2] https://lore.kernel.org/all/Y8m+M%2FffIEEWbfmv@ZenIV/ > > On Wed, Apr 09, 2025 at 03:14:44PM +0200, Sebastian Andrzej Siewior wrote: > > > On 2025-04-09 12:37:06 [+0200], Christian Brauner wrote: > > > > diff --git a/fs/namespace.c b/fs/namespace.c > > > > index e5b0b920dd97..25599428706c 100644 > > > > --- a/fs/namespace.c > > > > +++ b/fs/namespace.c > > > > @@ -1840,29 +1842,21 @@ static void __namespace_unlock(bool lazy) > > > … > > > > + d = kmalloc(sizeof(struct deferred_free_mounts), GFP_KERNEL); > > > > + if (d) { > > > > + hlist_move_list(&head, &d->release_list); > > > > + INIT_RCU_WORK(&d->rwork, defer_free_mounts); > > > > + queue_rcu_work(system_wq, &d->rwork); > > > > > > Couldn't we do system_unbound_wq? I think we can, afaict we don't need locality? I'll run some tests with system_unbound_wq. Thanks,
On Wed, Apr 09, 2025 at 03:14:44PM +0200, Sebastian Andrzej Siewior wrote: > On 2025-04-09 12:37:06 [+0200], Christian Brauner wrote: > > I still hate this with a passion because it adds another special-sauce > > path into the unlock path. I've folded the following diff into it so it > > at least doesn't start passing that pointless boolean and doesn't > > introduce __namespace_unlock(). Just use a global variable and pick the > > value off of it just as we do with the lists. Testing this now: > > I tried to apply this on top of the previous one but it all chunks > failed. See https://web.git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git/log/?h=vfs-6.16.mount
On Wed, Apr 09, 2025 at 03:04:43PM +0200, Mateusz Guzik wrote: > On Tue, Apr 08, 2025 at 04:58:34PM -0400, Eric Chanudet wrote: > > Defer releasing the detached file-system when calling namespace_unlock() > > during a lazy umount to return faster. > > > > When requesting MNT_DETACH, the caller does not expect the file-system > > to be shut down upon returning from the syscall. Calling > > synchronize_rcu_expedited() has a significant cost on RT kernel that > > defaults to rcupdate.rcu_normal_after_boot=1. Queue the detached struct > > mount in a separate list and put it on a workqueue to run post RCU > > grace-period. > > > > w/o patch, 6.15-rc1 PREEMPT_RT: > > perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt > > 0.02455 +- 0.00107 seconds time elapsed ( +- 4.36% ) > > perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt > > 0.02555 +- 0.00114 seconds time elapsed ( +- 4.46% ) > > > > w/ patch, 6.15-rc1 PREEMPT_RT: > > perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt > > 0.026311 +- 0.000869 seconds time elapsed ( +- 3.30% ) > > perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt > > 0.003194 +- 0.000160 seconds time elapsed ( +- 5.01% ) > > > > Christian wants the patch done differently and posted his diff, so I'm > not going to comment on it. > > I do have some feedback about the commit message though. > > In v1 it points out a real user which runs into it, while this one does > not. So I would rewrite this and put in bench results from the actual > consumer -- as it is one is left to wonder why patching up lazy unmount > is of any significance. Certainly. Doing the test mentioned in v1 again with v4+Christian's suggested changes: - QEMU x86_64, 8cpus, PREEMPT_RT, w/o patch: # perf stat -r 10 --table --null -- crun run test 0.07584 +- 0.00440 seconds time elapsed ( +- 5.80% ) - QEMU x86_64, 8cpus, PREEMPT_RT, w/ patch: # perf stat -r 10 --table --null -- crun run test 0.01421 +- 0.00387 seconds time elapsed ( +- 27.26% ) I will add that to the commit message. > I had to look up what rcupdate.rcu_normal_after_boot=1 is. Docs claim it > makes everyone use normal grace-periods, which explains the difference. > But without that one is left to wonder if perhaps there is a perf bug in > RCU instead where this is taking longer than it should despite the > option. Thus I would also denote how the delay shows up. I tried the test above while trying to force expedited RCU on the cmdline with: rcupdate.rcu_normal_after_boot=0 rcupdate.rcu_expedited=1 Unfortunately, rcupdate.rcu_normal_after_boot=0 has no effect and rcupdate_announce_bootup_oddness() reports: [ 0.015251] No expedited grace period (rcu_normal_after_boot). Which yielded similar results: - QEMU x86_64, 8cpus, PREEMPT_RT, w/o patch: # perf stat -r 10 --table --null -- crun run test 0.07838 +- 0.00322 seconds time elapsed ( +- 4.11% ) - QEMU x86_64, 8cpus, PREEMPT_RT, w/ patch: # perf stat -r 10 --table --null -- crun run test 0.01582 +- 0.00353 seconds time elapsed ( +- 22.30% ) I don't think rcupdate.rcu_expedited=1 had an effect, but I have not confirmed that yet. > v1 for reference: > > v1: https://lore.kernel.org/all/20230119205521.497401-1-echanude@redhat.com/
On 9/4/25 18:37, Christian Brauner wrote: > On Tue, Apr 08, 2025 at 04:58:34PM -0400, Eric Chanudet wrote: >> Defer releasing the detached file-system when calling namespace_unlock() >> during a lazy umount to return faster. >> >> When requesting MNT_DETACH, the caller does not expect the file-system >> to be shut down upon returning from the syscall. Calling >> synchronize_rcu_expedited() has a significant cost on RT kernel that >> defaults to rcupdate.rcu_normal_after_boot=1. Queue the detached struct >> mount in a separate list and put it on a workqueue to run post RCU >> grace-period. >> >> w/o patch, 6.15-rc1 PREEMPT_RT: >> perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt >> 0.02455 +- 0.00107 seconds time elapsed ( +- 4.36% ) >> perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt >> 0.02555 +- 0.00114 seconds time elapsed ( +- 4.46% ) >> >> w/ patch, 6.15-rc1 PREEMPT_RT: >> perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt >> 0.026311 +- 0.000869 seconds time elapsed ( +- 3.30% ) >> perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt >> 0.003194 +- 0.000160 seconds time elapsed ( +- 5.01% ) >> >> Signed-off-by: Alexander Larsson <alexl@redhat.com> >> Signed-off-by: Lucas Karpinski <lkarpins@redhat.com> >> Signed-off-by: Eric Chanudet <echanude@redhat.com> >> --- >> >> Attempt to re-spin this series based on the feedback received in v3 that >> pointed out the need to wait the grace-period in namespace_unlock() >> before calling the deferred mntput(). > I still hate this with a passion because it adds another special-sauce > path into the unlock path. I've folded the following diff into it so it > at least doesn't start passing that pointless boolean and doesn't > introduce __namespace_unlock(). Just use a global variable and pick the > value off of it just as we do with the lists. Testing this now: Yeah, it's painful that's for sure. I also agree with you about the parameter, changing the call signature always rubbed me the wrong way but I didn't push back on it mostly because we needed to find a way to do it sensibly and it sounds like that's still the case. AFAICT what's needed is a way to synchronize umount with the lockless path walk. Now umount detaches the mounts concerned, calls the rcu synchronize (essentially sleeps) to ensure that any lockless path walks see the umount before completing. But that rcu sync. is, as we can see, really wasteful so we do need to find a viable way to synchronize this. Strictly speaking the synchronization problem exists for normal and detached umounts but if we can find a sound solution for detached mounts perhaps we can extend later (but now that seems like a stretch) ... I'm not sure why, perhaps it's just me, I don't know, but with this we don't seem to be working well together to find a solution, I hope we can change that this time around. I was thinking of using a completion for this synchronization but even that would be messy because of possible multiple processes doing walks at the time which doesn't lend cleanly to using a completion. Do you have any ideas on how this could be done yourself? Ian > > diff --git a/fs/namespace.c b/fs/namespace.c > index e5b0b920dd97..25599428706c 100644 > --- a/fs/namespace.c > +++ b/fs/namespace.c > @@ -82,8 +82,9 @@ static struct hlist_head *mount_hashtable __ro_after_init; > static struct hlist_head *mountpoint_hashtable __ro_after_init; > static struct kmem_cache *mnt_cache __ro_after_init; > static DECLARE_RWSEM(namespace_sem); > -static HLIST_HEAD(unmounted); /* protected by namespace_sem */ > -static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ > +static bool unmounted_lazily; /* protected by namespace_sem */ > +static HLIST_HEAD(unmounted); /* protected by namespace_sem */ > +static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ > static DEFINE_SEQLOCK(mnt_ns_tree_lock); > > #ifdef CONFIG_FSNOTIFY > @@ -1807,17 +1808,18 @@ static void free_mounts(struct hlist_head *mount_list) > > static void defer_free_mounts(struct work_struct *work) > { > - struct deferred_free_mounts *d = container_of( > - to_rcu_work(work), struct deferred_free_mounts, rwork); > + struct deferred_free_mounts *d; > > + d = container_of(to_rcu_work(work), struct deferred_free_mounts, rwork); > free_mounts(&d->release_list); > kfree(d); > } > > -static void __namespace_unlock(bool lazy) > +static void namespace_unlock(void) > { > HLIST_HEAD(head); > LIST_HEAD(list); > + bool defer = unmounted_lazily; > > hlist_move_list(&unmounted, &head); > list_splice_init(&ex_mountpoints, &list); > @@ -1840,29 +1842,21 @@ static void __namespace_unlock(bool lazy) > if (likely(hlist_empty(&head))) > return; > > - if (lazy) { > - struct deferred_free_mounts *d = > - kmalloc(sizeof(*d), GFP_KERNEL); > + if (defer) { > + struct deferred_free_mounts *d; > > - if (unlikely(!d)) > - goto out; > - > - hlist_move_list(&head, &d->release_list); > - INIT_RCU_WORK(&d->rwork, defer_free_mounts); > - queue_rcu_work(system_wq, &d->rwork); > - return; > + d = kmalloc(sizeof(struct deferred_free_mounts), GFP_KERNEL); > + if (d) { > + hlist_move_list(&head, &d->release_list); > + INIT_RCU_WORK(&d->rwork, defer_free_mounts); > + queue_rcu_work(system_wq, &d->rwork); > + return; > + } > } > - > -out: > synchronize_rcu_expedited(); > free_mounts(&head); > } > > -static inline void namespace_unlock(void) > -{ > - __namespace_unlock(false); > -} > - > static inline void namespace_lock(void) > { > down_write(&namespace_sem); > @@ -2094,7 +2088,7 @@ static int do_umount(struct mount *mnt, int flags) > } > out: > unlock_mount_hash(); > - __namespace_unlock(flags & MNT_DETACH); > + namespace_unlock(); > return retval; > } > > >> v4: >> - Use queue_rcu_work() to defer free_mounts() for lazy umounts >> - Drop lazy_unlock global and refactor using a helper >> v3: https://lore.kernel.org/all/20240626201129.272750-2-lkarpins@redhat.com/ >> - Removed unneeded code for lazy umount case. >> - Don't block within interrupt context. >> v2: https://lore.kernel.org/all/20240426195429.28547-1-lkarpins@redhat.com/ >> - Only defer releasing umount'ed filesystems for lazy umounts >> v1: https://lore.kernel.org/all/20230119205521.497401-1-echanude@redhat.com/ >> >> fs/namespace.c | 52 +++++++++++++++++++++++++++++++++++++++++++------- >> 1 file changed, 45 insertions(+), 7 deletions(-) >> >> diff --git a/fs/namespace.c b/fs/namespace.c >> index 14935a0500a2..e5b0b920dd97 100644 >> --- a/fs/namespace.c >> +++ b/fs/namespace.c >> @@ -45,6 +45,11 @@ static unsigned int m_hash_shift __ro_after_init; >> static unsigned int mp_hash_mask __ro_after_init; >> static unsigned int mp_hash_shift __ro_after_init; >> >> +struct deferred_free_mounts { >> + struct rcu_work rwork; >> + struct hlist_head release_list; >> +}; >> + >> static __initdata unsigned long mhash_entries; >> static int __init set_mhash_entries(char *str) >> { >> @@ -1789,11 +1794,29 @@ static bool need_notify_mnt_list(void) >> } >> #endif >> >> -static void namespace_unlock(void) >> +static void free_mounts(struct hlist_head *mount_list) >> { >> - struct hlist_head head; >> struct hlist_node *p; >> struct mount *m; >> + >> + hlist_for_each_entry_safe(m, p, mount_list, mnt_umount) { >> + hlist_del(&m->mnt_umount); >> + mntput(&m->mnt); >> + } >> +} >> + >> +static void defer_free_mounts(struct work_struct *work) >> +{ >> + struct deferred_free_mounts *d = container_of( >> + to_rcu_work(work), struct deferred_free_mounts, rwork); >> + >> + free_mounts(&d->release_list); >> + kfree(d); >> +} >> + >> +static void __namespace_unlock(bool lazy) >> +{ >> + HLIST_HEAD(head); >> LIST_HEAD(list); >> >> hlist_move_list(&unmounted, &head); >> @@ -1817,12 +1840,27 @@ static void namespace_unlock(void) >> if (likely(hlist_empty(&head))) >> return; >> >> - synchronize_rcu_expedited(); >> + if (lazy) { >> + struct deferred_free_mounts *d = >> + kmalloc(sizeof(*d), GFP_KERNEL); >> >> - hlist_for_each_entry_safe(m, p, &head, mnt_umount) { >> - hlist_del(&m->mnt_umount); >> - mntput(&m->mnt); >> + if (unlikely(!d)) >> + goto out; >> + >> + hlist_move_list(&head, &d->release_list); >> + INIT_RCU_WORK(&d->rwork, defer_free_mounts); >> + queue_rcu_work(system_wq, &d->rwork); >> + return; >> } >> + >> +out: >> + synchronize_rcu_expedited(); >> + free_mounts(&head); >> +} >> + >> +static inline void namespace_unlock(void) >> +{ >> + __namespace_unlock(false); >> } >> >> static inline void namespace_lock(void) >> @@ -2056,7 +2094,7 @@ static int do_umount(struct mount *mnt, int flags) >> } >> out: >> unlock_mount_hash(); >> - namespace_unlock(); >> + __namespace_unlock(flags & MNT_DETACH); >> return retval; >> } >> >> -- >> 2.49.0 >>
On 10/4/25 00:04, Christian Brauner wrote: > On Wed, Apr 09, 2025 at 04:25:10PM +0200, Sebastian Andrzej Siewior wrote: >> On 2025-04-09 16:02:29 [+0200], Mateusz Guzik wrote: >>> On Wed, Apr 09, 2025 at 03:14:44PM +0200, Sebastian Andrzej Siewior wrote: >>>> One question: Do we need this lazy/ MNT_DETACH case? Couldn't we handle >>>> them all via queue_rcu_work()? >>>> If so, couldn't we have make deferred_free_mounts global and have two >>>> release_list, say release_list and release_list_next_gp? The first one >>>> will be used if queue_rcu_work() returns true, otherwise the second. >>>> Then once defer_free_mounts() is done and release_list_next_gp not >>>> empty, it would move release_list_next_gp -> release_list and invoke >>>> queue_rcu_work(). >>>> This would avoid the kmalloc, synchronize_rcu_expedited() and the >>>> special-sauce. >>>> >>> To my understanding it was preferred for non-lazy unmount consumers to >>> wait until the mntput before returning. >>> >>> That aside if I understood your approach it would de facto serialize all >>> of these? >>> >>> As in with the posted patches you can have different worker threads >>> progress in parallel as they all get a private list to iterate. >>> >>> With your proposal only one can do any work. >>> >>> One has to assume with sufficient mount/unmount traffic this can >>> eventually get into trouble. >> Right, it would serialize them within the same worker thread. With one >> worker for each put you would schedule multiple worker from the RCU >> callback. Given the system_wq you will schedule them all on the CPU >> which invokes the RCU callback. This kind of serializes it, too. >> >> The mntput() callback uses spinlock_t for locking and then it frees >> resources. It does not look like it waits for something nor takes ages. >> So it might not be needed to split each put into its own worker on a >> different CPU… One busy bee might be enough ;) > Unmounting can trigger very large number of mounts to be unmounted. If > you're on a container heavy system or services that all propagate to > each other in different mount namespaces mount propagation will generate > a ton of umounts. So this cannot be underestimated. Indeed yes, or shutting down autofs when it's using a direct mount map with a few hundred (or thousand) entries. Foe my part it's things like the targeted mount information and mounted mounts listing system calls (done again recently by Miklos) and this slow umount that are, IMHO, really important. Ian > > If a mount tree is wasted without MNT_DETACH it will pass UMOUNT_SYNC to > umount_tree(). That'll cause MNT_SYNC_UMOUNT to be raised on all mounts > during the unmount. > > If a concurrent path lookup calls legitimize_mnt() on such a mount and > sees that MNT_SYNC_UMOUNT is set it will discount as it know that the > concurrent unmounter hold the last reference and it __legitimize_mnt() > can thus simply drop the reference count. The final mntput() will be > done by the umounter. > > The synchronize_rcu() call in namespace_unlock() takes care that the > last mntput() doesn't happen until path walking has dropped out of RCU > mode. > > Without it it's possible that a non-MNT_DETACH umounter gets a spurious > EBUSY error because a concurrent lazy path walk will suddenly put the > last reference via mntput(). > > I'm unclear how that's handled in whatever it is you're proposing. >
On 2025-04-09 18:04:21 [+0200], Christian Brauner wrote: > On Wed, Apr 09, 2025 at 04:25:10PM +0200, Sebastian Andrzej Siewior wrote: > > On 2025-04-09 16:02:29 [+0200], Mateusz Guzik wrote: > > > On Wed, Apr 09, 2025 at 03:14:44PM +0200, Sebastian Andrzej Siewior wrote: > > > > One question: Do we need this lazy/ MNT_DETACH case? Couldn't we handle > > > > them all via queue_rcu_work()? > > > > If so, couldn't we have make deferred_free_mounts global and have two > > > > release_list, say release_list and release_list_next_gp? The first one > > > > will be used if queue_rcu_work() returns true, otherwise the second. > > > > Then once defer_free_mounts() is done and release_list_next_gp not > > > > empty, it would move release_list_next_gp -> release_list and invoke > > > > queue_rcu_work(). > > > > This would avoid the kmalloc, synchronize_rcu_expedited() and the > > > > special-sauce. > > > > > > > > > > To my understanding it was preferred for non-lazy unmount consumers to > > > wait until the mntput before returning. > > > > > > That aside if I understood your approach it would de facto serialize all > > > of these? > > > > > > As in with the posted patches you can have different worker threads > > > progress in parallel as they all get a private list to iterate. > > > > > > With your proposal only one can do any work. > > > > > > One has to assume with sufficient mount/unmount traffic this can > > > eventually get into trouble. > > > > Right, it would serialize them within the same worker thread. With one > > worker for each put you would schedule multiple worker from the RCU > > callback. Given the system_wq you will schedule them all on the CPU > > which invokes the RCU callback. This kind of serializes it, too. > > > > The mntput() callback uses spinlock_t for locking and then it frees > > resources. It does not look like it waits for something nor takes ages. > > So it might not be needed to split each put into its own worker on a > > different CPU… One busy bee might be enough ;) > > Unmounting can trigger very large number of mounts to be unmounted. If > you're on a container heavy system or services that all propagate to > each other in different mount namespaces mount propagation will generate > a ton of umounts. So this cannot be underestimated. So you want to have two of these unmounts in two worker so you can split them on two CPUs in best case. As of today, in order to get through with umounts asap you accelerate the grace period. And after the wake up may utilize more than one CPU. > If a mount tree is wasted without MNT_DETACH it will pass UMOUNT_SYNC to > umount_tree(). That'll cause MNT_SYNC_UMOUNT to be raised on all mounts > during the unmount. > > If a concurrent path lookup calls legitimize_mnt() on such a mount and > sees that MNT_SYNC_UMOUNT is set it will discount as it know that the > concurrent unmounter hold the last reference and it __legitimize_mnt() > can thus simply drop the reference count. The final mntput() will be > done by the umounter. > > The synchronize_rcu() call in namespace_unlock() takes care that the > last mntput() doesn't happen until path walking has dropped out of RCU > mode. > > Without it it's possible that a non-MNT_DETACH umounter gets a spurious > EBUSY error because a concurrent lazy path walk will suddenly put the > last reference via mntput(). > > I'm unclear how that's handled in whatever it is you're proposing. Okay. So we can't do this for UMOUNT_SYNC callers, thank you for the explanation. We could avoid the memory allocation and have one worker to take care of them all but you are afraid what this would mean to huge container. Understandable. The s/system_wq/system_unbound_wq/ would make sense. Sebastian
On Thu, Apr 10, 2025 at 10:28:33AM +0200, Sebastian Andrzej Siewior wrote: > On 2025-04-09 18:04:21 [+0200], Christian Brauner wrote: > > On Wed, Apr 09, 2025 at 04:25:10PM +0200, Sebastian Andrzej Siewior wrote: > > > On 2025-04-09 16:02:29 [+0200], Mateusz Guzik wrote: > > > > On Wed, Apr 09, 2025 at 03:14:44PM +0200, Sebastian Andrzej Siewior wrote: > > > > > One question: Do we need this lazy/ MNT_DETACH case? Couldn't we handle > > > > > them all via queue_rcu_work()? > > > > > If so, couldn't we have make deferred_free_mounts global and have two > > > > > release_list, say release_list and release_list_next_gp? The first one > > > > > will be used if queue_rcu_work() returns true, otherwise the second. > > > > > Then once defer_free_mounts() is done and release_list_next_gp not > > > > > empty, it would move release_list_next_gp -> release_list and invoke > > > > > queue_rcu_work(). > > > > > This would avoid the kmalloc, synchronize_rcu_expedited() and the > > > > > special-sauce. > > > > > > > > > > > > > To my understanding it was preferred for non-lazy unmount consumers to > > > > wait until the mntput before returning. > > > > > > > > That aside if I understood your approach it would de facto serialize all > > > > of these? > > > > > > > > As in with the posted patches you can have different worker threads > > > > progress in parallel as they all get a private list to iterate. > > > > > > > > With your proposal only one can do any work. > > > > > > > > One has to assume with sufficient mount/unmount traffic this can > > > > eventually get into trouble. > > > > > > Right, it would serialize them within the same worker thread. With one > > > worker for each put you would schedule multiple worker from the RCU > > > callback. Given the system_wq you will schedule them all on the CPU > > > which invokes the RCU callback. This kind of serializes it, too. > > > > > > The mntput() callback uses spinlock_t for locking and then it frees > > > resources. It does not look like it waits for something nor takes ages. > > > So it might not be needed to split each put into its own worker on a > > > different CPU… One busy bee might be enough ;) > > > > Unmounting can trigger very large number of mounts to be unmounted. If > > you're on a container heavy system or services that all propagate to > > each other in different mount namespaces mount propagation will generate > > a ton of umounts. So this cannot be underestimated. > > So you want to have two of these unmounts in two worker so you can split > them on two CPUs in best case. As of today, in order to get through with > umounts asap you accelerate the grace period. And after the wake up may > utilize more than one CPU. > > > If a mount tree is wasted without MNT_DETACH it will pass UMOUNT_SYNC to > > umount_tree(). That'll cause MNT_SYNC_UMOUNT to be raised on all mounts > > during the unmount. > > > > If a concurrent path lookup calls legitimize_mnt() on such a mount and > > sees that MNT_SYNC_UMOUNT is set it will discount as it know that the > > concurrent unmounter hold the last reference and it __legitimize_mnt() > > can thus simply drop the reference count. The final mntput() will be > > done by the umounter. > > > > The synchronize_rcu() call in namespace_unlock() takes care that the > > last mntput() doesn't happen until path walking has dropped out of RCU > > mode. > > > > Without it it's possible that a non-MNT_DETACH umounter gets a spurious > > EBUSY error because a concurrent lazy path walk will suddenly put the > > last reference via mntput(). > > > > I'm unclear how that's handled in whatever it is you're proposing. > > Okay. So we can't do this for UMOUNT_SYNC callers, thank you for the > explanation. We could avoid the memory allocation and have one worker to > take care of them all but you are afraid what this would mean to huge > container. Understandable. The s/system_wq/system_unbound_wq/ would make > sense. Don't get me wrong if you have a clever idea here I'm all ears.
On 10/4/25 00:04, Christian Brauner wrote: > On Wed, Apr 09, 2025 at 04:25:10PM +0200, Sebastian Andrzej Siewior wrote: >> On 2025-04-09 16:02:29 [+0200], Mateusz Guzik wrote: >>> On Wed, Apr 09, 2025 at 03:14:44PM +0200, Sebastian Andrzej Siewior wrote: >>>> One question: Do we need this lazy/ MNT_DETACH case? Couldn't we handle >>>> them all via queue_rcu_work()? >>>> If so, couldn't we have make deferred_free_mounts global and have two >>>> release_list, say release_list and release_list_next_gp? The first one >>>> will be used if queue_rcu_work() returns true, otherwise the second. >>>> Then once defer_free_mounts() is done and release_list_next_gp not >>>> empty, it would move release_list_next_gp -> release_list and invoke >>>> queue_rcu_work(). >>>> This would avoid the kmalloc, synchronize_rcu_expedited() and the >>>> special-sauce. >>>> >>> To my understanding it was preferred for non-lazy unmount consumers to >>> wait until the mntput before returning. >>> >>> That aside if I understood your approach it would de facto serialize all >>> of these? >>> >>> As in with the posted patches you can have different worker threads >>> progress in parallel as they all get a private list to iterate. >>> >>> With your proposal only one can do any work. >>> >>> One has to assume with sufficient mount/unmount traffic this can >>> eventually get into trouble. >> Right, it would serialize them within the same worker thread. With one >> worker for each put you would schedule multiple worker from the RCU >> callback. Given the system_wq you will schedule them all on the CPU >> which invokes the RCU callback. This kind of serializes it, too. >> >> The mntput() callback uses spinlock_t for locking and then it frees >> resources. It does not look like it waits for something nor takes ages. >> So it might not be needed to split each put into its own worker on a >> different CPU… One busy bee might be enough ;) > Unmounting can trigger very large number of mounts to be unmounted. If > you're on a container heavy system or services that all propagate to > each other in different mount namespaces mount propagation will generate > a ton of umounts. So this cannot be underestimated. > > If a mount tree is wasted without MNT_DETACH it will pass UMOUNT_SYNC to > umount_tree(). That'll cause MNT_SYNC_UMOUNT to be raised on all mounts > during the unmount. > > If a concurrent path lookup calls legitimize_mnt() on such a mount and > sees that MNT_SYNC_UMOUNT is set it will discount as it know that the > concurrent unmounter hold the last reference and it __legitimize_mnt() > can thus simply drop the reference count. The final mntput() will be > done by the umounter. In umount_tree() it looks like the unmounted mount remains hashed (ie. disconnect_mount() returns false) so can't it still race with an rcu-walk regardless of the sybcronsize_rcu(). Surely I'm missing something ... Ian > > The synchronize_rcu() call in namespace_unlock() takes care that the > last mntput() doesn't happen until path walking has dropped out of RCU > mode. > > Without it it's possible that a non-MNT_DETACH umounter gets a spurious > EBUSY error because a concurrent lazy path walk will suddenly put the > last reference via mntput(). > > I'm unclear how that's handled in whatever it is you're proposing. >
On 10/4/25 21:58, Ian Kent wrote: > > On 10/4/25 00:04, Christian Brauner wrote: >> On Wed, Apr 09, 2025 at 04:25:10PM +0200, Sebastian Andrzej Siewior >> wrote: >>> On 2025-04-09 16:02:29 [+0200], Mateusz Guzik wrote: >>>> On Wed, Apr 09, 2025 at 03:14:44PM +0200, Sebastian Andrzej Siewior >>>> wrote: >>>>> One question: Do we need this lazy/ MNT_DETACH case? Couldn't we >>>>> handle >>>>> them all via queue_rcu_work()? >>>>> If so, couldn't we have make deferred_free_mounts global and have two >>>>> release_list, say release_list and release_list_next_gp? The first >>>>> one >>>>> will be used if queue_rcu_work() returns true, otherwise the second. >>>>> Then once defer_free_mounts() is done and release_list_next_gp not >>>>> empty, it would move release_list_next_gp -> release_list and invoke >>>>> queue_rcu_work(). >>>>> This would avoid the kmalloc, synchronize_rcu_expedited() and the >>>>> special-sauce. >>>>> >>>> To my understanding it was preferred for non-lazy unmount consumers to >>>> wait until the mntput before returning. >>>> >>>> That aside if I understood your approach it would de facto >>>> serialize all >>>> of these? >>>> >>>> As in with the posted patches you can have different worker threads >>>> progress in parallel as they all get a private list to iterate. >>>> >>>> With your proposal only one can do any work. >>>> >>>> One has to assume with sufficient mount/unmount traffic this can >>>> eventually get into trouble. >>> Right, it would serialize them within the same worker thread. With one >>> worker for each put you would schedule multiple worker from the RCU >>> callback. Given the system_wq you will schedule them all on the CPU >>> which invokes the RCU callback. This kind of serializes it, too. >>> >>> The mntput() callback uses spinlock_t for locking and then it frees >>> resources. It does not look like it waits for something nor takes ages. >>> So it might not be needed to split each put into its own worker on a >>> different CPU… One busy bee might be enough ;) >> Unmounting can trigger very large number of mounts to be unmounted. If >> you're on a container heavy system or services that all propagate to >> each other in different mount namespaces mount propagation will generate >> a ton of umounts. So this cannot be underestimated. >> >> If a mount tree is wasted without MNT_DETACH it will pass UMOUNT_SYNC to >> umount_tree(). That'll cause MNT_SYNC_UMOUNT to be raised on all mounts >> during the unmount. >> >> If a concurrent path lookup calls legitimize_mnt() on such a mount and >> sees that MNT_SYNC_UMOUNT is set it will discount as it know that the >> concurrent unmounter hold the last reference and it __legitimize_mnt() >> can thus simply drop the reference count. The final mntput() will be >> done by the umounter. > > In umount_tree() it looks like the unmounted mount remains hashed (ie. > > disconnect_mount() returns false) so can't it still race with an rcu-walk > > regardless of the sybcronsize_rcu(). > > > Surely I'm missing something ... Ans I am, please ignore this. I miss-read the return of the mnt->mnt_parent->mnt.mnt_flags check in disconnect_mount(), my bad. > > > Ian > >> >> The synchronize_rcu() call in namespace_unlock() takes care that the >> last mntput() doesn't happen until path walking has dropped out of RCU >> mode. >> >> Without it it's possible that a non-MNT_DETACH umounter gets a spurious >> EBUSY error because a concurrent lazy path walk will suddenly put the >> last reference via mntput(). >> >> I'm unclear how that's handled in whatever it is you're proposing. >> >
> With this, I have applied your patch for the following discussion and > down thread. Happy to send a v5, should this patch be deemed worth > pursuing. I'll just switch to system_unbound_wq and there's no need to resend for now. If the numbers somehow change significantly due to that change just mention that. Thanks.
On Fri, Apr 11, 2025 at 05:17:25PM +0200, Christian Brauner wrote: > > With this, I have applied your patch for the following discussion and > > down thread. Happy to send a v5, should this patch be deemed worth > > pursuing. > > I'll just switch to system_unbound_wq and there's no need to resend for > now. If the numbers somehow change significantly due to that change just > mention that. Thanks. > Thank you, I do not see a significant change in the numbers with system_unbound_wq compared to system_wq. Best,
diff --git a/fs/namespace.c b/fs/namespace.c index 14935a0500a2..e5b0b920dd97 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -45,6 +45,11 @@ static unsigned int m_hash_shift __ro_after_init; static unsigned int mp_hash_mask __ro_after_init; static unsigned int mp_hash_shift __ro_after_init; +struct deferred_free_mounts { + struct rcu_work rwork; + struct hlist_head release_list; +}; + static __initdata unsigned long mhash_entries; static int __init set_mhash_entries(char *str) { @@ -1789,11 +1794,29 @@ static bool need_notify_mnt_list(void) } #endif -static void namespace_unlock(void) +static void free_mounts(struct hlist_head *mount_list) { - struct hlist_head head; struct hlist_node *p; struct mount *m; + + hlist_for_each_entry_safe(m, p, mount_list, mnt_umount) { + hlist_del(&m->mnt_umount); + mntput(&m->mnt); + } +} + +static void defer_free_mounts(struct work_struct *work) +{ + struct deferred_free_mounts *d = container_of( + to_rcu_work(work), struct deferred_free_mounts, rwork); + + free_mounts(&d->release_list); + kfree(d); +} + +static void __namespace_unlock(bool lazy) +{ + HLIST_HEAD(head); LIST_HEAD(list); hlist_move_list(&unmounted, &head); @@ -1817,12 +1840,27 @@ static void namespace_unlock(void) if (likely(hlist_empty(&head))) return; - synchronize_rcu_expedited(); + if (lazy) { + struct deferred_free_mounts *d = + kmalloc(sizeof(*d), GFP_KERNEL); - hlist_for_each_entry_safe(m, p, &head, mnt_umount) { - hlist_del(&m->mnt_umount); - mntput(&m->mnt); + if (unlikely(!d)) + goto out; + + hlist_move_list(&head, &d->release_list); + INIT_RCU_WORK(&d->rwork, defer_free_mounts); + queue_rcu_work(system_wq, &d->rwork); + return; } + +out: + synchronize_rcu_expedited(); + free_mounts(&head); +} + +static inline void namespace_unlock(void) +{ + __namespace_unlock(false); } static inline void namespace_lock(void) @@ -2056,7 +2094,7 @@ static int do_umount(struct mount *mnt, int flags) } out: unlock_mount_hash(); - namespace_unlock(); + __namespace_unlock(flags & MNT_DETACH); return retval; }