diff mbox series

[RFC,v3,1/1] fs/namespace: remove RCU sync for MNT_DETACH umount

Message ID 20240626201129.272750-3-lkarpins@redhat.com (mailing list archive)
State New
Headers show
Series fs/namespace: defer RCU sync for MNT_DETACH umount | expand

Commit Message

Lucas Karpinski June 26, 2024, 8:07 p.m. UTC
When detaching (MNT_DETACH) a filesystem, it should not be necessary to
wait for the grace period before completing the syscall. The
expectation that the filesystem is shut down by the time the syscall
returns does not apply in this case. The synchronize_expedited() is not
needed in the lazy umount case, so don't use it.

Without patch, on 6.10-rc2-rt kernel:
perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
        0.07333 +- 0.00615 seconds time elapsed  ( +-  8.38% )
perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt
	0.07229 +- 0.00672 seconds time elapsed  ( +-  9.29% )

With patch, on 6.10-rc2-rt kernel:
perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
        0.02834 +- 0.00419 seconds time elapsed  ( +- 14.78% )
perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt
        0.0029830 +- 0.0000767 seconds time elapsed  ( +-  2.57% )

Signed-off-by: Alexander Larsson <alexl@redhat.com>
Signed-off-by: Eric Chanudet <echanude@redhat.com>
Signed-off-by: Lucas Karpinski <lkarpins@redhat.com>
Suggested-by: Ian Kent <ikent@redhat.com>
---
 fs/namespace.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

Comments

Matthew Wilcox June 26, 2024, 8:47 p.m. UTC | #1
On Wed, Jun 26, 2024 at 04:07:49PM -0400, Lucas Karpinski wrote:
> +++ b/fs/namespace.c
> @@ -78,6 +78,7 @@ static struct kmem_cache *mnt_cache __ro_after_init;
>  static DECLARE_RWSEM(namespace_sem);
>  static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
>  static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
> +static bool lazy_unlock = false; /* protected by namespace_sem */

That's a pretty ugly way of doing it.  How about this?

+++ b/fs/namespace.c
@@ -1553,7 +1553,7 @@ int may_umount(struct vfsmount *mnt)
 
 EXPORT_SYMBOL(may_umount);
 
-static void namespace_unlock(void)
+static void __namespace_unlock(bool lazy)
 {
        struct hlist_head head;
        struct hlist_node *p;
@@ -1570,7 +1570,8 @@ static void namespace_unlock(void)
        if (likely(hlist_empty(&head)))
                return;
 
-       synchronize_rcu_expedited();
+       if (!lazy)
+               synchronize_rcu_expedited();
 
        hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
                hlist_del(&m->mnt_umount);
@@ -1578,6 +1579,11 @@ static void namespace_unlock(void)
        }
 }
 
+static inline void namespace_unlock(void)
+{
+       __namespace_unlock(false);
+}
+
 static inline void namespace_lock(void)
 {
        down_write(&namespace_sem);
@@ -1798,7 +1804,7 @@ static int do_umount(struct mount *mnt, int flags)
        }
 out:
        unlock_mount_hash();
-       namespace_unlock();
+       __namespace_unlock(flags & MNT_DETACH);
        return retval;
 }
 

(other variants on this theme might be to pass the flags to
__namespace_unlock() and check MNT_DETACH there)
Ian Kent June 27, 2024, 1:11 a.m. UTC | #2
On 27/6/24 04:47, Matthew Wilcox wrote:
> On Wed, Jun 26, 2024 at 04:07:49PM -0400, Lucas Karpinski wrote:
>> +++ b/fs/namespace.c
>> @@ -78,6 +78,7 @@ static struct kmem_cache *mnt_cache __ro_after_init;
>>   static DECLARE_RWSEM(namespace_sem);
>>   static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
>>   static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
>> +static bool lazy_unlock = false; /* protected by namespace_sem */
> That's a pretty ugly way of doing it.  How about this?

Ha!

That was my original thought but I also didn't much like changing all 
the callers.

I don't really like the proliferation of these small helper functions 
either but if everyone

is happy to do this I think it's a great idea.


Ian

>
> +++ b/fs/namespace.c
> @@ -1553,7 +1553,7 @@ int may_umount(struct vfsmount *mnt)
>   
>   EXPORT_SYMBOL(may_umount);
>   
> -static void namespace_unlock(void)
> +static void __namespace_unlock(bool lazy)
>   {
>          struct hlist_head head;
>          struct hlist_node *p;
> @@ -1570,7 +1570,8 @@ static void namespace_unlock(void)
>          if (likely(hlist_empty(&head)))
>                  return;
>   
> -       synchronize_rcu_expedited();
> +       if (!lazy)
> +               synchronize_rcu_expedited();
>   
>          hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
>                  hlist_del(&m->mnt_umount);
> @@ -1578,6 +1579,11 @@ static void namespace_unlock(void)
>          }
>   }
>   
> +static inline void namespace_unlock(void)
> +{
> +       __namespace_unlock(false);
> +}
> +
>   static inline void namespace_lock(void)
>   {
>          down_write(&namespace_sem);
> @@ -1798,7 +1804,7 @@ static int do_umount(struct mount *mnt, int flags)
>          }
>   out:
>          unlock_mount_hash();
> -       namespace_unlock();
> +       __namespace_unlock(flags & MNT_DETACH);
>          return retval;
>   }
>   
>
> (other variants on this theme might be to pass the flags to
> __namespace_unlock() and check MNT_DETACH there)
>
Jan Kara June 27, 2024, 11:54 a.m. UTC | #3
On Thu 27-06-24 09:11:14, Ian Kent wrote:
> On 27/6/24 04:47, Matthew Wilcox wrote:
> > On Wed, Jun 26, 2024 at 04:07:49PM -0400, Lucas Karpinski wrote:
> > > +++ b/fs/namespace.c
> > > @@ -78,6 +78,7 @@ static struct kmem_cache *mnt_cache __ro_after_init;
> > >   static DECLARE_RWSEM(namespace_sem);
> > >   static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
> > >   static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
> > > +static bool lazy_unlock = false; /* protected by namespace_sem */
> > That's a pretty ugly way of doing it.  How about this?
> 
> Ha!
> 
> That was my original thought but I also didn't much like changing all the
> callers.
> 
> I don't really like the proliferation of these small helper functions either
> but if everyone
> 
> is happy to do this I think it's a great idea.

So I know you've suggested removing synchronize_rcu_expedited() call in
your comment to v2. But I wonder why is it safe? I *thought*
synchronize_rcu_expedited() is there to synchronize the dropping of the
last mnt reference (and maybe something else) - see the comment at the
beginning of mntput_no_expire() - and this change would break that?

								Honza
Christian Brauner June 27, 2024, 3:16 p.m. UTC | #4
On Thu, Jun 27, 2024 at 01:54:18PM GMT, Jan Kara wrote:
> On Thu 27-06-24 09:11:14, Ian Kent wrote:
> > On 27/6/24 04:47, Matthew Wilcox wrote:
> > > On Wed, Jun 26, 2024 at 04:07:49PM -0400, Lucas Karpinski wrote:
> > > > +++ b/fs/namespace.c
> > > > @@ -78,6 +78,7 @@ static struct kmem_cache *mnt_cache __ro_after_init;
> > > >   static DECLARE_RWSEM(namespace_sem);
> > > >   static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
> > > >   static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
> > > > +static bool lazy_unlock = false; /* protected by namespace_sem */
> > > That's a pretty ugly way of doing it.  How about this?
> > 
> > Ha!
> > 
> > That was my original thought but I also didn't much like changing all the
> > callers.
> > 
> > I don't really like the proliferation of these small helper functions either
> > but if everyone
> > 
> > is happy to do this I think it's a great idea.
> 
> So I know you've suggested removing synchronize_rcu_expedited() call in
> your comment to v2. But I wonder why is it safe? I *thought*
> synchronize_rcu_expedited() is there to synchronize the dropping of the
> last mnt reference (and maybe something else) - see the comment at the
> beginning of mntput_no_expire() - and this change would break that?

Yes. During umount mnt->mnt_ns will be set to NULL with namespace_sem
and the mount seqlock held. mntput() doesn't acquire namespace_sem as
that would get rather problematic during path lookup. It also elides
lock_mount_hash() by looking at mnt->mnt_ns because that's set to NULL
when a mount is actually unmounted.

So iirc synchronize_rcu_expedited() will ensure that it is actually the
system call that shuts down all the mounts it put on the umounted list
and not some other task that also called mntput() as that would cause
pretty blatant EBUSY issues.

So callers that come before mnt->mnt_ns = NULL simply return of course
but callers that come after mnt->mnt_ns = NULL will acquire
lock_mount_hash() _under_ rcu_read_lock(). These callers see an elevated
reference count and thus simply return while namespace_lock()'s
synchronize_rcu_expedited() prevents the system call from making
progress.

But I also don't see it working without risk even with MNT_DETACH. It
still has potential to cause issues in userspace. Any program that
always passes MNT_DETACH simply to ensure that even in the very rare
case that a mount might still be busy is unmounted might now end up
seeing increased EBUSY failures for mounts that didn't actually need to
be unmounted with MNT_DETACH. In other words, this is only inocuous if
userspace only uses MNT_DETACH for stuff they actually know is busy when
they're trying to unmount. And I don't think that's the case.
Ian Kent June 28, 2024, 2:58 a.m. UTC | #5
On 27/6/24 19:54, Jan Kara wrote:
> On Thu 27-06-24 09:11:14, Ian Kent wrote:
>> On 27/6/24 04:47, Matthew Wilcox wrote:
>>> On Wed, Jun 26, 2024 at 04:07:49PM -0400, Lucas Karpinski wrote:
>>>> +++ b/fs/namespace.c
>>>> @@ -78,6 +78,7 @@ static struct kmem_cache *mnt_cache __ro_after_init;
>>>>    static DECLARE_RWSEM(namespace_sem);
>>>>    static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
>>>>    static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
>>>> +static bool lazy_unlock = false; /* protected by namespace_sem */
>>> That's a pretty ugly way of doing it.  How about this?
>> Ha!
>>
>> That was my original thought but I also didn't much like changing all the
>> callers.
>>
>> I don't really like the proliferation of these small helper functions either
>> but if everyone
>>
>> is happy to do this I think it's a great idea.
> So I know you've suggested removing synchronize_rcu_expedited() call in
> your comment to v2. But I wonder why is it safe? I *thought*
> synchronize_rcu_expedited() is there to synchronize the dropping of the
> last mnt reference (and maybe something else) - see the comment at the
> beginning of mntput_no_expire() - and this change would break that?

Interesting, because of the definition of lazy umount I didn't look closely

enough at that.


But I wonder, how exactly would that race occur, is holding the rcu read 
lock

sufficient since the rcu'd mount free won't be done until it's released (at

least I think that's how rcu works).


In this case, when lazy is true, the mount will have been detached in 
umount_tree()

and mnt->mnt_ns set to NULL under the namespace sem write lock. So that 
condition

in mntput_no_expre() won't be true and the mount will no longer be found 
by the VFS.

I guess the question then becomes will any outstanding lockless path 
walks race with

this with only the rcu read lock to protect it, Christian?


Ian
Ian Kent June 28, 2024, 3:17 a.m. UTC | #6
On 27/6/24 23:16, Christian Brauner wrote:
> On Thu, Jun 27, 2024 at 01:54:18PM GMT, Jan Kara wrote:
>> On Thu 27-06-24 09:11:14, Ian Kent wrote:
>>> On 27/6/24 04:47, Matthew Wilcox wrote:
>>>> On Wed, Jun 26, 2024 at 04:07:49PM -0400, Lucas Karpinski wrote:
>>>>> +++ b/fs/namespace.c
>>>>> @@ -78,6 +78,7 @@ static struct kmem_cache *mnt_cache __ro_after_init;
>>>>>    static DECLARE_RWSEM(namespace_sem);
>>>>>    static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
>>>>>    static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
>>>>> +static bool lazy_unlock = false; /* protected by namespace_sem */
>>>> That's a pretty ugly way of doing it.  How about this?
>>> Ha!
>>>
>>> That was my original thought but I also didn't much like changing all the
>>> callers.
>>>
>>> I don't really like the proliferation of these small helper functions either
>>> but if everyone
>>>
>>> is happy to do this I think it's a great idea.
>> So I know you've suggested removing synchronize_rcu_expedited() call in
>> your comment to v2. But I wonder why is it safe? I *thought*
>> synchronize_rcu_expedited() is there to synchronize the dropping of the
>> last mnt reference (and maybe something else) - see the comment at the
>> beginning of mntput_no_expire() - and this change would break that?
> Yes. During umount mnt->mnt_ns will be set to NULL with namespace_sem
> and the mount seqlock held. mntput() doesn't acquire namespace_sem as
> that would get rather problematic during path lookup. It also elides
> lock_mount_hash() by looking at mnt->mnt_ns because that's set to NULL
> when a mount is actually unmounted.
>
> So iirc synchronize_rcu_expedited() will ensure that it is actually the
> system call that shuts down all the mounts it put on the umounted list
> and not some other task that also called mntput() as that would cause
> pretty blatant EBUSY issues.
>
> So callers that come before mnt->mnt_ns = NULL simply return of course
> but callers that come after mnt->mnt_ns = NULL will acquire
> lock_mount_hash() _under_ rcu_read_lock(). These callers see an elevated
> reference count and thus simply return while namespace_lock()'s
> synchronize_rcu_expedited() prevents the system call from making
> progress.
>
> But I also don't see it working without risk even with MNT_DETACH. It
> still has potential to cause issues in userspace. Any program that
> always passes MNT_DETACH simply to ensure that even in the very rare
> case that a mount might still be busy is unmounted might now end up
> seeing increased EBUSY failures for mounts that didn't actually need to
> be unmounted with MNT_DETACH. In other words, this is only inocuous if
> userspace only uses MNT_DETACH for stuff they actually know is busy when
> they're trying to unmount. And I don't think that's the case.
>
I'm sorry but how does an MNT_DETACH umount system call return EBUSY, I 
can't

see how that can happen?


I have used lazy umount a lot over the years and I haven't had problems 
with it.

There is a tendency to think there might be problems using it but I've 
never been

able to spot them.


Ian
Jan Kara June 28, 2024, 11:13 a.m. UTC | #7
On Fri 28-06-24 10:58:54, Ian Kent wrote:
> 
> On 27/6/24 19:54, Jan Kara wrote:
> > On Thu 27-06-24 09:11:14, Ian Kent wrote:
> > > On 27/6/24 04:47, Matthew Wilcox wrote:
> > > > On Wed, Jun 26, 2024 at 04:07:49PM -0400, Lucas Karpinski wrote:
> > > > > +++ b/fs/namespace.c
> > > > > @@ -78,6 +78,7 @@ static struct kmem_cache *mnt_cache __ro_after_init;
> > > > >    static DECLARE_RWSEM(namespace_sem);
> > > > >    static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
> > > > >    static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
> > > > > +static bool lazy_unlock = false; /* protected by namespace_sem */
> > > > That's a pretty ugly way of doing it.  How about this?
> > > Ha!
> > > 
> > > That was my original thought but I also didn't much like changing all the
> > > callers.
> > > 
> > > I don't really like the proliferation of these small helper functions either
> > > but if everyone
> > > 
> > > is happy to do this I think it's a great idea.
> > So I know you've suggested removing synchronize_rcu_expedited() call in
> > your comment to v2. But I wonder why is it safe? I *thought*
> > synchronize_rcu_expedited() is there to synchronize the dropping of the
> > last mnt reference (and maybe something else) - see the comment at the
> > beginning of mntput_no_expire() - and this change would break that?
> 
> Interesting, because of the definition of lazy umount I didn't look closely
> enough at that.
> 
> But I wonder, how exactly would that race occur, is holding the rcu read
> lock sufficient since the rcu'd mount free won't be done until it's
> released (at least I think that's how rcu works).

I'm concerned about a race like:

[path lookup]				[umount -l]
...
path_put()
  mntput(mnt)
    mntput_no_expire(m)
      rcu_read_lock();
      if (likely(READ_ONCE(mnt->mnt_ns))) {
					do_umount()
					  umount_tree()
					    ...
					    mnt->mnt_ns = NULL;
					    ...
					  namespace_unlock()
					    mntput(&m->mnt)
					      mntput_no_expire(mnt)
				              smp_mb();
					      mnt_add_count(mnt, -1);
					      count = mnt_get_count(mnt);
					      if (count != 0) {
						...
						return;
        mnt_add_count(mnt, -1);
        rcu_read_unlock();
        return;
-> KABOOM, mnt->mnt_count dropped to 0 but nobody cleaned up the mount!
      }

And this scenario is exactly prevented by synchronize_rcu() in
namespace_unlock().


								Honza
Christian Brauner June 28, 2024, 12:54 p.m. UTC | #8
On Fri, Jun 28, 2024 at 11:17:43AM GMT, Ian Kent wrote:
> 
> On 27/6/24 23:16, Christian Brauner wrote:
> > On Thu, Jun 27, 2024 at 01:54:18PM GMT, Jan Kara wrote:
> > > On Thu 27-06-24 09:11:14, Ian Kent wrote:
> > > > On 27/6/24 04:47, Matthew Wilcox wrote:
> > > > > On Wed, Jun 26, 2024 at 04:07:49PM -0400, Lucas Karpinski wrote:
> > > > > > +++ b/fs/namespace.c
> > > > > > @@ -78,6 +78,7 @@ static struct kmem_cache *mnt_cache __ro_after_init;
> > > > > >    static DECLARE_RWSEM(namespace_sem);
> > > > > >    static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
> > > > > >    static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
> > > > > > +static bool lazy_unlock = false; /* protected by namespace_sem */
> > > > > That's a pretty ugly way of doing it.  How about this?
> > > > Ha!
> > > > 
> > > > That was my original thought but I also didn't much like changing all the
> > > > callers.
> > > > 
> > > > I don't really like the proliferation of these small helper functions either
> > > > but if everyone
> > > > 
> > > > is happy to do this I think it's a great idea.
> > > So I know you've suggested removing synchronize_rcu_expedited() call in
> > > your comment to v2. But I wonder why is it safe? I *thought*
> > > synchronize_rcu_expedited() is there to synchronize the dropping of the
> > > last mnt reference (and maybe something else) - see the comment at the
> > > beginning of mntput_no_expire() - and this change would break that?
> > Yes. During umount mnt->mnt_ns will be set to NULL with namespace_sem
> > and the mount seqlock held. mntput() doesn't acquire namespace_sem as
> > that would get rather problematic during path lookup. It also elides
> > lock_mount_hash() by looking at mnt->mnt_ns because that's set to NULL
> > when a mount is actually unmounted.
> > 
> > So iirc synchronize_rcu_expedited() will ensure that it is actually the
> > system call that shuts down all the mounts it put on the umounted list
> > and not some other task that also called mntput() as that would cause
> > pretty blatant EBUSY issues.
> > 
> > So callers that come before mnt->mnt_ns = NULL simply return of course
> > but callers that come after mnt->mnt_ns = NULL will acquire
> > lock_mount_hash() _under_ rcu_read_lock(). These callers see an elevated
> > reference count and thus simply return while namespace_lock()'s
> > synchronize_rcu_expedited() prevents the system call from making
> > progress.
> > 
> > But I also don't see it working without risk even with MNT_DETACH. It
> > still has potential to cause issues in userspace. Any program that
> > always passes MNT_DETACH simply to ensure that even in the very rare
> > case that a mount might still be busy is unmounted might now end up
> > seeing increased EBUSY failures for mounts that didn't actually need to
> > be unmounted with MNT_DETACH. In other words, this is only inocuous if
> > userspace only uses MNT_DETACH for stuff they actually know is busy when
> > they're trying to unmount. And I don't think that's the case.
> > 
> I'm sorry but how does an MNT_DETACH umount system call return EBUSY, I
> can't
> 
> see how that can happen?

Not the umount() call is the problem. Say you have the following
sequence:

(1) mount(ext4-device, /mnt)
    umount(/mnt, 0)
    mount(ext4-device, /mnt)

If that ext4 filesystem isn't in use anymore then umount() will succeed.
The same task can immediately issue a second mount() call on the same
device and it must succeed.

Today the behavior for this is the same whether or no the caller uses
MNT_DETACH. So:

(2) mount(ext4-device, /mnt)
    umount(/mnt, MNT_DETACH)
    mount(ext4-device, /mnt)

All that MNT_DETACH does is to skip the check for busy mounts otherwise
it's identical to a regular umount. So (1) and (2) will behave the same
as long as the filesystem isn't used anymore.

But afaict with your changes this wouldn't be true anymore. If someone
uses (2) on a filesystem that isn't busy then they might end up getting
EBUSY on the second mount. And if I'm right then that's potentially a
rather visible change.
Alexander Larsson June 28, 2024, 3:13 p.m. UTC | #9
On Fri, Jun 28, 2024 at 2:54 PM Christian Brauner <brauner@kernel.org> wrote:
>
> On Fri, Jun 28, 2024 at 11:17:43AM GMT, Ian Kent wrote:
> >
> > On 27/6/24 23:16, Christian Brauner wrote:
> > > On Thu, Jun 27, 2024 at 01:54:18PM GMT, Jan Kara wrote:
> > > > On Thu 27-06-24 09:11:14, Ian Kent wrote:
> > > > > On 27/6/24 04:47, Matthew Wilcox wrote:
> > > > > > On Wed, Jun 26, 2024 at 04:07:49PM -0400, Lucas Karpinski wrote:
> > > > > > > +++ b/fs/namespace.c
> > > > > > > @@ -78,6 +78,7 @@ static struct kmem_cache *mnt_cache __ro_after_init;
> > > > > > >    static DECLARE_RWSEM(namespace_sem);
> > > > > > >    static HLIST_HEAD(unmounted);    /* protected by namespace_sem */
> > > > > > >    static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
> > > > > > > +static bool lazy_unlock = false; /* protected by namespace_sem */
> > > > > > That's a pretty ugly way of doing it.  How about this?
> > > > > Ha!
> > > > >
> > > > > That was my original thought but I also didn't much like changing all the
> > > > > callers.
> > > > >
> > > > > I don't really like the proliferation of these small helper functions either
> > > > > but if everyone
> > > > >
> > > > > is happy to do this I think it's a great idea.
> > > > So I know you've suggested removing synchronize_rcu_expedited() call in
> > > > your comment to v2. But I wonder why is it safe? I *thought*
> > > > synchronize_rcu_expedited() is there to synchronize the dropping of the
> > > > last mnt reference (and maybe something else) - see the comment at the
> > > > beginning of mntput_no_expire() - and this change would break that?
> > > Yes. During umount mnt->mnt_ns will be set to NULL with namespace_sem
> > > and the mount seqlock held. mntput() doesn't acquire namespace_sem as
> > > that would get rather problematic during path lookup. It also elides
> > > lock_mount_hash() by looking at mnt->mnt_ns because that's set to NULL
> > > when a mount is actually unmounted.
> > >
> > > So iirc synchronize_rcu_expedited() will ensure that it is actually the
> > > system call that shuts down all the mounts it put on the umounted list
> > > and not some other task that also called mntput() as that would cause
> > > pretty blatant EBUSY issues.
> > >
> > > So callers that come before mnt->mnt_ns = NULL simply return of course
> > > but callers that come after mnt->mnt_ns = NULL will acquire
> > > lock_mount_hash() _under_ rcu_read_lock(). These callers see an elevated
> > > reference count and thus simply return while namespace_lock()'s
> > > synchronize_rcu_expedited() prevents the system call from making
> > > progress.
> > >
> > > But I also don't see it working without risk even with MNT_DETACH. It
> > > still has potential to cause issues in userspace. Any program that
> > > always passes MNT_DETACH simply to ensure that even in the very rare
> > > case that a mount might still be busy is unmounted might now end up
> > > seeing increased EBUSY failures for mounts that didn't actually need to
> > > be unmounted with MNT_DETACH. In other words, this is only inocuous if
> > > userspace only uses MNT_DETACH for stuff they actually know is busy when
> > > they're trying to unmount. And I don't think that's the case.
> > >
> > I'm sorry but how does an MNT_DETACH umount system call return EBUSY, I
> > can't
> >
> > see how that can happen?
>
> Not the umount() call is the problem. Say you have the following
> sequence:
>
> (1) mount(ext4-device, /mnt)
>     umount(/mnt, 0)
>     mount(ext4-device, /mnt)
>
> If that ext4 filesystem isn't in use anymore then umount() will succeed.
> The same task can immediately issue a second mount() call on the same
> device and it must succeed.
>
> Today the behavior for this is the same whether or no the caller uses
> MNT_DETACH. So:
>
> (2) mount(ext4-device, /mnt)
>     umount(/mnt, MNT_DETACH)
>     mount(ext4-device, /mnt)
>
> All that MNT_DETACH does is to skip the check for busy mounts otherwise
> it's identical to a regular umount. So (1) and (2) will behave the same
> as long as the filesystem isn't used anymore.
>
> But afaict with your changes this wouldn't be true anymore. If someone
> uses (2) on a filesystem that isn't busy then they might end up getting
> EBUSY on the second mount. And if I'm right then that's potentially a
> rather visible change.

This is rather unfortunate, as the synchronize_rcu call is quite
expensive. In particular on a real-time kernel where there are no
expedited RCUs. This is causing container startup to be slow, as there
are several umount(MNT_DETACH) happening during container setup (after
the pivot_root, etc).

Maybe we can add a umount flag for users that don't need the current
behaviour wrt EBUSY? In the container usecase the important part is
that the old mounts are disconnected from the child namespace and not
really what the mount busy state is (typically it is still mounted in
the parent namespace anyway).
diff mbox series

Patch

diff --git a/fs/namespace.c b/fs/namespace.c
index 5a51315c6678..5d889e05dd14 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -78,6 +78,7 @@  static struct kmem_cache *mnt_cache __ro_after_init;
 static DECLARE_RWSEM(namespace_sem);
 static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
 static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
+static bool lazy_unlock = false; /* protected by namespace_sem */
 
 struct mount_kattr {
 	unsigned int attr_set;
@@ -1555,6 +1556,7 @@  EXPORT_SYMBOL(may_umount);
 
 static void namespace_unlock(void)
 {
+	bool lazy;
 	struct hlist_head head;
 	struct hlist_node *p;
 	struct mount *m;
@@ -1563,6 +1565,9 @@  static void namespace_unlock(void)
 	hlist_move_list(&unmounted, &head);
 	list_splice_init(&ex_mountpoints, &list);
 
+	lazy = lazy_unlock;
+	lazy_unlock = false;
+
 	up_write(&namespace_sem);
 
 	shrink_dentry_list(&list);
@@ -1570,7 +1575,8 @@  static void namespace_unlock(void)
 	if (likely(hlist_empty(&head)))
 		return;
 
-	synchronize_rcu_expedited();
+	if (!lazy)
+		synchronize_rcu_expedited();
 
 	hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
 		hlist_del(&m->mnt_umount);
@@ -1798,6 +1804,7 @@  static int do_umount(struct mount *mnt, int flags)
 	}
 out:
 	unlock_mount_hash();
+	lazy_unlock = flags & MNT_DETACH ? true : false;
 	namespace_unlock();
 	return retval;
 }