diff mbox series

[03/34] teach move_mount(2) to work with OPEN_TREE_CLONE [ver #12]

Message ID 153754743491.17872.12115848333103740766.stgit@warthog.procyon.org.uk (mailing list archive)
State New, archived
Headers show
Series VFS: Introduce filesystem context [ver #12] | expand

Commit Message

David Howells Sept. 21, 2018, 4:30 p.m. UTC
From: Al Viro <viro@zeniv.linux.org.uk>

Allow a detached tree created by open_tree(..., OPEN_TREE_CLONE) to be
attached by move_mount(2).

If by the time of final fput() of OPEN_TREE_CLONE-opened file its tree is
not detached anymore, it won't be dissolved.  move_mount(2) is adjusted
to handle detached source.

That gives us equivalents of mount --bind and mount --rbind.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: David Howells <dhowells@redhat.com>
---

 fs/namespace.c |   26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

Comments

Alan Jenkins Oct. 5, 2018, 6:24 p.m. UTC | #1
On 21/09/2018 17:30, David Howells wrote:
> From: Al Viro <viro@zeniv.linux.org.uk>
>
> Allow a detached tree created by open_tree(..., OPEN_TREE_CLONE) to be
> attached by move_mount(2).
>
> If by the time of final fput() of OPEN_TREE_CLONE-opened file its tree is
> not detached anymore, it won't be dissolved.  move_mount(2) is adjusted
> to handle detached source.
>
> That gives us equivalents of mount --bind and mount --rbind.
>
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> Signed-off-by: David Howells <dhowells@redhat.com>
> ---
>
>   fs/namespace.c |   26 ++++++++++++++++++++------
>   1 file changed, 20 insertions(+), 6 deletions(-)
>
> diff --git a/fs/namespace.c b/fs/namespace.c
> index dd38141b1723..caf5c55ef555 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -1785,8 +1785,10 @@ void dissolve_on_fput(struct vfsmount *mnt)
>   {
>   	namespace_lock();
>   	lock_mount_hash();
> -	mntget(mnt);
> -	umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
> +	if (!real_mount(mnt)->mnt_ns) {
> +		mntget(mnt);
> +		umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
> +	}
>   	unlock_mount_hash();
>   	namespace_unlock();
>   }
> @@ -2393,6 +2395,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
>   	struct mount *old;
>   	struct mountpoint *mp;
>   	int err;
> +	bool attached;
>   
>   	mp = lock_mount(new_path);
>   	err = PTR_ERR(mp);
> @@ -2403,10 +2406,19 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
>   	p = real_mount(new_path->mnt);
>   
>   	err = -EINVAL;
> -	if (!check_mnt(p) || !check_mnt(old))
> +	/* The mountpoint must be in our namespace. */
> +	if (!check_mnt(p))
> +		goto out1;
> +	/* The thing moved should be either ours or completely unattached. */
> +	if (old->mnt_ns && !check_mnt(old))
>   		goto out1;
>   
> -	if (!mnt_has_parent(old))
> +	attached = mnt_has_parent(old);
> +	/*
> +	 * We need to allow open_tree(OPEN_TREE_CLONE) followed by
> +	 * move_mount(), but mustn't allow "/" to be moved.
> +	 */
> +	if (old->mnt_ns && !attached)
>   		goto out1;
>   
>   	if (old->mnt.mnt_flags & MNT_LOCKED)

Hi

I replied last time to wonder about the MNT_UMOUNT mnt_flag. So I've 
tested it now :-), on David's current tree (commit 5581f4935add).

The modified do_move_mount() allows re-attaching something that was 
lazy-unmounted. But the lazy unmount sets MNT_UMOUNT. And this flag is 
not cleared when the mount is re-attached.

I wasn't sure what effect this would have. Luckily it showed up straight 
away, when I tried to unmount again. It causes a soft lockup.

Debug printk:

diff --git a/fs/namespace.c b/fs/namespace.c
index 4dfe7e23b7ee..ac8de9191cfe 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2472,6 +2472,10 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
  	if (old->mnt.mnt_flags & MNT_LOCKED)
  		goto out1;
  
+	pr_info("mnt_flags=%x umount=%x\n",
+	        (unsigned) old->mnt.mnt_flags,
+	        (unsigned) !!(old->mnt.mnt_flags & MNT_UMOUNT);
+
  	if (old_path->dentry != old_path->mnt->mnt_root)
  		goto out1;

Testing:

# mount -ttmpfs tmp /mnt
# cd /mnt
# umount .
umount: /mnt: target is busy.
# umount -l .
# mount --move . /mnt
[  577.773804] mnt_flags=8000020 umount=1

Double-check the flags after the mount is re-attached:

# mount --move . /mnt
[  610.891311] mnt_flags=8000020 umount=1
mount: /mnt: mount(2) system call failed: Too many levels of symbolic links.

The bug:

# cd
# umount /mnt
[  656.229099] watchdog: BUG: soft lockup - CPU#0 stuck for 23s! [umount:1457]
[  656.230231] Modules linked in: xt_CHECKSUM(E) ipt_MASQUERADE(E) tun(E) bridge(E) stp(E) llc(E) ip6t_rpfilter(E) ip6t_REJECT(E) nf_reject_ipv6(E) xt_conntrack(E) devlink(E) ip6table_nat(E) nf_nat_ipv6(E) ip6table_mangle(E) ip6table_raw(E) ip6table_security(E) iptable_nat(E) nf_nat_ipv4(E) nf_nat(E) nf_conntrack(E) nf_defrag_ipv6(E) libcrc32c(E) nf_defrag_ipv4(E) iptable_mangle(E) iptable_raw(E) iptable_security(E) ip6table_filter(E) ip6_tables(E) snd_hda_codec_generic(E) snd_hda_intel(E) snd_hda_codec(E) snd_hwdep(E) snd_hda_core(E) snd_seq(E) snd_seq_device(E) snd_pcm(E) joydev(E) crc32_pclmul(E) snd_timer(E) snd(E) ghash_clmulni_intel(E) crct10dif_pclmul(E) virtio_balloon(E) soundcore(E) serio_raw(E) crc32c_intel(E) qxl(E) virtio_console(E) virtio_net(E) net_failover(E) failover(E) drm_kms_helper(E)
[  656.242150]  ttm(E) drm(E) qemu_fw_cfg(E) pata_acpi(E) ata_generic(E)
[  656.243333] CPU: 0 PID: 1457 Comm: umount Tainted: G            E     4.19.0-rc3+ #7
[  656.244767] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20180531_142017-buildhw-08.phx2.fedoraproject.org-1.fc28 04/01/2014
[  656.247038] RIP: 0010:pin_kill+0x128/0x140
[  656.247789] Code: f2 5a 00 48 8b 44 24 20 48 39 c5 0f 84 6f ff ff ff 48 89 df e8 e9 4a 5b 00 8b 43 18 85 c0 7e b3 c6 03 00 fb 66 0f 1f 44 00 00 <e9> 51 ff ff ff e8 be 11 dd ff 0f 1f 40 00 66 2e 0f 1f 84 00 00 00
[  656.250738] RSP: 0018:ffffa58040f93e30 EFLAGS: 00000202 ORIG_RAX: ffffffffffffff13
[  656.251984] RAX: 0000000000000000 RBX: ffff971a6b16dc30 RCX: dead000000000200
[  656.253183] RDX: 0000000000000001 RSI: ffffa58040f93dd0 RDI: ffff971a6b16dc30
[  656.254484] RBP: ffffa58040f93e50 R08: 000000000000067d R09: 000000000000067d
[  656.255838] R10: 0000000000000000 R11: 0000000000000000 R12: ffff971a6b2b1800
[  656.257181] R13: ffff971a6b16db88 R14: 0000000000000000 R15: ffff971a6b16db50
[  656.258530] FS:  00007fc7bac88fc0(0000) GS:ffff971ad9600000(0000) knlGS:0000000000000000
[  656.260079] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  656.261165] CR2: 00007fc7ba8704c7 CR3: 000000002d22c001 CR4: 00000000003606f0
[  656.262506] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  656.263690] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  656.265329] Call Trace:
[  656.267958]  ? finish_wait+0x80/0x80
[  656.269083]  group_pin_kill+0x1a/0x30
[  656.269989]  namespace_unlock+0x6f/0x80
[  656.270652]  ksys_umount+0x220/0x420
[  656.271393]  __x64_sys_umount+0x12/0x20
[  656.272249]  do_syscall_64+0x5b/0x160
[  656.272988]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[  656.273942] RIP: 0033:0x7fc7b9cd9117
[  656.274630] Code: ed 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 31 f6 e9 09 00 00 00 66 0f 1f 84 00 00 00 00 00 b8 a6 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 59 ed 2b 00 f7 d8 64 89 01 48
[  656.278886] RSP: 002b:00007ffe0a557498 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
[  656.281518] RAX: ffffffffffffffda RBX: 0000556bab8bd420 RCX: 00007fc7b9cd9117
[  656.283138] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 0000556bab8bd600
[  656.284757] RBP: 0000000000000000 R08: 0000556bab8bd620 R09: 00007ffe0a555d00
[  656.286367] R10: 0000000000000000 R11: 0000000000000246 R12: 0000556bab8bd600
[  656.288408] R13: 00007fc7baa7f1a4 R14: 0000000000000000 R15: 00007ffe0a557708
Alan Jenkins Oct. 7, 2018, 10:48 a.m. UTC | #2
On 05/10/2018 19:24, Alan Jenkins wrote:
> On 21/09/2018 17:30, David Howells wrote:
>> From: Al Viro <viro@zeniv.linux.org.uk>
>>
>> Allow a detached tree created by open_tree(..., OPEN_TREE_CLONE) to be
>> attached by move_mount(2).
>>
>> If by the time of final fput() of OPEN_TREE_CLONE-opened file its 
>> tree is
>> not detached anymore, it won't be dissolved.  move_mount(2) is adjusted
>> to handle detached source.
>>
>> That gives us equivalents of mount --bind and mount --rbind.
>>
>> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
>> Signed-off-by: David Howells <dhowells@redhat.com>
>> ---
>>
>>   fs/namespace.c |   26 ++++++++++++++++++++------
>>   1 file changed, 20 insertions(+), 6 deletions(-)
>>
>> diff --git a/fs/namespace.c b/fs/namespace.c
>> index dd38141b1723..caf5c55ef555 100644
>> --- a/fs/namespace.c
>> +++ b/fs/namespace.c
>> @@ -1785,8 +1785,10 @@ void dissolve_on_fput(struct vfsmount *mnt)
>>   {
>>       namespace_lock();
>>       lock_mount_hash();
>> -    mntget(mnt);
>> -    umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
>> +    if (!real_mount(mnt)->mnt_ns) {
>> +        mntget(mnt);
>> +        umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
>> +    }
>>       unlock_mount_hash();
>>       namespace_unlock();
>>   }
>> @@ -2393,6 +2395,7 @@ static int do_move_mount(struct path *old_path, 
>> struct path *new_path)
>>       struct mount *old;
>>       struct mountpoint *mp;
>>       int err;
>> +    bool attached;
>>         mp = lock_mount(new_path);
>>       err = PTR_ERR(mp);
>> @@ -2403,10 +2406,19 @@ static int do_move_mount(struct path 
>> *old_path, struct path *new_path)
>>       p = real_mount(new_path->mnt);
>>         err = -EINVAL;
>> -    if (!check_mnt(p) || !check_mnt(old))
>> +    /* The mountpoint must be in our namespace. */
>> +    if (!check_mnt(p))
>> +        goto out1;
>> +    /* The thing moved should be either ours or completely 
>> unattached. */
>> +    if (old->mnt_ns && !check_mnt(old))
>>           goto out1;
>>   -    if (!mnt_has_parent(old))
>> +    attached = mnt_has_parent(old);
>> +    /*
>> +     * We need to allow open_tree(OPEN_TREE_CLONE) followed by
>> +     * move_mount(), but mustn't allow "/" to be moved.
>> +     */
>> +    if (old->mnt_ns && !attached)
>>           goto out1;
>>         if (old->mnt.mnt_flags & MNT_LOCKED)
>
> Hi
>
> I replied last time to wonder about the MNT_UMOUNT mnt_flag. So I've 
> tested it now :-), on David's current tree (commit 5581f4935add).
>
> The modified do_move_mount() allows re-attaching something that was 
> lazy-unmounted. But the lazy unmount sets MNT_UMOUNT. And this flag is 
> not cleared when the mount is re-attached.
>
> I wasn't sure what effect this would have. Luckily it showed up 
> straight away, when I tried to unmount again. It causes a soft lockup.
>
> Debug printk:
>
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 4dfe7e23b7ee..ac8de9191cfe 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -2472,6 +2472,10 @@ static int do_move_mount(struct path *old_path, 
> struct path *new_path)
>      if (old->mnt.mnt_flags & MNT_LOCKED)
>          goto out1;
>
> +    pr_info("mnt_flags=%x umount=%x\n",
> +            (unsigned) old->mnt.mnt_flags,
> +            (unsigned) !!(old->mnt.mnt_flags & MNT_UMOUNT);
> +
>      if (old_path->dentry != old_path->mnt->mnt_root)
>          goto out1;

The lockup seems to be a general problem with the cleanup code. Even if 
I use this as advertised, i.e. for a simple bind mount.

(I was suspicious that being able to pass around detached trees as an 
FD, and re-attach them in any namespace, allows leaking memory by 
creating a namespace loop.  I.e. maybe it gives you enough rope to skip 
the test in mnt_ns_loop().  But I didn't get that far).

I converted test-fsmount.c for my own purposes:

diff --git a/samples/vfs/test-fsmount.c b/samples/vfs/test-fsmount.c
index 74124025ade0..da6e3fbf0513 100644
--- a/samples/vfs/test-fsmount.c
+++ b/samples/vfs/test-fsmount.c
@@ -83,6 +83,11 @@ static inline int move_mount(int from_dfd, const char *from_pathname,
  		       to_dfd, to_pathname, flags);
  }
  
+static inline int open_tree(int dfd, const char *pathname, unsigned flags)
+{
+	return syscall(__NR_open_tree, dfd, pathname, flags);
+}
+
  #define E_fsconfig(fd, cmd, key, val, aux)				\
  	do {								\
  		if (fsconfig(fd, cmd, key, val, aux) == -1)		\
@@ -93,6 +98,7 @@ int main(int argc, char *argv[])
  {
  	int fsfd, mfd;
  
+#if 0
  	/* Mount a publically available AFS filesystem */
  	fsfd = fsopen("afs", 0);
  	if (fsfd == -1) {
@@ -115,4 +121,9 @@ int main(int argc, char *argv[])
  
  	E(close(mfd));
  	exit(0);
+#endif
+
+	E( mfd = open_tree(-1, "/mnt", OPEN_TREE_CLONE) );
+	E( fchdir(mfd) );
+	E( execl("/bin/bash", "/bin/bash", NULL) );
  }

If I close() the mount FD "mfd", and then do "mount --move . /mnt", my 
printk() shows MNT_UMOUNT has been set. ( I guess fchdir() works more 
like openat(... , O_PATH) than dup() ). Then unmounting /mnt hangs, as I 
would expect from my previous test.

If I instead do the mount+unmount first, and close the FD as a second 
step, I think there's a lockup in the close().  The lockup happens in 
the same place as the unmount lockup from before. (Except there's a line 
"Code: Bad RIP value", I don't know why that happens).

# unshare --mount
# test-fsmount
# mount --move . /mnt
[  270.859542] umount=0 mnt_flags=20

Check the flags are still the same:

# mount --move /mnt /mnt
[  305./mnt: mount(2) system call failed: Too many levels of symbolic links.
[  313.737030] umount=0 mnt_flags=20

Clean up the bind mount, and then the inherited mount FD.

# cd
# umount /mnt
# exit

[  351.898629] watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [bash:1483]
[  351.899841] Modules linked in: xt_CHECKSUM(E) ipt_MASQUERADE(E) tun(E) bridge(E) stp(E) llc(E) ip6t_rpfilter(E) ip6t_REJECT(E) nf_reject_ipv6(E) xt_conntrack(E) ip6table_nat(E) nf_nat_ipv6(E) devlink(E) ip6table_mangle(E) ip6table_raw(E) ip6table_security(E) iptable_nat(E) nf_nat_ipv4(E) nf_nat(E) nf_conntrack(E) nf_defrag_ipv6(E) libcrc32c(E) nf_defrag_ipv4(E) iptable_mangle(E) iptable_raw(E) iptable_security(E) ip6table_filter(E) ip6_tables(E) snd_hda_codec_generic(E) snd_hda_intel(E) snd_hda_codec(E) snd_hwdep(E) snd_hda_core(E) snd_seq(E) snd_seq_device(E) snd_pcm(E) joydev(E) crc32_pclmul(E) snd_timer(E) ghash_clmulni_intel(E) snd(E) crct10dif_pclmul(E) virtio_balloon(E) serio_raw(E) soundcore(E) crc32c_intel(E) qxl(E) drm_kms_helper(E) virtio_console(E) ttm(E) virtio_net(E) net_failover(E)
[  351.912077]  failover(E) drm(E) qemu_fw_cfg(E) pata_acpi(E) ata_generic(E)
[  351.912888] CPU: 0 PID: 1483 Comm: bash Tainted: G            E     4.19.0-rc3+ #7
[  351.914221] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20180531_142017-buildhw-08.phx2.fedoraproject.org-1.fc28 04/01/2014
[  351.916582] RIP: 0010:pin_kill+0x128/0x140
[  351.917369] Code: f2 5a 00 48 8b 44 24 20 48 39 c5 0f 84 6f ff ff ff 48 89 df e8 e9 4a 5b 00 8b 43 18 85 c0 7e b3 c6 03 00 fb 66 0f 1f 44 00 00 <e9> 51 ff ff ff e8 be 11 dd ff 0f 1f 40 00 66 2e 0f 1f 84 00 00 00
[  351.920729] RSP: 0018:ffffa1b381be3d88 EFLAGS: 00000202 ORIG_RAX: ffffffffffffff13
[  351.921801] RAX: 0000000000000000 RBX: ffff909cf2ea68b0 RCX: dead000000000200
[  351.922807] RDX: 0000000000000001 RSI: ffffa1b381be3d28 RDI: ffff909cf2ea68b0
[  351.923811] RBP: ffffa1b381be3da8 R08: ffff909d59621760 R09: 0000000000000000
[  351.924813] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000010000000
[  351.925818] R13: ffff909cf5db9a38 R14: ffff909cf2ea67a0 R15: ffff909cedc07300
[  351.926824] FS:  00007f1eb90ac740(0000) GS:ffff909d59600000(0000) knlGS:0000000000000000
[  351.927957] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  351.928772] CR2: 00007f1eabedb180 CR3: 000000000f20a003 CR4: 00000000003606f0
[  351.929779] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  351.930785] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  351.931791] Call Trace:
[  351.932160]  ? finish_wait+0x80/0x80
[  351.932684]  group_pin_kill+0x1a/0x30
[  351.933207]  namespace_unlock+0x6f/0x80
[  351.933766]  __fput+0x239/0x240
[  351.934217]  task_work_run+0x84/0xa0
[  351.934743]  do_exit+0x2d3/0xae0
[  351.935206]  ? __do_page_fault+0x263/0x4e0
[  351.935799]  do_group_exit+0x3a/0xa0
[  351.936307]  __x64_sys_exit_group+0x14/0x20
[  351.936911]  do_syscall_64+0x5b/0x160
[  351.937436]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[  351.938164] RIP: 0033:0x7f1eb877adb6
[  351.938688] Code: Bad RIP value.
[  351.939149] RSP: 002b:00007ffd56e019d8 EFLAGS: 00000246 ORIG_RAX: 00000000000000e7
[  351.940216] RAX: ffffffffffffffda RBX: 00007f1eb8a69740 RCX: 00007f1eb877adb6
[  351.941222] RDX: 0000000000000000 RSI: 000000000000003c RDI: 0000000000000000
[  351.942229] RBP: 0000000000000000 R08: 00000000000000e7 R09: ffffffffffffff80
[  351.943236] R10: 00007ffd56e0188a R11: 0000000000000246 R12: 00007f1eb8a69740
[  351.944242] R13: 0000000000000001 R14: 00007f1eb8a72708 R15: 0000000000000000
Alan Jenkins Oct. 7, 2018, 7:20 p.m. UTC | #3
On 07/10/2018 11:48, Alan Jenkins wrote:
> On 05/10/2018 19:24, Alan Jenkins wrote:
>> On 21/09/2018 17:30, David Howells wrote:
>>> From: Al Viro <viro@zeniv.linux.org.uk>
>>>
>>> Allow a detached tree created by open_tree(..., OPEN_TREE_CLONE) to be
>>> attached by move_mount(2).
>>>
>>> If by the time of final fput() of OPEN_TREE_CLONE-opened file its 
>>> tree is
>>> not detached anymore, it won't be dissolved.  move_mount(2) is adjusted
>>> to handle detached source.
>>>
>>> That gives us equivalents of mount --bind and mount --rbind.
>>>
>>> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
>>> Signed-off-by: David Howells <dhowells@redhat.com>
>>> ---
>>>
>>>   fs/namespace.c |   26 ++++++++++++++++++++------
>>>   1 file changed, 20 insertions(+), 6 deletions(-)

>>> The lockup seems to be a general problem with the cleanup code. Even 
>>> if I use this as advertised, i.e. for a simple bind mount.

Ah, I see.  The problem is you were expecting me to use the FD from 
open_tree() directly.  But I did fchdir() into the FD, and then "mount 
--move . /mnt" :-).

If I use the FD directly, it avoids the hang.  I used two separate C 
programs (attached, to avoid my MUA damage)...

> (I was suspicious that being able to pass around detached trees as an 
> FD, and re-attach them in any namespace, allows leaking memory by 
> creating a namespace loop.  I.e. maybe it gives you enough rope to 
> skip the test in mnt_ns_loop().

...so here's the memory leak.

# open_tree --help
usage: open_tree 3</source/path FD_NUMBER COMMAND...
# move_mount --help
usage: move_mount 3</from/path 4</to/path

Create a child namespace:

# mount --make-shared /tmp
# cd /tmp
# mkdir private_mnt
# mount -t tmpfs tmp private_mnt
# mount --make-private private_mnt
# touch private_mnt/child_ns
# unshare --mount=private_mnt/child_ns --propagation=shared ls -l /proc/self/ns/mnt
lrwxrwxrwx. 1 root root 0 Oct  7 19:23 /proc/self/ns/mnt -> 'mnt:[4026532334]'
# findmnt | grep /tmp
├─/tmp                                tmpfs                  tmpfs      rw,nosuid,nodev,seclabel,size=1247640k,nr_inodes=311910
│ └─/tmp/private_mnt                  tmp                    tmpfs      rw,relatime,seclabel,uid=1000,gid=1000
│   └─/tmp/private_mnt/child_ns       nsfs[mnt:[4026532334]] nsfs       rw,seclabel


Create a reference cycle:

# ~/test-open_tree 3</tmp/private_mnt 3 \
     nsenter --mount=/tmp/private_mnt/child_ns \
     sh -c '~/test-move_mount 4</mnt'

Attach 10MB of memory to the cycle:

# grep Shmem: /proc/meminfo
Shmem:              1464 kB
# dd if=/dev/zero of=/tmp/private_mnt/bigfile bs=1M count=10
10+0 records in
10+0 records out
10485760 bytes (10 MB, 10 MiB) copied, 0.00976358 s, 1.1 GB/s
# grep Shmem: /proc/meminfo
Shmem:             11704 kB

Detach the cycle, and leak all the memory:

# umount -l /tmp/private_mnt/
# grep Shmem: /proc/meminfo
Shmem:             11704 kB
diff --git a/samples/vfs/Makefile b/samples/vfs/Makefile
index 4ac9690fb3c4..13a32e125a74 100644
--- a/samples/vfs/Makefile
+++ b/samples/vfs/Makefile
@@ -1,10 +1,14 @@
 # List of programs to build
 hostprogs-$(CONFIG_SAMPLE_VFS) := \
 	test-fsmount \
+	open_tree \
+	move_mount \
 	test-statx
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
 
 HOSTCFLAGS_test-fsmount.o += -I$(objtree)/usr/include
+HOSTCFLAGS_open_tree.o += -I$(objtree)/usr/include
+HOSTCFLAGS_move_mount.o += -I$(objtree)/usr/include
 HOSTCFLAGS_test-statx.o += -I$(objtree)/usr/include
diff --git a/samples/vfs/open_tree.c b/samples/vfs/open_tree.c
new file mode 100644
index 000000000000..6222e69048f9
--- /dev/null
+++ b/samples/vfs/open_tree.c
@@ -0,0 +1,54 @@
+/* fd-based mount test.
+ *
+ * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <linux/fs.h>
+#include <linux/unistd.h>
+
+#ifndef AT_RECURSIVE
+#define AT_RECURSIVE 0x8000
+#endif
+
+#define E(x) do { if ((x) == -1) { perror(#x); exit(1); } } while(0)
+
+static inline int open_tree(int dfd, const char *pathname, unsigned flags)
+{
+	return syscall(__NR_open_tree, dfd, pathname, flags);
+}
+
+int main(int argc, char *argv[])
+{
+	int fd_number;
+	char **command;
+	int mfd;
+
+	if (argc < 3 || !isdigit(argv[1][0])) {
+		fprintf(stderr, "usage: open_tree 3</source/path FD_NUMBER COMMAND...\n");
+		exit(2);
+	}
+	fd_number = atoi(argv[1]);
+	command = argv + 2;
+
+	E( mfd = open_tree(3, "", AT_EMPTY_PATH | OPEN_TREE_CLONE | AT_RECURSIVE) );
+	if (fd_number != mfd) {
+		E( dup2(mfd, fd_number) );
+		E( close(mfd) );
+	}
+	E( execvp(command[0], command) );
+}
diff --git a/samples/vfs/move_mount.c b/samples/vfs/move_mount.c
new file mode 100644
index 000000000000..1bd2122245e2
--- /dev/null
+++ b/samples/vfs/move_mount.c
@@ -0,0 +1,47 @@
+/* fd-based mount test.
+ *
+ * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <linux/fs.h>
+#include <linux/unistd.h>
+
+#define E(x) do { if ((x) == -1) { perror(#x); exit(1); } } while(0)
+
+static inline int move_mount(int from_dfd, const char *from_pathname,
+			     int to_dfd, const char *to_pathname,
+			     unsigned int flags)
+{
+	return syscall(__NR_move_mount,
+		       from_dfd, from_pathname,
+		       to_dfd, to_pathname, flags);
+}
+
+int main(int argc, char *argv[])
+{
+	if (argc != 1) {
+		fprintf(stderr, "usage: move_mount 3</from/path 4</to/path\n");
+		exit(2);
+	}
+
+	if (move_mount(3, "", 4, "", MOVE_MOUNT_F_EMPTY_PATH |
+		                     MOVE_MOUNT_T_EMPTY_PATH) < 0) {
+		perror("move_mount");
+		exit(1);
+	}
+
+	exit(0);
+}
David Howells Oct. 10, 2018, 11:56 a.m. UTC | #4
Alan Jenkins <alan.christopher.jenkins@gmail.com> wrote:

> I replied last time to wonder about the MNT_UMOUNT mnt_flag. So I've tested it
> now :-), on David's current tree (commit 5581f4935add).
> 
> The modified do_move_mount() allows re-attaching something that was
> lazy-unmounted. But the lazy unmount sets MNT_UMOUNT. And this flag is not
> cleared when the mount is re-attached.

Sorry, yes.  I'm not sure what the best way to deal with this is.  Should it
just return -EPERM or -ESTALE if MNT_UMOUNT is set?

David
David Howells Oct. 10, 2018, 12:31 p.m. UTC | #5
Alan Jenkins <alan.christopher.jenkins@gmail.com> wrote:

> # mount --move . /mnt

is this calling move_mount(2) on your system?

David
David Howells Oct. 10, 2018, 12:36 p.m. UTC | #6
Alan Jenkins <alan.christopher.jenkins@gmail.com> wrote:

> + * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
> + * Written by David Howells (dhowells@redhat.com)

Do you want to update that and I can take them into my patchset?

David
Alan Jenkins Oct. 10, 2018, 12:39 p.m. UTC | #7
On 10/10/2018 13:31, David Howells wrote:
> Alan Jenkins <alan.christopher.jenkins@gmail.com> wrote:
>
>> # mount --move . /mnt
> is this calling move_mount(2) on your system?
>
> David

No. That was an unpatched mount program, from util-linux.

Alan
David Howells Oct. 10, 2018, 12:50 p.m. UTC | #8
Alan Jenkins <alan.christopher.jenkins@gmail.com> wrote:

>  +	pr_info("mnt_flags=%x umount=%x\n",
> +	        (unsigned) old->mnt.mnt_flags,
> +	        (unsigned) !!(old->mnt.mnt_flags & MNT_UMOUNT);
> +

Note that this doesn't actually compile, for want of a bracket.

David
David Howells Oct. 10, 2018, 1:02 p.m. UTC | #9
The attached change seems to fix the lazy-umount problem.

David
---
diff --git a/fs/namespace.c b/fs/namespace.c
index 5adeeea2a4d9..d43f0fa152e9 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2472,7 +2472,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
 	if (old->mnt_ns && !attached)
 		goto out1;
 
-	if (old->mnt.mnt_flags & MNT_LOCKED)
+	if (old->mnt.mnt_flags & (MNT_LOCKED | MNT_UMOUNT))
 		goto out1;
 
 	if (old_path->dentry != old_path->mnt->mnt_root)
Alan Jenkins Oct. 10, 2018, 1:06 p.m. UTC | #10
On 10/10/2018 14:02, David Howells wrote:
> The attached change seems to fix the lazy-umount problem.
>
> David
> ---
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 5adeeea2a4d9..d43f0fa152e9 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -2472,7 +2472,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
>   	if (old->mnt_ns && !attached)
>   		goto out1;
>   
> -	if (old->mnt.mnt_flags & MNT_LOCKED)
> +	if (old->mnt.mnt_flags & (MNT_LOCKED | MNT_UMOUNT))
>   		goto out1;
>   
>   	if (old_path->dentry != old_path->mnt->mnt_root)


I can't test any more at the moment, as my laptop died today :). But I 
have no objection to this.

It would be more fun if there was a way to support it :), but I don't 
have a genuine reason to want it.  And you couldn't use it for fully 
general purposes anyway, because umount2( , MNT_DETACH) is defined as 
separating all the child mounts.

P.S. Regarding the issue with the namespace loop.  My strawman solution 
would be for graft_tree() to silently detach any NS file mounts that 
have a sequence number less than or equal to the namespace they are 
being mounted into.
David Howells Oct. 11, 2018, 9:17 a.m. UTC | #11
Alan Jenkins <alan.christopher.jenkins@gmail.com> wrote:

> # unshare --mount=private_mnt/child_ns --propagation=shared ls -l /proc/self/ns/mnt

I think the problem is that the mount of the nsfs object done by unshare here
pins the new mount namespace - but doesn't add the namespace's contents into
the mount tree, so the mount struct cycle-detection code is bypassed.

I think it's fine for all other namespaces, just not the mount namespace.

It looks like this bug might theoretically exist upstream also, though I don't
think there's any way to actually effect it given that mount() doesn't take a
dirfd argument.

The reason that you can do this with open_tree()/move_mount() is that it
allows you to create a mount tree (OPEN_TREE_CLONE) that has no namespace
assignment, pass it through the namespace switch and then attach it inside the
child namespace.  The cross-namespace checks in do_move_mount() are bypassed
because the root of the newly-cloned mount tree doesn't have one.

Unfortunately, just searching the newly-cloned mount tree for a conflicting
nsfs mount doesn't help because the potential loop could be hidden several
levels deep.

I think the simplest solution is to either reject a request for
open_tree(OPEN_TREE_CLONE) if there are any nsfs objects in the source tree,
or to just not copy said objects.

David
---

Test script:

	mount -t tmpfs none /a
	mount --make-shared /a
	cd /a
	mkdir private_mnt
	mount -t tmpfs xxx private_mnt
	mount --make-private private_mnt
	touch private_mnt/child_ns
	unshare --mount=private_mnt/child_ns --propagation=shared \
	    ls -l /proc/self/ns/mnt
	findmnt

	~/open_tree 3</a/private_mnt 3 \
	    nsenter --mount=/a/private_mnt/child_ns \
	    sh -c '~/move_mount 4</mnt'

	grep Shmem: /proc/meminfo
	dd if=/dev/zero of=/a/private_mnt/bigfile bs=1M count=10

	umount -l /a/private_mnt/
	grep Shmem: /proc/meminfo
Alan Jenkins Oct. 11, 2018, 11:48 a.m. UTC | #12
On 11/10/2018 10:17, David Howells wrote:
> Alan Jenkins <alan.christopher.jenkins@gmail.com> wrote:
>
>> # unshare --mount=private_mnt/child_ns --propagation=shared ls -l /proc/self/ns/mnt
> I think the problem is that the mount of the nsfs object done by unshare here
> pins the new mount namespace - but doesn't add the namespace's contents into
> the mount tree, so the mount struct cycle-detection code is bypassed.
>
> I think it's fine for all other namespaces, just not the mount namespace.
>
> It looks like this bug might theoretically exist upstream also, though I don't
> think there's any way to actually effect it given that mount() doesn't take a
> dirfd argument.
>
> The reason that you can do this with open_tree()/move_mount() is that it
> allows you to create a mount tree (OPEN_TREE_CLONE) that has no namespace
> assignment, pass it through the namespace switch and then attach it inside the
> child namespace.  The cross-namespace checks in do_move_mount() are bypassed
> because the root of the newly-cloned mount tree doesn't have one.
>
> Unfortunately, just searching the newly-cloned mount tree for a conflicting
> nsfs mount doesn't help because the potential loop could be hidden several
> levels deep.
>
> I think the simplest solution is to either reject a request for
> open_tree(OPEN_TREE_CLONE) if there are any nsfs objects in the source tree,
> or to just not copy said objects.
>
> David

Very clearly written, thank you.  Hum, your solution would mean 
open_tree(OPEN_TREE_CLONE) + move_mount() is not equivalent to the 
current `mount --rbind` :-(.  That does not fit the current patch 
description.

It sounds like you're under-estimating how we can use mnt_ns->seq (as is 
currently used in mnt_ns_loop()).  Or maybe I am over-estimating it :).

In principle, it should suffice for attach_recursive_mount() to check 
the NS sequence numbers of the NS files which are mounted. You can't 
hide the loop at a deeper level inside the NS, because of the existing 
mnt_ns_loop() check.

I think mnt_ns_loop() works 100% correctly upstream, and there is no 
memory leak bug there.  You can pass a mount NS fd between processes in 
arbitrary namespaces, and you can mount it with "mount --no-canonicalize 
--bind /proc/self/fd/3 /other_ns".  But mnt_ns_loop() will only allow 
the mount when the other NS is newer than your own mount namespace.

Upstream also covers mount propagation (and CLONE_NEWNS), by simply not 
propagating mounts of mount NS files.  ( See commit 4ce5d2b1a8fd "vfs: 
Don't copy mount bind mounts of /proc/<pid>/ns/mnt between namespaces" / 
https://unix.stackexchange.com/questions/473717/what-code-prevents-mount-namespace-loops-in-a-more-complex-case-involving-mount-propagation 
)

I think it is more a question of taste :-).  Would it be acceptable to 
prune the tree (or fail?) in move_mount() (and also `mount --move`, if 
you [ab]use it like I did) ?

I suspect we should prefer your solution.  It is clearly simpler, and I 
don't know that anyone really uses `mount --rbind` to clone trees of 
mount NS files.

Either way, I suggest we take care to say whether `mount --rbind` and 
`mount --bind` can be implemented using open_tree() + move_mount(), or 
whether we think it might be undesirable.  (E.g. because someone might 
read the current commit message, and desire to implement `mount 
--bind,ro` atomically, if/when we also have mount_setattr() ).

Regards

Alan


> ---
>
> Test script:
>
> 	mount -t tmpfs none /a
> 	mount --make-shared /a
> 	cd /a
> 	mkdir private_mnt
> 	mount -t tmpfs xxx private_mnt
> 	mount --make-private private_mnt
> 	touch private_mnt/child_ns
> 	unshare --mount=private_mnt/child_ns --propagation=shared \
> 	    ls -l /proc/self/ns/mnt
> 	findmnt
>
> 	~/open_tree 3</a/private_mnt 3 \
> 	    nsenter --mount=/a/private_mnt/child_ns \
> 	    sh -c '~/move_mount 4</mnt'
>
> 	grep Shmem: /proc/meminfo
> 	dd if=/dev/zero of=/a/private_mnt/bigfile bs=1M count=10
>
> 	umount -l /a/private_mnt/
> 	grep Shmem: /proc/meminfo
David Howells Oct. 11, 2018, 12:14 p.m. UTC | #13
David Howells <dhowells@redhat.com> wrote:

> The reason that you can do this with open_tree()/move_mount() is that it
> allows you to create a mount tree (OPEN_TREE_CLONE) that has no namespace
> assignment, pass it through the namespace switch and then attach it inside the
> child namespace.  The cross-namespace checks in do_move_mount() are bypassed
> because the root of the newly-cloned mount tree doesn't have one.

It's worse than that.  The apparently disconnected tree given you by
open_tree(OPEN_TREE_CLONE) is still subject to modification by outside
forces.  All it takes is one shared object within that tree.

So I do wonder if it's possible to form a ring, even in an upstream kernel, by
using the propagation mechanism to push through an nsfs mount into itself,
possibly with a layer of indirection (ie. having two mutually-referential
namespaces).

David
Alan Jenkins Oct. 11, 2018, 12:23 p.m. UTC | #14
On 11/10/2018 13:14, David Howells wrote:
> David Howells <dhowells@redhat.com> wrote:
>
>> The reason that you can do this with open_tree()/move_mount() is that it
>> allows you to create a mount tree (OPEN_TREE_CLONE) that has no namespace
>> assignment, pass it through the namespace switch and then attach it inside the
>> child namespace.  The cross-namespace checks in do_move_mount() are bypassed
>> because the root of the newly-cloned mount tree doesn't have one.
> It's worse than that.  The apparently disconnected tree given you by
> open_tree(OPEN_TREE_CLONE) is still subject to modification by outside
> forces.  All it takes is one shared object within that tree.
>
> So I do wonder if it's possible to form a ring, even in an upstream kernel, by
> using the propagation mechanism to push through an nsfs mount into itself,
> possibly with a layer of indirection (ie. having two mutually-referential
> namespaces).
>
> David

Upstream does cover the mount propagation case, by simply never 
propagating mounts of mount NS files.  See commit 4ce5d2b1a8fd "vfs: 
Don't copy mount bind mounts of /proc/<pid>/ns/mnt between namespaces" / 
https://unix.stackexchange.com/questions/473717/what-code-prevents-mount-namespace-loops-in-a-more-complex-case-involving-mount-propagation
David Howells Oct. 11, 2018, 1:10 p.m. UTC | #15
Alan Jenkins <alan.christopher.jenkins@gmail.com> wrote:

> It sounds like you're under-estimating how we can use mnt_ns->seq (as is
> currently used in mnt_ns_loop()).  Or maybe I am over-estimating it :).

I don't see how it helps.  The duplication and attachment of the nsfs object
is already done by open_tree(), but as it's a detached tree, there are no
namespace assignments on the objects therein.  move_mount() is attaching the
subtree as a whole.

I modified my example to put everything under /a, setting up initially on /a/x
and then moving to /a/y within the namespace.  Then I made it print the mount
tree in more places.  So after setup, I see:

	[root@andromeda x]# findmnt -R /a
	TARGET                          SOURCE
	/a                              none
	\_/a/x                          none
	  \_/a/x/private_mnt            xxx
	    \_/a/x/private_mnt/child_ns nsfs[mnt:[4026532272]]

this looks fine.  Then I do:

	~/open_tree 3</a/x/private_mnt 3 \
	    nsenter --mount=/a/x/private_mnt/child_ns \
	    sh -c 'findmnt -R /a; ~/move_mount 4</a/y; findmnt -R /a'

and I see:

	TARGET               SOURCE
	/a                   none
	\_/a/x               none
	  \_/a/x/private_mnt xxx
	TARGET               SOURCE
	/a                   none
	|_/a/x               none
	| \_/a/x/private_mnt xxx
	\_/a/y               xxx
	  \_/a/y/child_ns    nsfs[mnt:[4026532272]]

in which /a/x/private_mnt got cloned and the clone mounted on "/a/y".

So, you're right, it's nothing to do with propagation.  But I'm not sure how I
check this.  Reject it in move_mount() if there's an nsfs?  I'm not sure if
the seq number is actually useful here.

David
David Howells Oct. 11, 2018, 3:33 p.m. UTC | #16
Okay, this appears to fix the cycle-creation problem.

It could probably be improved by comparing sequence numbers as Alan suggests,
but I need to work out how to get at that.

David
---
commit 069c3376f7849044117c866aeafbb1a525f84926
Author: David Howells <dhowells@redhat.com>
Date:   Thu Oct 4 23:18:59 2018 +0100

    fixes

diff --git a/fs/internal.h b/fs/internal.h
index 17029b30e196..47a6c80c3c51 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -172,6 +172,7 @@ extern void mnt_pin_kill(struct mount *m);
  * fs/nsfs.c
  */
 extern const struct dentry_operations ns_dentry_operations;
+extern struct file_system_type nsfs;
 
 /*
  * fs/ioctl.c
diff --git a/fs/namespace.c b/fs/namespace.c
index e969ded7d54b..25ecd8b3c76b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2388,6 +2388,27 @@ static inline int tree_contains_unbindable(struct mount *mnt)
 	return 0;
 }
 
+/*
+ * Object if there are any nsfs mounts in the specified subtree.  These can act
+ * as pins for mount namespaces that aren't checked by the mount-cycle checking
+ * code, thereby allowing cycles to be made.
+ */
+static bool check_for_nsfs_mounts(struct mount *subtree)
+{
+	struct mount *p;
+	bool ret = false;
+
+	lock_mount_hash();
+	for (p = subtree; p; p = next_mnt(p, subtree))
+		if (p->mnt.mnt_sb->s_type == &nsfs)
+			goto out;
+
+	ret = true;
+out:
+	unlock_mount_hash();
+	return ret;
+}
+
 static int do_move_mount(struct path *old_path, struct path *new_path)
 {
 	struct path parent_path = {.mnt = NULL, .dentry = NULL};
@@ -2442,6 +2463,8 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
 	if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
 		goto out1;
 	err = -ELOOP;
+	if (!check_for_nsfs_mounts(old))
+		goto out1;
 	for (; mnt_has_parent(p); p = p->mnt_parent)
 		if (p == old)
 			goto out1;
diff --git a/fs/nsfs.c b/fs/nsfs.c
index f069eb6495b0..d3abcd5c2a23 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -269,7 +269,7 @@ static struct dentry *nsfs_mount(struct file_system_type *fs_type,
 	return mount_pseudo(fs_type, "nsfs:", &nsfs_ops,
 			&ns_dentry_operations, NSFS_MAGIC);
 }
-static struct file_system_type nsfs = {
+struct file_system_type nsfs = {
 	.name = "nsfs",
 	.mount = nsfs_mount,
 	.kill_sb = kill_anon_super,
Eric W. Biederman Oct. 11, 2018, 6:38 p.m. UTC | #17
David Howells <dhowells@redhat.com> writes:

> Okay, this appears to fix the cycle-creation problem.
>
> It could probably be improved by comparing sequence numbers as Alan suggests,
> but I need to work out how to get at that.

It should just be a matter of replacing the test
"if (p->mnt.mnt_sb->s_type == &nsfs)" with "if mnt_ns_loop(p->mnt.mnt_root)"

That would allow reusing 100% of the existing logic, and remove the need
to export file_system_type nsfs;

As your test exists below it will reject a lot more than mount namespace
file descriptors.  It will reject file descriptors for every other
namespace as well.

Eric

> ---
> commit 069c3376f7849044117c866aeafbb1a525f84926
> Author: David Howells <dhowells@redhat.com>
> Date:   Thu Oct 4 23:18:59 2018 +0100
>
>     fixes
>
> diff --git a/fs/internal.h b/fs/internal.h
> index 17029b30e196..47a6c80c3c51 100644
> --- a/fs/internal.h
> +++ b/fs/internal.h
> @@ -172,6 +172,7 @@ extern void mnt_pin_kill(struct mount *m);
>   * fs/nsfs.c
>   */
>  extern const struct dentry_operations ns_dentry_operations;
> +extern struct file_system_type nsfs;
>  
>  /*
>   * fs/ioctl.c
> diff --git a/fs/namespace.c b/fs/namespace.c
> index e969ded7d54b..25ecd8b3c76b 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -2388,6 +2388,27 @@ static inline int tree_contains_unbindable(struct mount *mnt)
>  	return 0;
>  }
>  
> +/*
> + * Object if there are any nsfs mounts in the specified subtree.  These can act
> + * as pins for mount namespaces that aren't checked by the mount-cycle checking
> + * code, thereby allowing cycles to be made.
> + */
> +static bool check_for_nsfs_mounts(struct mount *subtree)
> +{
> +	struct mount *p;
> +	bool ret = false;
> +
> +	lock_mount_hash();
> +	for (p = subtree; p; p = next_mnt(p, subtree))
> +		if (p->mnt.mnt_sb->s_type == &nsfs)
> +			goto out;
> +
> +	ret = true;
> +out:
> +	unlock_mount_hash();
> +	return ret;
> +}
> +
>  static int do_move_mount(struct path *old_path, struct path *new_path)
>  {
>  	struct path parent_path = {.mnt = NULL, .dentry = NULL};
> @@ -2442,6 +2463,8 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
>  	if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
>  		goto out1;
>  	err = -ELOOP;
> +	if (!check_for_nsfs_mounts(old))
> +		goto out1;
>  	for (; mnt_has_parent(p); p = p->mnt_parent)
>  		if (p == old)
>  			goto out1;
> diff --git a/fs/nsfs.c b/fs/nsfs.c
> index f069eb6495b0..d3abcd5c2a23 100644
> --- a/fs/nsfs.c
> +++ b/fs/nsfs.c
> @@ -269,7 +269,7 @@ static struct dentry *nsfs_mount(struct file_system_type *fs_type,
>  	return mount_pseudo(fs_type, "nsfs:", &nsfs_ops,
>  			&ns_dentry_operations, NSFS_MAGIC);
>  }
> -static struct file_system_type nsfs = {
> +struct file_system_type nsfs = {
>  	.name = "nsfs",
>  	.mount = nsfs_mount,
>  	.kill_sb = kill_anon_super,
David Howells Oct. 11, 2018, 8:17 p.m. UTC | #18
Eric W. Biederman <ebiederm@xmission.com> wrote:

> It should just be a matter of replacing the test
> "if (p->mnt.mnt_sb->s_type == &nsfs)" with "if mnt_ns_loop(p->mnt.mnt_root)"

Okay, the attached seems to work.

Thanks,
David
---
diff --git a/fs/namespace.c b/fs/namespace.c
index e969ded7d54b..5548fb9b7de2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2388,6 +2388,27 @@ static inline int tree_contains_unbindable(struct mount *mnt)
 	return 0;
 }
 
+/*
+ * Object if there are any nsfs mounts in the specified subtree.  These can act
+ * as pins for mount namespaces that aren't checked by the mount-cycle checking
+ * code, thereby allowing cycles to be made.
+ */
+static bool check_for_nsfs_mounts(struct mount *subtree)
+{
+	struct mount *p;
+	bool ret = false;
+
+	lock_mount_hash();
+	for (p = subtree; p; p = next_mnt(p, subtree))
+		if (mnt_ns_loop(p->mnt.mnt_root))
+			goto out;
+
+	ret = true;
+out:
+	unlock_mount_hash();
+	return ret;
+}
+
 static int do_move_mount(struct path *old_path, struct path *new_path)
 {
 	struct path parent_path = {.mnt = NULL, .dentry = NULL};
@@ -2442,6 +2463,8 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
 	if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
 		goto out1;
 	err = -ELOOP;
+	if (!check_for_nsfs_mounts(old))
+		goto out1;
 	for (; mnt_has_parent(p); p = p->mnt_parent)
 		if (p == old)
 			goto out1;
Alan Jenkins Oct. 12, 2018, 2:22 p.m. UTC | #19
On 10/10/2018 13:36, David Howells wrote:
> Alan Jenkins <alan.christopher.jenkins@gmail.com> wrote:
>
>> + * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
>> + * Written by David Howells (dhowells@redhat.com)
> Do you want to update that and I can take them into my patchset?
>
> David


Sure :).  I've attached a slightly updated version.

Thanks

Alan
From d9cbfa3398fe9e6269cc76429220d1fc1990b474 Mon Sep 17 00:00:00 2001
From: Alan Jenkins <alan.christopher.jenkins@gmail.com>
Date: Fri, 12 Oct 2018 11:11:13 +0100
Subject: [PATCH] vfs: tiny sample programs for open_tree() and move_mount()

A couple of utility commands for fd-based mounts.  They were useful to
reproduce three different issues, in the original implementation of the
system calls.

Also add .gitignore for all the vfs samples.

Signed-off-by: Alan Jenkins <alan.christopher.jenkins@gmail.com>
---
 samples/vfs/.gitignore        |  4 +++
 samples/vfs/Makefile          |  4 +++
 samples/vfs/move_mount.c      | 42 ++++++++++++++++++++++
 samples/vfs/open_tree_clone.c | 65 +++++++++++++++++++++++++++++++++++
 4 files changed, 115 insertions(+)
 create mode 100644 samples/vfs/.gitignore
 create mode 100644 samples/vfs/move_mount.c
 create mode 100644 samples/vfs/open_tree_clone.c

diff --git a/samples/vfs/.gitignore b/samples/vfs/.gitignore
new file mode 100644
index 000000000000..242ed23f90d2
--- /dev/null
+++ b/samples/vfs/.gitignore
@@ -0,0 +1,4 @@
+open_tree_clone
+move_mount
+test-fsmount
+test-statx
diff --git a/samples/vfs/Makefile b/samples/vfs/Makefile
index 4ac9690fb3c4..3a5bb48d21a0 100644
--- a/samples/vfs/Makefile
+++ b/samples/vfs/Makefile
@@ -1,10 +1,14 @@
 # List of programs to build
 hostprogs-$(CONFIG_SAMPLE_VFS) := \
+	open_tree_clone \
+	move_mount \
 	test-fsmount \
 	test-statx
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
 
+HOSTCFLAGS_open_tree_clone.o += -I$(objtree)/usr/include
+HOSTCFLAGS_move_mount.o += -I$(objtree)/usr/include
 HOSTCFLAGS_test-fsmount.o += -I$(objtree)/usr/include
 HOSTCFLAGS_test-statx.o += -I$(objtree)/usr/include
diff --git a/samples/vfs/move_mount.c b/samples/vfs/move_mount.c
new file mode 100644
index 000000000000..2cc9d876078a
--- /dev/null
+++ b/samples/vfs/move_mount.c
@@ -0,0 +1,42 @@
+/* fd-based mount utility.
+ *
+ * Copyright (C) 2018 Alan Jenkins (alan.christopher.jenkins@gmail.com).
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/fs.h>
+#include <linux/unistd.h>
+
+static inline int move_mount(int from_dfd, const char *from_pathname,
+			     int to_dfd, const char *to_pathname,
+			     unsigned int flags)
+{
+	return syscall(__NR_move_mount,
+		       from_dfd, from_pathname,
+		       to_dfd, to_pathname, flags);
+}
+
+int main(int argc, char *argv[])
+{
+	if (argc != 1) {
+		fprintf(stderr, "Usage: move_mount 3</from/path 4</to/path\n");
+		exit(2);
+	}
+
+	if (move_mount(3, "", 4, "", MOVE_MOUNT_F_EMPTY_PATH |
+		                     MOVE_MOUNT_T_EMPTY_PATH) < 0) {
+		perror("move_mount");
+		exit(1);
+	}
+
+	exit(0);
+}
diff --git a/samples/vfs/open_tree_clone.c b/samples/vfs/open_tree_clone.c
new file mode 100644
index 000000000000..3025d101addc
--- /dev/null
+++ b/samples/vfs/open_tree_clone.c
@@ -0,0 +1,65 @@
+/* fd-based mount utility.
+ *
+ * Inspired by http://skarnet.org/software/execline/redirfd.html etc.
+ *
+ * Copyright (C) 2018 Alan Jenkins (alan.christopher.jenkins@gmail.com).
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/fs.h>
+#include <linux/unistd.h>
+
+#ifndef AT_RECURSIVE
+#define AT_RECURSIVE 0x8000
+#endif
+
+#define E(x) do { if ((x) == -1) { perror(#x); exit(1); } } while(0)
+
+static inline int open_tree(int dfd, const char *pathname, unsigned flags)
+{
+	return syscall(__NR_open_tree, dfd, pathname, flags);
+}
+
+int main(int argc, char *argv[])
+{
+	int fd_number;
+	char **command;
+	int mfd;
+
+	if (argc < 3 || !isdigit(argv[1][0])) {
+		fprintf(stderr, "Usage: 3</from/path open_tree_clone FD_NUMBER COMMAND...\n");
+		fprintf(stderr, "Clone the tree mounted at /from/path.\n");
+		fprintf(stderr, "Then execute COMMAND, passing the cloned tree as file FD_NUMBER.\n");
+		fprintf(stderr, "\n");
+		fprintf(stderr, "hint:  3</from/path 4</to/path open_tree_clone 3 move_mount\n");
+		exit(2);
+	}
+	fd_number = atoi(argv[1]);
+	command = argv + 2;
+
+	mfd = open_tree(3, "", AT_EMPTY_PATH | OPEN_TREE_CLONE | AT_RECURSIVE);
+	if (mfd == -1) {
+		perror("open_tree");
+		exit(1);
+	}
+
+	if (fd_number != mfd) {
+		E( dup2(mfd, fd_number) );
+		E( close(mfd) );
+	}
+
+	execvp(command[0], command);
+	perror("exec");
+	exit(1);
+}
David Howells Oct. 12, 2018, 2:54 p.m. UTC | #20
Alan Jenkins <alan.christopher.jenkins@gmail.com> wrote:

> +	open_tree_clone \
> +	move_mount \

I'll rename them to test-XXX if you're okay with that.

David
Alan Jenkins Oct. 12, 2018, 2:57 p.m. UTC | #21
On 12/10/2018 15:54, David Howells wrote:
> Alan Jenkins <alan.christopher.jenkins@gmail.com> wrote:
>
>> +	open_tree_clone \
>> +	move_mount \
> I'll rename them to test-XXX if you're okay with that.
>
> David


Yes, that's fine.

Feel free to make adaptations you like.  I don't have anything planned 
for them myself, outside of testing the patch series.

Alan
Al Viro Oct. 13, 2018, 6:06 a.m. UTC | #22
On Thu, Oct 11, 2018 at 09:17:54PM +0100, David Howells wrote:
> +/*
> + * Object if there are any nsfs mounts in the specified subtree.  These can act
> + * as pins for mount namespaces that aren't checked by the mount-cycle checking
> + * code, thereby allowing cycles to be made.
> + */
> +static bool check_for_nsfs_mounts(struct mount *subtree)
> +{
> +	struct mount *p;
> +	bool ret = false;
> +
> +	lock_mount_hash();
> +	for (p = subtree; p; p = next_mnt(p, subtree))
> +		if (mnt_ns_loop(p->mnt.mnt_root))
> +			goto out;
> +
> +	ret = true;
> +out:
> +	unlock_mount_hash();
> +	return ret;
> +}

Umm...  The comment doesn't match the behaviour - you are
accepting references to later namespaces.  Behaviour is
not a problem, the comment is.
Alan Jenkins Oct. 17, 2018, 5:45 p.m. UTC | #23
Hi David.  I think there's an outstanding point below, have you been 
thinking about it?

On 07/10/2018 11:48, Alan Jenkins wrote:
> On 05/10/2018 19:24, Alan Jenkins wrote:
>> On 21/09/2018 17:30, David Howells wrote:
>>> From: Al Viro <viro@zeniv.linux.org.uk>
>>>
>>> Allow a detached tree created by open_tree(..., OPEN_TREE_CLONE) to be
>>> attached by move_mount(2).
>>>
>>> If by the time of final fput() of OPEN_TREE_CLONE-opened file its 
>>> tree is
>>> not detached anymore, it won't be dissolved.  move_mount(2) is adjusted
>>> to handle detached source.
>>>
>>> That gives us equivalents of mount --bind and mount --rbind.
>>>
>>> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
>>> Signed-off-by: David Howells <dhowells@redhat.com>
>>> ---
>>>
>>>   fs/namespace.c |   26 ++++++++++++++++++++------
>>>   1 file changed, 20 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/fs/namespace.c b/fs/namespace.c
>>> index dd38141b1723..caf5c55ef555 100644
>>> --- a/fs/namespace.c
>>> +++ b/fs/namespace.c
>>> @@ -1785,8 +1785,10 @@ void dissolve_on_fput(struct vfsmount *mnt)
>>>   {
>>>       namespace_lock();
>>>       lock_mount_hash();
>>> -    mntget(mnt);
>>> -    umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
>>> +    if (!real_mount(mnt)->mnt_ns) {
>>> +        mntget(mnt);
>>> +        umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
>>> +    }
>>>       unlock_mount_hash();
>>>       namespace_unlock();
>>>   }
>>> @@ -2393,6 +2395,7 @@ static int do_move_mount(struct path 
>>> *old_path, struct path *new_path)
>>>       struct mount *old;
>>>       struct mountpoint *mp;
>>>       int err;
>>> +    bool attached;
>>>         mp = lock_mount(new_path);
>>>       err = PTR_ERR(mp);
>>> @@ -2403,10 +2406,19 @@ static int do_move_mount(struct path 
>>> *old_path, struct path *new_path)
>>>       p = real_mount(new_path->mnt);
>>>         err = -EINVAL;
>>> -    if (!check_mnt(p) || !check_mnt(old))
>>> +    /* The mountpoint must be in our namespace. */
>>> +    if (!check_mnt(p))
>>> +        goto out1;
>>> +    /* The thing moved should be either ours or completely 
>>> unattached. */
>>> +    if (old->mnt_ns && !check_mnt(old))
>>>           goto out1;
>>>   -    if (!mnt_has_parent(old))
>>> +    attached = mnt_has_parent(old);
>>> +    /*
>>> +     * We need to allow open_tree(OPEN_TREE_CLONE) followed by
>>> +     * move_mount(), but mustn't allow "/" to be moved.
>>> +     */
>>> +    if (old->mnt_ns && !attached)
>>>           goto out1;
>>>         if (old->mnt.mnt_flags & MNT_LOCKED)
>>
>> Hi
>>
>> I replied last time to wonder about the MNT_UMOUNT mnt_flag. So I've 
>> tested it now :-), on David's current tree (commit 5581f4935add).
>>
>> The modified do_move_mount() allows re-attaching something that was 
>> lazy-unmounted. But the lazy unmount sets MNT_UMOUNT. And this flag 
>> is not cleared when the mount is re-attached.
>>
>> I wasn't sure what effect this would have. Luckily it showed up 
>> straight away, when I tried to unmount again. It causes a soft lockup.
>>
>> Debug printk:
>>
>> diff --git a/fs/namespace.c b/fs/namespace.c
>> index 4dfe7e23b7ee..ac8de9191cfe 100644
>> --- a/fs/namespace.c
>> +++ b/fs/namespace.c
>> @@ -2472,6 +2472,10 @@ static int do_move_mount(struct path 
>> *old_path, struct path *new_path)
>>      if (old->mnt.mnt_flags & MNT_LOCKED)
>>          goto out1;
>>
>> +    pr_info("mnt_flags=%x umount=%x\n",
>> +            (unsigned) old->mnt.mnt_flags,
>> +            (unsigned) !!(old->mnt.mnt_flags & MNT_UMOUNT);
>> +
>>      if (old_path->dentry != old_path->mnt->mnt_root)
>>          goto out1;
>
> The lockup seems to be a general problem with the cleanup code. Even 
> if I use this as advertised, i.e. for a simple bind mount.
>
> (I was suspicious that being able to pass around detached trees as an 
> FD, and re-attach them in any namespace, allows leaking memory by 
> creating a namespace loop.  I.e. maybe it gives you enough rope to 
> skip the test in mnt_ns_loop().  But I didn't get that far).
>
> I converted test-fsmount.c for my own purposes:
>
> diff --git a/samples/vfs/test-fsmount.c b/samples/vfs/test-fsmount.c
> index 74124025ade0..da6e3fbf0513 100644
> --- a/samples/vfs/test-fsmount.c
> +++ b/samples/vfs/test-fsmount.c
> @@ -83,6 +83,11 @@ static inline int move_mount(int from_dfd, const 
> char *from_pathname,
>                 to_dfd, to_pathname, flags);
>  }
>
> +static inline int open_tree(int dfd, const char *pathname, unsigned 
> flags)
> +{
> +    return syscall(__NR_open_tree, dfd, pathname, flags);
> +}
> +
>  #define E_fsconfig(fd, cmd, key, val, aux)                \
>      do {                                \
>          if (fsconfig(fd, cmd, key, val, aux) == -1)        \
> @@ -93,6 +98,7 @@ int main(int argc, char *argv[])
>  {
>      int fsfd, mfd;
>
> +#if 0
>      /* Mount a publically available AFS filesystem */
>      fsfd = fsopen("afs", 0);
>      if (fsfd == -1) {
> @@ -115,4 +121,9 @@ int main(int argc, char *argv[])
>
>      E(close(mfd));
>      exit(0);
> +#endif
> +
> +    E( mfd = open_tree(-1, "/mnt", OPEN_TREE_CLONE) );
> +    E( fchdir(mfd) );
> +    E( execl("/bin/bash", "/bin/bash", NULL) );
>  }
>
> If I close() the mount FD "mfd", and then do "mount --move . /mnt", my 
> printk() shows MNT_UMOUNT has been set. ( I guess fchdir() works more 
> like openat(... , O_PATH) than dup() ). Then unmounting /mnt hangs, as 
> I would expect from my previous test.


^ You posted a diff that would solve this problem


>
>
> If I instead do the mount+unmount first, and close the FD as a second 
> step, I think there's a lockup in the close().  The lockup happens in 
> the same place as the unmount lockup from before.


^ but I don't think you have addressed this problem in your replies so far.

Thanks

Alan


> (Except there's a line "Code: Bad RIP value", I don't know why that 
> happens).
>
> # unshare --mount
> # test-fsmount
> # mount --move . /mnt
> [  270.859542] umount=0 mnt_flags=20
>
> Check the flags are still the same:
>
> # mount --move /mnt /mnt
> [  305./mnt: mount(2) system call failed: Too many levels of symbolic 
> links.
> [  313.737030] umount=0 mnt_flags=20
>
> Clean up the bind mount, and then the inherited mount FD.
>
> # cd
> # umount /mnt
> # exit
>
> [  351.898629] watchdog: BUG: soft lockup - CPU#0 stuck for 22s! 
> [bash:1483]
> [  351.899841] Modules linked in: xt_CHECKSUM(E) ipt_MASQUERADE(E) 
> tun(E) bridge(E) stp(E) llc(E) ip6t_rpfilter(E) ip6t_REJECT(E) 
> nf_reject_ipv6(E) xt_conntrack(E) ip6table_nat(E) nf_nat_ipv6(E) 
> devlink(E) ip6table_mangle(E) ip6table_raw(E) ip6table_security(E) 
> iptable_nat(E) nf_nat_ipv4(E) nf_nat(E) nf_conntrack(E) 
> nf_defrag_ipv6(E) libcrc32c(E) nf_defrag_ipv4(E) iptable_mangle(E) 
> iptable_raw(E) iptable_security(E) ip6table_filter(E) ip6_tables(E) 
> snd_hda_codec_generic(E) snd_hda_intel(E) snd_hda_codec(E) 
> snd_hwdep(E) snd_hda_core(E) snd_seq(E) snd_seq_device(E) snd_pcm(E) 
> joydev(E) crc32_pclmul(E) snd_timer(E) ghash_clmulni_intel(E) snd(E) 
> crct10dif_pclmul(E) virtio_balloon(E) serio_raw(E) soundcore(E) 
> crc32c_intel(E) qxl(E) drm_kms_helper(E) virtio_console(E) ttm(E) 
> virtio_net(E) net_failover(E)
> [  351.912077]  failover(E) drm(E) qemu_fw_cfg(E) pata_acpi(E) 
> ata_generic(E)
> [  351.912888] CPU: 0 PID: 1483 Comm: bash Tainted: G E     
> 4.19.0-rc3+ #7
> [  351.914221] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), 
> BIOS ?-20180531_142017-buildhw-08.phx2.fedoraproject.org-1.fc28 
> 04/01/2014
> [  351.916582] RIP: 0010:pin_kill+0x128/0x140
> [  351.917369] Code: f2 5a 00 48 8b 44 24 20 48 39 c5 0f 84 6f ff ff 
> ff 48 89 df e8 e9 4a 5b 00 8b 43 18 85 c0 7e b3 c6 03 00 fb 66 0f 1f 
> 44 00 00 <e9> 51 ff ff ff e8 be 11 dd ff 0f 1f 40 00 66 2e 0f 1f 84 00 
> 00 00
> [  351.920729] RSP: 0018:ffffa1b381be3d88 EFLAGS: 00000202 ORIG_RAX: 
> ffffffffffffff13
> [  351.921801] RAX: 0000000000000000 RBX: ffff909cf2ea68b0 RCX: 
> dead000000000200
> [  351.922807] RDX: 0000000000000001 RSI: ffffa1b381be3d28 RDI: 
> ffff909cf2ea68b0
> [  351.923811] RBP: ffffa1b381be3da8 R08: ffff909d59621760 R09: 
> 0000000000000000
> [  351.924813] R10: 0000000000000000 R11: 0000000000000000 R12: 
> 0000000010000000
> [  351.925818] R13: ffff909cf5db9a38 R14: ffff909cf2ea67a0 R15: 
> ffff909cedc07300
> [  351.926824] FS:  00007f1eb90ac740(0000) GS:ffff909d59600000(0000) 
> knlGS:0000000000000000
> [  351.927957] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  351.928772] CR2: 00007f1eabedb180 CR3: 000000000f20a003 CR4: 
> 00000000003606f0
> [  351.929779] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 
> 0000000000000000
> [  351.930785] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 
> 0000000000000400
> [  351.931791] Call Trace:
> [  351.932160]  ? finish_wait+0x80/0x80
> [  351.932684]  group_pin_kill+0x1a/0x30
> [  351.933207]  namespace_unlock+0x6f/0x80
> [  351.933766]  __fput+0x239/0x240
> [  351.934217]  task_work_run+0x84/0xa0
> [  351.934743]  do_exit+0x2d3/0xae0
> [  351.935206]  ? __do_page_fault+0x263/0x4e0
> [  351.935799]  do_group_exit+0x3a/0xa0
> [  351.936307]  __x64_sys_exit_group+0x14/0x20
> [  351.936911]  do_syscall_64+0x5b/0x160
> [  351.937436]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> [  351.938164] RIP: 0033:0x7f1eb877adb6
> [  351.938688] Code: Bad RIP value.
> [  351.939149] RSP: 002b:00007ffd56e019d8 EFLAGS: 00000246 ORIG_RAX: 
> 00000000000000e7
> [  351.940216] RAX: ffffffffffffffda RBX: 00007f1eb8a69740 RCX: 
> 00007f1eb877adb6
> [  351.941222] RDX: 0000000000000000 RSI: 000000000000003c RDI: 
> 0000000000000000
> [  351.942229] RBP: 0000000000000000 R08: 00000000000000e7 R09: 
> ffffffffffffff80
> [  351.943236] R10: 00007ffd56e0188a R11: 0000000000000246 R12: 
> 00007f1eb8a69740
> [  351.944242] R13: 0000000000000001 R14: 00007f1eb8a72708 R15: 
> 0000000000000000
>
>
David Howells Oct. 18, 2018, 8:09 p.m. UTC | #24
Alan Jenkins <alan.christopher.jenkins@gmail.com> wrote:

> If I instead do the mount+unmount first, and close the FD as a second step, I
> think there's a lockup in the close().  The lockup happens in the same place
> as the unmount lockup from before. (Except there's a line "Code: Bad RIP
> value", I don't know why that happens).

Sorry, which FD are we talking about?

I presume you're talking about a command sequence like this:

	# unshare --mount
	# test-fsmount
	# mount --move . /mnt
	# mount --move /mnt /mnt
	# cd
	# umount /mnt
	# exit

but this fails on your modified test-fsmount with:

	shell-init: error retrieving current directory: getcwd: cannot access
	parent directories: No such file or directory

David
David Howells Oct. 18, 2018, 8:58 p.m. UTC | #25
David Howells <dhowells@redhat.com> wrote:

> but this fails on your modified test-fsmount with:
>
> 	shell-init: error retrieving current directory: getcwd: cannot access
> 	parent directories: No such file or directory

Actually, it doesn't fail at this point, and I do see a splat later in
fsnotify_first_mark().

	static struct fsnotify_mark *fsnotify_first_mark(struct fsnotify_mark_connector **connp)
	{
		struct fsnotify_mark_connector *conn;
		struct hlist_node *node = NULL;

		conn = srcu_dereference(*connp, &fsnotify_mark_srcu);

conn here is 6b6b6b6b6b6b6b6b.

	RIP: 0010:fsnotify_first_mark+0x5f/0xbb

	Call Trace:
	 fsnotify+0x115/0x344
	 ? __fput+0xac/0x1c1
	 __fput+0xac/0x1c1
	 task_work_run+0x78/0x9f
	 do_exit+0x525/0xa05
	 do_group_exit+0xb2/0xb2
	 __x64_sys_exit_group+0x14/0x14
	 do_syscall_64+0x7d/0x1a0
	 entry_SYSCALL_64_after_hwframe+0x49/0xbe

The line in fsnotify is:

			fsnotify_first_mark(&mnt->mnt_fsnotify_marks);

and fsnotify() is called from fsnotify_close().

David
David Howells Oct. 19, 2018, 11:57 a.m. UTC | #26
Okay, I put in a tracepoint (patch attached) and got a trace from the life of
the offending mount object.  I've cropped non-useful information out of the
lines, inserted a blank line every time the usage count goes down to 2 and
dropped most of the lines generated by fsnotify.

Most of the lines are irrelevant.  You can see system calls being issued and
commands being run that make no difference on balance.

What matters are the first four lines, the two mounts and the umount.  You can
see it go splat on the last line when the usage count has become poisoned.

            bash-3597  M=93785f8a u=1 ALC sp=clone_mnt+0x31/0x30a
            bash-3597  M=93785f8a u=2 GET sp=do_dentry_open+0x24/0x2d3
            bash-3597  M=93785f8a u=1 PUT sp=__se_sys_open_tree+0x195/0x1da

^--- These three lines look like the open_tree() syscall done by test-fsmount.

            bash-3597  M=93785f8a u=2 GET sp=set_fs_pwd+0x37/0xdb

^--- And this the fchdir() syscall from test-fsmount.  At this point u=2 would
     seem correct as the subtree isn't actually mounted anywhere (1 for pwd, 1
     for fd).

v--- test-fsmount then called execl() on bash, which did some stat'ing to find
     the executable...

            bash-3597  M=93785f8a u=3 GET sp=legitimize_path.isra.7+0x16/0x50
            bash-3597  M=93785f8a u=2 PUT sp=vfs_statx+0x95/0xcc

            bash-3597  M=93785f8a u=3 GET sp=legitimize_path.isra.7+0x16/0x50
            bash-3597  M=93785f8a u=2 PUT sp=vfs_statx+0x95/0xcc

v--- and then exec'd it.

            bash-3597  M=93785f8a u=3 GET sp=legitimize_path.isra.7+0x16/0x50
            bash-3597  M=93785f8a u=4 GET sp=do_dentry_open+0x24/0x2d3
            bash-3597  M=93785f8a u=3 PUT sp=terminate_walk+0x20/0xda
            bash-3597  M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50
            bash-3597  M=93785f8a u=3 PUT sp=vfs_statx+0x95/0xcc
            bash-3597  M=93785f8a u=2 PUT sp=__fput+0x180/0x1c1

v--- bash then did stuff:

            bash-3597  M=93785f8a u=3 GET sp=legitimize_path.isra.7+0x16/0x50
            bash-3597  M=93785f8a u=2 PUT sp=vfs_statx+0x95/0xcc

            bash-3597  M=93785f8a u=3 GET sp=copy_fs_struct+0xcc/0xde
     grepconf.sh-3598  M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50
     grepconf.sh-3598  M=93785f8a u=3 PUT sp=vfs_statx+0x95/0xcc
     grepconf.sh-3598  M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50
     grepconf.sh-3598  M=93785f8a u=3 PUT sp=vfs_statx+0x95/0xcc
     grepconf.sh-3598  M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50
     grepconf.sh-3598  M=93785f8a u=5 GET sp=do_dentry_open+0x24/0x2d3
     grepconf.sh-3598  M=93785f8a u=4 PUT sp=terminate_walk+0x20/0xda
     grepconf.sh-3598  M=93785f8a u=5 GET sp=legitimize_path.isra.7+0x16/0x50
     grepconf.sh-3598  M=93785f8a u=4 PUT sp=vfs_statx+0x95/0xcc
     grepconf.sh-3598  M=93785f8a u=3 PUT sp=__fput+0x180/0x1c1
     grepconf.sh-3598  M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50
     grepconf.sh-3598  M=93785f8a u=3 PUT sp=vfs_statx+0x95/0xcc
     grepconf.sh-3598  M=93785f8a u=4 GET sp=copy_fs_struct+0xcc/0xde
            grep-3599  M=93785f8a u=3 PUT sp=free_fs_struct+0x1e/0x2e
     grepconf.sh-3598  M=93785f8a u=2 PUT sp=free_fs_struct+0x1e/0x2e

            bash-3597  M=93785f8a u=3 GET sp=copy_fs_struct+0xcc/0xde
            bash-3600  M=93785f8a u=4 GET sp=copy_fs_struct+0xcc/0xde
             tty-3601  M=93785f8a u=3 PUT sp=free_fs_struct+0x1e/0x2e
            bash-3600  M=93785f8a u=4 GET sp=copy_fs_struct+0xcc/0xde
            tput-3602  M=93785f8a u=3 PUT sp=free_fs_struct+0x1e/0x2e
            bash-3600  M=93785f8a u=2 PUT sp=free_fs_struct+0x1e/0x2e

            bash-3597  M=93785f8a u=3 GET sp=copy_fs_struct+0xcc/0xde
            bash-3603  M=93785f8a u=4 GET sp=copy_fs_struct+0xcc/0xde
       dircolors-3604  M=93785f8a u=3 PUT sp=free_fs_struct+0x1e/0x2e
            bash-3603  M=93785f8a u=2 PUT sp=free_fs_struct+0x1e/0x2e

            bash-3597  M=93785f8a u=3 GET sp=copy_fs_struct+0xcc/0xde
            grep-3605  M=93785f8a u=2 PUT sp=free_fs_struct+0x1e/0x2e

            bash-3597  M=93785f8a u=3 GET sp=copy_fs_struct+0xcc/0xde
     grepconf.sh-3606  M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50
     grepconf.sh-3606  M=93785f8a u=3 PUT sp=vfs_statx+0x95/0xcc
     grepconf.sh-3606  M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50
     grepconf.sh-3606  M=93785f8a u=3 PUT sp=vfs_statx+0x95/0xcc
     grepconf.sh-3606  M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50
     grepconf.sh-3606  M=93785f8a u=5 GET sp=do_dentry_open+0x24/0x2d3
     grepconf.sh-3606  M=93785f8a u=4 PUT sp=terminate_walk+0x20/0xda
     grepconf.sh-3606  M=93785f8a u=5 GET sp=legitimize_path.isra.7+0x16/0x50
     grepconf.sh-3606  M=93785f8a u=4 PUT sp=vfs_statx+0x95/0xcc
     grepconf.sh-3606  M=93785f8a u=3 PUT sp=__fput+0x180/0x1c1
     grepconf.sh-3606  M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50
     grepconf.sh-3606  M=93785f8a u=3 PUT sp=vfs_statx+0x95/0xcc
     grepconf.sh-3606  M=93785f8a u=4 GET sp=copy_fs_struct+0xcc/0xde
            grep-3607  M=93785f8a u=3 PUT sp=free_fs_struct+0x1e/0x2e
     grepconf.sh-3606  M=93785f8a u=2 PUT sp=free_fs_struct+0x1e/0x2e

            bash-3597  M=93785f8a u=3 GET sp=copy_fs_struct+0xcc/0xde
     grepconf.sh-3608  M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50
     grepconf.sh-3608  M=93785f8a u=3 PUT sp=vfs_statx+0x95/0xcc
     grepconf.sh-3608  M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50
     grepconf.sh-3608  M=93785f8a u=3 PUT sp=vfs_statx+0x95/0xcc
     grepconf.sh-3608  M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50
     grepconf.sh-3608  M=93785f8a u=5 GET sp=do_dentry_open+0x24/0x2d3
     grepconf.sh-3608  M=93785f8a u=4 PUT sp=terminate_walk+0x20/0xda
     grepconf.sh-3608  M=93785f8a u=5 GET sp=legitimize_path.isra.7+0x16/0x50
     grepconf.sh-3608  M=93785f8a u=4 PUT sp=vfs_statx+0x95/0xcc
     grepconf.sh-3608  M=93785f8a u=3 PUT sp=__fput+0x180/0x1c1
     grepconf.sh-3608  M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50
     grepconf.sh-3608  M=93785f8a u=3 PUT sp=vfs_statx+0x95/0xcc
     grepconf.sh-3608  M=93785f8a u=4 GET sp=copy_fs_struct+0xcc/0xde
            grep-3609  M=93785f8a u=3 PUT sp=free_fs_struct+0x1e/0x2e
     grepconf.sh-3608  M=93785f8a u=2 PUT sp=free_fs_struct+0x1e/0x2e

            bash-3597  M=93785f8a u=3 GET sp=legitimize_path.isra.7+0x16/0x50
            bash-3597  M=93785f8a u=2 PUT sp=vfs_statx+0x95/0xcc

I ran "mount --move . /mnt":

            bash-3597  M=93785f8a u=3 GET sp=copy_fs_struct+0xcc/0xde
           mount-3610  M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50
           mount-3610  M=93785f8a u=3 PUT sp=vfs_statx+0x95/0xcc
           mount-3610  M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50
           mount-3610  M=93785f8a u=5 GET sp=do_dentry_open+0x24/0x2d3
           mount-3610  M=93785f8a u=4 PUT sp=terminate_walk+0x20/0xda
           mount-3610  M=93785f8a u=5 GET sp=legitimize_path.isra.7+0x16/0x50
           mount-3610  M=93785f8a u=4 PUT sp=vfs_statx+0x95/0xcc
           mount-3610  M=93785f8a u=3 PUT sp=__fput+0x180/0x1c1
           mount-3610  M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50
           mount-3610  M=93785f8a u=3 PUT sp=do_mount+0x715/0x929
           mount-3610  M=93785f8a u=2 PUT sp=free_fs_struct+0x1e/0x2e

which worked.  Herein lieth the problem.  The usage count should be 3 now (1
for pwd, 1 for fd, 1 for mount) - but how does VFS know that this mount object
isn't mounted yet?

I ran "mount --move /mnt /mnt":

            bash-3597  M=93785f8a u=3 GET sp=copy_fs_struct+0xcc/0xde
           mount-3611  M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50
           mount-3611  M=93785f8a u=3 PUT sp=vfs_statx+0x95/0xcc
           mount-3611  M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50
           mount-3611  M=93785f8a u=5 GET sp=legitimize_path.isra.7+0x16/0x50
           mount-3611  M=93785f8a u=4 PUT sp=do_mount+0x715/0x929
           mount-3611  M=93785f8a u=3 PUT sp=do_mount+0x1cf/0x929
           mount-3611  M=93785f8a u=2 PUT sp=free_fs_struct+0x1e/0x2e

which failed with ELOOP.

I ran "cd":

            bash-3597  M=93785f8a u=1 PUT sp=set_fs_pwd+0xb8/0xdb

I ran "umount /mnt":

          umount-3612  M=93785f8a u=2 GET sp=legitimize_path.isra.7+0x16/0x50
          umount-3612  M=93785f8a u=1 PUT sp=vfs_statx+0x95/0xcc
          umount-3612  M=93785f8a u=2 GET sp=legitimize_path.isra.7+0x16/0x50
          umount-3612  M=93785f8a u=1 PUT sp=vfs_statx+0x95/0xcc
          umount-3612  M=93785f8a u=2 GET sp=legitimize_path.isra.7+0x16/0x50
          umount-3612  M=93785f8a u=1 PUT sp=user_statfs+0x61/0x98
          umount-3612  M=93785f8a u=2 GET sp=legitimize_mnt+0x12/0x108
          umount-3612  M=93785f8a u=1 PUT sp=pin_kill+0x11c/0x325
          umount-3612  M=93785f8a u=0 PUT sp=ksys_umount+0x3e8/0x40e
          umount-3612  M=93785f8a u=0 FRE sp=cleanup_mnt+0x4d/0x5e

And finally, I exited the shell, which then tried to release the fd inherited
from open_tree():

            bash-3597  M=93785f8a u=1802201963 NFY sp=__fput+0xac/0x1c1

Note that the subtree attached to the fd has not at this point been directly
mounted by move_mount(), but implicitly mounted by fchdir() into it and then
using mount(MS_MOVE) of "." to "/mnt".

Note also that if I run the commands all as one go rather than pasting them
individually, a crash occurs in pin_kill() instead.

So, I'm not sure how to proceed from here.  There's no easy way to remove the
FMODE_NEED_UNMOUNT flag left by open_tree().

David
---
commit e7c8e6590aa0dd3bf2e10450b8992d496c44be8b
Author: David Howells <dhowells@redhat.com>
Date:   Fri Oct 19 10:38:35 2018 +0100

    mnt_count trace

diff --git a/fs/mount.h b/fs/mount.h
index f39bc9da4d73..124a6dd73936 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -20,7 +20,7 @@ struct mnt_namespace {
 } __randomize_layout;
 
 struct mnt_pcp {
-	int mnt_count;
+	int mnt_countxxx;
 	int mnt_writers;
 };
 
@@ -46,6 +46,7 @@ struct mount {
 	int mnt_count;
 	int mnt_writers;
 #endif
+	atomic_t mnt_count;
 	struct list_head mnt_mounts;	/* list of children, anchored here */
 	struct list_head mnt_child;	/* and going through their mnt_child */
 	struct list_head mnt_instance;	/* mount instance on sb->s_mounts */
diff --git a/fs/namei.c b/fs/namei.c
index fb913148d4d1..da1489f6925c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -460,32 +460,6 @@ int inode_permission(struct inode *inode, int mask)
 }
 EXPORT_SYMBOL(inode_permission);
 
-/**
- * path_get - get a reference to a path
- * @path: path to get the reference to
- *
- * Given a path increment the reference count to the dentry and the vfsmount.
- */
-void path_get(const struct path *path)
-{
-	mntget(path->mnt);
-	dget(path->dentry);
-}
-EXPORT_SYMBOL(path_get);
-
-/**
- * path_put - put a reference to a path
- * @path: path to put the reference to
- *
- * Given a path decrement the reference count to the dentry and the vfsmount.
- */
-void path_put(const struct path *path)
-{
-	dput(path->dentry);
-	mntput(path->mnt);
-}
-EXPORT_SYMBOL(path_put);
-
 #define EMBEDDED_LEVELS 2
 struct nameidata {
 	struct path	path;
diff --git a/fs/namespace.c b/fs/namespace.c
index 6370bfabec99..389e806e1a65 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -29,6 +29,8 @@
 #include <linux/sched/task.h>
 #include <uapi/linux/mount.h>
 #include <linux/fs_context.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/mnt.h>
 
 #include "pnode.h"
 #include "internal.h"
@@ -109,8 +111,10 @@ static int mnt_alloc_id(struct mount *mnt)
 	return 0;
 }
 
-static void mnt_free_id(struct mount *mnt)
+static noinline void mnt_free_id(struct mount *mnt)
 {
+	trace_mnt_count(mnt, mnt->mnt_id, atomic_read(&mnt->mnt_count), 99,
+			__builtin_return_address(0));
 	ida_free(&mnt_id_ida, mnt->mnt_id);
 }
 
@@ -141,6 +145,9 @@ void mnt_release_group_id(struct mount *mnt)
  */
 static inline void mnt_add_count(struct mount *mnt, int n)
 {
+	int u;
+
+#if 0
 #ifdef CONFIG_SMP
 	this_cpu_add(mnt->mnt_pcp->mnt_count, n);
 #else
@@ -148,6 +155,9 @@ static inline void mnt_add_count(struct mount *mnt, int n)
 	mnt->mnt_count += n;
 	preempt_enable();
 #endif
+#endif
+	u = atomic_add_return(n, &mnt->mnt_count);
+	trace_mnt_count(mnt, mnt->mnt_id, u, n, __builtin_return_address(0));
 }
 
 /*
@@ -155,6 +165,7 @@ static inline void mnt_add_count(struct mount *mnt, int n)
  */
 unsigned int mnt_get_count(struct mount *mnt)
 {
+#if 0
 #ifdef CONFIG_SMP
 	unsigned int count = 0;
 	int cpu;
@@ -167,6 +178,8 @@ unsigned int mnt_get_count(struct mount *mnt)
 #else
 	return mnt->mnt_count;
 #endif
+#endif
+	return atomic_read(&mnt->mnt_count);
 }
 
 static void drop_mountpoint(struct fs_pin *p)
@@ -198,11 +211,15 @@ static struct mount *alloc_vfsmnt(const char *name)
 		if (!mnt->mnt_pcp)
 			goto out_free_devname;
 
+#if 0
 		this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
+#endif
 #else
 		mnt->mnt_count = 1;
 		mnt->mnt_writers = 0;
 #endif
+		atomic_set(&mnt->mnt_count, 1);
+		trace_mnt_count(mnt, mnt->mnt_id, 1, 0, __builtin_return_address(0));
 
 		INIT_HLIST_NODE(&mnt->mnt_hash);
 		INIT_LIST_HEAD(&mnt->mnt_child);
@@ -1128,7 +1145,7 @@ static void mntput_no_expire(struct mount *mnt)
 	cleanup_mnt(mnt);
 }
 
-void mntput(struct vfsmount *mnt)
+inline void mntput(struct vfsmount *mnt)
 {
 	if (mnt) {
 		struct mount *m = real_mount(mnt);
@@ -1140,7 +1157,7 @@ void mntput(struct vfsmount *mnt)
 }
 EXPORT_SYMBOL(mntput);
 
-struct vfsmount *mntget(struct vfsmount *mnt)
+inline struct vfsmount *mntget(struct vfsmount *mnt)
 {
 	if (mnt)
 		mnt_add_count(real_mount(mnt), 1);
@@ -3970,3 +3987,29 @@ const struct proc_ns_operations mntns_operations = {
 	.install	= mntns_install,
 	.owner		= mntns_owner,
 };
+
+/**
+ * path_get - get a reference to a path
+ * @path: path to get the reference to
+ *
+ * Given a path increment the reference count to the dentry and the vfsmount.
+ */
+void path_get(const struct path *path)
+{
+	mntget(path->mnt);
+	dget(path->dentry);
+}
+EXPORT_SYMBOL(path_get);
+
+/**
+ * path_put - put a reference to a path
+ * @path: path to put the reference to
+ *
+ * Given a path decrement the reference count to the dentry and the vfsmount.
+ */
+void path_put(const struct path *path)
+{
+	dput(path->dentry);
+	mntput(path->mnt);
+}
+EXPORT_SYMBOL(path_put);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index ababdbfab537..aaef44d6204c 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -23,6 +23,7 @@
 #include <linux/module.h>
 #include <linux/mount.h>
 #include <linux/srcu.h>
+#include <trace/events/mnt.h>
 
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
@@ -324,10 +325,13 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
 	/* global tests shouldn't care about events on child only the specific event */
 	__u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
 
-	if (data_is == FSNOTIFY_EVENT_PATH)
+	if (data_is == FSNOTIFY_EVENT_PATH) {
 		mnt = real_mount(((const struct path *)data)->mnt);
-	else
+		trace_mnt_count(mnt, mnt->mnt_id, atomic_read(&mnt->mnt_count), 88,
+				__builtin_return_address(0));
+	} else {
 		mnt = NULL;
+	}
 
 	/*
 	 * Optimization: srcu_read_lock() has a memory barrier which can
diff --git a/include/trace/events/mnt.h b/include/trace/events/mnt.h
new file mode 100644
index 000000000000..da1a981f1a61
--- /dev/null
+++ b/include/trace/events/mnt.h
@@ -0,0 +1,57 @@
+/* Mount tracepoints
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM mnt
+
+#if !defined(_TRACE_MNT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_MNT_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(mnt_count,
+	    TP_PROTO(const void *mnt, int mnt_id, int mnt_count,
+		     int delta, const void *where),
+
+	    TP_ARGS(mnt, mnt_id, mnt_count, delta, where),
+
+	    TP_STRUCT__entry(
+		    __field(int,			mnt_id		)
+		    __field(int,			mnt_count	)
+		    __field(int,			delta		)
+		    __field(const void *,		mnt		)
+		    __field(const void *,		where		)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->mnt_id = mnt_id;
+		    __entry->mnt_count = mnt_count;
+		    __entry->delta = delta;
+		    __entry->mnt = mnt;
+		    __entry->where = where;
+			   ),
+
+	    TP_printk("M=%p m=%08x u=%d %s sp=%pSR",
+		      __entry->mnt,
+		      __entry->mnt_id,
+		      __entry->mnt_count,
+		      __print_symbolic(__entry->delta,
+				       { 0, "ALC" },
+				       { 1, "GET" },
+				       { -1, "PUT" },
+				       { 88, "NFY" },
+				       { 99, "FRE" }),
+		      __entry->where)
+	    );
+
+#endif /* _TRACE_MNT_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
David Howells Oct. 19, 2018, 1:37 p.m. UTC | #27
Alan Jenkins <alan.christopher.jenkins@gmail.com> wrote:

> If I close() the mount FD "mfd", and then do "mount --move . /mnt", my
> printk() shows MNT_UMOUNT has been set. ( I guess fchdir() works more like
> openat(... , O_PATH) than dup() ). Then unmounting /mnt hangs, as I would
> expect from my previous test.

Okay, I think the attached should fix it.

The issue being that do_move_mount() calls attach_recursive_mnt() with a NULL
parent_path, which means that the moved-mount doesn't get its refcount
incremented.

David
---
diff --git a/fs/namespace.c b/fs/namespace.c
index 6370bfabec99..ce9fff980549 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1935,7 +1935,8 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
 static int attach_recursive_mnt(struct mount *source_mnt,
 			struct mount *dest_mnt,
 			struct mountpoint *dest_mp,
-			struct path *parent_path)
+			struct path *parent_path,
+			bool moving)
 {
 	HLIST_HEAD(tree_list);
 	struct mnt_namespace *ns = dest_mnt->mnt_ns;
@@ -1976,6 +1977,8 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 		attach_mnt(source_mnt, dest_mnt, dest_mp);
 		touch_mnt_namespace(source_mnt->mnt_ns);
 	} else {
+		if (moving)
+			mnt_add_count(source_mnt, 1);
 		mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
 		commit_tree(source_mnt);
 	}
@@ -2062,7 +2065,7 @@ static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
 	      d_is_dir(mnt->mnt.mnt_root))
 		return -ENOTDIR;
 
-	return attach_recursive_mnt(mnt, p, mp, NULL);
+	return attach_recursive_mnt(mnt, p, mp, NULL, false);
 }
 
 /*
@@ -2522,7 +2525,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
 			goto out1;
 
 	err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp,
-				   attached ? &parent_path : NULL);
+				   attached ? &parent_path : NULL, true);
 	if (err)
 		goto out1;
Alan Jenkins Oct. 19, 2018, 5:35 p.m. UTC | #28
On 19/10/2018 14:37, David Howells wrote:
> Alan Jenkins <alan.christopher.jenkins@gmail.com> wrote:
>
>> If I close() the mount FD "mfd", and then do "mount --move . /mnt", my
>> printk() shows MNT_UMOUNT has been set. ( I guess fchdir() works more like
>> openat(... , O_PATH) than dup() ). Then unmounting /mnt hangs, as I would
>> expect from my previous test.
> Okay, I think the attached should fix it.
>
> The issue being that do_move_mount() calls attach_recursive_mnt() with a NULL
> parent_path, which means that the moved-mount doesn't get its refcount
> incremented.
>
> David
> ---
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 6370bfabec99..ce9fff980549 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -1935,7 +1935,8 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
>   static int attach_recursive_mnt(struct mount *source_mnt,
>   			struct mount *dest_mnt,
>   			struct mountpoint *dest_mp,
> -			struct path *parent_path)
> +			struct path *parent_path,
> +			bool moving)
>   {
>   	HLIST_HEAD(tree_list);
>   	struct mnt_namespace *ns = dest_mnt->mnt_ns;
> @@ -1976,6 +1977,8 @@ static int attach_recursive_mnt(struct mount *source_mnt,
>   		attach_mnt(source_mnt, dest_mnt, dest_mp);
>   		touch_mnt_namespace(source_mnt->mnt_ns);
>   	} else {
> +		if (moving)
> +			mnt_add_count(source_mnt, 1);
>   		mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
>   		commit_tree(source_mnt);
>   	}
> @@ -2062,7 +2065,7 @@ static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
>   	      d_is_dir(mnt->mnt.mnt_root))
>   		return -ENOTDIR;
>   
> -	return attach_recursive_mnt(mnt, p, mp, NULL);
> +	return attach_recursive_mnt(mnt, p, mp, NULL, false);
>   }
>   
>   /*
> @@ -2522,7 +2525,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
>   			goto out1;
>   
>   	err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp,
> -				   attached ? &parent_path : NULL);
> +				   attached ? &parent_path : NULL, true);
>   	if (err)
>   		goto out1;
>   

I guess this tries to fix the second of the two sequences I mentioned - 
mount+unmount, then close the FD.  It doesn't seem to work.

# open_tree_clone 3</mnt 3 sh
# cd /proc/self/fd/3
# mount --move . /mnt
[   41.747831] mnt_flags=1020 umount=0
# cd /
# umount /mnt
umount: /mnt: target is busy

^ a newly introduced bug? I do not remember having this problem before.

# umount -l /mnt
# exec 3<&-  # close FD 3
[   95.984094] watchdog: BUG: soft lockup - CPU#1 stuck for 22s! [sh:1423]
...
[   96.000032] RIP: 0010:pin_kill+0x128/0x140

And the first sequence I mentioned - close the FD, then mount+unmount - 
seems to be unchanged.

# open_tree_clone 3</mnt 3 sh
# cd /proc/self/fd/3
# exec 3<&-  # close FD 3
# mount --move . /mnt
[   76.175127] mnt_flags=8000020 umount=1
# cd /
# umount /mnt
watchdog: BUG: soft lockup - CPU#0 stuck for 23s! [umount:1472]
...
RIP: 0010:pin_kill+0x128/0x140

The close-then-mount test seemed to be solved by the diff you suggested 
earlier.

diff --git a/fs/namespace.c b/fs/namespace.c
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2469,7 +2469,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
  	if (old->mnt_ns && !attached)
  		goto out1;
  
-	if (old->mnt.mnt_flags & MNT_LOCKED)
+	if (old->mnt.mnt_flags & (MNT_LOCKED | MNT_UMOUNT))
  		goto out1;
  
  	if (old_path->dentry != old_path->mnt->mnt_root)

If we can do that, then is it possible to solve mount-unmount-close the 
same way?

@@ -1763,7 +1763,7 @@ void dissolve_on_fput(struct vfsmount *mnt)
  {
  	namespace_lock();
  	lock_mount_hash();
-	if (!real_mount(mnt)->mnt_ns) {
+	if (!real_mount(mnt)->mnt_ns && !(mnt->mnt_flags & MNT_UMOUNT)) {
  		mntget(mnt);
  		umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
  	}

Regards

Alan
David Howells Oct. 19, 2018, 9:35 p.m. UTC | #29
Alan Jenkins <alan.christopher.jenkins@gmail.com> wrote:

> And the first sequence I mentioned - close the FD, then mount+unmount - 
> seems to be unchanged.

Unchanged in what sense?  Still breaks?  I thought I'd fixed that - or are we
talking about a different first sequence?

Sorry, I'm losing track of how many different ways of breaking open_tree() and
move_mount() you've posted.  I don't suppose you could post a checklist?

> I guess this tries to fix the second of the two sequences I mentioned - 
> mount+unmount, then close the FD.  It doesn't seem to work.
> 
> # open_tree_clone 3</mnt 3 sh
> # cd /proc/self/fd/3
> # mount --move . /mnt
> [   41.747831] mnt_flags=1020 umount=0
> # cd /
> # umount /mnt
> umount: /mnt: target is busy
> 
> ^ a newly introduced bug? I do not remember having this problem before.
> 
> # umount -l /mnt

Sigh, so I see.  I have the attached trace from this sequence.

David
----
Command "open_tree_clone 3</mnt 3 sh"

              sh-3614  M=421a9872 u=1 ALC sp=clone_mnt+0x31/0x30a
              sh-3614  M=421a9872 u=2 GET sp=do_dentry_open+0x24/0x2d3
              sh-3614  M=421a9872 u=1 PUT sp=__se_sys_open_tree+0x195/0x1da
              sh-3614  M=421a9872 u=2 GET sp=proc_fd_link+0x106/0x124
              sh-3614  M=421a9872 u=1 PUT sp=vfs_statx+0x95/0xcc

Command "cd /proc/self/fd/3":

              sh-3614  M=421a9872 u=2 GET sp=proc_fd_link+0x106/0x124
              sh-3614  M=421a9872 u=3 GET sp=set_fs_pwd+0x37/0xdb
              sh-3614  M=421a9872 u=2 PUT sp=ksys_chdir+0x88/0xbd

              sh-3614  M=421a9872 u=3 GET sp=legitimize_path.isra.7+0x16/0x50
              sh-3614  M=421a9872 u=2 PUT sp=vfs_statx+0x95/0xcc

Command "mount --move . /mnt":

              sh-3614  M=421a9872 u=3 GET sp=copy_fs_struct+0xcc/0xde
           mount-3615  M=421a9872 u=4 GET sp=legitimize_path.isra.7+0x16/0x50
           mount-3615  M=421a9872 u=3 PUT sp=vfs_statx+0x95/0xcc
           mount-3615  M=421a9872 u=4 GET sp=legitimize_path.isra.7+0x16/0x50
           mount-3615  M=421a9872 u=5 GET sp=do_dentry_open+0x24/0x2d3
           mount-3615  M=421a9872 u=4 PUT sp=terminate_walk+0x20/0xda
           mount-3615  M=421a9872 u=5 GET sp=legitimize_path.isra.7+0x16/0x50
           mount-3615  M=421a9872 u=4 PUT sp=vfs_statx+0x95/0xcc
           mount-3615  M=421a9872 u=3 PUT sp=__fput+0x180/0x1c1
           mount-3615  M=421a9872 u=4 GET sp=legitimize_path.isra.7+0x16/0x50
           mount-3615  M=421a9872 u=4 0x4e sp=          (null)
           mount-3615  M=421a9872 u=5 GET sp=do_move_mount+0x216/0x298
           mount-3615  M=421a9872 u=4 PUT sp=do_mount+0x715/0x929
           mount-3615  M=421a9872 u=3 PUT sp=free_fs_struct+0x1e/0x2e

Command "cd /":

              sh-3614  M=421a9872 u=2 PUT sp=set_fs_pwd+0xb8/0xdb

Command "umount /mnt":

          umount-3616  M=421a9872 u=3 GET sp=legitimize_path.isra.7+0x16/0x50
          umount-3616  M=421a9872 u=2 PUT sp=vfs_statx+0x95/0xcc
          umount-3616  M=421a9872 u=3 GET sp=legitimize_path.isra.7+0x16/0x50
          umount-3616  M=421a9872 u=2 PUT sp=vfs_statx+0x95/0xcc
          umount-3616  M=421a9872 u=3 GET sp=legitimize_path.isra.7+0x16/0x50
          umount-3616  M=421a9872 u=2 PUT sp=user_statfs+0x61/0x98
          umount-3616  M=421a9872 u=3 GET sp=legitimize_mnt+0x12/0x108
          umount-3616  M=421a9872 u=2 PUT sp=ksys_umount+0x3e8/0x40e

(Fails, -EBUSY).

Command "umount -l /mnt":

          umount-3617  M=421a9872 u=3 GET sp=legitimize_mnt+0x12/0x108
          umount-3617  M=421a9872 u=2 PUT sp=pin_kill+0x11c/0x325
          umount-3617  M=421a9872 u=1 PUT sp=ksys_umount+0x3e8/0x40e

Command "exec 3<&-":

(Goes weird: bash still responds, but trying to run a command locks up that
shell; can still log in with ssh, but can't then run commands).

David
David Howells Oct. 19, 2018, 9:40 p.m. UTC | #30
Alan Jenkins <alan.christopher.jenkins@gmail.com> wrote:

> I guess this tries to fix the second of the two sequences I mentioned - 
> mount+unmount, then close the FD.  It doesn't seem to work.

It fixes this:

    unshare --mount
    /root/test-fsmount
    mount --move . /mnt
    mount --move /mnt /mnt
    cd
    umount /mnt
    exit

Which usually gets a GPF in fsnotify.

David
David Howells Oct. 19, 2018, 10:36 p.m. UTC | #31
Alan Jenkins <alan.christopher.jenkins@gmail.com> wrote:

> # open_tree_clone 3</mnt 3 sh
> # cd /proc/self/fd/3
> # mount --move . /mnt
> [   41.747831] mnt_flags=1020 umount=0
> # cd /
> # umount /mnt
> umount: /mnt: target is busy
> 
> ^ a newly introduced bug? I do not remember having this problem before.

The reason EBUSY is returned is because propagate_mount_busy() is called by
do_umount() with refcnt == 2, but mnt_count == 3:

          umount-3577  M=f8898a34 u=3 0x555 sp=__x64_sys_umount+0x12/0x15

the trace line being added here:

		if (!propagate_mount_busy(mnt, 2)) {
			if (!list_empty(&mnt->mnt_list))
				umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
			retval = 0;
		} else {
			trace_mnt_count(mnt, mnt->mnt_id,
					atomic_read(&mnt->mnt_count),
					0x555, __builtin_return_address(0));
		}

The busy evaluation is a result of this check:

	if (!list_empty(&mnt->mnt_mounts) || do_refcount_check(mnt, refcnt))

in propagate_mount_busy().


The problem apparently being that mnt_count counts both refs from mountings
and refs from other sources, such as file descriptors or pathwalk.

David
Al Viro Oct. 20, 2018, 5:25 a.m. UTC | #32
On Fri, Oct 19, 2018 at 11:36:19PM +0100, David Howells wrote:
> Alan Jenkins <alan.christopher.jenkins@gmail.com> wrote:
> 
> > # open_tree_clone 3</mnt 3 sh
> > # cd /proc/self/fd/3
> > # mount --move . /mnt
> > [   41.747831] mnt_flags=1020 umount=0
> > # cd /
> > # umount /mnt
> > umount: /mnt: target is busy
> > 
> > ^ a newly introduced bug? I do not remember having this problem before.
> 
> The reason EBUSY is returned is because propagate_mount_busy() is called by
> do_umount() with refcnt == 2, but mnt_count == 3:
> 
>           umount-3577  M=f8898a34 u=3 0x555 sp=__x64_sys_umount+0x12/0x15
> 
> the trace line being added here:
> 
> 		if (!propagate_mount_busy(mnt, 2)) {
> 			if (!list_empty(&mnt->mnt_list))
> 				umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
> 			retval = 0;
> 		} else {
> 			trace_mnt_count(mnt, mnt->mnt_id,
> 					atomic_read(&mnt->mnt_count),
> 					0x555, __builtin_return_address(0));
> 		}
> 
> The busy evaluation is a result of this check:
> 
> 	if (!list_empty(&mnt->mnt_mounts) || do_refcount_check(mnt, refcnt))
> 
> in propagate_mount_busy().
> 
> 
> The problem apparently being that mnt_count counts both refs from mountings
> and refs from other sources, such as file descriptors or pathwalk.

As it bloody well should.  Once the tree has been attached, that open_ctree()
descriptor is refering to vfsmount of /mnt (what else could it be?)

IOW, it *is* genuinely busy.  The livelock on umount -l you've mentioned is
a different story - that's definitely a bug, but this -EBUSY is correct.
Alan Jenkins Oct. 20, 2018, 11:06 a.m. UTC | #33
On 19/10/2018 23:36, David Howells wrote:
> Alan Jenkins <alan.christopher.jenkins@gmail.com> wrote:
>
>> # open_tree_clone 3</mnt 3 sh
>> # cd /proc/self/fd/3
>> # mount --move . /mnt
>> [   41.747831] mnt_flags=1020 umount=0
>> # cd /
>> # umount /mnt
>> umount: /mnt: target is busy
>>
>> ^ a newly introduced bug? I do not remember having this problem before.
> The reason EBUSY is returned is because propagate_mount_busy() is called by
> do_umount() with refcnt == 2, but mnt_count == 3:
>
>            umount-3577  M=f8898a34 u=3 0x555 sp=__x64_sys_umount+0x12/0x15
>
> the trace line being added here:
>
> 		if (!propagate_mount_busy(mnt, 2)) {
> 			if (!list_empty(&mnt->mnt_list))
> 				umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
> 			retval = 0;
> 		} else {
> 			trace_mnt_count(mnt, mnt->mnt_id,
> 					atomic_read(&mnt->mnt_count),
> 					0x555, __builtin_return_address(0));
> 		}
>
> The busy evaluation is a result of this check:
>
> 	if (!list_empty(&mnt->mnt_mounts) || do_refcount_check(mnt, refcnt))
>
> in propagate_mount_busy().
>
>
> The problem apparently being that mnt_count counts both refs from mountings
> and refs from other sources, such as file descriptors or pathwalk.
>
> David

Sorry for wasting your time on the EBUSY.  The EBUSY error is not new, 
it is correct, and I was doing the wrong thing.  I cannot "umount /mnt" 
if I still have an FD which points inside /mnt.

I was trying to provide a nice clearer overview, but it was still too 
sloppy to understand.  I've written a second attempt to rephrase it (and 
remove my mistake about EBUSY).  This all seems consistent with what Al 
just said, so if you got the picture from Al's message, you can ignore 
this one :-).

~

The patch series [ver #12] has a problem.  OPEN_TREE_CLONE creates an 
open file, marked with FMODE_NEED_UNMOUNT for cleanup. Users are 
expected to move_mount() directly from that file.

However, it is also possible to use openat() on the open file, which 
gives you a second open file.  This raises questions about the cleanup 
handling.  The second open file is *not* marked FMODE_NEED_UNMOUNT.  
What happens if we clean up the first open file and then move_mount() 
from the second one?  And what happens if you consume the second open 
file using move_mount(), and then cleanup up the first open file?

When I test the patch series [ver #12], it seems I can trigger the same 
bug for either case.  The two reproducers use the same commands, but in 
a different order.

"close-then-mount"

# open_tree_clone 3</mnt 3 sh
# cd /proc/self/fd/3
# exec 3<&-  # close FD 3
# mount --move . /mnt && cd /
# umount -l /mnt
watchdog: BUG: soft lockup - CPU#0 stuck for 23s! [umount:1472]
...
RIP: 0010:pin_kill+0x128/0x140
...
  Call Trace:
  pin_kill+0x5a/0x140
  ? finish_wait+0x80/0x80
  group_pin_kill+0x1a/0x30
  namespace_unlock+0x6f/0x80
  ksys_umount+0x220/0x420
  __x64_sys_umount+0x12/0x20
  do_syscall_64+0x5b/0x160
  entry_SYSCALL_64_after_hwframe+0x44/0xa9


"mount-then-close"

# open_tree_clone 3</mnt 3 sh
# cd /proc/self/fd/3
# mount --move . /mnt && cd /
# umount -l /mnt
# exec 3<&-  # close FD 3
watchdog: BUG: soft lockup - CPU#1 stuck for 22s! [sh:1423]
...
RIP: 0010:pin_kill+0x128/0x140
...
Call Trace:
  ? finish_wait+0x80/0x80
  group_pin_kill+0x1a/0x30
  namespace_unlock+0x6f/0x80
  __fput+0x239/0x240
  task_work_run+0x84/0xa0
  exit_to_usermode_loop+0xb4/0xc0
  do_syscall_64+0x14d/0x160
  entry_SYSCALL_64_after_hwframe+0x44/0xa9

When I debug the kernel and reproduce "close-then-mount", I can see 
something is wrong even before the last command.  The mount command 
attaches a mount into the mount namespace which is still marked as 
MNT_UMOUNT.  This contradicts a comment in the predicate function, 
disconnect_mount():

	/* Because the reference counting rules change when mounts are
* unmounted and connected, umounted mounts may not be
* connected to mounted mounts.
*/
	if  (!(mnt 
<https://elixir.bootlin.com/linux/latest/ident/mnt>->mnt_parent->mnt 
<https://elixir.bootlin.com/linux/latest/ident/mnt>.mnt_flags  &  MNT_UMOUNT <https://elixir.bootlin.com/linux/latest/ident/MNT_UMOUNT>))
		return  true;

We could ask if there is a procedure to safely clear MNT_UMOUNT on a 
detached tree, but we don't have a specific reason to. You suggested a 
one-line diff, to deny the problematic mount command in "close-then-mount".

@@ -2469,7 +2469,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
      if (old->mnt_ns && !attached)
          goto out1;
  
-    if (old->mnt.mnt_flags & MNT_LOCKED)
+    if (old->mnt.mnt_flags & (MNT_LOCKED | MNT_UMOUNT))
          goto out1;
  
      if (old_path->dentry != old_path->mnt->mnt_root)

It sounds plausible, and it worked as suggested.  But it feels 
incomplete.  If my two reproducer sequences are really symmetric, we 
need to fix the code path in move_mount() *and* the code path in 
close().  I asked if we can add this on top:

@@ -1763,7 +1763,7 @@ void dissolve_on_fput(struct vfsmount *mnt)
  {
      namespace_lock();
      lock_mount_hash();
-    if (!real_mount(mnt)->mnt_ns) {
+    if (!real_mount(mnt)->mnt_ns && !(mnt->mnt_flags & MNT_UMOUNT)) {
          mntget(mnt);
          umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
      }

(To apply without whitespace damage, see the attachment).  I tested now 
and this seems to allow "mount-then-close"; there is no immediate 
softlockup or error message.

You mentioned when you tested, you can get a GPF in fsnotify instead, 
depending on the timing of the commands.  I have been entering my 
commands one at a time, and I have not seen the GPF so far.

You posted an analysis of a GPF, where you showed the reference count 
was clearly one less than it should have been.  You narrowed this down 
to a step where you connected an unmounted mount (MNT_UMOUNT) to a 
mounted mount.  So your analysis is consistent with the comment in 
disconnect_mount(), which says 1) you're not allowed to do that, 2) the 
reason is because of different reference-counting rules.  AFAICT, the 
GPF you analyzed would be prevented by the fix in do_move_mount(), 
checking for MNT_UMOUNT.

I have been trying to understand MNT_UMOUNT by reading the patch series 
that added it.  Now I'm getting the impression the different 
ref-counting rules pre-date MNT_UMOUNT.  I *think* the added check in 
dissolve_on_fput() makes things right, but I don't understand enough to 
be sure.

Alan
diff --git a/fs/namespace.c b/fs/namespace.c
index 4dfe7e23b7ee..e8d61d5f581d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1763,7 +1763,7 @@ void dissolve_on_fput(struct vfsmount *mnt)
 {
 	namespace_lock();
 	lock_mount_hash();
-	if (!real_mount(mnt)->mnt_ns) {
+	if (!real_mount(mnt)->mnt_ns && !(mnt->mnt_flags & MNT_UMOUNT)) {
 		mntget(mnt);
 		umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
 	}
@@ -2469,7 +2469,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
 	if (old->mnt_ns && !attached)
 		goto out1;
 
-	if (old->mnt.mnt_flags & MNT_LOCKED)
+	if (old->mnt.mnt_flags & (MNT_LOCKED | MNT_UMOUNT))
 		goto out1;
 
 	if (old_path->dentry != old_path->mnt->mnt_root)
Al Viro Oct. 20, 2018, 11:48 a.m. UTC | #34
On Sat, Oct 20, 2018 at 12:06:32PM +0100, Alan Jenkins wrote:

> You posted an analysis of a GPF, where you showed the reference count was
> clearly one less than it should have been.  You narrowed this down to a step
> where you connected an unmounted mount (MNT_UMOUNT) to a mounted mount.  So
> your analysis is consistent with the comment in disconnect_mount(), which
> says 1) you're not allowed to do that, 2) the reason is because of different
> reference-counting rules.  AFAICT, the GPF you analyzed would be prevented
> by the fix in do_move_mount(), checking for MNT_UMOUNT.

Not just refcounting; it's that fs_pin is really intended to have ->kill()
triggered only once.  If you look at the pin_kill() (which is where the
livelock happened) you'll see what's going on - anyone hitting it between
the first call and freeing of the object will be sleeping until ->kill()
from the first call gets through pin_remove(), at which point they bugger
off (being very careful with accessing the sucker to avoid use-after-free).

MNT_UMOUNT means that there's no way back.

> pre-date MNT_UMOUNT.  I *think* the added check in dissolve_on_fput() makes
> things right, but I don't understand enough to be sure.

That, plus making sure that do_move_mount() grabs a reference in case
of successfully attaching a tree.  I hate passing bool argument, BTW -
better just do mnt_add_count() either before attach_recursive_mnt()
and decrement on failure, or, better yet, just do it on success.  Note
that namespace_sem is held, so the damn thing *can't* disappear under
us - nobody will be able to detach it until we drop namespace_lock.

> diff --git a/fs/namespace.c b/fs/namespace.c
> index 4dfe7e23b7ee..e8d61d5f581d 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -1763,7 +1763,7 @@ void dissolve_on_fput(struct vfsmount *mnt)
>  {
>  	namespace_lock();
>  	lock_mount_hash();
> -	if (!real_mount(mnt)->mnt_ns) {
> +	if (!real_mount(mnt)->mnt_ns && !(mnt->mnt_flags & MNT_UMOUNT)) {
>  		mntget(mnt);
>  		umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
>  	}
> @@ -2469,7 +2469,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
>  	if (old->mnt_ns && !attached)
>  		goto out1;
>  
> -	if (old->mnt.mnt_flags & MNT_LOCKED)
> +	if (old->mnt.mnt_flags & (MNT_LOCKED | MNT_UMOUNT))
>  		goto out1;
>  
>  	if (old_path->dentry != old_path->mnt->mnt_root)
Al Viro Oct. 20, 2018, 12:26 p.m. UTC | #35
On Sat, Oct 20, 2018 at 12:48:26PM +0100, Al Viro wrote:

> Not just refcounting; it's that fs_pin is really intended to have ->kill()
> triggered only once.  If you look at the pin_kill() (which is where the
> livelock happened)

More specifically, it's group_pin_kill() assuming that by the time pin_kill()
returns it either will have called to pin_remove() or will have waited for
one to complete.  Either way, the object will be gone from the list, so we
do get progress.  Livelock comes since the object has already been through
pin_remove() once and then got reinserted into the list.  Now pin_kill()
returns immediately and we keep spinning on the element that doesn't go
away.
David Howells Oct. 21, 2018, 12:40 a.m. UTC | #36
Alan Jenkins <alan.christopher.jenkins@gmail.com> wrote:

> diff --git a/fs/namespace.c b/fs/namespace.c
> index 4dfe7e23b7ee..e8d61d5f581d 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -1763,7 +1763,7 @@ void dissolve_on_fput(struct vfsmount *mnt)
>  {
>  	namespace_lock();
>  	lock_mount_hash();
> -	if (!real_mount(mnt)->mnt_ns) {
> +	if (!real_mount(mnt)->mnt_ns && !(mnt->mnt_flags & MNT_UMOUNT)) {
>  		mntget(mnt);
>  		umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
>  	}
> @@ -2469,7 +2469,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
>  	if (old->mnt_ns && !attached)
>  		goto out1;
>  
> -	if (old->mnt.mnt_flags & MNT_LOCKED)
> +	if (old->mnt.mnt_flags & (MNT_LOCKED | MNT_UMOUNT))
>  		goto out1;
>  
>  	if (old_path->dentry != old_path->mnt->mnt_root)

I've already got one of these; I'll fold in the other also.

David
Eric W. Biederman Oct. 21, 2018, 4:57 p.m. UTC | #37
David Howells <dhowells@redhat.com> writes:

> From: Al Viro <viro@zeniv.linux.org.uk>
>
> Allow a detached tree created by open_tree(..., OPEN_TREE_CLONE) to be
> attached by move_mount(2).
>
> If by the time of final fput() of OPEN_TREE_CLONE-opened file its tree is
> not detached anymore, it won't be dissolved.  move_mount(2) is adjusted
> to handle detached source.
>
> That gives us equivalents of mount --bind and mount --rbind.

In light of recent conversations about double umount_tree.

Do we want to simply limit ourselves to attaching file->f_path of
a FMODE_NEED_UMOUNT file descriptor and clearing FMODE_NEED_UMOUNT
when it is attached?

Either that or perhaps move the logic into mntput on when to perform the
umount_tree?

Otherwise I believe we start running into surprising situations:

This works:
	ofd = open_tree(...);
	dup_fd = openat(ofd, "", O_PATH);
	mount_move(dup_fd, ...);
	close(ofd);
	close(dup_fd);

This should fail:
	ofd = open_tree(...);
	dup_fd = openat(ofd, "", O_PATH);
	close(ofd);
	mount_move(dup_fd, ...);
	close(dup_fd);

Allowing any file descriptor that points to mnt_ns == NULL without
MNT_UMOUNT set seems like it is adding flexibility for no reason.


> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> Signed-off-by: David Howells <dhowells@redhat.com>
> ---
>
>  fs/namespace.c |   26 ++++++++++++++++++++------
>  1 file changed, 20 insertions(+), 6 deletions(-)
>
> diff --git a/fs/namespace.c b/fs/namespace.c
> index dd38141b1723..caf5c55ef555 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -1785,8 +1785,10 @@ void dissolve_on_fput(struct vfsmount *mnt)
>  {
>  	namespace_lock();
>  	lock_mount_hash();
> -	mntget(mnt);
> -	umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
> +	if (!real_mount(mnt)->mnt_ns) {
> +		mntget(mnt);
> +		umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
> +	}

^^^^^^ This change should be unnecessary.

>  	unlock_mount_hash();
>  	namespace_unlock();
>  }
> @@ -2393,6 +2395,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
>  	struct mount *old;
>  	struct mountpoint *mp;
>  	int err;
> +	bool attached;
>  
>  	mp = lock_mount(new_path);
>  	err = PTR_ERR(mp);
> @@ -2403,10 +2406,19 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
>  	p = real_mount(new_path->mnt);
>  
>  	err = -EINVAL;
> -	if (!check_mnt(p) || !check_mnt(old))
> +	/* The mountpoint must be in our namespace. */
> +	if (!check_mnt(p))
> +		goto out1;
> +	/* The thing moved should be either ours or completely unattached. */
> +	if (old->mnt_ns && !check_mnt(old))
>  		goto out1;

^^^^^^^^^^^^^^^^^^^^^^^

!old->mnt_ns  should only be allowed when it comes from a file
 descriptor with FMODE_NEED_UMOUNT.


> -	if (!mnt_has_parent(old))
> +	attached = mnt_has_parent(old);
> +	/*
> +	 * We need to allow open_tree(OPEN_TREE_CLONE) followed by
> +	 * move_mount(), but mustn't allow "/" to be moved.
> +	 */
> +	if (old->mnt_ns && !attached)
>  		goto out1;
>  
>  	if (old->mnt.mnt_flags & MNT_LOCKED)
> @@ -2421,7 +2433,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
>  	/*
>  	 * Don't move a mount residing in a shared parent.
>  	 */
> -	if (IS_MNT_SHARED(old->mnt_parent))
> +	if (attached && IS_MNT_SHARED(old->mnt_parent))
>  		goto out1;
>  	/*
>  	 * Don't move a mount tree containing unbindable mounts to a destination
> @@ -2435,7 +2447,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
>  			goto out1;
>  
>  	err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp,
> -				   &parent_path);
> +				   attached ? &parent_path : NULL);
>  	if (err)
>  		goto out1;

^^^^^^^^^^^^^^^^^^^
Somewhere around here we should have code that clears FMODE_NEED_UMOUNT.


> @@ -3121,6 +3133,8 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
>  
>  /*
>   * Move a mount from one place to another.
> + * In combination with open_tree(OPEN_TREE_CLONE [| AT_RECURSIVE]) it can be
> + * used to copy a mount subtree.
>   *
>   * Note the flags value is a combination of MOVE_MOUNT_* flags.
>   */
Alan Jenkins Oct. 23, 2018, 11:19 a.m. UTC | #38
On 21/09/2018 17:30, David Howells wrote:
> From: Al Viro <viro@zeniv.linux.org.uk>
> 
> Allow a detached tree created by open_tree(..., OPEN_TREE_CLONE) to be
> attached by move_mount(2).
> 
> If by the time of final fput() of OPEN_TREE_CLONE-opened file its tree is
> not detached anymore, it won't be dissolved.  move_mount(2) is adjusted
> to handle detached source.
> 
> That gives us equivalents of mount --bind and mount --rbind.
> 
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> Signed-off-by: David Howells <dhowells@redhat.com>
> ---
> 
>   fs/namespace.c |   26 ++++++++++++++++++++------
>   1 file changed, 20 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/namespace.c b/fs/namespace.c
> index dd38141b1723..caf5c55ef555 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -1785,8 +1785,10 @@ void dissolve_on_fput(struct vfsmount *mnt)
>   {
>   	namespace_lock();
>   	lock_mount_hash();
> -	mntget(mnt);
> -	umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
> +	if (!real_mount(mnt)->mnt_ns) {
> +		mntget(mnt);
> +		umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
> +	}
>   	unlock_mount_hash();
>   	namespace_unlock();
>   }
> @@ -2393,6 +2395,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
>   	struct mount *old;
>   	struct mountpoint *mp;
>   	int err;
> +	bool attached;
>   
>   	mp = lock_mount(new_path);
>   	err = PTR_ERR(mp);
> @@ -2403,10 +2406,19 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
>   	p = real_mount(new_path->mnt);
>   
>   	err = -EINVAL;
> -	if (!check_mnt(p) || !check_mnt(old))
> +	/* The mountpoint must be in our namespace. */
> +	if (!check_mnt(p))
> +		goto out1;
> +	/* The thing moved should be either ours or completely unattached. */
> +	if (old->mnt_ns && !check_mnt(old))
>   		goto out1;
>   
> -	if (!mnt_has_parent(old))
> +	attached = mnt_has_parent(old);
> +	/*
> +	 * We need to allow open_tree(OPEN_TREE_CLONE) followed by
> +	 * move_mount(), but mustn't allow "/" to be moved.
> +	 */
> +	if (old->mnt_ns && !attached)
>   		goto out1;
>   
>   	if (old->mnt.mnt_flags & MNT_LOCKED)
> @@ -2421,7 +2433,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
>   	/*
>   	 * Don't move a mount residing in a shared parent.
>   	 */
> -	if (IS_MNT_SHARED(old->mnt_parent))
> +	if (attached && IS_MNT_SHARED(old->mnt_parent))
>   		goto out1;
>   	/*
>   	 * Don't move a mount tree containing unbindable mounts to a destination
> @@ -2435,7 +2447,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
>   			goto out1;
>   
>   	err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp,
> -				   &parent_path);
> +				   attached ? &parent_path : NULL);
>   	if (err)
>   		goto out1;
>   

I think there's another small hole.  It is possible to move a sub-mount 
from a detached tree (instead of moving the root of the tree).  Then 
do_move_mount() calls attach_recursive_mnt() with a non-NULL parent_path.

This causes it to skip count_mounts().  So, it doesn't count the number 
of attached mounts, and it allows you to exceed sysctl_mount_max.

Regards
Alan

(I've tested to confirm the code lets you move a sub-mount.  I didn't 
test whether it allows exceeding sysctl_mount_max.

# unshare -m --propagation private
# mkdir -p /tmp/mnt
# mount --bind /tmp/mnt /tmp/mnt
# open_tree_clone 3</tmp 3 sh
# cd /proc/self/fd/3
# mount --move mnt /mnt
# exit
exit
# exit
logout
#
Al Viro Oct. 23, 2018, 4:22 p.m. UTC | #39
On Tue, Oct 23, 2018 at 12:19:35PM +0100, Alan Jenkins wrote:

> I think there's another small hole.  It is possible to move a sub-mount from
> a detached tree (instead of moving the root of the tree).  Then
> do_move_mount() calls attach_recursive_mnt() with a non-NULL parent_path.
> 
> This causes it to skip count_mounts().  So, it doesn't count the number of
> attached mounts, and it allows you to exceed sysctl_mount_max.

That's trivial to repair, fortunately - we just need to check source_mnt->mnt_ns
instead of parent_path.
diff mbox series

Patch

diff --git a/fs/namespace.c b/fs/namespace.c
index dd38141b1723..caf5c55ef555 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1785,8 +1785,10 @@  void dissolve_on_fput(struct vfsmount *mnt)
 {
 	namespace_lock();
 	lock_mount_hash();
-	mntget(mnt);
-	umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
+	if (!real_mount(mnt)->mnt_ns) {
+		mntget(mnt);
+		umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
+	}
 	unlock_mount_hash();
 	namespace_unlock();
 }
@@ -2393,6 +2395,7 @@  static int do_move_mount(struct path *old_path, struct path *new_path)
 	struct mount *old;
 	struct mountpoint *mp;
 	int err;
+	bool attached;
 
 	mp = lock_mount(new_path);
 	err = PTR_ERR(mp);
@@ -2403,10 +2406,19 @@  static int do_move_mount(struct path *old_path, struct path *new_path)
 	p = real_mount(new_path->mnt);
 
 	err = -EINVAL;
-	if (!check_mnt(p) || !check_mnt(old))
+	/* The mountpoint must be in our namespace. */
+	if (!check_mnt(p))
+		goto out1;
+	/* The thing moved should be either ours or completely unattached. */
+	if (old->mnt_ns && !check_mnt(old))
 		goto out1;
 
-	if (!mnt_has_parent(old))
+	attached = mnt_has_parent(old);
+	/*
+	 * We need to allow open_tree(OPEN_TREE_CLONE) followed by
+	 * move_mount(), but mustn't allow "/" to be moved.
+	 */
+	if (old->mnt_ns && !attached)
 		goto out1;
 
 	if (old->mnt.mnt_flags & MNT_LOCKED)
@@ -2421,7 +2433,7 @@  static int do_move_mount(struct path *old_path, struct path *new_path)
 	/*
 	 * Don't move a mount residing in a shared parent.
 	 */
-	if (IS_MNT_SHARED(old->mnt_parent))
+	if (attached && IS_MNT_SHARED(old->mnt_parent))
 		goto out1;
 	/*
 	 * Don't move a mount tree containing unbindable mounts to a destination
@@ -2435,7 +2447,7 @@  static int do_move_mount(struct path *old_path, struct path *new_path)
 			goto out1;
 
 	err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp,
-				   &parent_path);
+				   attached ? &parent_path : NULL);
 	if (err)
 		goto out1;
 
@@ -3121,6 +3133,8 @@  SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
 
 /*
  * Move a mount from one place to another.
+ * In combination with open_tree(OPEN_TREE_CLONE [| AT_RECURSIVE]) it can be
+ * used to copy a mount subtree.
  *
  * Note the flags value is a combination of MOVE_MOUNT_* flags.
  */