diff mbox

[RFC,16/35] ovl: readd lsattr/chattr support

Message ID 20180412150826.20988-17-mszeredi@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Miklos Szeredi April 12, 2018, 3:08 p.m. UTC
Implement FS_IOC_GETFLAGS and FS_IOC_SETFLAGS.

Needs vfs_ioctl() exported to modules.

Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/internal.h       |  1 -
 fs/ioctl.c          |  1 +
 fs/overlayfs/file.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h  |  2 ++
 4 files changed, 62 insertions(+), 1 deletion(-)

Comments

Amir Goldstein April 13, 2018, 2:48 p.m. UTC | #1
On Thu, Apr 12, 2018 at 6:08 PM, Miklos Szeredi <mszeredi@redhat.com> wrote:
> Implement FS_IOC_GETFLAGS and FS_IOC_SETFLAGS.
>
> Needs vfs_ioctl() exported to modules.
>
> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
> ---
>  fs/internal.h       |  1 -
>  fs/ioctl.c          |  1 +
>  fs/overlayfs/file.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/fs.h  |  2 ++
>  4 files changed, 62 insertions(+), 1 deletion(-)
>
> diff --git a/fs/internal.h b/fs/internal.h
> index 3319bf39e339..d5108d9c6a2f 100644
> --- a/fs/internal.h
> +++ b/fs/internal.h
> @@ -176,7 +176,6 @@ extern const struct dentry_operations ns_dentry_operations;
>   */
>  extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd,
>                     unsigned long arg);
> -extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
>
>  /*
>   * iomap support:
> diff --git a/fs/ioctl.c b/fs/ioctl.c
> index 5ace7efb0d04..696f4c46a868 100644
> --- a/fs/ioctl.c
> +++ b/fs/ioctl.c
> @@ -49,6 +49,7 @@ long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
>   out:
>         return error;
>  }
> +EXPORT_SYMBOL(vfs_ioctl);
>
>  static int ioctl_fibmap(struct file *filp, int __user *p)
>  {
> diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
> index 05e3e2f80b89..cc004ff1b05b 100644
> --- a/fs/overlayfs/file.c
> +++ b/fs/overlayfs/file.c
> @@ -8,6 +8,7 @@
>
>  #include <linux/cred.h>
>  #include <linux/file.h>
> +#include <linux/mount.h>
>  #include <linux/xattr.h>
>  #include <linux/uio.h>
>  #include "overlayfs.h"
> @@ -291,6 +292,63 @@ long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
>         return ret;
>  }
>
> +static long ovl_real_ioctl(struct file *file, unsigned int cmd,
> +                          unsigned long arg)
> +{
> +       struct fd real;
> +       const struct cred *old_cred;
> +       long ret;
> +
> +       ret = ovl_real_file(file, &real);
> +       if (ret)
> +               return ret;
> +
> +       old_cred = ovl_override_creds(file_inode(file)->i_sb);
> +       ret = vfs_ioctl(real.file, cmd, arg);
> +       revert_creds(old_cred);
> +
> +       fdput(real);
> +
> +       return ret;
> +}
> +
> +long ovl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
> +{
> +       long ret;
> +       struct inode *inode = file_inode(file);
> +
> +       switch (cmd) {
> +       case FS_IOC_GETFLAGS:
> +               ret = ovl_real_ioctl(file, cmd, arg);
> +               break;
> +
> +       case FS_IOC_SETFLAGS:
> +               if (!inode_owner_or_capable(inode))
> +                       return -EACCES;
> +
> +               ret = mnt_want_write_file(file);
> +               if (ret)
> +                       return ret;
> +
> +               ret = ovl_copy_up(file_dentry(file));
> +               if (!ret) {
> +                       ret = ovl_real_ioctl(file, cmd, arg);
> +
> +                       inode_lock(inode);
> +                       ovl_copyflags(ovl_inode_real(inode), inode);
> +                       inode_unlock(inode);
> +               }
> +
> +               mnt_drop_write_file(file);
> +               break;
> +
> +       default:
> +               ret = -ENOTTY;

I am wondering out loud.
This is a change of behavior that fs specific ioctls cannot be executed
on overlay file - arguably a good change of behavior, but still a change
that applications may got dependent on.

Would it have been better to opt-in for this change by a more generic
config/mount options, for example "consistent_fd" , instead of
"copy_up_shared" and then we can choose whether or not to
pass though unknown ioctls to real file.

I know we removed the want_write_file() protection from VFS, but
still, pass through of ioctls was the legacy behavior. Thoughts?
I don't mind to wait and see if someone shouts.

Thanks,
Amir.
Amir Goldstein April 17, 2018, 7:51 p.m. UTC | #2
On Thu, Apr 12, 2018 at 6:08 PM, Miklos Szeredi <mszeredi@redhat.com> wrote:
> Implement FS_IOC_GETFLAGS and FS_IOC_SETFLAGS.
>
> Needs vfs_ioctl() exported to modules.
>
> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
> ---
>  fs/internal.h       |  1 -
>  fs/ioctl.c          |  1 +
>  fs/overlayfs/file.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/fs.h  |  2 ++
>  4 files changed, 62 insertions(+), 1 deletion(-)
>
> diff --git a/fs/internal.h b/fs/internal.h
> index 3319bf39e339..d5108d9c6a2f 100644
> --- a/fs/internal.h
> +++ b/fs/internal.h
> @@ -176,7 +176,6 @@ extern const struct dentry_operations ns_dentry_operations;
>   */
>  extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd,
>                     unsigned long arg);
> -extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
>
>  /*
>   * iomap support:
> diff --git a/fs/ioctl.c b/fs/ioctl.c
> index 5ace7efb0d04..696f4c46a868 100644
> --- a/fs/ioctl.c
> +++ b/fs/ioctl.c
> @@ -49,6 +49,7 @@ long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
>   out:
>         return error;
>  }
> +EXPORT_SYMBOL(vfs_ioctl);
>
>  static int ioctl_fibmap(struct file *filp, int __user *p)
>  {
> diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
> index 05e3e2f80b89..cc004ff1b05b 100644
> --- a/fs/overlayfs/file.c
> +++ b/fs/overlayfs/file.c
> @@ -8,6 +8,7 @@
>
>  #include <linux/cred.h>
>  #include <linux/file.h>
> +#include <linux/mount.h>
>  #include <linux/xattr.h>
>  #include <linux/uio.h>
>  #include "overlayfs.h"
> @@ -291,6 +292,63 @@ long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
>         return ret;
>  }
>
> +static long ovl_real_ioctl(struct file *file, unsigned int cmd,
> +                          unsigned long arg)
> +{
> +       struct fd real;
> +       const struct cred *old_cred;
> +       long ret;
> +
> +       ret = ovl_real_file(file, &real);
> +       if (ret)
> +               return ret;
> +
> +       old_cred = ovl_override_creds(file_inode(file)->i_sb);
> +       ret = vfs_ioctl(real.file, cmd, arg);
> +       revert_creds(old_cred);
> +
> +       fdput(real);
> +
> +       return ret;
> +}
> +
> +long ovl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
> +{
> +       long ret;
> +       struct inode *inode = file_inode(file);
> +
> +       switch (cmd) {
> +       case FS_IOC_GETFLAGS:
> +               ret = ovl_real_ioctl(file, cmd, arg);
> +               break;
> +
> +       case FS_IOC_SETFLAGS:
> +               if (!inode_owner_or_capable(inode))
> +                       return -EACCES;
> +
> +               ret = mnt_want_write_file(file);
> +               if (ret)
> +                       return ret;
> +
> +               ret = ovl_copy_up(file_dentry(file));
> +               if (!ret) {
> +                       ret = ovl_real_ioctl(file, cmd, arg);
> +

I got this lockdep splat with overlayfs-rorw and overlay/040, but I don't
see the problem in the patch:

overlay/040 [19:29:01][    7.414427]
[    7.415349] ============================================
[    7.417863] WARNING: possible recursive locking detected
[    7.419652] 4.16.0-rc7-xfstests-00054-g1b18a246f764-dirty #3233 Not tainted
[    7.421517] --------------------------------------------
[    7.422736] chattr/2376 is trying to acquire lock:
[    7.423843]  (sb_writers#10){.+.+}, at: [<000000003170ac81>]
mnt_want_write_file+0x21/0x4a
[    7.425683]
[    7.425683] but task is already holding lock:
[    7.427397]  (sb_writers#10){.+.+}, at: [<000000003170ac81>]
mnt_want_write_file+0x21/0x4a
[    7.430180]
[    7.430180] other info that might help us debug this:
[    7.432511]  Possible unsafe locking scenario:
[    7.432511]
[    7.433860]        CPU0
[    7.434424]        ----
[    7.434987]   lock(sb_writers#10);
[    7.435768]   lock(sb_writers#10);
[    7.436692]
[    7.436692]  *** DEADLOCK ***
[    7.436692]
[    7.438460]  May be due to missing lock nesting notation
[    7.438460]
[    7.440477] 1 lock held by chattr/2376:
[    7.441876]  #0:  (sb_writers#10){.+.+}, at: [<000000003170ac81>]
mnt_want_write_file+0x21/0x4a
[    7.444537]
[    7.444537] stack backtrace:
[    7.445881] CPU: 1 PID: 2376 Comm: chattr Not tainted
4.16.0-rc7-xfstests-00054-g1b18a246f764-dirty #3233
[    7.449594] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[    7.453121] Call Trace:
[    7.453945]  dump_stack+0x7c/0xb2
[    7.454957]  validate_chain.isra.24+0x6da/0x8af
[    7.456341]  __lock_acquire+0x5e6/0x67b
[    7.457643]  ? __lock_acquire+0x5e6/0x67b
[    7.458879]  lock_acquire+0x139/0x1dd
[    7.459988]  ? mnt_want_write_file+0x21/0x4a
[    7.461317]  __sb_start_write+0x91/0x163
[    7.462500]  ? mnt_want_write_file+0x21/0x4a
[    7.463822]  mnt_want_write_file+0x21/0x4a
[    7.465091]  xfs_ioc_setxflags+0x70/0xe5
[    7.466266]  xfs_file_ioctl+0x4a7/0xa90
[    7.467452]  ? __lock_acquire+0x5e6/0x67b
[    7.468681]  ? terminate_walk+0x20/0xd9
[    7.469835]  ? __lock_is_held+0x40/0x71
[    7.471012]  vfs_ioctl+0x1e/0x2b
[    7.472008]  ovl_real_ioctl+0x45/0x71
[    7.473122]  ovl_ioctl+0x9a/0xf2
[    7.474114]  vfs_ioctl+0x1e/0x2b
[    7.475149]  do_vfs_ioctl+0x579/0x5f1
[    7.476341]  ? entry_SYSCALL_64_after_hwframe+0x52/0xb7
[    7.477765]  SyS_ioctl+0x52/0x74
[    7.479041]  do_syscall_64+0x76/0x182
[    7.480419]  entry_SYSCALL_64_after_hwframe+0x42/0xb7
[    7.482536] RIP: 0033:0x7fde8d270dd7
[    7.483919] RSP: 002b:00007ffde2ae7e98 EFLAGS: 00000246 ORIG_RAX:
0000000000000010
[    7.486862] RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007fde8d270dd7
[    7.489554] RDX: 00007ffde2ae7eac RSI: 0000000040086602 RDI: 0000000000000003
[    7.492227] RBP: 0000000000000010 R08: 0000000000000000 R09: 0000000000000001
[    7.494922] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000001
[    7.497642] R13: 00007ffde2ae81a0 R14: 00007ffde2ae8188 R15: 0000000000000000
[    7.540886] XFS (vdf): Mounting V5 Filesystem
[    7.563795] XFS (vdf): Ending clean mount

Thanks,
Amir.
Amir Goldstein April 22, 2018, 8:35 a.m. UTC | #3
On Tue, Apr 17, 2018 at 10:51 PM, Amir Goldstein <amir73il@gmail.com> wrote:
> On Thu, Apr 12, 2018 at 6:08 PM, Miklos Szeredi <mszeredi@redhat.com> wrote:
>> Implement FS_IOC_GETFLAGS and FS_IOC_SETFLAGS.
>>
...
>> +long ovl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
>> +{
>> +       long ret;
>> +       struct inode *inode = file_inode(file);
>> +
>> +       switch (cmd) {
>> +       case FS_IOC_GETFLAGS:
>> +               ret = ovl_real_ioctl(file, cmd, arg);
>> +               break;
>> +
>> +       case FS_IOC_SETFLAGS:
>> +               if (!inode_owner_or_capable(inode))
>> +                       return -EACCES;
>> +
>> +               ret = mnt_want_write_file(file);
>> +               if (ret)
>> +                       return ret;
>> +
>> +               ret = ovl_copy_up(file_dentry(file));
>> +               if (!ret) {
>> +                       ret = ovl_real_ioctl(file, cmd, arg);
>> +
>
> I got this lockdep splat with overlayfs-rorw and overlay/040, but I don't
> see the problem in the patch:
>

Ouch! the problem is not with the patch. The patch is just bring to light
the fact that filesystems do mnt_want_write_file(file) on ioctls such as
FS_IOC_SETFLAGS and if that file happens to be an overlayfs file
then filesystems are getting write access to overlay mount and that was
not their intention. That can be a way to bypass filesystem ro mount
and freeze protection.

I couldn't reproduce ro/freeze protection with xfs and ext4 on upstream
kernel, but did reproduce freeze protection bypass with ext4 and the ro-rw
patches. ext4 also hits a WARN_ON with upstream kernel and with ro-rw:

root@kvm-xfstests:~# mount /vdf
root@kvm-xfstests:~# mkdir -p /vdf/ovl-lower /vdf/ovl-upper /vdf/ovl-work
root@kvm-xfstests:~# touch /vdf/ovl-upper/foo
root@kvm-xfstests:~# mount -t overlay none /mnt -o
lowerdir=/vdf/ovl-lower,upperdir=/vdf/ovl-upper,workdir=/vdf/ovl-work
root@kvm-xfstests:~# fsfreeze -f /vdf
root@kvm-xfstests:~# chattr +i /mnt/foo
root@kvm-xfstests:~# lsattr -l /mnt/foo
/mnt/scratch/foo             Immutable, Extents

[   53.478454] WARNING: CPU: 1 PID: 1415 at
/home/amir/build/src/linux/fs/ext4/ext4_jbd2.c:53
ext4_journal_check_start+0x48/0x82
[   53.482094] CPU: 1 PID: 1415 Comm: chattr Not tainted
4.17.0-rc1-xfstests-00086-g5a6426c9b720 #3255
[   53.484927] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[   53.487792] RIP: 0010:ext4_journal_check_start+0x48/0x82
[   53.489392] RSP: 0018:ffffc90000707b18 EFLAGS: 00010246
[   53.491070] RAX: 00000000ffffffe2 RBX: ffff88007c497000 RCX: 0000000000000000
[   53.493326] RDX: ffff880079284000 RSI: 000000000000002e RDI: ffffffff8208165c
[   53.494850] RBP: 0000000000000001 R08: 0000000000000000 R09: 0000000000000001
[   53.496356] R10: 0000000000000001 R11: ffff880077c49250 R12: 0000000000000000
[   53.497857] R13: ffff88007ca61180 R14: 000000000000019c R15: 0000000000000000
[   53.499376] FS:  00007f447dd2d780(0000) GS:ffff88007f400000(0000)
knlGS:0000000000000000
[   53.501975] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   53.504280] CR2: 00007f447d1ff0e0 CR3: 00000000792fc000 CR4: 00000000000006e0
[   53.505855] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   53.507814] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   53.511278] Call Trace:
[   53.511861]  __ext4_journal_start_sb+0xe4/0x1a4
[   53.512860]  ? ext4_file_open+0xb6/0x189
[   53.513705]  ext4_file_open+0xb6/0x189
[   53.514547]  ? ext4_release_file+0x9f/0x9f
[   53.515426]  do_dentry_open+0x1af/0x2e6
[   53.516298]  path_open+0x5d/0x7e
[   53.516998]  ovl_open_realfile+0x6b/0xcb
[   53.517843]  ? ovl_pre_mmap+0x4c/0x4c
[   53.518659]  ovl_open+0x51/0x77
[   53.519343]  do_dentry_open+0x1af/0x2e6
[   53.520167]  do_last+0x520/0x5f5
[   53.520874]  path_openat+0x1f1/0x274
[   53.521648]  do_filp_open+0x4d/0xa3
[   53.522422]  ? __alloc_fd+0x2f/0x1b6
[   53.523193]  ? do_sys_open+0x13d/0x1c4
[   53.523997]  do_sys_open+0x13d/0x1c4
[   53.525056]  do_syscall_64+0x5d/0x167
[   53.526099]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[   53.527620] RIP: 0033:0x7f447d1ff4b0
[   53.528693] RSP: 002b:00007fffaef09438 EFLAGS: 00000246 ORIG_RAX:
0000000000000002
[   53.530779] RAX: ffffffffffffffda RBX: 00007fffaef0adaf RCX: 00007f447d1ff4b0
[   53.532495] RDX: 00007fffaef09450 RSI: 0000000000000800 RDI: 00007fffaef0adaf
[   53.534053] RBP: 00007fffaef0adaf R08: 0000000000000000 R09: 0000000000000001
[   53.536649] R10: 00007f447d1a2ff0 R11: 0000000000000246 R12: 00007fffaef09528
[   53.539024] R13: 00007fffaef09740 R14: 00007fffaef09728 R15: 0000000000000000
[   53.541658] Code: 8b 93 60 07 00 00 b8 fb ff ff ff 48 8b 8a e8 03
00 00 80 e1 02 75 4c f6 43 50 01 b8 e2 ff ff ff 75 41 83 bb 28 03 00
00 04 75 02 <0f> 0b 48 8b 92 30 03 00 00 31 c0 48 85 d2 74 28 48 8b 02
83 e0
[   53.548190] irq event stamp: 0
[   53.549431] hardirqs last  enabled at (0): [<0000000000000000>]
      (null)
[   53.552361] hardirqs last disabled at (0): [<ffffffff810741a9>]
copy_process.part.9+0x654/0x1afb
[   53.555814] softirqs last  enabled at (0): [<ffffffff810741a9>]
copy_process.part.9+0x654/0x1afb
[   53.559164] softirqs last disabled at (0): [<0000000000000000>]
      (null)
[   53.562005] ---[ end trace adfe58189c7e6188 ]---


Upstream WARN_ON:

[  302.631228] WARNING: CPU: 0 PID: 1406 at
/home/amir/build/src/linux/fs/ext4/ext4_jbd2.c:53
ext4_journal_check_start+0x48/0x82
[  302.635440] CPU: 0 PID: 1406 Comm: chattr Not tainted
4.17.0-rc1-xfstests #3237
[  302.638200] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[  302.641172] RIP: 0010:ext4_journal_check_start+0x48/0x82
[  302.643154] RSP: 0018:ffffc9000076fbd8 EFLAGS: 00010246
[  302.644466] RAX: 00000000ffffffe2 RBX: ffff88007a77b000 RCX: 0000000000000000
[  302.646418] RDX: ffff88007c9df000 RSI: 00000000ffffffff RDI: 0000000000000246
[  302.648130] RBP: 0000000000000001 R08: 0000000000000000 R09: 0000000000000006
[  302.649764] R10: 0000000000000001 R11: ffffffff82210708 R12: 0000000000000000
[  302.651719] R13: ffff88007c468180 R14: 000000000000019c R15: 0000000000000000
[  302.653437] FS:  00007f070a480780(0000) GS:ffff88007f200000(0000)
knlGS:0000000000000000
[  302.655711] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  302.657923] CR2: 0000564edb963008 CR3: 000000007a676000 CR4: 00000000000006f0
[  302.660718] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  302.663476] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  302.666179] Call Trace:
[  302.667071]  __ext4_journal_start_sb+0xe4/0x1a4
[  302.668763]  ? ext4_file_open+0xb6/0x189
[  302.670118]  ext4_file_open+0xb6/0x189
[  302.671528]  ? ext4_release_file+0x9f/0x9f
[  302.673211]  do_dentry_open+0x19e/0x2d5
[  302.674747]  ? ovl_inode_init_once+0xe/0xe
[  302.676398]  do_last+0x520/0x5f9
[  302.677668]  path_openat+0x1fa/0x26b
[  302.679100]  do_filp_open+0x4d/0xa3
[  302.680280]  ? __lock_acquire+0x5e6/0x67b
[  302.681567]  ? __alloc_fd+0x1a4/0x1b6
[  302.683051]  ? do_sys_open+0x13c/0x1c1
[  302.684170]  do_sys_open+0x13c/0x1c1
[  302.685361]  do_syscall_64+0x5d/0x167
[  302.686458]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  302.688029] RIP: 0033:0x7f07099524b0
[  302.689090] RSP: 002b:00007ffcfefc3178 EFLAGS: 00000246 ORIG_RAX:
0000000000000002
[  302.691309] RAX: ffffffffffffffda RBX: 00007ffcfefc3daf RCX: 00007f07099524b0
[  302.693346] RDX: 00007ffcfefc3190 RSI: 0000000000000800 RDI: 00007ffcfefc3daf
[  302.695389] RBP: 00007ffcfefc3daf R08: 0000000000000000 R09: 0000000000000001
[  302.697766] R10: 00007f07098f5ff0 R11: 0000000000000246 R12: 00007ffcfefc3268
[  302.699955] R13: 00007ffcfefc3480 R14: 00007ffcfefc3468 R15: 0000000000000000
[  302.702521] Code: 8b 93 60 07 00 00 b8 fb ff ff ff 48 8b 8a e8 03
00 00 80 e1 02 75 4c f6 43 50 01 b8 e2 ff ff ff 75 41 83 bb 28 03 00
00 04 75 02 <0f> 0b 48 8b 92 30 03 00 00 31 c0 48 85 d2 74 28 48 8b 02
83 e0
[  302.706996] irq event stamp: 2376
[  302.707868] hardirqs last  enabled at (2375): [<ffffffff811f8e72>]
prepend_path+0x205/0x449
[  302.709872] hardirqs last disabled at (2376): [<ffffffff81a0118f>]
error_entry+0x7f/0x100
[  302.712311] softirqs last  enabled at (1060): [<ffffffff81c0033b>]
__do_softirq+0x33b/0x433
[  302.714754] softirqs last disabled at (1041): [<ffffffff8107c541>]
irq_exit+0x59/0xa8
[  302.716465] ---[ end trace e891c35ae0c8bbe5 ]---

Is there a reason why the real file can't get the real path?
For current kernels, can you say what else can go wrong when filesystems
call mnt_want_write_file() on an overlay file on ioctl with filesystem
inode and why I couldn't reproduce readonly/freeze bypass?

Thanks,
Amir.
Amir Goldstein April 22, 2018, 3:18 p.m. UTC | #4
On Sun, Apr 22, 2018 at 11:35 AM, Amir Goldstein <amir73il@gmail.com> wrote:
> On Tue, Apr 17, 2018 at 10:51 PM, Amir Goldstein <amir73il@gmail.com> wrote:
>> On Thu, Apr 12, 2018 at 6:08 PM, Miklos Szeredi <mszeredi@redhat.com> wrote:
>>> Implement FS_IOC_GETFLAGS and FS_IOC_SETFLAGS.
>>>
> ...
>>> +long ovl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
>>> +{
>>> +       long ret;
>>> +       struct inode *inode = file_inode(file);
>>> +
>>> +       switch (cmd) {
>>> +       case FS_IOC_GETFLAGS:
>>> +               ret = ovl_real_ioctl(file, cmd, arg);
>>> +               break;
>>> +
>>> +       case FS_IOC_SETFLAGS:
>>> +               if (!inode_owner_or_capable(inode))
>>> +                       return -EACCES;
>>> +
>>> +               ret = mnt_want_write_file(file);
>>> +               if (ret)
>>> +                       return ret;
>>> +
>>> +               ret = ovl_copy_up(file_dentry(file));
>>> +               if (!ret) {
>>> +                       ret = ovl_real_ioctl(file, cmd, arg);
>>> +
>>
>> I got this lockdep splat with overlayfs-rorw and overlay/040, but I don't
>> see the problem in the patch:
>>
>
> Ouch! the problem is not with the patch. The patch is just bring to light
> the fact that filesystems do mnt_want_write_file(file) on ioctls such as
> FS_IOC_SETFLAGS and if that file happens to be an overlayfs file
> then filesystems are getting write access to overlay mount and that was
> not their intention. That can be a way to bypass filesystem ro mount
> and freeze protection.
>
> I couldn't reproduce ro/freeze protection with xfs and ext4 on upstream
> kernel, but did reproduce freeze protection bypass with ext4 and the ro-rw
> patches. ext4 also hits a WARN_ON with upstream kernel and with ro-rw:
>
> root@kvm-xfstests:~# mount /vdf
> root@kvm-xfstests:~# mkdir -p /vdf/ovl-lower /vdf/ovl-upper /vdf/ovl-work
> root@kvm-xfstests:~# touch /vdf/ovl-upper/foo
> root@kvm-xfstests:~# mount -t overlay none /mnt -o
> lowerdir=/vdf/ovl-lower,upperdir=/vdf/ovl-upper,workdir=/vdf/ovl-work
> root@kvm-xfstests:~# fsfreeze -f /vdf
> root@kvm-xfstests:~# chattr +i /mnt/foo
> root@kvm-xfstests:~# lsattr -l /mnt/foo
> /mnt/scratch/foo             Immutable, Extents
>
[...]

> Upstream WARN_ON:
>
> [  302.631228] WARNING: CPU: 0 PID: 1406 at
> /home/amir/build/src/linux/fs/ext4/ext4_jbd2.c:53
> ext4_journal_check_start+0x48/0x82
> [  302.635440] CPU: 0 PID: 1406 Comm: chattr Not tainted
> 4.17.0-rc1-xfstests #3237
> [  302.638200] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
> BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
> [  302.641172] RIP: 0010:ext4_journal_check_start+0x48/0x82
> [  302.643154] RSP: 0018:ffffc9000076fbd8 EFLAGS: 00010246
> [  302.644466] RAX: 00000000ffffffe2 RBX: ffff88007a77b000 RCX: 0000000000000000
> [  302.646418] RDX: ffff88007c9df000 RSI: 00000000ffffffff RDI: 0000000000000246
> [  302.648130] RBP: 0000000000000001 R08: 0000000000000000 R09: 0000000000000006
> [  302.649764] R10: 0000000000000001 R11: ffffffff82210708 R12: 0000000000000000
> [  302.651719] R13: ffff88007c468180 R14: 000000000000019c R15: 0000000000000000
> [  302.653437] FS:  00007f070a480780(0000) GS:ffff88007f200000(0000)
> knlGS:0000000000000000
> [  302.655711] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  302.657923] CR2: 0000564edb963008 CR3: 000000007a676000 CR4: 00000000000006f0
> [  302.660718] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [  302.663476] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [  302.666179] Call Trace:
> [  302.667071]  __ext4_journal_start_sb+0xe4/0x1a4
> [  302.668763]  ? ext4_file_open+0xb6/0x189
> [  302.670118]  ext4_file_open+0xb6/0x189
> [  302.671528]  ? ext4_release_file+0x9f/0x9f
> [  302.673211]  do_dentry_open+0x19e/0x2d5
> [  302.674747]  ? ovl_inode_init_once+0xe/0xe
> [  302.676398]  do_last+0x520/0x5f9
> [  302.677668]  path_openat+0x1fa/0x26b
> [  302.679100]  do_filp_open+0x4d/0xa3
> [  302.680280]  ? __lock_acquire+0x5e6/0x67b
> [  302.681567]  ? __alloc_fd+0x1a4/0x1b6
> [  302.683051]  ? do_sys_open+0x13c/0x1c1
> [  302.684170]  do_sys_open+0x13c/0x1c1
> [  302.685361]  do_syscall_64+0x5d/0x167
> [  302.686458]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
> [  302.688029] RIP: 0033:0x7f07099524b0
> [  302.689090] RSP: 002b:00007ffcfefc3178 EFLAGS: 00000246 ORIG_RAX:
> 0000000000000002
> [  302.691309] RAX: ffffffffffffffda RBX: 00007ffcfefc3daf RCX: 00007f07099524b0
> [  302.693346] RDX: 00007ffcfefc3190 RSI: 0000000000000800 RDI: 00007ffcfefc3daf
> [  302.695389] RBP: 00007ffcfefc3daf R08: 0000000000000000 R09: 0000000000000001
> [  302.697766] R10: 00007f07098f5ff0 R11: 0000000000000246 R12: 00007ffcfefc3268
> [  302.699955] R13: 00007ffcfefc3480 R14: 00007ffcfefc3468 R15: 0000000000000000
> [  302.702521] Code: 8b 93 60 07 00 00 b8 fb ff ff ff 48 8b 8a e8 03
> 00 00 80 e1 02 75 4c f6 43 50 01 b8 e2 ff ff ff 75 41 83 bb 28 03 00
> 00 04 75 02 <0f> 0b 48 8b 92 30 03 00 00 31 c0 48 85 d2 74 28 48 8b 02
> 83 e0
> [  302.706996] irq event stamp: 2376
> [  302.707868] hardirqs last  enabled at (2375): [<ffffffff811f8e72>]
> prepend_path+0x205/0x449
> [  302.709872] hardirqs last disabled at (2376): [<ffffffff81a0118f>]
> error_entry+0x7f/0x100
> [  302.712311] softirqs last  enabled at (1060): [<ffffffff81c0033b>]
> __do_softirq+0x33b/0x433
> [  302.714754] softirqs last disabled at (1041): [<ffffffff8107c541>]
> irq_exit+0x59/0xa8
> [  302.716465] ---[ end trace e891c35ae0c8bbe5 ]---
>

That's an ext4 bug unrelated to overlayfs.
reproduced by:
# mount /vdf/
# fsfreeze -f /vdf/
# cat /vdf/foo

> Is there a reason why the real file can't get the real path?
> For current kernels, can you say what else can go wrong when filesystems
> call mnt_want_write_file() on an overlay file on ioctl with filesystem
> inode and why I couldn't reproduce readonly/freeze bypass?
>

The reason is that commit 7c6893e3c9ab ("ovl: don't allow writing ioctl
on lower layer") also silently fixed "block writing ioctl on upper layer
of frozen fs".

Thanks,
Amir.
Ritesh Harjani April 23, 2018, 6:11 a.m. UTC | #5
On 4/12/2018 8:38 PM, Miklos Szeredi wrote:
> Implement FS_IOC_GETFLAGS and FS_IOC_SETFLAGS.
> 
> Needs vfs_ioctl() exported to modules.
Do you think if it is better to separate out ovl implementation and 
export of vfs_ioctl method ?
Probably there are other users which should be using vfs_ioctl method as 
well (like ecryptfs) ?

> 
> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
> ---
>   fs/internal.h       |  1 -
>   fs/ioctl.c          |  1 +
>   fs/overlayfs/file.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>   include/linux/fs.h  |  2 ++
>   4 files changed, 62 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/internal.h b/fs/internal.h
> index 3319bf39e339..d5108d9c6a2f 100644
> --- a/fs/internal.h
> +++ b/fs/internal.h
> @@ -176,7 +176,6 @@ extern const struct dentry_operations ns_dentry_operations;
>    */
>   extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd,
>   		    unsigned long arg);
> -extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
>   
>   /*
>    * iomap support:
> diff --git a/fs/ioctl.c b/fs/ioctl.c
> index 5ace7efb0d04..696f4c46a868 100644
> --- a/fs/ioctl.c
> +++ b/fs/ioctl.c
> @@ -49,6 +49,7 @@ long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
>    out:
>   	return error;
>   }
> +EXPORT_SYMBOL(vfs_ioctl);
>   
>   static int ioctl_fibmap(struct file *filp, int __user *p)
>   {
> diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
> index 05e3e2f80b89..cc004ff1b05b 100644
> --- a/fs/overlayfs/file.c
> +++ b/fs/overlayfs/file.c
> @@ -8,6 +8,7 @@
>   
>   #include <linux/cred.h>
>   #include <linux/file.h>
> +#include <linux/mount.h>
>   #include <linux/xattr.h>
>   #include <linux/uio.h>
>   #include "overlayfs.h"
> @@ -291,6 +292,63 @@ long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
>   	return ret;
>   }
>   
> +static long ovl_real_ioctl(struct file *file, unsigned int cmd,
> +			   unsigned long arg)
> +{
> +	struct fd real;
> +	const struct cred *old_cred;
> +	long ret;
> +
> +	ret = ovl_real_file(file, &real);
> +	if (ret)
> +		return ret;
> +
> +	old_cred = ovl_override_creds(file_inode(file)->i_sb);
> +	ret = vfs_ioctl(real.file, cmd, arg);
> +	revert_creds(old_cred);
> +
> +	fdput(real);
> +
> +	return ret;
> +}
> +
> +long ovl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
> +{
> +	long ret;
> +	struct inode *inode = file_inode(file);
> +
> +	switch (cmd) {
> +	case FS_IOC_GETFLAGS:
> +		ret = ovl_real_ioctl(file, cmd, arg);
> +		break;
> +
> +	case FS_IOC_SETFLAGS:
> +		if (!inode_owner_or_capable(inode))
> +			return -EACCES;
> +
> +		ret = mnt_want_write_file(file);
> +		if (ret)
> +			return ret;
> +
> +		ret = ovl_copy_up(file_dentry(file));
> +		if (!ret) {
> +			ret = ovl_real_ioctl(file, cmd, arg);
> +
> +			inode_lock(inode);
> +			ovl_copyflags(ovl_inode_real(inode), inode);
> +			inode_unlock(inode);
> +		}
> +
> +		mnt_drop_write_file(file);
> +		break;
> +
> +	default:
> +		ret = -ENOTTY;
> +	}
> +
> +	return ret;
> +}
> +
>   const struct file_operations ovl_file_operations = {
>   	.open		= ovl_open,
>   	.release	= ovl_release,
> @@ -300,4 +358,5 @@ const struct file_operations ovl_file_operations = {
>   	.fsync		= ovl_fsync,
>   	.mmap		= ovl_mmap,
>   	.fallocate	= ovl_fallocate,
> +	.unlocked_ioctl	= ovl_ioctl,

What about compat_ioctl ? should that implementation also go in the same 
patch ?

>   };
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 0b215faa30ae..1add10f04b56 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1621,6 +1621,8 @@ int vfs_mkobj(struct dentry *, umode_t,
>   		int (*f)(struct dentry *, umode_t, void *),
>   		void *);
>   
> +extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
> +
>   /*
>    * VFS file helper functions.
>    */
>
Miklos Szeredi April 23, 2018, 10:21 a.m. UTC | #6
On Sun, Apr 22, 2018 at 10:35 AM, Amir Goldstein <amir73il@gmail.com> wrote:
> On Tue, Apr 17, 2018 at 10:51 PM, Amir Goldstein <amir73il@gmail.com> wrote:

[snip]

> Is there a reason why the real file can't get the real path?

It could, except for vma->vm_file.

Now, we could have a separate realfile for mmap (with overlay path)
and one for everything else (with real path).  Maybe that's the way to
go, to minimize the chance of trouble caused by this irregularity.

> For current kernels, can you say what else can go wrong when filesystems
> call mnt_want_write_file() on an overlay file on ioctl with filesystem
> inode and why I couldn't reproduce readonly/freeze bypass?

mnt_want_write_file() is overlayfs-aware in current kernels.

We could fix it to use file_inode()->i_sb instead of
f_path.dentry->d_sb after reverting the overlay specific hack, and
that would fix the freeze bypass bug and would be correct for all
filesystems.   But I wonder how many such issues we have where
discrepancy between f_path.dentry and file_inode() matters.

Thanks,
Miklos
Miklos Szeredi April 23, 2018, 10:28 a.m. UTC | #7
On Mon, Apr 23, 2018 at 12:21 PM, Miklos Szeredi <miklos@szeredi.hu> wrote:

> We could fix it to use file_inode()->i_sb instead of
> f_path.dentry->d_sb after reverting the overlay specific hack, and
> that would fix the freeze bypass bug and would be correct for all
> filesystems.   But I wonder how many such issues we have where
> discrepancy between f_path.dentry and file_inode() matters.

OTOH, any such issues should already have surfaced.  Just need to be
careful when reverting VFS patches that they don't regress in the face
of f_path containing the overlay path.

So we are probably better to keep using overlay path in real file
universally, just to keep the code simpler.

Thanks,
Miklos
diff mbox

Patch

diff --git a/fs/internal.h b/fs/internal.h
index 3319bf39e339..d5108d9c6a2f 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -176,7 +176,6 @@  extern const struct dentry_operations ns_dentry_operations;
  */
 extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd,
 		    unsigned long arg);
-extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 
 /*
  * iomap support:
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 5ace7efb0d04..696f4c46a868 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -49,6 +49,7 @@  long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
  out:
 	return error;
 }
+EXPORT_SYMBOL(vfs_ioctl);
 
 static int ioctl_fibmap(struct file *filp, int __user *p)
 {
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 05e3e2f80b89..cc004ff1b05b 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -8,6 +8,7 @@ 
 
 #include <linux/cred.h>
 #include <linux/file.h>
+#include <linux/mount.h>
 #include <linux/xattr.h>
 #include <linux/uio.h>
 #include "overlayfs.h"
@@ -291,6 +292,63 @@  long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	return ret;
 }
 
+static long ovl_real_ioctl(struct file *file, unsigned int cmd,
+			   unsigned long arg)
+{
+	struct fd real;
+	const struct cred *old_cred;
+	long ret;
+
+	ret = ovl_real_file(file, &real);
+	if (ret)
+		return ret;
+
+	old_cred = ovl_override_creds(file_inode(file)->i_sb);
+	ret = vfs_ioctl(real.file, cmd, arg);
+	revert_creds(old_cred);
+
+	fdput(real);
+
+	return ret;
+}
+
+long ovl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	long ret;
+	struct inode *inode = file_inode(file);
+
+	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+		ret = ovl_real_ioctl(file, cmd, arg);
+		break;
+
+	case FS_IOC_SETFLAGS:
+		if (!inode_owner_or_capable(inode))
+			return -EACCES;
+
+		ret = mnt_want_write_file(file);
+		if (ret)
+			return ret;
+
+		ret = ovl_copy_up(file_dentry(file));
+		if (!ret) {
+			ret = ovl_real_ioctl(file, cmd, arg);
+
+			inode_lock(inode);
+			ovl_copyflags(ovl_inode_real(inode), inode);
+			inode_unlock(inode);
+		}
+
+		mnt_drop_write_file(file);
+		break;
+
+	default:
+		ret = -ENOTTY;
+	}
+
+	return ret;
+}
+
 const struct file_operations ovl_file_operations = {
 	.open		= ovl_open,
 	.release	= ovl_release,
@@ -300,4 +358,5 @@  const struct file_operations ovl_file_operations = {
 	.fsync		= ovl_fsync,
 	.mmap		= ovl_mmap,
 	.fallocate	= ovl_fallocate,
+	.unlocked_ioctl	= ovl_ioctl,
 };
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0b215faa30ae..1add10f04b56 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1621,6 +1621,8 @@  int vfs_mkobj(struct dentry *, umode_t,
 		int (*f)(struct dentry *, umode_t, void *),
 		void *);
 
+extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+
 /*
  * VFS file helper functions.
  */