diff mbox series

[05/14] fs: don't allow kernel reads and writes without iter ops

Message ID 20200903142242.925828-6-hch@lst.de (mailing list archive)
State New, archived
Headers show
Series [01/14] proc: remove a level of indentation in proc_get_inode | expand

Commit Message

Christoph Hellwig Sept. 3, 2020, 2:22 p.m. UTC
Don't allow calling ->read or ->write with set_fs as a preparation for
killing off set_fs.  All the instances that we use kernel_read/write on
are using the iter ops already.

If a file has both the regular ->read/->write methods and the iter
variants those could have different semantics for messed up enough
drivers.  Also fails the kernel access to them in that case.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kees Cook <keescook@chromium.org>
---
 fs/read_write.c | 67 +++++++++++++++++++++++++++++++------------------
 1 file changed, 42 insertions(+), 25 deletions(-)

Comments

Eric Biggers Oct. 1, 2020, 10:38 p.m. UTC | #1
Christoph, Al, and Linus:

On Thu, Sep 03, 2020 at 04:22:33PM +0200, Christoph Hellwig wrote:
> @@ -510,28 +524,31 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t
>  /* caller is responsible for file_start_write/file_end_write */
>  ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
>  {
> -	mm_segment_t old_fs;
> -	const char __user *p;
> +	struct kvec iov = {
> +		.iov_base	= (void *)buf,
> +		.iov_len	= min_t(size_t, count, MAX_RW_COUNT),
> +	};
> +	struct kiocb kiocb;
> +	struct iov_iter iter;
>  	ssize_t ret;
>  
>  	if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE)))
>  		return -EBADF;
>  	if (!(file->f_mode & FMODE_CAN_WRITE))
>  		return -EINVAL;
> +	/*
> +	 * Also fail if ->write_iter and ->write are both wired up as that
> +	 * implies very convoluted semantics.
> +	 */
> +	if (unlikely(!file->f_op->write_iter || file->f_op->write))
> +		return warn_unsupported(file, "write");
>  
> -	old_fs = get_fs();
> -	set_fs(KERNEL_DS);
> -	p = (__force const char __user *)buf;
> -	if (count > MAX_RW_COUNT)
> -		count =  MAX_RW_COUNT;
> -	if (file->f_op->write)
> -		ret = file->f_op->write(file, p, count, pos);
> -	else if (file->f_op->write_iter)
> -		ret = new_sync_write(file, p, count, pos);
> -	else
> -		ret = -EINVAL;
> -	set_fs(old_fs);
> +	init_sync_kiocb(&kiocb, file);
> +	kiocb.ki_pos = *pos;
> +	iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len);
> +	ret = file->f_op->write_iter(&kiocb, &iter);

next-20201001 crashes on boot for me because there is a bad interaction between
this commit in vfs/for-next:

	commit 4d03e3cc59828c82ee89ea6e27a2f3cdf95aaadf
	Author: Christoph Hellwig <hch@lst.de>
	Date:   Thu Sep 3 16:22:33 2020 +0200

	    fs: don't allow kernel reads and writes without iter ops

... and Linus's mainline commit:

	commit 90fb702791bf99b959006972e8ee7bb4609f441b
	Author: Linus Torvalds <torvalds@linux-foundation.org>
	Date:   Tue Sep 29 17:18:34 2020 -0700

	    autofs: use __kernel_write() for the autofs pipe writing

Linus's commit made autofs start passing pos=NULL to __kernel_write().
But, Christoph's commit made __kernel_write() no longer accept a NULL pos.

The following fixes it:

diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c
index 5ced859dac53..b04c528b19d3 100644
--- a/fs/autofs/waitq.c
+++ b/fs/autofs/waitq.c
@@ -53,7 +53,7 @@ static int autofs_write(struct autofs_sb_info *sbi,
 
 	mutex_lock(&sbi->pipe_mutex);
 	while (bytes) {
-		wr = __kernel_write(file, data, bytes, NULL);
+		wr = __kernel_write(file, data, bytes, &file->f_pos);
 		if (wr <= 0)
 			break;
 		data += wr;

I'm not sure what was intended, though.

Full stack trace below.

BUG: kernel NULL pointer dereference, address: 0000000000000000
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
PGD 0 P4D 0 
Oops: 0000 [#1] PREEMPT SMP NOPTI
CPU: 12 PID: 383 Comm: systemd-binfmt Tainted: G                T 5.9.0-rc7-next-20201001 #2
Hardware name: Gigabyte Technology Co., Ltd. X399 AORUS Gaming 7/X399 AORUS Gaming 7, BIOS F2 08/31/2017
RIP: 0010:init_sync_kiocb include/linux/fs.h:2050 [inline]
RIP: 0010:__kernel_write+0x16c/0x2b0 fs/read_write.c:546
Code: 24 4a b9 01 00 00 00 0f 45 c7 be 01 00 00 00 48 8d 7c 24 10 48 c7 44 24 40 00 00 00 00 48 c7 44 24 58 00 00 00 00 89 44 24 4c <48> 8b 03 48 c7 44 24 60 00 00 00 00 48 89 44 24 50 4c 89 64 24 38
RSP: 0018:ffffa2fc0102f8b0 EFLAGS: 00010246
RAX: 0000000000020000 RBX: 0000000000000000 RCX: 0000000000000001
RDX: ffffa2fc0102f8b0 RSI: 0000000000000001 RDI: ffffa2fc0102f8c0
RBP: ffff8ad2927e2940 R08: 0000000000000130 R09: ffff8ad29a201800
R10: 000000000000000f R11: ffff8ad292547510 R12: ffff8ad2927e2940
R13: ffffa2fc0102f8e8 R14: ffff8ad29a951768 R15: 0000000000000130
FS:  00007f11023b9000(0000) GS:ffff8ad29ed00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 0000000857b62000 CR4: 00000000003506e0
Call Trace:
 autofs_write fs/autofs/waitq.c:56 [inline]
 autofs_notify_daemon.constprop.0+0x115/0x260 fs/autofs/waitq.c:163
 autofs_wait+0x5b1/0x750 fs/autofs/waitq.c:465
 autofs_mount_wait+0x2d/0x60 fs/autofs/root.c:250
 autofs_d_automount+0xc4/0x1a0 fs/autofs/root.c:379
 follow_automount fs/namei.c:1198 [inline]
 __traverse_mounts+0x8d/0x230 fs/namei.c:1243
 traverse_mounts fs/namei.c:1272 [inline]
 handle_mounts fs/namei.c:1381 [inline]
 step_into+0x44e/0x730 fs/namei.c:1691
 walk_component+0x7e/0x1b0 fs/namei.c:1867
 link_path_walk+0x270/0x3b0 fs/namei.c:2184
 path_openat+0x90/0xe40 fs/namei.c:3365
 do_filp_open+0x98/0x140 fs/namei.c:3396
 do_sys_openat2+0xac/0x170 fs/open.c:1168
 do_sys_open fs/open.c:1184 [inline]
 __do_sys_openat fs/open.c:1200 [inline]
 __se_sys_openat fs/open.c:1195 [inline]
 __x64_sys_openat+0x51/0x90 fs/open.c:1195
 do_syscall_64+0x33/0x40 arch/x86/entry/common.c:46
 entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f11031d3c1b
Code: 25 00 00 41 00 3d 00 00 41 00 74 4b 64 8b 04 25 18 00 00 00 85 c0 75 67 44 89 e2 48 89 ee bf 9c ff ff ff b8 01 01 00 00 0f 05 <48> 3d 00 f0 ff ff 0f 87 91 00 00 00 48 8b 4c 24 28 64 48 2b 0c 25
RSP: 002b:00007ffda6c0a1c0 EFLAGS: 00000246 ORIG_RAX: 0000000000000101
RAX: ffffffffffffffda RBX: 0000000000000020 RCX: 00007f11031d3c1b
RDX: 0000000000080101 RSI: 000055f65e114278 RDI: 00000000ffffff9c
random: lvm: uninitialized urandom read (4 bytes read)
RBP: 000055f65e114278 R08: 00000000ffffffff R09: 000055f65ef055d8
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000080101
R13: 000055f65e11434a R14: 0000000000000000 R15: 0000000000000000
CR2: 0000000000000000
---[ end trace 166ad5429f22801b ]---
Al Viro Oct. 1, 2020, 10:40 p.m. UTC | #2
On Thu, Oct 01, 2020 at 03:38:52PM -0700, Eric Biggers wrote:

>  	mutex_lock(&sbi->pipe_mutex);
>  	while (bytes) {
> -		wr = __kernel_write(file, data, bytes, NULL);
> +		wr = __kernel_write(file, data, bytes, &file->f_pos);

Better
	loff_t dummy = 0;
...
		wr = __kernel_write(file, data, bytes, &dummy);
Linus Torvalds Oct. 2, 2020, 4:27 p.m. UTC | #3
On Thu, Oct 1, 2020 at 3:41 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> Better
>         loff_t dummy = 0;
> ...
>                 wr = __kernel_write(file, data, bytes, &dummy);

No, just fix __kernel_write() to work correctly.

The fact is, NULL _is_ the right pointer for ppos these days.

That commit by Christoph is buggy: it replaces new_sync_write() with a
buggy open-coded version.

Notice how new_sync_write does

        kiocb.ki_pos = (ppos ? *ppos : 0);
,,,
        if (ret > 0 && ppos)
                *ppos = kiocb.ki_pos;

but the open-coded version doesn't.

So just fix that in linux-next. The *last* thing we want is to have
different semantics for the "same" kernel functions.

               Linus
Eric Biggers Oct. 9, 2020, 10:06 p.m. UTC | #4
On Fri, Oct 02, 2020 at 09:27:09AM -0700, Linus Torvalds wrote:
> On Thu, Oct 1, 2020 at 3:41 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
> >
> > Better
> >         loff_t dummy = 0;
> > ...
> >                 wr = __kernel_write(file, data, bytes, &dummy);
> 
> No, just fix __kernel_write() to work correctly.
> 
> The fact is, NULL _is_ the right pointer for ppos these days.
> 
> That commit by Christoph is buggy: it replaces new_sync_write() with a
> buggy open-coded version.
> 
> Notice how new_sync_write does
> 
>         kiocb.ki_pos = (ppos ? *ppos : 0);
> ,,,
>         if (ret > 0 && ppos)
>                 *ppos = kiocb.ki_pos;
> 
> but the open-coded version doesn't.
> 
> So just fix that in linux-next. The *last* thing we want is to have
> different semantics for the "same" kernel functions.

It's a bit unintuitive that ppos=NULL means "use pos 0", not "use file->f_pos".

Anyway, it works.  The important thing is, this is still broken in linux-next...

- Eric
Linus Torvalds Oct. 10, 2020, 1:03 a.m. UTC | #5
On Fri, Oct 9, 2020 at 3:06 PM Eric Biggers <ebiggers@kernel.org> wrote:
>
> It's a bit unintuitive that ppos=NULL means "use pos 0", not "use file->f_pos".

That's not at all what it means.

A NULL ppos means "this has no position at all", and is what we use
for FMODE_STREAM file descriptors (ie sockets, pipes, etc).

It also means that we don't do the locking for position updates.

The fact that "ki_pos" gets set to zero is just because it needs to be
_something_. It shouldn't actually ever be used for stream devices.

                  Linus
Eric Biggers Oct. 10, 2020, 1:19 a.m. UTC | #6
On Fri, Oct 09, 2020 at 06:03:31PM -0700, Linus Torvalds wrote:
> On Fri, Oct 9, 2020 at 3:06 PM Eric Biggers <ebiggers@kernel.org> wrote:
> >
> > It's a bit unintuitive that ppos=NULL means "use pos 0", not "use file->f_pos".
> 
> That's not at all what it means.
> 
> A NULL ppos means "this has no position at all", and is what we use
> for FMODE_STREAM file descriptors (ie sockets, pipes, etc).
> 
> It also means that we don't do the locking for position updates.
> 
> The fact that "ki_pos" gets set to zero is just because it needs to be
> _something_. It shouldn't actually ever be used for stream devices.
> 

Okay, that makes more sense.  So the patchset from Matthew
https://lkml.kernel.org/linux-fsdevel/20201003025534.21045-1-willy@infradead.org/T/#u
isn't what you had in mind.

- Eric
Linus Torvalds Oct. 10, 2020, 1:29 a.m. UTC | #7
On Fri, Oct 9, 2020 at 6:19 PM Eric Biggers <ebiggers@kernel.org> wrote:
>
> Okay, that makes more sense.  So the patchset from Matthew
> https://lkml.kernel.org/linux-fsdevel/20201003025534.21045-1-willy@infradead.org/T/#u
> isn't what you had in mind.

No.

That first patch makes sense - it's just the "ppos can be NULL" patch.

But as mentioned, NULL isn't "shorthand for zero". It's just "pipes
don't _have_ a pos, trying to pass in some explicit position is
crazy".

So no, the other patches in that set are a bit odd, I think.

SOME of them look potentially fine - the bpfilter one seems to be
valid, for example, because it's literally about reading/writing a
pipe. And maybe the sysctl one is similarly sensible - I didn't check
the context of that one.

But no, NULL shouldn't mean "start at position zero, and we don't care
about the result".

              Linus
Alexander Viro Oct. 10, 2020, 1:55 a.m. UTC | #8
On Fri, Oct 09, 2020 at 06:29:13PM -0700, Linus Torvalds wrote:
> On Fri, Oct 9, 2020 at 6:19 PM Eric Biggers <ebiggers@kernel.org> wrote:
> >
> > Okay, that makes more sense.  So the patchset from Matthew
> > https://lkml.kernel.org/linux-fsdevel/20201003025534.21045-1-willy@infradead.org/T/#u
> > isn't what you had in mind.
> 
> No.
> 
> That first patch makes sense - it's just the "ppos can be NULL" patch.
> 
> But as mentioned, NULL isn't "shorthand for zero". It's just "pipes
> don't _have_ a pos, trying to pass in some explicit position is
> crazy".
> 
> So no, the other patches in that set are a bit odd, I think.
> 
> SOME of them look potentially fine - the bpfilter one seems to be
> valid, for example, because it's literally about reading/writing a
> pipe. And maybe the sysctl one is similarly sensible - I didn't check
> the context of that one.

FWIW, I hadn't pushed that branch out (or merged it into #for-next yet);
for one thing, uml part (mconsole) is simply broken, for another...
IMO ##5--8 are asking for kernel_pread() and if you look at binfmt_elf.c,
you'll see elf_read() being pretty much that.  acct.c, keys and usermode
parts are asking for kernel_pwrite() as well.

I've got stuck looking through the drivers/target stuff - it would've
been another kernel_pwrite() candidate, but it smells like its use of
filp_open() is really asking for trouble, starting with symlink attacks.
Not sure - I'm not familiar with the area, but...
Christoph Hellwig Oct. 14, 2020, 5:51 a.m. UTC | #9
On Sat, Oct 10, 2020 at 01:55:24AM +0000, Alexander Viro wrote:
> FWIW, I hadn't pushed that branch out (or merged it into #for-next yet);
> for one thing, uml part (mconsole) is simply broken, for another...
> IMO ##5--8 are asking for kernel_pread() and if you look at binfmt_elf.c,
> you'll see elf_read() being pretty much that.  acct.c, keys and usermode
> parts are asking for kernel_pwrite() as well.
> 
> I've got stuck looking through the drivers/target stuff - it would've
> been another kernel_pwrite() candidate, but it smells like its use of
> filp_open() is really asking for trouble, starting with symlink attacks.
> Not sure - I'm not familiar with the area, but...

Can you just pull in the minimal fix so that the branch gets fixed
for this merge window?  All the cleanups can come later.
diff mbox series

Patch

diff --git a/fs/read_write.c b/fs/read_write.c
index 5db58b8c78d0dd..702c4301d9eb6b 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -419,27 +419,41 @@  static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, lo
 	return ret;
 }
 
+static int warn_unsupported(struct file *file, const char *op)
+{
+	pr_warn_ratelimited(
+		"kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
+		op, file, current->pid, current->comm);
+	return -EINVAL;
+}
+
 ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
 {
-	mm_segment_t old_fs = get_fs();
+	struct kvec iov = {
+		.iov_base	= buf,
+		.iov_len	= min_t(size_t, count, MAX_RW_COUNT),
+	};
+	struct kiocb kiocb;
+	struct iov_iter iter;
 	ssize_t ret;
 
 	if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ)))
 		return -EINVAL;
 	if (!(file->f_mode & FMODE_CAN_READ))
 		return -EINVAL;
+	/*
+	 * Also fail if ->read_iter and ->read are both wired up as that
+	 * implies very convoluted semantics.
+	 */
+	if (unlikely(!file->f_op->read_iter || file->f_op->read))
+		return warn_unsupported(file, "read");
 
-	if (count > MAX_RW_COUNT)
-		count =  MAX_RW_COUNT;
-	set_fs(KERNEL_DS);
-	if (file->f_op->read)
-		ret = file->f_op->read(file, (void __user *)buf, count, pos);
-	else if (file->f_op->read_iter)
-		ret = new_sync_read(file, (void __user *)buf, count, pos);
-	else
-		ret = -EINVAL;
-	set_fs(old_fs);
+	init_sync_kiocb(&kiocb, file);
+	kiocb.ki_pos = *pos;
+	iov_iter_kvec(&iter, READ, &iov, 1, iov.iov_len);
+	ret = file->f_op->read_iter(&kiocb, &iter);
 	if (ret > 0) {
+		*pos = kiocb.ki_pos;
 		fsnotify_access(file);
 		add_rchar(current, ret);
 	}
@@ -510,28 +524,31 @@  static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t
 /* caller is responsible for file_start_write/file_end_write */
 ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
 {
-	mm_segment_t old_fs;
-	const char __user *p;
+	struct kvec iov = {
+		.iov_base	= (void *)buf,
+		.iov_len	= min_t(size_t, count, MAX_RW_COUNT),
+	};
+	struct kiocb kiocb;
+	struct iov_iter iter;
 	ssize_t ret;
 
 	if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE)))
 		return -EBADF;
 	if (!(file->f_mode & FMODE_CAN_WRITE))
 		return -EINVAL;
+	/*
+	 * Also fail if ->write_iter and ->write are both wired up as that
+	 * implies very convoluted semantics.
+	 */
+	if (unlikely(!file->f_op->write_iter || file->f_op->write))
+		return warn_unsupported(file, "write");
 
-	old_fs = get_fs();
-	set_fs(KERNEL_DS);
-	p = (__force const char __user *)buf;
-	if (count > MAX_RW_COUNT)
-		count =  MAX_RW_COUNT;
-	if (file->f_op->write)
-		ret = file->f_op->write(file, p, count, pos);
-	else if (file->f_op->write_iter)
-		ret = new_sync_write(file, p, count, pos);
-	else
-		ret = -EINVAL;
-	set_fs(old_fs);
+	init_sync_kiocb(&kiocb, file);
+	kiocb.ki_pos = *pos;
+	iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len);
+	ret = file->f_op->write_iter(&kiocb, &iter);
 	if (ret > 0) {
+		*pos = kiocb.ki_pos;
 		fsnotify_modify(file);
 		add_wchar(current, ret);
 	}