diff mbox series

[03/11] fs: add new read_uptr and write_uptr file operations

Message ID 20200624162901.1814136-4-hch@lst.de (mailing list archive)
State New, archived
Headers show
Series [01/11] uptr: add a new "universal pointer" type | expand

Commit Message

Christoph Hellwig June 24, 2020, 4:28 p.m. UTC
Add two new file operations that are identical to ->read and ->write
except that they can also safely take kernel pointers using the uptr_t
type.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/internal.h      |  4 ++--
 fs/read_write.c    | 18 ++++++++++++++----
 include/linux/fs.h |  3 +++
 3 files changed, 19 insertions(+), 6 deletions(-)

Comments

Linus Torvalds June 24, 2020, 5:19 p.m. UTC | #1
On Wed, Jun 24, 2020 at 9:29 AM Christoph Hellwig <hch@lst.de> wrote:
>
> Add two new file operations that are identical to ->read and ->write
> except that they can also safely take kernel pointers using the uptr_t
> type.

Honestly, I think this is the wrong way to go.

All of this new complexity and messiness, just to remove a few
unimportant final cases?

If somebody can't be bothered to convert a driver to
iter_read/iter_write, why would they be bothered to convert it to
read_uptr/write_uptr?

And this messiness will stay around for decades.

So let's not go down that path.

If you want to do "splice() and kernel_read() requires read_iter"
(with a warning so that we find any cases), then that's fine. But
let's not add yet _another_ read type.

Why did you care so much about sysctl, and why couldn't they use the iter ops?

                    Linus
Christoph Hellwig June 24, 2020, 5:55 p.m. UTC | #2
On Wed, Jun 24, 2020 at 10:19:16AM -0700, Linus Torvalds wrote:
> Honestly, I think this is the wrong way to go.
> 
> All of this new complexity and messiness, just to remove a few
> unimportant final cases?
> 
> If somebody can't be bothered to convert a driver to
> iter_read/iter_write, why would they be bothered to convert it to
> read_uptr/write_uptr?
> 
> And this messiness will stay around for decades.
> 
> So let's not go down that path.
> 
> If you want to do "splice() and kernel_read() requires read_iter"
> (with a warning so that we find any cases), then that's fine. But
> let's not add yet _another_ read type.
> 
> Why did you care so much about sysctl, and why couldn't they use the iter ops?

I don't care at all.  Based on our previous chat I assumed you
wanted something like this.  We might still need the uptr_t for
setsockopt, though.
Matthew Wilcox June 24, 2020, 5:56 p.m. UTC | #3
On Wed, Jun 24, 2020 at 10:19:16AM -0700, Linus Torvalds wrote:
> On Wed, Jun 24, 2020 at 9:29 AM Christoph Hellwig <hch@lst.de> wrote:
> >
> > Add two new file operations that are identical to ->read and ->write
> > except that they can also safely take kernel pointers using the uptr_t
> > type.
> 
> Honestly, I think this is the wrong way to go.
> 
> All of this new complexity and messiness, just to remove a few
> unimportant final cases?
> 
> If somebody can't be bothered to convert a driver to
> iter_read/iter_write, why would they be bothered to convert it to
> read_uptr/write_uptr?
> 
> And this messiness will stay around for decades.
> 
> So let's not go down that path.
> 
> If you want to do "splice() and kernel_read() requires read_iter"
> (with a warning so that we find any cases), then that's fine. But
> let's not add yet _another_ read type.
> 
> Why did you care so much about sysctl, and why couldn't they use the iter ops?

Heh, when I saw patch 4, I started working on that.  It doesn't seem all
that bad, except I've never used the iov_iter before, so I have no idea
if I did this right.  Also, this fixes a bug if 'count' is too large,
which I should split out and send separately.

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 42c5128c7d1c..7a8c474bc196 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -12,6 +12,7 @@
 #include <linux/cred.h>
 #include <linux/namei.h>
 #include <linux/mm.h>
+#include <linux/uio.h>
 #include <linux/module.h>
 #include <linux/bpf-cgroup.h>
 #include <linux/mount.h>
@@ -540,12 +541,13 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
 	return err;
 }
 
-static ssize_t proc_sys_call_handler(struct file *filp, void __user *ubuf,
-		size_t count, loff_t *ppos, int write)
+static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter,
+		int write)
 {
-	struct inode *inode = file_inode(filp);
+	struct inode *inode = file_inode(iocb->ki_filp);
 	struct ctl_table_header *head = grab_header(inode);
 	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+	size_t count = iov_iter_count(iter);
 	void *kbuf;
 	ssize_t error;
 
@@ -566,35 +568,32 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *ubuf,
 		goto out;
 
 	/* don't even try if the size is too large */
+	error = -ENOMEM;
 	if (count > KMALLOC_MAX_SIZE)
-		return -ENOMEM;
+		goto out;
+	kbuf = kzalloc(count, GFP_KERNEL);
+	if (!kbuf)
+		goto out;
 
 	if (write) {
-		kbuf = memdup_user_nul(ubuf, count);
-		if (IS_ERR(kbuf)) {
-			error = PTR_ERR(kbuf);
-			goto out;
-		}
-	} else {
-		error = -ENOMEM;
-		kbuf = kzalloc(count, GFP_KERNEL);
-		if (!kbuf)
+		error = -EFAULT;
+		if (!copy_from_iter_full(kbuf, count, iter))
 			goto out;
 	}
 
 	error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, &kbuf, &count,
-					   ppos);
+					   &iocb->ki_pos);
 	if (error)
 		goto out_free_buf;
 
 	/* careful: calling conventions are nasty here */
-	error = table->proc_handler(table, write, kbuf, &count, ppos);
+	error = table->proc_handler(table, write, kbuf, &count, &iocb->ki_pos);
 	if (error)
 		goto out_free_buf;
 
 	if (!write) {
 		error = -EFAULT;
-		if (copy_to_user(ubuf, kbuf, count))
+		if (copy_to_iter(kbuf, count, iter) < count)
 			goto out_free_buf;
 	}
 
@@ -607,16 +606,14 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *ubuf,
 	return error;
 }
 
-static ssize_t proc_sys_read(struct file *filp, char __user *buf,
-				size_t count, loff_t *ppos)
+static ssize_t proc_sys_read(struct kiocb *iocb, struct iov_iter *iter)
 {
-	return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 0);
+	return proc_sys_call_handler(iocb, iter, 0);
 }
 
-static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
-				size_t count, loff_t *ppos)
+static ssize_t proc_sys_write(struct kiocb *iocb, struct iov_iter *iter)
 {
-	return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1);
+	return proc_sys_call_handler(iocb, iter, 1);
 }
 
 static int proc_sys_open(struct inode *inode, struct file *filp)
@@ -853,8 +850,8 @@ static int proc_sys_getattr(const struct path *path, struct kstat *stat,
 static const struct file_operations proc_sys_file_operations = {
 	.open		= proc_sys_open,
 	.poll		= proc_sys_poll,
-	.read		= proc_sys_read,
-	.write		= proc_sys_write,
+	.read_iter	= proc_sys_read,
+	.write_iter	= proc_sys_write,
 	.llseek		= default_llseek,
 };
Christoph Hellwig June 24, 2020, 5:59 p.m. UTC | #4
On Wed, Jun 24, 2020 at 06:56:44PM +0100, Matthew Wilcox wrote:
>  	/* don't even try if the size is too large */
> +	error = -ENOMEM;
>  	if (count > KMALLOC_MAX_SIZE)
> -		return -ENOMEM;
> +		goto out;
> +	kbuf = kzalloc(count, GFP_KERNEL);
> +	if (!kbuf)
> +		goto out;
>  
>  	if (write) {
> +		error = -EFAULT;
> +		if (!copy_from_iter_full(kbuf, count, iter))
>  			goto out;
>  	}

The nul-termination for the write cases seems to be lost here.
Linus Torvalds June 24, 2020, 6:11 p.m. UTC | #5
On Wed, Jun 24, 2020 at 10:55 AM Christoph Hellwig <hch@lst.de> wrote:
>
> I don't care at all.  Based on our previous chat I assumed you
> wanted something like this.  We might still need the uptr_t for
> setsockopt, though.

No.

What I mean was *not* something like uptr_t.

Just keep the existing "set_fs()". It's not harmful if it's only used
occasionally. We should rename it once it's rare enough, though.

Then, make the following changes:

 - all the normal user access functions stop caring. They use
TASK_SIZE_MAX and are done with it. They basically stop reacting to
set_fs().

 - then, we can have a few *very* specific cases (like setsockopt,
maybe some random read/write) that we teach to use the new set_fs()
thing.

So in *those* cases, we'd basically just do "oh, ok, we are supposed
to use a kernel pointer" based on the setfs value.

IOW, I mean tto do something much more gradual. No new interfaces, no
new types, just a couple of (very clearly marked!) cases of the legacy
set_fs() behavior.

                Linus
Christoph Hellwig June 24, 2020, 6:14 p.m. UTC | #6
On Wed, Jun 24, 2020 at 11:11:50AM -0700, Linus Torvalds wrote:
> What I mean was *not* something like uptr_t.
> 
> Just keep the existing "set_fs()". It's not harmful if it's only used
> occasionally. We should rename it once it's rare enough, though.
> 
> Then, make the following changes:
> 
>  - all the normal user access functions stop caring. They use
> TASK_SIZE_MAX and are done with it. They basically stop reacting to
> set_fs().
> 
>  - then, we can have a few *very* specific cases (like setsockopt,
> maybe some random read/write) that we teach to use the new set_fs()
> thing.
> 
> So in *those* cases, we'd basically just do "oh, ok, we are supposed
> to use a kernel pointer" based on the setfs value.
> 
> IOW, I mean tto do something much more gradual. No new interfaces, no
> new types, just a couple of (very clearly marked!) cases of the legacy
> set_fs() behavior.

So we'd need new user copy functions for just those cases, and make
sure everything below the potential get_fs-NG uses them.  But without
any kind of tape safety to easily validate all users below actually
use them?  I just don't see how that makes sense.

FYI, I think the only users where we really need it are setsockopt
and a s390-specific driver from my audits so far.  Everything else
shouldn't need anything like that.
Linus Torvalds June 24, 2020, 6:15 p.m. UTC | #7
On Wed, Jun 24, 2020 at 11:11 AM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> So in *those* cases, we'd basically just do "oh, ok, we are supposed
> to use a kernel pointer" based on the setfs value.

The important part here si that we don't need to change any
interfaces, because we don't add that whole "carry the bit around with
the pointer".

I agree that that would be the nice clean interface _if_ we intended
for this to be something we care about. But we already _have_ that
interface in the "iter" code, I absolutely do not think we should
create a new one.

So that's why the (admittedly hacky) "just support the old model for
special cases" kind of approach. Make it really easy to convert some
very specific individual places that might care, and make it very
obvious what those places are.

Maybe in a year or two, there's only a couple such places, and we can
see if we can clean those up separately. But make it easy do the
transition to the new model by _not_ changing the basic logic for now.

See what I'm aiming for?

In particular, for architectures that haven't been modified, this
results in zero changes what-so-ever.

            Linus
Linus Torvalds June 24, 2020, 6:20 p.m. UTC | #8
On Wed, Jun 24, 2020 at 11:14 AM Christoph Hellwig <hch@lst.de> wrote:
>
> So we'd need new user copy functions for just those cases

No. We'd open-code them. They'd look at "oh, I'm supposed to use a
kernel pointer" and just use those.

IOW, basically IN THE CODE that cares (and the whole argument is that
this code is one or two special cases) you do

    /* This has not been converted to the new world order */
    if (get_fs() == KERNEL_DS) memcpy(..) else copy_from_user();

You're overdesigning things. You're making them more complex than they
need to be.

Basically, I do *NOT* want to pollute the VFS layer with new
interfaces that shouldn't exist in the long run. I'd much rather make
the eventual goal be to get rid of 'read/write' entirely in favour of
the 'iter' things, but what I absolutely do *NOT* want to see is to
make a _third_ interface for reading and writing. Quite the reverse.
We should strive to make it a _single_ interface, not add a new one.

And I'd rather have a couple of ugly code details in odd places (that
we can hopefully fix up later) than have new VFS infrastructure that
will then hang around forever more.

                Linus
Christoph Hellwig June 24, 2020, 6:24 p.m. UTC | #9
On Wed, Jun 24, 2020 at 11:20:26AM -0700, Linus Torvalds wrote:
> On Wed, Jun 24, 2020 at 11:14 AM Christoph Hellwig <hch@lst.de> wrote:
> >
> > So we'd need new user copy functions for just those cases
> 
> No. We'd open-code them. They'd look at "oh, I'm supposed to use a
> kernel pointer" and just use those.
> 
> IOW, basically IN THE CODE that cares (and the whole argument is that
> this code is one or two special cases) you do
> 
>     /* This has not been converted to the new world order */
>     if (get_fs() == KERNEL_DS) memcpy(..) else copy_from_user();
> 
> You're overdesigning things. You're making them more complex than they
> need to be.

I wish it was so simple.  I really don't like overdesigns, trust me.

But please take a look at setsockopt and all the different instances
(count 90 .setsockopt wireups, and they then branch out into
various subroutines as well).  I really don't want to open code that
there, but we could do helper specific to setsockopt.

Honestly my preference would be to say that no eBPF isn't actually
a user API and just rip out the crap added to it, but I fear that
is not an option.  Because in that case we'd basically be done.

> Basically, I do *NOT* want to pollute the VFS layer with new
> interfaces that shouldn't exist in the long run. I'd much rather make
> the eventual goal be to get rid of 'read/write' entirely in favour of
> the 'iter' things, but what I absolutely do *NOT* want to see is to
> make a _third_ interface for reading and writing. Quite the reverse.
> We should strive to make it a _single_ interface, not add a new one.

Completele agreement on this.  I actually hate the new fops, and only
added them reluctantly as I mis-interpreted what you said.
Matthew Wilcox June 24, 2020, 6:29 p.m. UTC | #10
On Wed, Jun 24, 2020 at 08:24:37PM +0200, Christoph Hellwig wrote:
> On Wed, Jun 24, 2020 at 11:20:26AM -0700, Linus Torvalds wrote:
> > On Wed, Jun 24, 2020 at 11:14 AM Christoph Hellwig <hch@lst.de> wrote:
> > >
> > > So we'd need new user copy functions for just those cases
> > 
> > No. We'd open-code them. They'd look at "oh, I'm supposed to use a
> > kernel pointer" and just use those.
> > 
> > IOW, basically IN THE CODE that cares (and the whole argument is that
> > this code is one or two special cases) you do
> > 
> >     /* This has not been converted to the new world order */
> >     if (get_fs() == KERNEL_DS) memcpy(..) else copy_from_user();
> > 
> > You're overdesigning things. You're making them more complex than they
> > need to be.
> 
> I wish it was so simple.  I really don't like overdesigns, trust me.
> 
> But please take a look at setsockopt and all the different instances
> (count 90 .setsockopt wireups, and they then branch out into
> various subroutines as well).  I really don't want to open code that
> there, but we could do helper specific to setsockopt.

Can we do a setsockopt_iter() which replaces optval/optlen with an iov_iter?
Christoph Hellwig June 24, 2020, 6:31 p.m. UTC | #11
On Wed, Jun 24, 2020 at 07:29:44PM +0100, Matthew Wilcox wrote:
> On Wed, Jun 24, 2020 at 08:24:37PM +0200, Christoph Hellwig wrote:
> > On Wed, Jun 24, 2020 at 11:20:26AM -0700, Linus Torvalds wrote:
> > > On Wed, Jun 24, 2020 at 11:14 AM Christoph Hellwig <hch@lst.de> wrote:
> > > >
> > > > So we'd need new user copy functions for just those cases
> > > 
> > > No. We'd open-code them. They'd look at "oh, I'm supposed to use a
> > > kernel pointer" and just use those.
> > > 
> > > IOW, basically IN THE CODE that cares (and the whole argument is that
> > > this code is one or two special cases) you do
> > > 
> > >     /* This has not been converted to the new world order */
> > >     if (get_fs() == KERNEL_DS) memcpy(..) else copy_from_user();
> > > 
> > > You're overdesigning things. You're making them more complex than they
> > > need to be.
> > 
> > I wish it was so simple.  I really don't like overdesigns, trust me.
> > 
> > But please take a look at setsockopt and all the different instances
> > (count 90 .setsockopt wireups, and they then branch out into
> > various subroutines as well).  I really don't want to open code that
> > there, but we could do helper specific to setsockopt.
> 
> Can we do a setsockopt_iter() which replaces optval/optlen with an iov_iter?

We could.  The only downside is int-sized sockopts are common, and used
in the fast path of networking applications (e.g. cork,uncork) and this
might introduce enough overhead to be noticable.
Christoph Hellwig June 24, 2020, 6:37 p.m. UTC | #12
On Wed, Jun 24, 2020 at 07:59:05PM +0200, Christoph Hellwig wrote:
> On Wed, Jun 24, 2020 at 06:56:44PM +0100, Matthew Wilcox wrote:
> >  	/* don't even try if the size is too large */
> > +	error = -ENOMEM;
> >  	if (count > KMALLOC_MAX_SIZE)
> > -		return -ENOMEM;
> > +		goto out;
> > +	kbuf = kzalloc(count, GFP_KERNEL);
> > +	if (!kbuf)
> > +		goto out;
> >  
> >  	if (write) {
> > +		error = -EFAULT;
> > +		if (!copy_from_iter_full(kbuf, count, iter))
> >  			goto out;
> >  	}
> 
> The nul-termination for the write cases seems to be lost here.

Version with the count and termination fixed below.  Can I get your
signoff?  If testing passes that means I can go back to my
kernel_read/write version from the set_fs removal tree with it.

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 42c5128c7d1c76..36ac7b0e4ba80d 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -12,6 +12,7 @@
 #include <linux/cred.h>
 #include <linux/namei.h>
 #include <linux/mm.h>
+#include <linux/uio.h>
 #include <linux/module.h>
 #include <linux/bpf-cgroup.h>
 #include <linux/mount.h>
@@ -540,12 +541,13 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
 	return err;
 }
 
-static ssize_t proc_sys_call_handler(struct file *filp, void __user *ubuf,
-		size_t count, loff_t *ppos, int write)
+static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter,
+		int write)
 {
-	struct inode *inode = file_inode(filp);
+	struct inode *inode = file_inode(iocb->ki_filp);
 	struct ctl_table_header *head = grab_header(inode);
 	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+	size_t count = iov_iter_count(iter);
 	void *kbuf;
 	ssize_t error;
 
@@ -566,35 +568,33 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *ubuf,
 		goto out;
 
 	/* don't even try if the size is too large */
-	if (count > KMALLOC_MAX_SIZE)
-		return -ENOMEM;
+	error = -ENOMEM;
+	if (count + !!write > KMALLOC_MAX_SIZE)
+		goto out;
+	kbuf = kzalloc(count, GFP_KERNEL);
+	if (!kbuf)
+		goto out;
 
 	if (write) {
-		kbuf = memdup_user_nul(ubuf, count);
-		if (IS_ERR(kbuf)) {
-			error = PTR_ERR(kbuf);
-			goto out;
-		}
-	} else {
-		error = -ENOMEM;
-		kbuf = kzalloc(count, GFP_KERNEL);
-		if (!kbuf)
+		error = -EFAULT;
+		if (!copy_from_iter_full(kbuf, count, iter))
 			goto out;
+		((char *)kbuf)[count] = '\0';
 	}
 
 	error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, &kbuf, &count,
-					   ppos);
+					   &iocb->ki_pos);
 	if (error)
 		goto out_free_buf;
 
 	/* careful: calling conventions are nasty here */
-	error = table->proc_handler(table, write, kbuf, &count, ppos);
+	error = table->proc_handler(table, write, kbuf, &count, &iocb->ki_pos);
 	if (error)
 		goto out_free_buf;
 
 	if (!write) {
 		error = -EFAULT;
-		if (copy_to_user(ubuf, kbuf, count))
+		if (copy_to_iter(kbuf, count, iter) < count)
 			goto out_free_buf;
 	}
 
@@ -607,16 +607,14 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *ubuf,
 	return error;
 }
 
-static ssize_t proc_sys_read(struct file *filp, char __user *buf,
-				size_t count, loff_t *ppos)
+static ssize_t proc_sys_read(struct kiocb *iocb, struct iov_iter *iter)
 {
-	return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 0);
+	return proc_sys_call_handler(iocb, iter, 0);
 }
 
-static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
-				size_t count, loff_t *ppos)
+static ssize_t proc_sys_write(struct kiocb *iocb, struct iov_iter *iter)
 {
-	return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1);
+	return proc_sys_call_handler(iocb, iter, 1);
 }
 
 static int proc_sys_open(struct inode *inode, struct file *filp)
@@ -853,8 +851,8 @@ static int proc_sys_getattr(const struct path *path, struct kstat *stat,
 static const struct file_operations proc_sys_file_operations = {
 	.open		= proc_sys_open,
 	.poll		= proc_sys_poll,
-	.read		= proc_sys_read,
-	.write		= proc_sys_write,
+	.read_iter	= proc_sys_read,
+	.write_iter	= proc_sys_write,
 	.llseek		= default_llseek,
 };
Matthew Wilcox June 24, 2020, 6:43 p.m. UTC | #13
On Wed, Jun 24, 2020 at 08:37:43PM +0200, Christoph Hellwig wrote:
> On Wed, Jun 24, 2020 at 07:59:05PM +0200, Christoph Hellwig wrote:
> > On Wed, Jun 24, 2020 at 06:56:44PM +0100, Matthew Wilcox wrote:
> > >  	/* don't even try if the size is too large */
> > > +	error = -ENOMEM;
> > >  	if (count > KMALLOC_MAX_SIZE)
> > > -		return -ENOMEM;
> > > +		goto out;
> > > +	kbuf = kzalloc(count, GFP_KERNEL);
> > > +	if (!kbuf)
> > > +		goto out;
> > >  
> > >  	if (write) {
> > > +		error = -EFAULT;
> > > +		if (!copy_from_iter_full(kbuf, count, iter))
> > >  			goto out;
> > >  	}
> > 
> > The nul-termination for the write cases seems to be lost here.
> 
> Version with the count and termination fixed below.  Can I get your
> signoff?  If testing passes that means I can go back to my
> kernel_read/write version from the set_fs removal tree with it.

Sure!

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>

I went with some slightly different fixes, but I'm having trouble
sending email (the new infradead.org is not quite set up right yet).
I've attached the two fixes to this email in case you want to
incorporate them in some way.
From 3c68e4efcc50192962a8cd18c67fb6fad2493713 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 24 Jun 2020 14:12:02 -0400
Subject: [PATCH 1/2] sysctl: Call sysctl_head_finish on error
To: hch@lst.de

This error path returned directly instead of calling sysctl_head_finish().

Fixes: ef9d965bc8b6 ("sysctl: reject gigantic reads/write to sysctl files")
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 fs/proc/proc_sysctl.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 42c5128c7d1c..6c1166ccdaea 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -566,8 +566,9 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *ubuf,
 		goto out;
 
 	/* don't even try if the size is too large */
-	if (count > KMALLOC_MAX_SIZE)
-		return -ENOMEM;
+	error = -ENOMEM;
+	if (count >= KMALLOC_MAX_SIZE)
+		goto out;
 
 	if (write) {
 		kbuf = memdup_user_nul(ubuf, count);
@@ -576,7 +577,6 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *ubuf,
 			goto out;
 		}
 	} else {
-		error = -ENOMEM;
 		kbuf = kzalloc(count, GFP_KERNEL);
 		if (!kbuf)
 			goto out;
David Laight June 27, 2020, 10:49 a.m. UTC | #14
From: Linus Torvalds
> Sent: 24 June 2020 19:12
> On Wed, Jun 24, 2020 at 10:55 AM Christoph Hellwig <hch@lst.de> wrote:
> >
> > I don't care at all.  Based on our previous chat I assumed you
> > wanted something like this.  We might still need the uptr_t for
> > setsockopt, though.
> 
> No.
> 
> What I mean was *not* something like uptr_t.
> 
> Just keep the existing "set_fs()". It's not harmful if it's only used
> occasionally. We should rename it once it's rare enough, though.

Am I right in thinking that it just sets a flag in 'current' ?
Although I don't remember access_ok() doing a suitable check
(would need to be (address - base) < limit).

> Then, make the following changes:
> 
>  - all the normal user access functions stop caring. They use
> TASK_SIZE_MAX and are done with it. They basically stop reacting to
> set_fs().
> 
>  - then, we can have a few *very* specific cases (like setsockopt,
> maybe some random read/write) that we teach to use the new set_fs()
> thing.

Certainly there is a 'BPF' hook in the setsockopt() syscall handler
that can substitute a kernel buffer for any setsockopt() request.

If that is needed (I presume it was added for a purpose) then all
the socket option code needs to be able to handle kernel buffers.
(Actually given what some getsockopt() do, if there was a
requirement to 'adjust' setsockopt() then there should be a hook
in the getsockopt() code as well.)

If you are going to go through all the socket option code to change
the name of all the buffer access functions then it is probably
almost as easy to move the usercopies out into the wrappers.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Linus Torvalds June 27, 2020, 4:33 p.m. UTC | #15
On Sat, Jun 27, 2020 at 3:49 AM David Laight <David.Laight@aculab.com> wrote:
>
> > Just keep the existing "set_fs()". It's not harmful if it's only used
> > occasionally. We should rename it once it's rare enough, though.
>
> Am I right in thinking that it just sets a flag in 'current' ?

Basically, yes. That's what it has always done.

Well "always" is not true - it used to set the %fs segment register
originally (thus the name), but _conceptually_ it sets a flag for
"should user accesses be kernel accesses instead".

On x86 - and most other architectures where user space and kernel
space are in the same address space and accessed with the same
instructions, that has then been implemented as just a "what is the
limit for an access".

On other architectures - architectures that need different access
methods (or different flags to the load/store instruction) - it's an
actual flag that changes which access method you use.

> Although I don't remember access_ok() doing a suitable check
> (would need to be (address - base) < limit).

So again, on the architectures with a unified address space,
access_ok() is exactly that "address + access_size <= limit", although
often done with some inline asm just to get the overflow case done
efficiently.

On other architectures, there's no limit check, because _all_
addresses are either user space or kernel space addresses, and what
changes isn't the address limit, but the access itself.

So what I was suggesting is literally

 - keep this flag around as a flag

 - but make all _normal_ user accesses ignore it, and always do user
accesses (so on a unified address space architecture like x86 it
always checks the _fixed_ limit, and on something like sparc32 which
has separate kernel and user address spaces, it just always does a
user access with no conditionals at all)

 - then make the really odd and hopefully very rare cases check that
flag explicitly and manually, and do

        if (current->legacy_uptr_is_kernel)
                memcpy(...);
        else
                copy_to/from_user(...);

and my hope is that we'd have only a handful of cases (like the
setsockopt thing: one for each protocol or whatever) that actually
want this.

Note that the legacy behavior would still remain in architectures that
haven't been modified to remove the use of set_fs(), so I would
further suggest that the two approaches live side-by-side for at least
a while. But _generic_ code (and with Christoph's patches at least
x86) would make set_fs() cause a build error.

So we'd have a new

     set_force_kernel_pointers();
     ....
     clear_force_kernel_pointers();

that would set/clear that 'current->legacy_uptr_is_kernel' variable,
and we'd have a handful of places that would check it.

The naming above is all random, and I'm not claiming that any of this
is particularly _clean_. I'm also not claiming that it's really any
better than our current "set_fs()" mess conceptually.

The only thing that makes it better than our current "set_fs()" is

 - there would hopefully be very few cases of this

 - it would *not* affect random incidental user accesses that just
happen to be in the shadow of this thing.

That second point is the important one, I feel. The real problem with
"set_fs()" has been that we've occasionally had bugs where we ended up
running odd paths that we really didn't _intend_ to run with kernel
pointers. The classic example is the SCSI "write as ioctl" example,
where a write to a SCSI generic device would do various odd things and
follow pointers and what-not. Then you get into real trouble when
"splice()" ends up truiong to write a kernel buffer, and because of
"set_fs()" suddenly the sg code started accessing kernel memory
willy-nilly.

So my suggestion was basically a new version of set_fs(), but one that
is just much more targeted, and doesn't affect all random user
accesses, only those very special ones that are then very *explicitly*
aware of the fact that "hey, I might be called in this situation where
I'm going to get a kernel address instead".

> If that is needed (I presume it was added for a purpose) then all
> the socket option code needs to be able to handle kernel buffers.

So that's very much what I'd like to avoid.

The plan would be that all the *normal* stuff would be handled by
either (a) always having the data come from user space, or (b) the
data has a known size (either fixed, or "optlen" or whatever) and then
being copied to a kernel buffer and then always handled as a kernel
field that bpf can then call with kernel data.

I thought there was just one very specific case of "oh, in certain
cases of setsockopt we don't know what size this address is and optlen
is ignored", so we have to just pass the pointer down to the protocol,
which is the point that knows how much of an address it wants..

Was that a misunderstanding on my part?

Because if there are tons and tons of places that want this "either
kernel or user" then we could still have a helper function for it, but
it means that the whole "limit the cases" advantage to some degree
goes away.

It would still fix the 99% of normal "copy/from/to_user()" cases,
though. They'd be fixed and "safe" and coule never ever touch kernel
memory even if there was some confusion about things. So it would be
an improvement, but I was really hoping that the cases where there can
be confusion would be pretty rare.

             Linus
David Laight June 29, 2020, 8:21 a.m. UTC | #16
From: Linus Torvalds
> Sent: 27 June 2020 17:33
> On Sat, Jun 27, 2020 at 3:49 AM David Laight <David.Laight@aculab.com> wrote:
> >
> > > Just keep the existing "set_fs()". It's not harmful if it's only used
> > > occasionally. We should rename it once it's rare enough, though.
> >
> > Am I right in thinking that it just sets a flag in 'current' ?
> 
> Basically, yes. That's what it has always done.

I could check, but I suspect it sets what TASK_SIZE uses to ~0u
so that access_ok() can't fail.

> Well "always" is not true - it used to set the %fs segment register
> originally (thus the name), but _conceptually_ it sets a flag for
> "should user accesses be kernel accesses instead".
> 
> On x86 - and most other architectures where user space and kernel
> space are in the same address space and accessed with the same
> instructions, that has then been implemented as just a "what is the
> limit for an access".
> 
> On other architectures - architectures that need different access
> methods (or different flags to the load/store instruction) - it's an
> actual flag that changes which access method you use.
> 
> > Although I don't remember access_ok() doing a suitable check
> > (would need to be (address - base) < limit).
> 
> So again, on the architectures with a unified address space,
> access_ok() is exactly that "address + access_size <= limit", although
> often done with some inline asm just to get the overflow case done
> efficiently.

I realised afterwards that the 'kernel address is actually user'
check isn't really done on architectures like x86 until stac/clac.

I had another thought.
While setting up a full-blown scatter-gather 'iter' structure for
functions like [gs]etsockopt, ioctl and fcntl is OTT and probably
measurably expensive a lightweight 'buffer' structure that just
contained address, length and user/kernel flag could be used.

Although the uses would need an extra level of indirection this
would be offset by reducing the number of parameters passed
through all the layers.

...
> I thought there was just one very specific case of "oh, in certain
> cases of setsockopt we don't know what size this address is and optlen
> is ignored", so we have to just pass the pointer down to the protocol,
> which is the point that knows how much of an address it wants..

I can't help feeling that userspace passes a suitable length but
the kernel doesn't verify it.

It is worse than that, one of the SCTP getsockopt() calls has to return
a length that is shorter than the buffer it wrote.

So any buffer descriptor length would have to be advisory.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Christoph Hellwig June 29, 2020, 3:29 p.m. UTC | #17
On Sat, Jun 27, 2020 at 09:33:03AM -0700, Linus Torvalds wrote:
> I thought there was just one very specific case of "oh, in certain
> cases of setsockopt we don't know what size this address is and optlen
> is ignored", so we have to just pass the pointer down to the protocol,
> which is the point that knows how much of an address it wants..

The setsock issue is a little more complicated.  Let me try to summarize
it:

 - setsock takes a (user) pointer and len
 - unfortunately while the designed of the BSD socket API designed the
   len to be correct some protocol implementations have been sloppy
   and just use a hardcoded len for the value plus some other funnies
 - unfortunately there is some BPF magic that can attach to a socket
   and be run, and that (and only that in the latest kernel) can cause
   a setsockopt to take a kernel buffer.  One that was copied from
   userspace earlier and had the BPF program run on it.
 - unfortunately we have about 90 ->setsockopt instances, and the BPF
   hook is not specific to one particular of them.  In fact the
   BPF program can run for options that don't even exist, and based on
   my previous dicussion Facebook has setups that rely on that.

> Was that a misunderstanding on my part?
> 
> Because if there are tons and tons of places that want this "either
> kernel or user" then we could still have a helper function for it, but
> it means that the whole "limit the cases" advantage to some degree
> goes away.

But except for setsockopt we don't really have anything like that left.
There is some alpha arch code that would need to be duplicated for
user vs kernel pointers, but I suspect it will get cleaner by that,
and the messy s390 crypto driver whіch will be a bit of work, but all
internal to that driver.

So based on that I'd rather get away without our flag and tag the
kernel pointer case in setsockopt explicitly.
Linus Torvalds June 29, 2020, 5:02 p.m. UTC | #18
On Mon, Jun 29, 2020 at 8:29 AM Christoph Hellwig <hch@lst.de> wrote:
>
> So based on that I'd rather get away without our flag and tag the
> kernel pointer case in setsockopt explicitly.

Yeah, I'd be ok to pass that kind of flag around for setsockopt, in
ways I _don't_ want to do for some very core vfs thing like 'read()'.

That said, is there no practical limit on how big "optlen" can be?
Sure, I realize that a lot of setsockopt users may not use all of the
data, but let's say that "optlen" is 128, but the actual low-level
setsockopt operation only uses the first 16 bytes, maybe we could
always just copy the 128 bytes from user space into kernel space, and
just say "setsockopt() always gets a kernel pointer".

Then the bpf use is even simpler. It would just pass the kernel
pointer natively.

Because that seems to be what the BPF code really wants to do: it
takes the user optval, and munges it into a kernel optval, and then
(if that has been done) runs the low-level sock_setsockopt() under
KERNEL_DS.

Couldn't we switch things around instead, and just *always* copy
things from user space, and sock_setsockopt (and
sock->ops->setsockopt) _always_ get a kernel buffer?

And avoid the set_fs(KERNEL_DS) games entirely that way?

Attached it a RFC patch just for __sys_setsockopt() - note that it
does *not* change all the low-level setsockopt callers to just do the
kernel access instead, so this is completely broken, but you can kind
of see what I mean.

Wouldn't this work? In fact, wouldn't this simplify all the setsockopt
places that now don't need to do "get_user()" etc any more?

It would be better if we could limit "optlen" to something sane, but
right now it just does a kmalloc() of whatever the user claims the opt
len is..

                    Linus
Christoph Hellwig June 29, 2020, 6:07 p.m. UTC | #19
On Mon, Jun 29, 2020 at 10:02:48AM -0700, Linus Torvalds wrote:
> That said, is there no practical limit on how big "optlen" can be?

There are some pretty huge ones, like the sctp one that can take
a basically unlimited list of sockaddr structures.

> Sure, I realize that a lot of setsockopt users may not use all of the
> data, but let's say that "optlen" is 128, but the actual low-level
> setsockopt operation only uses the first 16 bytes, maybe we could
> always just copy the 128 bytes from user space into kernel space, and
> just say "setsockopt() always gets a kernel pointer".

One issue is that a lot setsockopt calls are in the fast path, and
even have micro-optimizations like putting an int on stack for the
fast path to avoid the memory allocation.  While I don't know for
sure I fear that always doing a large allocation could end up having
a performance impact.  But otherwise I like that idea, and did in
fact start some prep work until I realized what I did was futile.

> Then the bpf use is even simpler. It would just pass the kernel
> pointer natively.
> 
> Because that seems to be what the BPF code really wants to do: it
> takes the user optval, and munges it into a kernel optval, and then
> (if that has been done) runs the low-level sock_setsockopt() under
> KERNEL_DS.
> 
> Couldn't we switch things around instead, and just *always* copy
> things from user space, and sock_setsockopt (and
> sock->ops->setsockopt) _always_ get a kernel buffer?
> 
> And avoid the set_fs(KERNEL_DS) games entirely that way?

I'd love to be able to do that.  And now that we want through this
whole mess than Nth time I have another idea:

 - we assume optlen is correct, which should cover about 90% of
   the protocols
 - but to override that a new setsockopt_len method is added that
   returns the correct length for all the messy ones.

Let me try if that works out.
Linus Torvalds June 29, 2020, 6:29 p.m. UTC | #20
On Mon, Jun 29, 2020 at 11:07 AM Christoph Hellwig <hch@lst.de> wrote:
>
> One issue is that a lot setsockopt calls are in the fast path, and
> even have micro-optimizations like putting an int on stack for the
> fast path to avoid the memory allocation.

Yeah., An the RFC patch I posted could easily be updated to do exactly
that for small optlen values (say, avoid the kmalloc and use a stack
buffer for oplen smaller than 16 bytes or whatever).

Most of the setsockopt's I'm aware of are just a single integer, so if
that's the bulk of them, then we'd never actually need to do the
kmalloc() in those cases, and only fall back to the kmalloc for the
(hopefully quite unusual) bigger options..

> I'd love to be able to do that.  And now that we want through this
> whole mess than Nth time I have another idea:
>
>  - we assume optlen is correct, which should cover about 90% of
>    the protocols
>  - but to override that a new setsockopt_len method is added that
>    returns the correct length for all the messy ones.
>
> Let me try if that works out.

Doing a quick grep, there's about 100 different ".setsockopt" function
initializers, but a quarter of them are just setting it to
'sock_no_setsockopt'.

A number of others are using 'sock_common_setsockopt'.

Which leaves something like 50 different implementations of the
.setsockopt functions.  But I didn't go any deeper than that - maybe
they then have hundreds of different option cases each and this is all
a nightmare.

Looking at a couple of them, the "int val" situation does seem to be
the most common one by _far_, and is often handled by a common
"get_user()" thing, so converting them to just getting the thing as a
kernel pointer doesn't look _too_ nasty, because even when they have a
lot of subcases, the actual optval accesses are much fewer.

Which is not to say that it looks all that much fun, but it doesn't
look entirely undoable either.

The good news (I guess) is that any missed transformation will be
fairly obvious (ie somebody uses a "get_user()" on what is now a
kernel pointer, and returns -EFAULT. So it shouldn't cause any subtle
failures, and it shouldn't cause any security issues.

I didn't look at the compat cases, but if anything I'd expect those to
become simpler by having kernel pointers. And there doesn't actually
seem to be that many of them (possibly because the "int" case si so
common that it all ends up being the same?)

              Linus
Christoph Hellwig June 29, 2020, 6:36 p.m. UTC | #21
On Mon, Jun 29, 2020 at 11:29:22AM -0700, Linus Torvalds wrote:
> I didn't look at the compat cases, but if anything I'd expect those to
> become simpler by having kernel pointers. And there doesn't actually
> seem to be that many of them (possibly because the "int" case si so
> common that it all ends up being the same?)

Having resurrect my work there really are tons of int cases.  Which
makes me thing that splitting out a setsockopt_int method which gets
passed value instead of a pointer, then converting all the simple cases
to that first and then doing the real shit later sounds like a promіsing
idea.  Let me think a bit more about that.

And yes, a lot of the common methods have tons of cases and
sub-dispatchers and everything else you'd expect from an ioctl-like
interface..
Linus Torvalds June 29, 2020, 7:10 p.m. UTC | #22
On Mon, Jun 29, 2020 at 11:36 AM Christoph Hellwig <hch@lst.de> wrote:
>
> Having resurrect my work there really are tons of int cases.  Which
> makes me thing that splitting out a setsockopt_int method which gets
> passed value instead of a pointer, then converting all the simple cases
> to that first and then doing the real shit later sounds like a promіsing
> idea.

Try my hacky patch first, and just change the code that does

                if (get_user(val, (int __user *)optval)) {
                        err = -EFAULT;

to do

                val = *(int *)optval;

In fact, that pattern seems to be so common that you can probably
almost do it with a sed-script or something.

                   Linus
Christoph Hellwig June 30, 2020, 7:04 a.m. UTC | #23
Next fund one, in net/ipv6/ip6_flowlabel.c:ipv6_flowlabel_opt() we
have this gem toward the end:

		if (!freq->flr_label) {
			if (copy_to_user(&((struct in6_flowlabel_req __user *)optval)->flr_label,
					 &fl->label, sizeof(fl->label))) {
				/* Intentionally ignore fault. */

so it writes back to what was supposed to be the input parameter,
and only does it for a partial region.  Not sure how we could handle
that with any kind of copy to kernel in the caller scheme?
David Laight June 30, 2020, 7:51 a.m. UTC | #24
From: Linus Torvalds
> Sent: 29 June 2020 18:03
> On Mon, Jun 29, 2020 at 8:29 AM Christoph Hellwig <hch@lst.de> wrote:
> >
> > So based on that I'd rather get away without our flag and tag the
> > kernel pointer case in setsockopt explicitly.
> 
> Yeah, I'd be ok to pass that kind of flag around for setsockopt, in
> ways I _don't_ want to do for some very core vfs thing like 'read()'.
> 
> That said, is there no practical limit on how big "optlen" can be?
> Sure, I realize that a lot of setsockopt users may not use all of the
> data, but let's say that "optlen" is 128, but the actual low-level
> setsockopt operation only uses the first 16 bytes, maybe we could
> always just copy the 128 bytes from user space into kernel space, and
> just say "setsockopt() always gets a kernel pointer".
> 
> Then the bpf use is even simpler. It would just pass the kernel
> pointer natively.
> 
> Because that seems to be what the BPF code really wants to do: it
> takes the user optval, and munges it into a kernel optval, and then
> (if that has been done) runs the low-level sock_setsockopt() under
> KERNEL_DS.
> 
> Couldn't we switch things around instead, and just *always* copy
> things from user space, and sock_setsockopt (and
> sock->ops->setsockopt) _always_ get a kernel buffer?
> 
> And avoid the set_fs(KERNEL_DS) games entirely that way?

I did a patch for SCTP to do the copies in the protocol wrapper.
Apart from the issue of bad applications providing overlarge
buffers and effecting a local DoS attack there were some odd issues:

1) SCTP completely abuses both setsockopt and getsockopt
   to perform additional socket operations.
   I suspect the original implementation didn't want to
   add new system calls.
2) SCTP treats getsockopt as RMW on the user buffer.
   Mostly it only needs 4 bytes, but it can in include
   a sockaddr_storage.
3) SCTP has one getsockopt that is really a setsockopt
   (ie changes things) but is implemented using getsockopt
   so that it can return a value.
4) One of the SCTP getsockopt calls has to return the
   'wrong' value to userspace (ie not the length of the
   transferred data) for compatibility with the orginal
   broken code.

I'm wondering if the [sg]etsockopt wrapper should actually
pass through a structure containing:
	Kernel buffer address (on stack if short)
	User buffer address (may be NULL)
	Length of buffer
	copy_to_user length (normally zero)
	flag: embedded pointers are user/kernel

Most code will just use the kernel buffer and return length/error.

Code that knows the supplied length is invalid can use the
user pointer - but only support direct user requests.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Luis Chamberlain July 8, 2020, 5:14 a.m. UTC | #25
On Sat, Jun 27, 2020 at 09:33:03AM -0700, Linus Torvalds wrote:
> The real problem with
> "set_fs()" has been that we've occasionally had bugs where we ended up
> running odd paths that we really didn't _intend_ to run with kernel
> pointers. The classic example is the SCSI "write as ioctl" example,
> where a write to a SCSI generic device would do various odd things and
> follow pointers and what-not. Then you get into real trouble when
> "splice()" ends up truiong to write a kernel buffer, and because of
> "set_fs()" suddenly the sg code started accessing kernel memory
> willy-nilly.

So the semantics of this interface can create chaos fast if not used
carefully and conservatively.

Christoph, it would be great if you're future series can include some
version of a verbiage for the motivation for the culling of set_fs().
Maybe it was just me, but the original motivation wasn't clear at first
and took some thread digging to get it.

  Luis
diff mbox series

Patch

diff --git a/fs/internal.h b/fs/internal.h
index 242f2845b3428b..b6777a47b05163 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -189,9 +189,9 @@  int do_statx(int dfd, const char __user *filename, unsigned flags,
 static inline void set_fmode_can_read_write(struct file *f)
 {
 	if ((f->f_mode & FMODE_READ) &&
-	    (f->f_op->read || f->f_op->read_iter))
+	    (f->f_op->read || f->f_op->read_uptr || f->f_op->read_iter))
 		f->f_mode |= FMODE_CAN_READ;
 	if ((f->f_mode & FMODE_WRITE) &&
-	    (f->f_op->write || f->f_op->write_iter))
+	    (f->f_op->write || f->f_op->write_uptr || f->f_op->write_iter))
 		f->f_mode |= FMODE_CAN_WRITE;
 }
diff --git a/fs/read_write.c b/fs/read_write.c
index e7f36b15683049..24ffbf3cbda243 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -430,7 +430,9 @@  ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
 
 	if (count > MAX_RW_COUNT)
 		count =  MAX_RW_COUNT;
-	if (file->f_op->read) {
+	if (file->f_op->read_uptr) {
+		ret = file->f_op->read_uptr(file, KERNEL_UPTR(buf), count, pos);
+	} else if (file->f_op->read) {
 		mm_segment_t old_fs = get_fs();
 
 		set_fs(KERNEL_DS);
@@ -485,7 +487,9 @@  ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
 	if (count > MAX_RW_COUNT)
 		count =  MAX_RW_COUNT;
 
-	if (file->f_op->read)
+	if (file->f_op->read_uptr)
+		ret = file->f_op->read_uptr(file, USER_UPTR(buf), count, pos);
+	else if (file->f_op->read)
 		ret = file->f_op->read(file, buf, count, pos);
 	else if (file->f_op->read_iter)
 		ret = new_sync_read(file, buf, count, pos);
@@ -530,7 +534,10 @@  ssize_t __kernel_write(struct file *file, const void *buf, size_t count,
 
 	if (count > MAX_RW_COUNT)
 		count =  MAX_RW_COUNT;
-	if (file->f_op->write) {
+	if (file->f_op->write_uptr) {
+		ret = file->f_op->write_uptr(file, KERNEL_UPTR((void *)buf),
+				count, pos);
+	} else if (file->f_op->write) {
 		mm_segment_t old_fs = get_fs();
 
 		set_fs(KERNEL_DS);
@@ -592,7 +599,10 @@  ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
 	if (count > MAX_RW_COUNT)
 		count =  MAX_RW_COUNT;
 	file_start_write(file);
-	if (file->f_op->write)
+	if (file->f_op->write_uptr)
+		ret = file->f_op->write_uptr(file,
+				USER_UPTR((char __user *)buf), count, pos);
+	else if (file->f_op->write)
 		ret = file->f_op->write(file, buf, count, pos);
 	else if (file->f_op->write_iter)
 		ret = new_sync_write(file, buf, count, pos);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fac6aead402a98..d8fc3015f5a197 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -39,6 +39,7 @@ 
 #include <linux/fs_types.h>
 #include <linux/build_bug.h>
 #include <linux/stddef.h>
+#include <linux/uptr.h>
 
 #include <asm/byteorder.h>
 #include <uapi/linux/fs.h>
@@ -1830,6 +1831,8 @@  struct file_operations {
 	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
 	ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
 	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
+	ssize_t (*read_uptr) (struct file *, uptr_t, size_t, loff_t *);
+	ssize_t (*write_uptr) (struct file *, uptr_t, size_t, loff_t *);
 	int (*iopoll)(struct kiocb *kiocb, bool spin);
 	int (*iterate) (struct file *, struct dir_context *);
 	int (*iterate_shared) (struct file *, struct dir_context *);