[RFC,2/3] fs: add RWF_ENCODED for writing compressed data
diff mbox series

Message ID 230a76e65372a8fb3ec62ce167d9322e5e342810.1568875700.git.osandov@fb.com
State New
Headers show
Series
  • fs: interface for directly writing encoded (e.g., compressed) data
Related show

Commit Message

Omar Sandoval Sept. 19, 2019, 6:53 a.m. UTC
From: Omar Sandoval <osandov@fb.com>

Btrfs can transparently compress data written by the user. However, we'd
like to add an interface to write pre-compressed data directly to the
filesystem. This adds support for so-called "encoded writes" via
pwritev2().

A new RWF_ENCODED flags indicates that a write is "encoded". If this
flag is set, iov[0].iov_base points to a struct encoded_iov which
contains metadata about the write: namely, the compression algorithm and
the unencoded (i.e., decompressed) length of the extent. iov[0].iov_len
must be set to sizeof(struct encoded_iov), which can be used to extend
the interface in the future. The remaining iovecs contain the encoded
extent.

A similar interface for reading encoded data can be added to preadv2()
in the future.

Filesystems must indicate that they support encoded writes by setting
FMODE_ENCODED_IO in ->file_open().

Signed-off-by: Omar Sandoval <osandov@fb.com>
---
 include/linux/fs.h      | 13 +++++++
 include/uapi/linux/fs.h | 24 ++++++++++++-
 mm/filemap.c            | 75 ++++++++++++++++++++++++++++++++++-------
 3 files changed, 99 insertions(+), 13 deletions(-)

Comments

Jann Horn Sept. 19, 2019, 3:44 p.m. UTC | #1
On Thu, Sep 19, 2019 at 8:54 AM Omar Sandoval <osandov@osandov.com> wrote:
> Btrfs can transparently compress data written by the user. However, we'd
> like to add an interface to write pre-compressed data directly to the
> filesystem. This adds support for so-called "encoded writes" via
> pwritev2().
>
> A new RWF_ENCODED flags indicates that a write is "encoded". If this
> flag is set, iov[0].iov_base points to a struct encoded_iov which
> contains metadata about the write: namely, the compression algorithm and
> the unencoded (i.e., decompressed) length of the extent. iov[0].iov_len
> must be set to sizeof(struct encoded_iov), which can be used to extend
> the interface in the future. The remaining iovecs contain the encoded
> extent.
>
> A similar interface for reading encoded data can be added to preadv2()
> in the future.
>
> Filesystems must indicate that they support encoded writes by setting
> FMODE_ENCODED_IO in ->file_open().
[...]
> +int import_encoded_write(struct kiocb *iocb, struct encoded_iov *encoded,
> +                        struct iov_iter *from)
> +{
> +       if (iov_iter_single_seg_count(from) != sizeof(*encoded))
> +               return -EINVAL;
> +       if (copy_from_iter(encoded, sizeof(*encoded), from) != sizeof(*encoded))
> +               return -EFAULT;
> +       if (encoded->compression == ENCODED_IOV_COMPRESSION_NONE &&
> +           encoded->encryption == ENCODED_IOV_ENCRYPTION_NONE) {
> +               iocb->ki_flags &= ~IOCB_ENCODED;
> +               return 0;
> +       }
> +       if (encoded->compression > ENCODED_IOV_COMPRESSION_TYPES ||
> +           encoded->encryption > ENCODED_IOV_ENCRYPTION_TYPES)
> +               return -EINVAL;
> +       if (!capable(CAP_SYS_ADMIN))
> +               return -EPERM;

How does this capable() check interact with io_uring? Without having
looked at this in detail, I suspect that when an encoded write is
requested through io_uring, the capable() check might be executed on
something like a workqueue worker thread, which is probably running
with a full capability set.
Jens Axboe Sept. 20, 2019, 4:25 p.m. UTC | #2
On 9/19/19 9:44 AM, Jann Horn wrote:
> On Thu, Sep 19, 2019 at 8:54 AM Omar Sandoval <osandov@osandov.com> wrote:
>> Btrfs can transparently compress data written by the user. However, we'd
>> like to add an interface to write pre-compressed data directly to the
>> filesystem. This adds support for so-called "encoded writes" via
>> pwritev2().
>>
>> A new RWF_ENCODED flags indicates that a write is "encoded". If this
>> flag is set, iov[0].iov_base points to a struct encoded_iov which
>> contains metadata about the write: namely, the compression algorithm and
>> the unencoded (i.e., decompressed) length of the extent. iov[0].iov_len
>> must be set to sizeof(struct encoded_iov), which can be used to extend
>> the interface in the future. The remaining iovecs contain the encoded
>> extent.
>>
>> A similar interface for reading encoded data can be added to preadv2()
>> in the future.
>>
>> Filesystems must indicate that they support encoded writes by setting
>> FMODE_ENCODED_IO in ->file_open().
> [...]
>> +int import_encoded_write(struct kiocb *iocb, struct encoded_iov *encoded,
>> +                        struct iov_iter *from)
>> +{
>> +       if (iov_iter_single_seg_count(from) != sizeof(*encoded))
>> +               return -EINVAL;
>> +       if (copy_from_iter(encoded, sizeof(*encoded), from) != sizeof(*encoded))
>> +               return -EFAULT;
>> +       if (encoded->compression == ENCODED_IOV_COMPRESSION_NONE &&
>> +           encoded->encryption == ENCODED_IOV_ENCRYPTION_NONE) {
>> +               iocb->ki_flags &= ~IOCB_ENCODED;
>> +               return 0;
>> +       }
>> +       if (encoded->compression > ENCODED_IOV_COMPRESSION_TYPES ||
>> +           encoded->encryption > ENCODED_IOV_ENCRYPTION_TYPES)
>> +               return -EINVAL;
>> +       if (!capable(CAP_SYS_ADMIN))
>> +               return -EPERM;
> 
> How does this capable() check interact with io_uring? Without having
> looked at this in detail, I suspect that when an encoded write is
> requested through io_uring, the capable() check might be executed on
> something like a workqueue worker thread, which is probably running
> with a full capability set.

If we can hit -EAGAIN before doing the import in io_uring, then yes,
this will probably bypass the check as it'll only happen from the
worker.
Omar Sandoval Sept. 24, 2019, 5:15 p.m. UTC | #3
On Thu, Sep 19, 2019 at 05:44:12PM +0200, Jann Horn wrote:
> On Thu, Sep 19, 2019 at 8:54 AM Omar Sandoval <osandov@osandov.com> wrote:
> > Btrfs can transparently compress data written by the user. However, we'd
> > like to add an interface to write pre-compressed data directly to the
> > filesystem. This adds support for so-called "encoded writes" via
> > pwritev2().
> >
> > A new RWF_ENCODED flags indicates that a write is "encoded". If this
> > flag is set, iov[0].iov_base points to a struct encoded_iov which
> > contains metadata about the write: namely, the compression algorithm and
> > the unencoded (i.e., decompressed) length of the extent. iov[0].iov_len
> > must be set to sizeof(struct encoded_iov), which can be used to extend
> > the interface in the future. The remaining iovecs contain the encoded
> > extent.
> >
> > A similar interface for reading encoded data can be added to preadv2()
> > in the future.
> >
> > Filesystems must indicate that they support encoded writes by setting
> > FMODE_ENCODED_IO in ->file_open().
> [...]
> > +int import_encoded_write(struct kiocb *iocb, struct encoded_iov *encoded,
> > +                        struct iov_iter *from)
> > +{
> > +       if (iov_iter_single_seg_count(from) != sizeof(*encoded))
> > +               return -EINVAL;
> > +       if (copy_from_iter(encoded, sizeof(*encoded), from) != sizeof(*encoded))
> > +               return -EFAULT;
> > +       if (encoded->compression == ENCODED_IOV_COMPRESSION_NONE &&
> > +           encoded->encryption == ENCODED_IOV_ENCRYPTION_NONE) {
> > +               iocb->ki_flags &= ~IOCB_ENCODED;
> > +               return 0;
> > +       }
> > +       if (encoded->compression > ENCODED_IOV_COMPRESSION_TYPES ||
> > +           encoded->encryption > ENCODED_IOV_ENCRYPTION_TYPES)
> > +               return -EINVAL;
> > +       if (!capable(CAP_SYS_ADMIN))
> > +               return -EPERM;
> 
> How does this capable() check interact with io_uring? Without having
> looked at this in detail, I suspect that when an encoded write is
> requested through io_uring, the capable() check might be executed on
> something like a workqueue worker thread, which is probably running
> with a full capability set.

I discussed this more with Jens. You're right, per-IO permission checks
aren't going to work. In fully-polled mode, we never get an opportunity
to check capabilities in right context. So, this will probably require a
new open flag.
Omar Sandoval Sept. 24, 2019, 7:35 p.m. UTC | #4
On Tue, Sep 24, 2019 at 10:15:13AM -0700, Omar Sandoval wrote:
> On Thu, Sep 19, 2019 at 05:44:12PM +0200, Jann Horn wrote:
> > On Thu, Sep 19, 2019 at 8:54 AM Omar Sandoval <osandov@osandov.com> wrote:
> > > Btrfs can transparently compress data written by the user. However, we'd
> > > like to add an interface to write pre-compressed data directly to the
> > > filesystem. This adds support for so-called "encoded writes" via
> > > pwritev2().
> > >
> > > A new RWF_ENCODED flags indicates that a write is "encoded". If this
> > > flag is set, iov[0].iov_base points to a struct encoded_iov which
> > > contains metadata about the write: namely, the compression algorithm and
> > > the unencoded (i.e., decompressed) length of the extent. iov[0].iov_len
> > > must be set to sizeof(struct encoded_iov), which can be used to extend
> > > the interface in the future. The remaining iovecs contain the encoded
> > > extent.
> > >
> > > A similar interface for reading encoded data can be added to preadv2()
> > > in the future.
> > >
> > > Filesystems must indicate that they support encoded writes by setting
> > > FMODE_ENCODED_IO in ->file_open().
> > [...]
> > > +int import_encoded_write(struct kiocb *iocb, struct encoded_iov *encoded,
> > > +                        struct iov_iter *from)
> > > +{
> > > +       if (iov_iter_single_seg_count(from) != sizeof(*encoded))
> > > +               return -EINVAL;
> > > +       if (copy_from_iter(encoded, sizeof(*encoded), from) != sizeof(*encoded))
> > > +               return -EFAULT;
> > > +       if (encoded->compression == ENCODED_IOV_COMPRESSION_NONE &&
> > > +           encoded->encryption == ENCODED_IOV_ENCRYPTION_NONE) {
> > > +               iocb->ki_flags &= ~IOCB_ENCODED;
> > > +               return 0;
> > > +       }
> > > +       if (encoded->compression > ENCODED_IOV_COMPRESSION_TYPES ||
> > > +           encoded->encryption > ENCODED_IOV_ENCRYPTION_TYPES)
> > > +               return -EINVAL;
> > > +       if (!capable(CAP_SYS_ADMIN))
> > > +               return -EPERM;
> > 
> > How does this capable() check interact with io_uring? Without having
> > looked at this in detail, I suspect that when an encoded write is
> > requested through io_uring, the capable() check might be executed on
> > something like a workqueue worker thread, which is probably running
> > with a full capability set.
> 
> I discussed this more with Jens. You're right, per-IO permission checks
> aren't going to work. In fully-polled mode, we never get an opportunity
> to check capabilities in right context. So, this will probably require a
> new open flag.

Actually, file_ns_capable() accomplishes the same thing without a new
open flag. Changing the capable() check to file_ns_capable() in
init_user_ns should be enough.
Jann Horn Sept. 24, 2019, 8:01 p.m. UTC | #5
On Tue, Sep 24, 2019 at 9:35 PM Omar Sandoval <osandov@osandov.com> wrote:
> On Tue, Sep 24, 2019 at 10:15:13AM -0700, Omar Sandoval wrote:
> > On Thu, Sep 19, 2019 at 05:44:12PM +0200, Jann Horn wrote:
> > > On Thu, Sep 19, 2019 at 8:54 AM Omar Sandoval <osandov@osandov.com> wrote:
> > > > Btrfs can transparently compress data written by the user. However, we'd
> > > > like to add an interface to write pre-compressed data directly to the
> > > > filesystem. This adds support for so-called "encoded writes" via
> > > > pwritev2().
> > > >
> > > > A new RWF_ENCODED flags indicates that a write is "encoded". If this
> > > > flag is set, iov[0].iov_base points to a struct encoded_iov which
> > > > contains metadata about the write: namely, the compression algorithm and
> > > > the unencoded (i.e., decompressed) length of the extent. iov[0].iov_len
> > > > must be set to sizeof(struct encoded_iov), which can be used to extend
> > > > the interface in the future. The remaining iovecs contain the encoded
> > > > extent.
> > > >
> > > > A similar interface for reading encoded data can be added to preadv2()
> > > > in the future.
> > > >
> > > > Filesystems must indicate that they support encoded writes by setting
> > > > FMODE_ENCODED_IO in ->file_open().
> > > [...]
> > > > +int import_encoded_write(struct kiocb *iocb, struct encoded_iov *encoded,
> > > > +                        struct iov_iter *from)
> > > > +{
> > > > +       if (iov_iter_single_seg_count(from) != sizeof(*encoded))
> > > > +               return -EINVAL;
> > > > +       if (copy_from_iter(encoded, sizeof(*encoded), from) != sizeof(*encoded))
> > > > +               return -EFAULT;
> > > > +       if (encoded->compression == ENCODED_IOV_COMPRESSION_NONE &&
> > > > +           encoded->encryption == ENCODED_IOV_ENCRYPTION_NONE) {
> > > > +               iocb->ki_flags &= ~IOCB_ENCODED;
> > > > +               return 0;
> > > > +       }
> > > > +       if (encoded->compression > ENCODED_IOV_COMPRESSION_TYPES ||
> > > > +           encoded->encryption > ENCODED_IOV_ENCRYPTION_TYPES)
> > > > +               return -EINVAL;
> > > > +       if (!capable(CAP_SYS_ADMIN))
> > > > +               return -EPERM;
> > >
> > > How does this capable() check interact with io_uring? Without having
> > > looked at this in detail, I suspect that when an encoded write is
> > > requested through io_uring, the capable() check might be executed on
> > > something like a workqueue worker thread, which is probably running
> > > with a full capability set.
> >
> > I discussed this more with Jens. You're right, per-IO permission checks
> > aren't going to work. In fully-polled mode, we never get an opportunity
> > to check capabilities in right context. So, this will probably require a
> > new open flag.
>
> Actually, file_ns_capable() accomplishes the same thing without a new
> open flag. Changing the capable() check to file_ns_capable() in
> init_user_ns should be enough.

+Aleksa for openat2() and open() space

Mmh... but if the file descriptor has been passed through a privilege
boundary, it isn't really clear whether the original opener of the
file intended for this to be possible. For example, if (as a
hypothetical example) the init process opens a service's logfile with
root privileges, then passes the file descriptor to that logfile to
the service on execve(), that doesn't mean that the service should be
able to perform compressed writes into that file, I think.

I think that an open flag (as you already suggested) or an fcntl()
operation would do the job; but AFAIK the open() flag space has run
out, so if you hook it up that way, I think you might have to wait for
Aleksa Sarai to get something like his sys_openat2() suggestion
(https://lore.kernel.org/lkml/20190904201933.10736-12-cyphar@cyphar.com/)
merged?
Christian Brauner Sept. 24, 2019, 8:22 p.m. UTC | #6
On Tue, Sep 24, 2019 at 10:01:41PM +0200, Jann Horn wrote:
> On Tue, Sep 24, 2019 at 9:35 PM Omar Sandoval <osandov@osandov.com> wrote:
> > On Tue, Sep 24, 2019 at 10:15:13AM -0700, Omar Sandoval wrote:
> > > On Thu, Sep 19, 2019 at 05:44:12PM +0200, Jann Horn wrote:
> > > > On Thu, Sep 19, 2019 at 8:54 AM Omar Sandoval <osandov@osandov.com> wrote:
> > > > > Btrfs can transparently compress data written by the user. However, we'd
> > > > > like to add an interface to write pre-compressed data directly to the
> > > > > filesystem. This adds support for so-called "encoded writes" via
> > > > > pwritev2().
> > > > >
> > > > > A new RWF_ENCODED flags indicates that a write is "encoded". If this
> > > > > flag is set, iov[0].iov_base points to a struct encoded_iov which
> > > > > contains metadata about the write: namely, the compression algorithm and
> > > > > the unencoded (i.e., decompressed) length of the extent. iov[0].iov_len
> > > > > must be set to sizeof(struct encoded_iov), which can be used to extend
> > > > > the interface in the future. The remaining iovecs contain the encoded
> > > > > extent.
> > > > >
> > > > > A similar interface for reading encoded data can be added to preadv2()
> > > > > in the future.
> > > > >
> > > > > Filesystems must indicate that they support encoded writes by setting
> > > > > FMODE_ENCODED_IO in ->file_open().
> > > > [...]
> > > > > +int import_encoded_write(struct kiocb *iocb, struct encoded_iov *encoded,
> > > > > +                        struct iov_iter *from)
> > > > > +{
> > > > > +       if (iov_iter_single_seg_count(from) != sizeof(*encoded))
> > > > > +               return -EINVAL;
> > > > > +       if (copy_from_iter(encoded, sizeof(*encoded), from) != sizeof(*encoded))
> > > > > +               return -EFAULT;
> > > > > +       if (encoded->compression == ENCODED_IOV_COMPRESSION_NONE &&
> > > > > +           encoded->encryption == ENCODED_IOV_ENCRYPTION_NONE) {
> > > > > +               iocb->ki_flags &= ~IOCB_ENCODED;
> > > > > +               return 0;
> > > > > +       }
> > > > > +       if (encoded->compression > ENCODED_IOV_COMPRESSION_TYPES ||
> > > > > +           encoded->encryption > ENCODED_IOV_ENCRYPTION_TYPES)
> > > > > +               return -EINVAL;
> > > > > +       if (!capable(CAP_SYS_ADMIN))
> > > > > +               return -EPERM;
> > > >
> > > > How does this capable() check interact with io_uring? Without having
> > > > looked at this in detail, I suspect that when an encoded write is
> > > > requested through io_uring, the capable() check might be executed on
> > > > something like a workqueue worker thread, which is probably running
> > > > with a full capability set.
> > >
> > > I discussed this more with Jens. You're right, per-IO permission checks
> > > aren't going to work. In fully-polled mode, we never get an opportunity
> > > to check capabilities in right context. So, this will probably require a
> > > new open flag.
> >
> > Actually, file_ns_capable() accomplishes the same thing without a new
> > open flag. Changing the capable() check to file_ns_capable() in
> > init_user_ns should be enough.
> 
> +Aleksa for openat2() and open() space
> 
> Mmh... but if the file descriptor has been passed through a privilege
> boundary, it isn't really clear whether the original opener of the
> file intended for this to be possible. For example, if (as a
> hypothetical example) the init process opens a service's logfile with
> root privileges, then passes the file descriptor to that logfile to
> the service on execve(), that doesn't mean that the service should be
> able to perform compressed writes into that file, I think.

I think we should even generalize this: for most new properties a given
file descriptor can carry we would want it to be explicitly enabled such
that passing the fd around amounts to passing that property around. At
least as soon as we consider it to be associated with some privilege
boundary. I don't think we have done this generally. But I would very
much support moving to such a model.

Christian
Omar Sandoval Sept. 24, 2019, 8:38 p.m. UTC | #7
On Tue, Sep 24, 2019 at 10:01:41PM +0200, Jann Horn wrote:
> On Tue, Sep 24, 2019 at 9:35 PM Omar Sandoval <osandov@osandov.com> wrote:
> > On Tue, Sep 24, 2019 at 10:15:13AM -0700, Omar Sandoval wrote:
> > > On Thu, Sep 19, 2019 at 05:44:12PM +0200, Jann Horn wrote:
> > > > On Thu, Sep 19, 2019 at 8:54 AM Omar Sandoval <osandov@osandov.com> wrote:
> > > > > Btrfs can transparently compress data written by the user. However, we'd
> > > > > like to add an interface to write pre-compressed data directly to the
> > > > > filesystem. This adds support for so-called "encoded writes" via
> > > > > pwritev2().
> > > > >
> > > > > A new RWF_ENCODED flags indicates that a write is "encoded". If this
> > > > > flag is set, iov[0].iov_base points to a struct encoded_iov which
> > > > > contains metadata about the write: namely, the compression algorithm and
> > > > > the unencoded (i.e., decompressed) length of the extent. iov[0].iov_len
> > > > > must be set to sizeof(struct encoded_iov), which can be used to extend
> > > > > the interface in the future. The remaining iovecs contain the encoded
> > > > > extent.
> > > > >
> > > > > A similar interface for reading encoded data can be added to preadv2()
> > > > > in the future.
> > > > >
> > > > > Filesystems must indicate that they support encoded writes by setting
> > > > > FMODE_ENCODED_IO in ->file_open().
> > > > [...]
> > > > > +int import_encoded_write(struct kiocb *iocb, struct encoded_iov *encoded,
> > > > > +                        struct iov_iter *from)
> > > > > +{
> > > > > +       if (iov_iter_single_seg_count(from) != sizeof(*encoded))
> > > > > +               return -EINVAL;
> > > > > +       if (copy_from_iter(encoded, sizeof(*encoded), from) != sizeof(*encoded))
> > > > > +               return -EFAULT;
> > > > > +       if (encoded->compression == ENCODED_IOV_COMPRESSION_NONE &&
> > > > > +           encoded->encryption == ENCODED_IOV_ENCRYPTION_NONE) {
> > > > > +               iocb->ki_flags &= ~IOCB_ENCODED;
> > > > > +               return 0;
> > > > > +       }
> > > > > +       if (encoded->compression > ENCODED_IOV_COMPRESSION_TYPES ||
> > > > > +           encoded->encryption > ENCODED_IOV_ENCRYPTION_TYPES)
> > > > > +               return -EINVAL;
> > > > > +       if (!capable(CAP_SYS_ADMIN))
> > > > > +               return -EPERM;
> > > >
> > > > How does this capable() check interact with io_uring? Without having
> > > > looked at this in detail, I suspect that when an encoded write is
> > > > requested through io_uring, the capable() check might be executed on
> > > > something like a workqueue worker thread, which is probably running
> > > > with a full capability set.
> > >
> > > I discussed this more with Jens. You're right, per-IO permission checks
> > > aren't going to work. In fully-polled mode, we never get an opportunity
> > > to check capabilities in right context. So, this will probably require a
> > > new open flag.
> >
> > Actually, file_ns_capable() accomplishes the same thing without a new
> > open flag. Changing the capable() check to file_ns_capable() in
> > init_user_ns should be enough.
> 
> +Aleksa for openat2() and open() space
> 
> Mmh... but if the file descriptor has been passed through a privilege
> boundary, it isn't really clear whether the original opener of the
> file intended for this to be possible. For example, if (as a
> hypothetical example) the init process opens a service's logfile with
> root privileges, then passes the file descriptor to that logfile to
> the service on execve(), that doesn't mean that the service should be
> able to perform compressed writes into that file, I think.

Ahh, you're right.

> I think that an open flag (as you already suggested) or an fcntl()
> operation would do the job; but AFAIK the open() flag space has run
> out, so if you hook it up that way, I think you might have to wait for
> Aleksa Sarai to get something like his sys_openat2() suggestion
> (https://lore.kernel.org/lkml/20190904201933.10736-12-cyphar@cyphar.com/)
> merged?

If I counted correctly, there's still space for a new O_ flag. One of
the problems that Aleksa is solving is that unknown O_ flags are
silently ignored, which isn't an issue for an O_ENCODED flag. If the
kernel doesn't support it, it won't support RWF_ENCODED, either, so
you'll get EOPNOTSUPP from pwritev2(). So, open flag it is...
Matthew Wilcox Sept. 24, 2019, 8:50 p.m. UTC | #8
On Tue, Sep 24, 2019 at 10:22:29PM +0200, Christian Brauner wrote:
> On Tue, Sep 24, 2019 at 10:01:41PM +0200, Jann Horn wrote:
> > Mmh... but if the file descriptor has been passed through a privilege
> > boundary, it isn't really clear whether the original opener of the
> > file intended for this to be possible. For example, if (as a
> > hypothetical example) the init process opens a service's logfile with
> > root privileges, then passes the file descriptor to that logfile to
> > the service on execve(), that doesn't mean that the service should be
> > able to perform compressed writes into that file, I think.
> 
> I think we should even generalize this: for most new properties a given
> file descriptor can carry we would want it to be explicitly enabled such
> that passing the fd around amounts to passing that property around. At
> least as soon as we consider it to be associated with some privilege
> boundary. I don't think we have done this generally. But I would very
> much support moving to such a model.

I think you've got this right.  This needs to be an fcntl() flag, which
is only settable by root.  Now, should it be an O_ flag, modifiable by
F_SETFL, or should it be a new F_ flag?
Dave Chinner Sept. 25, 2019, 7:11 a.m. UTC | #9
On Tue, Sep 24, 2019 at 10:01:41PM +0200, Jann Horn wrote:
> On Tue, Sep 24, 2019 at 9:35 PM Omar Sandoval <osandov@osandov.com> wrote:
> > On Tue, Sep 24, 2019 at 10:15:13AM -0700, Omar Sandoval wrote:
> > > On Thu, Sep 19, 2019 at 05:44:12PM +0200, Jann Horn wrote:
> > > > On Thu, Sep 19, 2019 at 8:54 AM Omar Sandoval <osandov@osandov.com> wrote:
> > > > > Btrfs can transparently compress data written by the user. However, we'd
> > > > > like to add an interface to write pre-compressed data directly to the
> > > > > filesystem. This adds support for so-called "encoded writes" via
> > > > > pwritev2().
> > > > >
> > > > > A new RWF_ENCODED flags indicates that a write is "encoded". If this
> > > > > flag is set, iov[0].iov_base points to a struct encoded_iov which
> > > > > contains metadata about the write: namely, the compression algorithm and
> > > > > the unencoded (i.e., decompressed) length of the extent. iov[0].iov_len
> > > > > must be set to sizeof(struct encoded_iov), which can be used to extend
> > > > > the interface in the future. The remaining iovecs contain the encoded
> > > > > extent.
> > > > >
> > > > > A similar interface for reading encoded data can be added to preadv2()
> > > > > in the future.
> > > > >
> > > > > Filesystems must indicate that they support encoded writes by setting
> > > > > FMODE_ENCODED_IO in ->file_open().
> > > > [...]
> > > > > +int import_encoded_write(struct kiocb *iocb, struct encoded_iov *encoded,
> > > > > +                        struct iov_iter *from)
> > > > > +{
> > > > > +       if (iov_iter_single_seg_count(from) != sizeof(*encoded))
> > > > > +               return -EINVAL;
> > > > > +       if (copy_from_iter(encoded, sizeof(*encoded), from) != sizeof(*encoded))
> > > > > +               return -EFAULT;
> > > > > +       if (encoded->compression == ENCODED_IOV_COMPRESSION_NONE &&
> > > > > +           encoded->encryption == ENCODED_IOV_ENCRYPTION_NONE) {
> > > > > +               iocb->ki_flags &= ~IOCB_ENCODED;
> > > > > +               return 0;
> > > > > +       }
> > > > > +       if (encoded->compression > ENCODED_IOV_COMPRESSION_TYPES ||
> > > > > +           encoded->encryption > ENCODED_IOV_ENCRYPTION_TYPES)
> > > > > +               return -EINVAL;
> > > > > +       if (!capable(CAP_SYS_ADMIN))
> > > > > +               return -EPERM;
> > > >
> > > > How does this capable() check interact with io_uring? Without having
> > > > looked at this in detail, I suspect that when an encoded write is
> > > > requested through io_uring, the capable() check might be executed on
> > > > something like a workqueue worker thread, which is probably running
> > > > with a full capability set.
> > >
> > > I discussed this more with Jens. You're right, per-IO permission checks
> > > aren't going to work. In fully-polled mode, we never get an opportunity
> > > to check capabilities in right context. So, this will probably require a
> > > new open flag.
> >
> > Actually, file_ns_capable() accomplishes the same thing without a new
> > open flag. Changing the capable() check to file_ns_capable() in
> > init_user_ns should be enough.
> 
> +Aleksa for openat2() and open() space
> 
> Mmh... but if the file descriptor has been passed through a privilege
> boundary, it isn't really clear whether the original opener of the
> file intended for this to be possible. For example, if (as a
> hypothetical example) the init process opens a service's logfile with
> root privileges, then passes the file descriptor to that logfile to
> the service on execve(), that doesn't mean that the service should be
> able to perform compressed writes into that file, I think.

Where's the privilege boundary that is being crossed?

We're talking about user data read/write access here, not some
special security capability. Access to the data has already been
permission checked, so why should the format that the data is
supplied to the kernel in suddenly require new privilege checks?

i.e. writing encoded data to a file requires exactly the same
access permissions as writing cleartext data to the file. The only
extra information here is whether the _filesystem_ supports encoded
data, and that doesn't change regardless of what the open file gets
passed to. Hence the capability is either there or it isn't, it
doesn't transform not matter what privilege boundary the file is
passed across. Similarly, we have permission to access the data
or we don't through the struct file, it doesn't transform either.

Hence I don't see why CAP_SYS_ADMIN or any special permissions are
needed for an application with access permissions to file data to
use these RWF_ENCODED IO interfaces. I am inclined to think the
permission check here is wrong and should be dropped, and then all
these issues go away.

Yes, the app that is going to use this needs root perms because it
accesses all data in the fs (it's a backup app!), but that doesn't
mean you can only use RWF_ENCODED if you have root perms.

Cheers,

Dave.
Colin Walters Sept. 25, 2019, 12:07 p.m. UTC | #10
On Wed, Sep 25, 2019, at 3:11 AM, Dave Chinner wrote:
>
> We're talking about user data read/write access here, not some
> special security capability. Access to the data has already been
> permission checked, so why should the format that the data is
> supplied to the kernel in suddenly require new privilege checks?

What happens with BTRFS today if userspace provides invalid compressed data via this interface?  Does that show up as filesystem corruption later?  If the data is verified at write time, wouldn't that be losing most of the speed advantages of providing pre-compressed data?

Ability for a user to cause fsck errors later would be a new thing that would argue for a privilege check I think.
Chris Mason Sept. 25, 2019, 2:56 p.m. UTC | #11
On 25 Sep 2019, at 8:07, Colin Walters wrote:

> On Wed, Sep 25, 2019, at 3:11 AM, Dave Chinner wrote:
>>
>> We're talking about user data read/write access here, not some
>> special security capability. Access to the data has already been
>> permission checked, so why should the format that the data is
>> supplied to the kernel in suddenly require new privilege checks?
>
> What happens with BTRFS today if userspace provides invalid compressed 
> data via this interface?  Does that show up as filesystem corruption 
> later?  If the data is verified at write time, wouldn't that be losing 
> most of the speed advantages of providing pre-compressed data?

The data is verified while being decompressed, but that's a fairly large 
fuzzing surface (all of zstd, zlib, and lzo).  A lot of people will 
correctly argue that we already have that fuzzing surface today, but I'd 
rather not make a really easy way to stuff arbitrary bytes through the 
kernel decompression code until all the projects involved sign off.

-chris
Theodore Y. Ts'o Sept. 25, 2019, 3:08 p.m. UTC | #12
On Wed, Sep 25, 2019 at 08:07:12AM -0400, Colin Walters wrote:
> 
> 
> On Wed, Sep 25, 2019, at 3:11 AM, Dave Chinner wrote:
> >
> > We're talking about user data read/write access here, not some
> > special security capability. Access to the data has already been
> > permission checked, so why should the format that the data is
> > supplied to the kernel in suddenly require new privilege checks?
> 
> What happens with BTRFS today if userspace provides invalid
> compressed data via this interface?  Does that show up as filesystem
> corruption later?  If the data is verified at write time, wouldn't
> that be losing most of the speed advantages of providing
> pre-compressed data?

Not necessarily, most compression algorithms are far more expensive to
compress than to decompress.

If there is a buggy decompressor, it's possible that invalid data
could result in a buffer overrun.  So that's an argument for verifying
the compressed code at write time.  OTOH, the verification could be
just as vulnerability to invalid data as the decompressor, so it
doesn't buy you that much.

> Ability for a user to cause fsck errors later would be a new thing
> that would argue for a privilege check I think.

Well, if it's only invalid data in a user file, there's no reason why
it should cause the kernel declare that the file system is corrupt; it
can just return EIO.

What fsck does is a different question, of course; it might be that
the fsck code isn't going to check compressed user data.  After all,
if all of the files on the file system are compressed, requiring fsck
to check all compressed data blocks is tantamount to requiring it to
read all of the blocks in the file system.  Much better would be some
kind of online scrub operation which validates data files while the
file system is mounted and the system can be in a serving state.

						- Ted
Dave Chinner Sept. 25, 2019, 10:52 p.m. UTC | #13
On Wed, Sep 25, 2019 at 08:07:12AM -0400, Colin Walters wrote:
> 
> 
> On Wed, Sep 25, 2019, at 3:11 AM, Dave Chinner wrote:
> >
> > We're talking about user data read/write access here, not some
> > special security capability. Access to the data has already been
> > permission checked, so why should the format that the data is
> > supplied to the kernel in suddenly require new privilege checks?
> 
> What happens with BTRFS today if userspace provides invalid compressed
> data via this interface?

Then the filesystem returns EIO or ENODATA on read because it can't
decompress it.

User can read it back in compressed format (i.e. same way they wrote
it), try to fix it themselves.

> Does that show up as filesystem corruption later?

Nope. Just bad user data.

> If the data is verified at write time, wouldn't that be losing most of the speed advantages of providing pre-compressed data?

It's a direct IO interface. User writes garbage, then they get
garbage back. The user can still retreive the compressed data
directly, so the data is not lost....

> Ability for a user to cause fsck errors later would be a new thing
> that would argue for a privilege check I think.

fsck doesn't validate the correctness of user data - it validates
the filesystem structure is consistent. i.e. user data in unreadable
format != corrupt filesystem structure.

Cheers,

Dave.
Omar Sandoval Sept. 26, 2019, 12:36 a.m. UTC | #14
On Wed, Sep 25, 2019 at 05:11:29PM +1000, Dave Chinner wrote:
> On Tue, Sep 24, 2019 at 10:01:41PM +0200, Jann Horn wrote:
> > On Tue, Sep 24, 2019 at 9:35 PM Omar Sandoval <osandov@osandov.com> wrote:
> > > On Tue, Sep 24, 2019 at 10:15:13AM -0700, Omar Sandoval wrote:
> > > > On Thu, Sep 19, 2019 at 05:44:12PM +0200, Jann Horn wrote:
> > > > > On Thu, Sep 19, 2019 at 8:54 AM Omar Sandoval <osandov@osandov.com> wrote:
> > > > > > Btrfs can transparently compress data written by the user. However, we'd
> > > > > > like to add an interface to write pre-compressed data directly to the
> > > > > > filesystem. This adds support for so-called "encoded writes" via
> > > > > > pwritev2().
> > > > > >
> > > > > > A new RWF_ENCODED flags indicates that a write is "encoded". If this
> > > > > > flag is set, iov[0].iov_base points to a struct encoded_iov which
> > > > > > contains metadata about the write: namely, the compression algorithm and
> > > > > > the unencoded (i.e., decompressed) length of the extent. iov[0].iov_len
> > > > > > must be set to sizeof(struct encoded_iov), which can be used to extend
> > > > > > the interface in the future. The remaining iovecs contain the encoded
> > > > > > extent.
> > > > > >
> > > > > > A similar interface for reading encoded data can be added to preadv2()
> > > > > > in the future.
> > > > > >
> > > > > > Filesystems must indicate that they support encoded writes by setting
> > > > > > FMODE_ENCODED_IO in ->file_open().
> > > > > [...]
> > > > > > +int import_encoded_write(struct kiocb *iocb, struct encoded_iov *encoded,
> > > > > > +                        struct iov_iter *from)
> > > > > > +{
> > > > > > +       if (iov_iter_single_seg_count(from) != sizeof(*encoded))
> > > > > > +               return -EINVAL;
> > > > > > +       if (copy_from_iter(encoded, sizeof(*encoded), from) != sizeof(*encoded))
> > > > > > +               return -EFAULT;
> > > > > > +       if (encoded->compression == ENCODED_IOV_COMPRESSION_NONE &&
> > > > > > +           encoded->encryption == ENCODED_IOV_ENCRYPTION_NONE) {
> > > > > > +               iocb->ki_flags &= ~IOCB_ENCODED;
> > > > > > +               return 0;
> > > > > > +       }
> > > > > > +       if (encoded->compression > ENCODED_IOV_COMPRESSION_TYPES ||
> > > > > > +           encoded->encryption > ENCODED_IOV_ENCRYPTION_TYPES)
> > > > > > +               return -EINVAL;
> > > > > > +       if (!capable(CAP_SYS_ADMIN))
> > > > > > +               return -EPERM;
> > > > >
> > > > > How does this capable() check interact with io_uring? Without having
> > > > > looked at this in detail, I suspect that when an encoded write is
> > > > > requested through io_uring, the capable() check might be executed on
> > > > > something like a workqueue worker thread, which is probably running
> > > > > with a full capability set.
> > > >
> > > > I discussed this more with Jens. You're right, per-IO permission checks
> > > > aren't going to work. In fully-polled mode, we never get an opportunity
> > > > to check capabilities in right context. So, this will probably require a
> > > > new open flag.
> > >
> > > Actually, file_ns_capable() accomplishes the same thing without a new
> > > open flag. Changing the capable() check to file_ns_capable() in
> > > init_user_ns should be enough.
> > 
> > +Aleksa for openat2() and open() space
> > 
> > Mmh... but if the file descriptor has been passed through a privilege
> > boundary, it isn't really clear whether the original opener of the
> > file intended for this to be possible. For example, if (as a
> > hypothetical example) the init process opens a service's logfile with
> > root privileges, then passes the file descriptor to that logfile to
> > the service on execve(), that doesn't mean that the service should be
> > able to perform compressed writes into that file, I think.
> 
> Where's the privilege boundary that is being crossed?
> 
> We're talking about user data read/write access here, not some
> special security capability. Access to the data has already been
> permission checked, so why should the format that the data is
> supplied to the kernel in suddenly require new privilege checks?
> 
> i.e. writing encoded data to a file requires exactly the same
> access permissions as writing cleartext data to the file. The only
> extra information here is whether the _filesystem_ supports encoded
> data, and that doesn't change regardless of what the open file gets
> passed to. Hence the capability is either there or it isn't, it
> doesn't transform not matter what privilege boundary the file is
> passed across. Similarly, we have permission to access the data
> or we don't through the struct file, it doesn't transform either.
> 
> Hence I don't see why CAP_SYS_ADMIN or any special permissions are
> needed for an application with access permissions to file data to
> use these RWF_ENCODED IO interfaces. I am inclined to think the
> permission check here is wrong and should be dropped, and then all
> these issues go away.
> 
> Yes, the app that is going to use this needs root perms because it
> accesses all data in the fs (it's a backup app!), but that doesn't
> mean you can only use RWF_ENCODED if you have root perms.

For RWF_ENCODED writes, the risk here is that we'd be adding a way for
an unprivileged process to feed arbitrary data to zlib/lzo/zstd in the
kernel. From what I could find, this is a new attack surface for
unprivileged processes, and based on the search results for
"$compression_algorithm CVE", there are real bugs here.

For RWF_ENCODED reads, there's another potential issue that occurred to
me. There are a few operations for which we may need to chop up a
compressed extent: hole punch, truncate, reflink, and dedupe. Rather
than recompressing the data, Btrfs keeps the whole extent on disk and
updates the file metadata to refer to a piece of the extent. If we want
to support RWF_ENCODED reads for such extents (and I think we do), then
we need to return the entire original extent along with that metadata.
For an unprivileged reader, there's a security issue that we may be
returning data that the reader wasn't supposed to see. (A privileged
reader can go and read the block device anyways.)

So, in my opinion, both reads and writes should require privilege just
to be on the safe side.
Colin Walters Sept. 26, 2019, 12:17 p.m. UTC | #15
On Wed, Sep 25, 2019, at 10:56 AM, Chris Mason wrote:

> The data is verified while being decompressed, but that's a fairly large 
> fuzzing surface (all of zstd, zlib, and lzo).  A lot of people will 
> correctly argue that we already have that fuzzing surface today, but I'd 
> rather not make a really easy way to stuff arbitrary bytes through the 
> kernel decompression code until all the projects involved sign off.

Right.  So maybe have this start of as a BTRFS ioctl and require
privileges?   I assume that's sufficient for what Omar wants.

(Are there actually any other popular Linux filesystems that do transparent compression anyways?)
Omar Sandoval Sept. 26, 2019, 5:46 p.m. UTC | #16
On Thu, Sep 26, 2019 at 08:17:12AM -0400, Colin Walters wrote:
> 
> 
> On Wed, Sep 25, 2019, at 10:56 AM, Chris Mason wrote:
> 
> > The data is verified while being decompressed, but that's a fairly large 
> > fuzzing surface (all of zstd, zlib, and lzo).  A lot of people will 
> > correctly argue that we already have that fuzzing surface today, but I'd 
> > rather not make a really easy way to stuff arbitrary bytes through the 
> > kernel decompression code until all the projects involved sign off.
> 
> Right.  So maybe have this start of as a BTRFS ioctl and require
> privileges?   I assume that's sufficient for what Omar wants.

That was the first version of this series, but Dave requested that I
make it generic [1].

> (Are there actually any other popular Linux filesystems that do transparent compression anyways?)

A scan over the kernel tree shows that a few other filesystems do
compression:

- jffs2
- pstore (if you can call that a filesystem)
- ubifs
- cramfs (read-only)
- erofs (read-only)
- squashfs (read-only)

None of the "mainstream" general-purpose filesystems have support, but
that was also the case for reflink/dedupe before XFS added support.

1: https://lore.kernel.org/linux-fsdevel/20190905021012.GL7777@dread.disaster.area/

Patch
diff mbox series

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 75c4b7680385..ae3ac0312674 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -175,6 +175,9 @@  typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 /* File does not contribute to nr_files count */
 #define FMODE_NOACCOUNT		((__force fmode_t)0x20000000)
 
+/* File supports encoded IO */
+#define FMODE_ENCODED_IO	((__force fmode_t)0x40000000)
+
 /*
  * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
  * that indicates that they should check the contents of the iovec are
@@ -314,6 +317,7 @@  enum rw_hint {
 #define IOCB_SYNC		(1 << 5)
 #define IOCB_WRITE		(1 << 6)
 #define IOCB_NOWAIT		(1 << 7)
+#define IOCB_ENCODED		(1 << 8)
 
 struct kiocb {
 	struct file		*ki_filp;
@@ -3046,6 +3050,10 @@  extern int sb_min_blocksize(struct super_block *, int);
 extern int generic_file_mmap(struct file *, struct vm_area_struct *);
 extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
 extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
+struct encoded_iov;
+extern int generic_encoded_write_checks(struct kiocb *, struct encoded_iov *);
+extern int import_encoded_write(struct kiocb *, struct encoded_iov *,
+				struct iov_iter *);
 extern int generic_remap_checks(struct file *file_in, loff_t pos_in,
 				struct file *file_out, loff_t pos_out,
 				loff_t *count, unsigned int remap_flags);
@@ -3364,6 +3372,11 @@  static inline int kiocb_set_rw_flags(int rw, struct kiocb *ki, rwf_t flags)
 			return -EOPNOTSUPP;
 		ki->ki_flags |= IOCB_NOWAIT;
 	}
+	if (flags & RWF_ENCODED) {
+		if (rw != WRITE || !(ki->ki_filp->f_mode & FMODE_ENCODED_IO))
+			return -EOPNOTSUPP;
+		ki->ki_flags |= IOCB_ENCODED;
+	}
 	if (flags & RWF_HIPRI)
 		ki->ki_flags |= IOCB_HIPRI;
 	if (flags & RWF_DSYNC)
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index aad225b05be7..b775d9aea978 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -283,6 +283,25 @@  struct fsxattr {
 
 typedef int __bitwise __kernel_rwf_t;
 
+enum {
+	ENCODED_IOV_COMPRESSION_NONE,
+	ENCODED_IOV_COMPRESSION_ZLIB,
+	ENCODED_IOV_COMPRESSION_LZO,
+	ENCODED_IOV_COMPRESSION_ZSTD,
+	ENCODED_IOV_COMPRESSION_TYPES = ENCODED_IOV_COMPRESSION_ZSTD,
+};
+
+enum {
+	ENCODED_IOV_ENCRYPTION_NONE,
+	ENCODED_IOV_ENCRYPTION_TYPES = ENCODED_IOV_ENCRYPTION_NONE,
+};
+
+struct encoded_iov {
+	__u64 unencoded_len;
+	__u32 compression;
+	__u32 encryption;
+};
+
 /* high priority request, poll if possible */
 #define RWF_HIPRI	((__force __kernel_rwf_t)0x00000001)
 
@@ -298,8 +317,11 @@  typedef int __bitwise __kernel_rwf_t;
 /* per-IO O_APPEND */
 #define RWF_APPEND	((__force __kernel_rwf_t)0x00000010)
 
+/* encoded (e.g., compressed or encrypted) IO */
+#define RWF_ENCODED	((__force __kernel_rwf_t)0x00000020)
+
 /* mask of flags supported by the kernel */
 #define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
-			 RWF_APPEND)
+			 RWF_APPEND | RWF_ENCODED)
 
 #endif /* _UAPI_LINUX_FS_H */
diff --git a/mm/filemap.c b/mm/filemap.c
index 40667c2f3383..3d2555364432 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2974,24 +2974,16 @@  static int generic_write_check_limits(struct file *file, loff_t pos,
 	return 0;
 }
 
-/*
- * Performs necessary checks before doing a write
- *
- * Can adjust writing position or amount of bytes to write.
- * Returns appropriate error code that caller should return or
- * zero in case that write should be allowed.
- */
-inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
+static int generic_write_checks_common(struct kiocb *iocb, loff_t *count)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
-	loff_t count;
 	int ret;
 
 	if (IS_SWAPFILE(inode))
 		return -ETXTBSY;
 
-	if (!iov_iter_count(from))
+	if (!*count)
 		return 0;
 
 	/* FIXME: this is for backwards compatibility with 2.4 */
@@ -3001,8 +2993,21 @@  inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
 	if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
 		return -EINVAL;
 
-	count = iov_iter_count(from);
-	ret = generic_write_check_limits(file, iocb->ki_pos, &count);
+	return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
+}
+
+/*
+ * Performs necessary checks before doing a write
+ *
+ * Can adjust writing position or amount of bytes to write.
+ * Returns a negative errno or the new number of bytes to write.
+ */
+inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
+{
+	loff_t count = iov_iter_count(from);
+	int ret;
+
+	ret = generic_write_checks_common(iocb, &count);
 	if (ret)
 		return ret;
 
@@ -3011,6 +3016,52 @@  inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
 }
 EXPORT_SYMBOL(generic_write_checks);
 
+int generic_encoded_write_checks(struct kiocb *iocb,
+				 struct encoded_iov *encoded)
+{
+	loff_t count = encoded->unencoded_len;
+	int ret;
+
+	ret = generic_write_checks_common(iocb, &count);
+	if (ret)
+		return ret;
+
+	if (count != encoded->unencoded_len) {
+		/*
+		 * The write got truncated by generic_write_checks(). We can't
+		 * do a partial encoded write.
+		 */
+		return -EFBIG;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(generic_encoded_write_checks);
+
+/*
+ * If no encoding is set, this clears IOCB_ENCODED and the write should be
+ * treated as a normal write.
+ */
+int import_encoded_write(struct kiocb *iocb, struct encoded_iov *encoded,
+			 struct iov_iter *from)
+{
+	if (iov_iter_single_seg_count(from) != sizeof(*encoded))
+		return -EINVAL;
+	if (copy_from_iter(encoded, sizeof(*encoded), from) != sizeof(*encoded))
+		return -EFAULT;
+	if (encoded->compression == ENCODED_IOV_COMPRESSION_NONE &&
+	    encoded->encryption == ENCODED_IOV_ENCRYPTION_NONE) {
+		iocb->ki_flags &= ~IOCB_ENCODED;
+		return 0;
+	}
+	if (encoded->compression > ENCODED_IOV_COMPRESSION_TYPES ||
+	    encoded->encryption > ENCODED_IOV_ENCRYPTION_TYPES)
+		return -EINVAL;
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	return 0;
+}
+EXPORT_SYMBOL(import_encoded_write);
+
 /*
  * Performs necessary checks before doing a clone.
  *