[RFC,v2,2/5] fs: add RWF_ENCODED for reading/writing compressed data
diff mbox series

Message ID 7f98cf5409cf2b583cd5b3451fc739fd3428873b.1571164762.git.osandov@fb.com
State New
Headers show
Series
  • fs: interface for directly reading/writing compressed data
Related show

Commit Message

Omar Sandoval Oct. 15, 2019, 6:42 p.m. UTC
From: Omar Sandoval <osandov@fb.com>

Btrfs supports transparent compression: data written by the user can be
compressed when written to disk and decompressed when read back.
However, we'd like to add an interface to write pre-compressed data
directly to the filesystem, and the matching interface to read
compressed data without decompressing it. This adds support for
so-called "encoded I/O" via preadv2() and pwritev2().

A new RWF_ENCODED flags indicates that a read or write is "encoded". If
this flag is set, iov[0].iov_base points to a struct encoded_iov which
is used for metadata: namely, the compression algorithm, unencoded
(i.e., decompressed) length, and what subrange of the unencoded data
should be used (needed for truncated or hole-punched extents and when
reading in the middle of an extent). For reads, the filesystem returns
this information; for writes, the caller provides it to the filesystem.
iov[0].iov_len must be set to sizeof(struct encoded_iov), which can be
used to extend the interface in the future. The remaining iovecs contain
the encoded extent.

Filesystems must indicate that they support encoded writes by setting
FMODE_ENCODED_IO in ->file_open().

Signed-off-by: Omar Sandoval <osandov@fb.com>
---
 include/linux/fs.h      | 14 +++++++
 include/uapi/linux/fs.h | 26 ++++++++++++-
 mm/filemap.c            | 82 ++++++++++++++++++++++++++++++++++-------
 3 files changed, 108 insertions(+), 14 deletions(-)

Comments

Nikolay Borisov Oct. 16, 2019, 9:50 a.m. UTC | #1
On 15.10.19 г. 21:42 ч., Omar Sandoval wrote:
> From: Omar Sandoval <osandov@fb.com>
> 
> Btrfs supports transparent compression: data written by the user can be
> compressed when written to disk and decompressed when read back.
> However, we'd like to add an interface to write pre-compressed data
> directly to the filesystem, and the matching interface to read
> compressed data without decompressing it. This adds support for
> so-called "encoded I/O" via preadv2() and pwritev2().
> 
> A new RWF_ENCODED flags indicates that a read or write is "encoded". If
> this flag is set, iov[0].iov_base points to a struct encoded_iov which
> is used for metadata: namely, the compression algorithm, unencoded
> (i.e., decompressed) length, and what subrange of the unencoded data

In the future when encryption is also supported. What should be the
mechanism to enforce ordering of encoding operations i.e. first compress
then encrypt => uncoded_len should be the resulting size after the
encrypt operation. To me (not being a cryptographer) this seems the
sensible thing to do since compression will be effective that way.
However, what if , for whatever reasons, a different filesystem wants to
support this interface but chooses to do it the other around -> encrypt,
then compress?

> should be used (needed for truncated or hole-punched extents and when
> reading in the middle of an extent). For reads, the filesystem returns
> this information; for writes, the caller provides it to the filesystem.
> iov[0].iov_len must be set to sizeof(struct encoded_iov), which can be
> used to extend the interface in the future. The remaining iovecs contain
> the encoded extent.
> 
> Filesystems must indicate that they support encoded writes by setting
> FMODE_ENCODED_IO in ->file_open().
> 
> Signed-off-by: Omar Sandoval <osandov@fb.com>
> ---
>  include/linux/fs.h      | 14 +++++++
>  include/uapi/linux/fs.h | 26 ++++++++++++-
>  mm/filemap.c            | 82 ++++++++++++++++++++++++++++++++++-------
>  3 files changed, 108 insertions(+), 14 deletions(-)
> 
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index e0d909d35763..54681f21e05e 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -175,6 +175,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
>  /* File does not contribute to nr_files count */
>  #define FMODE_NOACCOUNT		((__force fmode_t)0x20000000)
>  
> +/* File supports encoded IO */
> +#define FMODE_ENCODED_IO	((__force fmode_t)0x40000000)
> +
>  /*
>   * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
>   * that indicates that they should check the contents of the iovec are
> @@ -314,6 +317,7 @@ enum rw_hint {
>  #define IOCB_SYNC		(1 << 5)
>  #define IOCB_WRITE		(1 << 6)
>  #define IOCB_NOWAIT		(1 << 7)
> +#define IOCB_ENCODED		(1 << 8)
>  
>  struct kiocb {
>  	struct file		*ki_filp;
> @@ -3088,6 +3092,11 @@ extern int sb_min_blocksize(struct super_block *, int);
>  extern int generic_file_mmap(struct file *, struct vm_area_struct *);
>  extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
>  extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
> +struct encoded_iov;
> +extern int generic_encoded_write_checks(struct kiocb *, struct encoded_iov *);
> +extern ssize_t check_encoded_read(struct kiocb *, struct iov_iter *);
> +extern int import_encoded_write(struct kiocb *, struct encoded_iov *,
> +				struct iov_iter *);
>  extern int generic_remap_checks(struct file *file_in, loff_t pos_in,
>  				struct file *file_out, loff_t pos_out,
>  				loff_t *count, unsigned int remap_flags);
> @@ -3403,6 +3412,11 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
>  			return -EOPNOTSUPP;
>  		ki->ki_flags |= IOCB_NOWAIT;
>  	}
> +	if (flags & RWF_ENCODED) {
> +		if (!(ki->ki_filp->f_mode & FMODE_ENCODED_IO))
> +			return -EOPNOTSUPP;
> +		ki->ki_flags |= IOCB_ENCODED;
> +	}
>  	if (flags & RWF_HIPRI)
>  		ki->ki_flags |= IOCB_HIPRI;
>  	if (flags & RWF_DSYNC)
> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> index 379a612f8f1d..ed92a8a257cb 100644
> --- a/include/uapi/linux/fs.h
> +++ b/include/uapi/linux/fs.h
> @@ -284,6 +284,27 @@ struct fsxattr {
>  
>  typedef int __bitwise __kernel_rwf_t;
>  
> +enum {
> +	ENCODED_IOV_COMPRESSION_NONE,
> +	ENCODED_IOV_COMPRESSION_ZLIB,
> +	ENCODED_IOV_COMPRESSION_LZO,
> +	ENCODED_IOV_COMPRESSION_ZSTD,
> +	ENCODED_IOV_COMPRESSION_TYPES = ENCODED_IOV_COMPRESSION_ZSTD,
> +};
> +
> +enum {
> +	ENCODED_IOV_ENCRYPTION_NONE,
> +	ENCODED_IOV_ENCRYPTION_TYPES = ENCODED_IOV_ENCRYPTION_NONE,
> +};
> +
> +struct encoded_iov {
> +	__u64 len;
> +	__u64 unencoded_len;
> +	__u64 unencoded_offset;
> +	__u32 compression;
> +	__u32 encryption;
> +};
> +
>  /* high priority request, poll if possible */
>  #define RWF_HIPRI	((__force __kernel_rwf_t)0x00000001)
>  
> @@ -299,8 +320,11 @@ typedef int __bitwise __kernel_rwf_t;
>  /* per-IO O_APPEND */
>  #define RWF_APPEND	((__force __kernel_rwf_t)0x00000010)
>  
> +/* encoded (e.g., compressed or encrypted) IO */

nit: s/or/and\/or/ or both are exclusive?

> +#define RWF_ENCODED	((__force __kernel_rwf_t)0x00000020)
> +
>  /* mask of flags supported by the kernel */
>  #define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
> -			 RWF_APPEND)
> +			 RWF_APPEND | RWF_ENCODED)
>  
>  #endif /* _UAPI_LINUX_FS_H */
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 1146fcfa3215..d2e6d9caf353 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -2948,24 +2948,15 @@ static int generic_write_check_limits(struct file *file, loff_t pos,
>  	return 0;
>  }
>  
> -/*
> - * Performs necessary checks before doing a write
> - *
> - * Can adjust writing position or amount of bytes to write.
> - * Returns appropriate error code that caller should return or
> - * zero in case that write should be allowed.
> - */
> -inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
> +static int generic_write_checks_common(struct kiocb *iocb, loff_t *count)
>  {
>  	struct file *file = iocb->ki_filp;
>  	struct inode *inode = file->f_mapping->host;
> -	loff_t count;
> -	int ret;
>  
>  	if (IS_SWAPFILE(inode))
>  		return -ETXTBSY;
>  
> -	if (!iov_iter_count(from))
> +	if (!*count)
>  		return 0;
>  
>  	/* FIXME: this is for backwards compatibility with 2.4 */
> @@ -2975,8 +2966,21 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
>  	if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
>  		return -EINVAL;
>  
> -	count = iov_iter_count(from);
> -	ret = generic_write_check_limits(file, iocb->ki_pos, &count);
> +	return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
> +}
> +
> +/*
> + * Performs necessary checks before doing a write
> + *
> + * Can adjust writing position or amount of bytes to write.
> + * Returns a negative errno or the new number of bytes to write.
> + */
> +inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
> +{
> +	loff_t count = iov_iter_count(from);
> +	int ret;
> +
> +	ret = generic_write_checks_common(iocb, &count);
>  	if (ret)
>  		return ret;
>  
> @@ -2985,6 +2989,58 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
>  }
>  EXPORT_SYMBOL(generic_write_checks);
>  
> +int generic_encoded_write_checks(struct kiocb *iocb,
> +				 struct encoded_iov *encoded)
> +{
> +	loff_t count = encoded->unencoded_len;
> +	int ret;
> +
> +	ret = generic_write_checks_common(iocb, &count);

That's a bit confusing. You will only ever write encoded len bytes, yet
you check the unencoded len. Presumably that's to ensure the data can be
read back successfully? Still it feels a bit odd. IMO this warrants a
comment.

When you use this function in patch 5 all the checks are performed
against unencoded_len yet you do :

count = encoded.len;

> +	if (ret)
> +		return ret;
> +
> +	if (count != encoded->unencoded_len) {
> +		/*
> +		 * The write got truncated by generic_write_checks_common(). We
> +		 * can't do a partial encoded write.
> +		 */
> +		return -EFBIG;
> +	}
> +	return 0;
> +}
> +EXPORT_SYMBOL(generic_encoded_write_checks);
> +
> +ssize_t check_encoded_read(struct kiocb *iocb, struct iov_iter *iter)
> +{
> +	if (!(iocb->ki_filp->f_flags & O_ENCODED))
> +		return -EPERM;
> +	if (iov_iter_single_seg_count(iter) != sizeof(struct encoded_iov))
> +		return -EINVAL;
> +	return iov_iter_count(iter) - sizeof(struct encoded_iov);
> +}
> +EXPORT_SYMBOL(check_encoded_read);
> +
> +int import_encoded_write(struct kiocb *iocb, struct encoded_iov *encoded,

nit: This might be just me but 'import' doesn't sound right, how about
parse_encoded_write ?


> +			 struct iov_iter *from)
> +{
> +	if (!(iocb->ki_filp->f_flags & O_ENCODED))
> +		return -EPERM;
> +	if (iov_iter_single_seg_count(from) != sizeof(*encoded))
> +		return -EINVAL;
> +	if (copy_from_iter(encoded, sizeof(*encoded), from) != sizeof(*encoded))
> +		return -EFAULT;
> +	if (encoded->compression == ENCODED_IOV_COMPRESSION_NONE &&
> +	    encoded->encryption == ENCODED_IOV_ENCRYPTION_NONE)
> +		return -EINVAL;
> +	if (encoded->compression > ENCODED_IOV_COMPRESSION_TYPES ||
> +	    encoded->encryption > ENCODED_IOV_ENCRYPTION_TYPES)
> +		return -EINVAL;
> +	if (encoded->unencoded_offset >= encoded->unencoded_len)
> +		return -EINVAL;
> +	return 0;
> +}
> +EXPORT_SYMBOL(import_encoded_write);
> +
>  /*
>   * Performs necessary checks before doing a clone.
>   *
>
Omar Sandoval Oct. 18, 2019, 10:19 p.m. UTC | #2
On Wed, Oct 16, 2019 at 12:50:48PM +0300, Nikolay Borisov wrote:
> 
> 
> On 15.10.19 г. 21:42 ч., Omar Sandoval wrote:
> > From: Omar Sandoval <osandov@fb.com>
> > 
> > Btrfs supports transparent compression: data written by the user can be
> > compressed when written to disk and decompressed when read back.
> > However, we'd like to add an interface to write pre-compressed data
> > directly to the filesystem, and the matching interface to read
> > compressed data without decompressing it. This adds support for
> > so-called "encoded I/O" via preadv2() and pwritev2().
> > 
> > A new RWF_ENCODED flags indicates that a read or write is "encoded". If
> > this flag is set, iov[0].iov_base points to a struct encoded_iov which
> > is used for metadata: namely, the compression algorithm, unencoded
> > (i.e., decompressed) length, and what subrange of the unencoded data
> 
> In the future when encryption is also supported. What should be the
> mechanism to enforce ordering of encoding operations i.e. first compress
> then encrypt => uncoded_len should be the resulting size after the
> encrypt operation. To me (not being a cryptographer) this seems the
> sensible thing to do since compression will be effective that way.
> However, what if , for whatever reasons, a different filesystem wants to
> support this interface but chooses to do it the other around -> encrypt,
> then compress?

Compress-then-encrypt is the only sane way to do it (because properly
encrypted data is indistinguishable from random data, which doesn't
compress very well). When we add encryption support, we can add a note
to the man page.

If someone _really_ wants encrypt-then-compress, they can add another
encoding field, compression_after_encryption.

> > should be used (needed for truncated or hole-punched extents and when
> > reading in the middle of an extent). For reads, the filesystem returns
> > this information; for writes, the caller provides it to the filesystem.
> > iov[0].iov_len must be set to sizeof(struct encoded_iov), which can be
> > used to extend the interface in the future. The remaining iovecs contain
> > the encoded extent.
> > 
> > Filesystems must indicate that they support encoded writes by setting
> > FMODE_ENCODED_IO in ->file_open().
> > 
> > Signed-off-by: Omar Sandoval <osandov@fb.com>
> > ---
> >  include/linux/fs.h      | 14 +++++++
> >  include/uapi/linux/fs.h | 26 ++++++++++++-
> >  mm/filemap.c            | 82 ++++++++++++++++++++++++++++++++++-------
> >  3 files changed, 108 insertions(+), 14 deletions(-)
> > 
> > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > index e0d909d35763..54681f21e05e 100644
> > --- a/include/linux/fs.h
> > +++ b/include/linux/fs.h
> > @@ -175,6 +175,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
> >  /* File does not contribute to nr_files count */
> >  #define FMODE_NOACCOUNT		((__force fmode_t)0x20000000)
> >  
> > +/* File supports encoded IO */
> > +#define FMODE_ENCODED_IO	((__force fmode_t)0x40000000)
> > +
> >  /*
> >   * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
> >   * that indicates that they should check the contents of the iovec are
> > @@ -314,6 +317,7 @@ enum rw_hint {
> >  #define IOCB_SYNC		(1 << 5)
> >  #define IOCB_WRITE		(1 << 6)
> >  #define IOCB_NOWAIT		(1 << 7)
> > +#define IOCB_ENCODED		(1 << 8)
> >  
> >  struct kiocb {
> >  	struct file		*ki_filp;
> > @@ -3088,6 +3092,11 @@ extern int sb_min_blocksize(struct super_block *, int);
> >  extern int generic_file_mmap(struct file *, struct vm_area_struct *);
> >  extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
> >  extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
> > +struct encoded_iov;
> > +extern int generic_encoded_write_checks(struct kiocb *, struct encoded_iov *);
> > +extern ssize_t check_encoded_read(struct kiocb *, struct iov_iter *);
> > +extern int import_encoded_write(struct kiocb *, struct encoded_iov *,
> > +				struct iov_iter *);
> >  extern int generic_remap_checks(struct file *file_in, loff_t pos_in,
> >  				struct file *file_out, loff_t pos_out,
> >  				loff_t *count, unsigned int remap_flags);
> > @@ -3403,6 +3412,11 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
> >  			return -EOPNOTSUPP;
> >  		ki->ki_flags |= IOCB_NOWAIT;
> >  	}
> > +	if (flags & RWF_ENCODED) {
> > +		if (!(ki->ki_filp->f_mode & FMODE_ENCODED_IO))
> > +			return -EOPNOTSUPP;
> > +		ki->ki_flags |= IOCB_ENCODED;
> > +	}
> >  	if (flags & RWF_HIPRI)
> >  		ki->ki_flags |= IOCB_HIPRI;
> >  	if (flags & RWF_DSYNC)
> > diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> > index 379a612f8f1d..ed92a8a257cb 100644
> > --- a/include/uapi/linux/fs.h
> > +++ b/include/uapi/linux/fs.h
> > @@ -284,6 +284,27 @@ struct fsxattr {
> >  
> >  typedef int __bitwise __kernel_rwf_t;
> >  
> > +enum {
> > +	ENCODED_IOV_COMPRESSION_NONE,
> > +	ENCODED_IOV_COMPRESSION_ZLIB,
> > +	ENCODED_IOV_COMPRESSION_LZO,
> > +	ENCODED_IOV_COMPRESSION_ZSTD,
> > +	ENCODED_IOV_COMPRESSION_TYPES = ENCODED_IOV_COMPRESSION_ZSTD,
> > +};
> > +
> > +enum {
> > +	ENCODED_IOV_ENCRYPTION_NONE,
> > +	ENCODED_IOV_ENCRYPTION_TYPES = ENCODED_IOV_ENCRYPTION_NONE,
> > +};
> > +
> > +struct encoded_iov {
> > +	__u64 len;
> > +	__u64 unencoded_len;
> > +	__u64 unencoded_offset;
> > +	__u32 compression;
> > +	__u32 encryption;
> > +};
> > +
> >  /* high priority request, poll if possible */
> >  #define RWF_HIPRI	((__force __kernel_rwf_t)0x00000001)
> >  
> > @@ -299,8 +320,11 @@ typedef int __bitwise __kernel_rwf_t;
> >  /* per-IO O_APPEND */
> >  #define RWF_APPEND	((__force __kernel_rwf_t)0x00000010)
> >  
> > +/* encoded (e.g., compressed or encrypted) IO */
> 
> nit: s/or/and\/or/ or both are exclusive?

Changed, thanks.

> > +#define RWF_ENCODED	((__force __kernel_rwf_t)0x00000020)
> > +
> >  /* mask of flags supported by the kernel */
> >  #define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
> > -			 RWF_APPEND)
> > +			 RWF_APPEND | RWF_ENCODED)
> >  
> >  #endif /* _UAPI_LINUX_FS_H */
> > diff --git a/mm/filemap.c b/mm/filemap.c
> > index 1146fcfa3215..d2e6d9caf353 100644
> > --- a/mm/filemap.c
> > +++ b/mm/filemap.c
> > @@ -2948,24 +2948,15 @@ static int generic_write_check_limits(struct file *file, loff_t pos,
> >  	return 0;
> >  }
> >  
> > -/*
> > - * Performs necessary checks before doing a write
> > - *
> > - * Can adjust writing position or amount of bytes to write.
> > - * Returns appropriate error code that caller should return or
> > - * zero in case that write should be allowed.
> > - */
> > -inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
> > +static int generic_write_checks_common(struct kiocb *iocb, loff_t *count)
> >  {
> >  	struct file *file = iocb->ki_filp;
> >  	struct inode *inode = file->f_mapping->host;
> > -	loff_t count;
> > -	int ret;
> >  
> >  	if (IS_SWAPFILE(inode))
> >  		return -ETXTBSY;
> >  
> > -	if (!iov_iter_count(from))
> > +	if (!*count)
> >  		return 0;
> >  
> >  	/* FIXME: this is for backwards compatibility with 2.4 */
> > @@ -2975,8 +2966,21 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
> >  	if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
> >  		return -EINVAL;
> >  
> > -	count = iov_iter_count(from);
> > -	ret = generic_write_check_limits(file, iocb->ki_pos, &count);
> > +	return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
> > +}
> > +
> > +/*
> > + * Performs necessary checks before doing a write
> > + *
> > + * Can adjust writing position or amount of bytes to write.
> > + * Returns a negative errno or the new number of bytes to write.
> > + */
> > +inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
> > +{
> > +	loff_t count = iov_iter_count(from);
> > +	int ret;
> > +
> > +	ret = generic_write_checks_common(iocb, &count);
> >  	if (ret)
> >  		return ret;
> >  
> > @@ -2985,6 +2989,58 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
> >  }
> >  EXPORT_SYMBOL(generic_write_checks);
> >  
> > +int generic_encoded_write_checks(struct kiocb *iocb,
> > +				 struct encoded_iov *encoded)
> > +{
> > +	loff_t count = encoded->unencoded_len;
> > +	int ret;
> > +
> > +	ret = generic_write_checks_common(iocb, &count);
> 
> That's a bit confusing. You will only ever write encoded len bytes, yet
> you check the unencoded len. Presumably that's to ensure the data can be
> read back successfully? Still it feels a bit odd. IMO this warrants a
> comment.
> 
> When you use this function in patch 5 all the checks are performed
> against unencoded_len yet you do :
> 
> count = encoded.len;

Oops, this is supposed to check encoded->len. I forgot to update it when
I made the file length and unencoded length distinct fields. Good catch!

This needs to check the file length rather than the encoded length
because the checks in generic_write_check_limits() are concerned with
the file size (RLIMIT_FSIZE and s_maxbytes).

> > +	if (ret)
> > +		return ret;
> > +
> > +	if (count != encoded->unencoded_len) {
> > +		/*
> > +		 * The write got truncated by generic_write_checks_common(). We
> > +		 * can't do a partial encoded write.
> > +		 */
> > +		return -EFBIG;
> > +	}
> > +	return 0;
> > +}
> > +EXPORT_SYMBOL(generic_encoded_write_checks);
> > +
> > +ssize_t check_encoded_read(struct kiocb *iocb, struct iov_iter *iter)
> > +{
> > +	if (!(iocb->ki_filp->f_flags & O_ENCODED))
> > +		return -EPERM;
> > +	if (iov_iter_single_seg_count(iter) != sizeof(struct encoded_iov))
> > +		return -EINVAL;
> > +	return iov_iter_count(iter) - sizeof(struct encoded_iov);
> > +}
> > +EXPORT_SYMBOL(check_encoded_read);
> > +
> > +int import_encoded_write(struct kiocb *iocb, struct encoded_iov *encoded,
> 
> nit: This might be just me but 'import' doesn't sound right, how about
> parse_encoded_write ?

The naming is borrowed from import_iovec(). IMO that's more descriptive
since we're not really parsing anything.

> > +			 struct iov_iter *from)
> > +{
> > +	if (!(iocb->ki_filp->f_flags & O_ENCODED))
> > +		return -EPERM;
> > +	if (iov_iter_single_seg_count(from) != sizeof(*encoded))
> > +		return -EINVAL;
> > +	if (copy_from_iter(encoded, sizeof(*encoded), from) != sizeof(*encoded))
> > +		return -EFAULT;
> > +	if (encoded->compression == ENCODED_IOV_COMPRESSION_NONE &&
> > +	    encoded->encryption == ENCODED_IOV_ENCRYPTION_NONE)
> > +		return -EINVAL;
> > +	if (encoded->compression > ENCODED_IOV_COMPRESSION_TYPES ||
> > +	    encoded->encryption > ENCODED_IOV_ENCRYPTION_TYPES)
> > +		return -EINVAL;
> > +	if (encoded->unencoded_offset >= encoded->unencoded_len)
> > +		return -EINVAL;
> > +	return 0;
> > +}
> > +EXPORT_SYMBOL(import_encoded_write);
> > +
> >  /*
> >   * Performs necessary checks before doing a clone.
> >   *
> >
Aleksa Sarai Oct. 19, 2019, 5:01 a.m. UTC | #3
On 2019-10-15, Omar Sandoval <osandov@osandov.com> wrote:
> From: Omar Sandoval <osandov@fb.com>
> 
> Btrfs supports transparent compression: data written by the user can be
> compressed when written to disk and decompressed when read back.
> However, we'd like to add an interface to write pre-compressed data
> directly to the filesystem, and the matching interface to read
> compressed data without decompressing it. This adds support for
> so-called "encoded I/O" via preadv2() and pwritev2().
> 
> A new RWF_ENCODED flags indicates that a read or write is "encoded". If
> this flag is set, iov[0].iov_base points to a struct encoded_iov which
> is used for metadata: namely, the compression algorithm, unencoded
> (i.e., decompressed) length, and what subrange of the unencoded data
> should be used (needed for truncated or hole-punched extents and when
> reading in the middle of an extent). For reads, the filesystem returns
> this information; for writes, the caller provides it to the filesystem.
> iov[0].iov_len must be set to sizeof(struct encoded_iov), which can be
> used to extend the interface in the future. The remaining iovecs contain
> the encoded extent.
> 
> Filesystems must indicate that they support encoded writes by setting
> FMODE_ENCODED_IO in ->file_open().
> 
> Signed-off-by: Omar Sandoval <osandov@fb.com>
> ---
>  include/linux/fs.h      | 14 +++++++
>  include/uapi/linux/fs.h | 26 ++++++++++++-
>  mm/filemap.c            | 82 ++++++++++++++++++++++++++++++++++-------
>  3 files changed, 108 insertions(+), 14 deletions(-)
> 
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index e0d909d35763..54681f21e05e 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -175,6 +175,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
>  /* File does not contribute to nr_files count */
>  #define FMODE_NOACCOUNT		((__force fmode_t)0x20000000)
>  
> +/* File supports encoded IO */
> +#define FMODE_ENCODED_IO	((__force fmode_t)0x40000000)
> +
>  /*
>   * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
>   * that indicates that they should check the contents of the iovec are
> @@ -314,6 +317,7 @@ enum rw_hint {
>  #define IOCB_SYNC		(1 << 5)
>  #define IOCB_WRITE		(1 << 6)
>  #define IOCB_NOWAIT		(1 << 7)
> +#define IOCB_ENCODED		(1 << 8)
>  
>  struct kiocb {
>  	struct file		*ki_filp;
> @@ -3088,6 +3092,11 @@ extern int sb_min_blocksize(struct super_block *, int);
>  extern int generic_file_mmap(struct file *, struct vm_area_struct *);
>  extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
>  extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
> +struct encoded_iov;
> +extern int generic_encoded_write_checks(struct kiocb *, struct encoded_iov *);
> +extern ssize_t check_encoded_read(struct kiocb *, struct iov_iter *);
> +extern int import_encoded_write(struct kiocb *, struct encoded_iov *,
> +				struct iov_iter *);
>  extern int generic_remap_checks(struct file *file_in, loff_t pos_in,
>  				struct file *file_out, loff_t pos_out,
>  				loff_t *count, unsigned int remap_flags);
> @@ -3403,6 +3412,11 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
>  			return -EOPNOTSUPP;
>  		ki->ki_flags |= IOCB_NOWAIT;
>  	}
> +	if (flags & RWF_ENCODED) {
> +		if (!(ki->ki_filp->f_mode & FMODE_ENCODED_IO))
> +			return -EOPNOTSUPP;
> +		ki->ki_flags |= IOCB_ENCODED;
> +	}
>  	if (flags & RWF_HIPRI)
>  		ki->ki_flags |= IOCB_HIPRI;
>  	if (flags & RWF_DSYNC)
> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> index 379a612f8f1d..ed92a8a257cb 100644
> --- a/include/uapi/linux/fs.h
> +++ b/include/uapi/linux/fs.h
> @@ -284,6 +284,27 @@ struct fsxattr {
>  
>  typedef int __bitwise __kernel_rwf_t;
>  
> +enum {
> +	ENCODED_IOV_COMPRESSION_NONE,
> +	ENCODED_IOV_COMPRESSION_ZLIB,
> +	ENCODED_IOV_COMPRESSION_LZO,
> +	ENCODED_IOV_COMPRESSION_ZSTD,
> +	ENCODED_IOV_COMPRESSION_TYPES = ENCODED_IOV_COMPRESSION_ZSTD,
> +};
> +
> +enum {
> +	ENCODED_IOV_ENCRYPTION_NONE,
> +	ENCODED_IOV_ENCRYPTION_TYPES = ENCODED_IOV_ENCRYPTION_NONE,
> +};
> +
> +struct encoded_iov {
> +	__u64 len;
> +	__u64 unencoded_len;
> +	__u64 unencoded_offset;
> +	__u32 compression;
> +	__u32 encryption;
> +};
> +
>  /* high priority request, poll if possible */
>  #define RWF_HIPRI	((__force __kernel_rwf_t)0x00000001)
>  
> @@ -299,8 +320,11 @@ typedef int __bitwise __kernel_rwf_t;
>  /* per-IO O_APPEND */
>  #define RWF_APPEND	((__force __kernel_rwf_t)0x00000010)
>  
> +/* encoded (e.g., compressed or encrypted) IO */
> +#define RWF_ENCODED	((__force __kernel_rwf_t)0x00000020)
> +
>  /* mask of flags supported by the kernel */
>  #define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
> -			 RWF_APPEND)
> +			 RWF_APPEND | RWF_ENCODED)
>  
>  #endif /* _UAPI_LINUX_FS_H */
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 1146fcfa3215..d2e6d9caf353 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -2948,24 +2948,15 @@ static int generic_write_check_limits(struct file *file, loff_t pos,
>  	return 0;
>  }
>  
> -/*
> - * Performs necessary checks before doing a write
> - *
> - * Can adjust writing position or amount of bytes to write.
> - * Returns appropriate error code that caller should return or
> - * zero in case that write should be allowed.
> - */
> -inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
> +static int generic_write_checks_common(struct kiocb *iocb, loff_t *count)
>  {
>  	struct file *file = iocb->ki_filp;
>  	struct inode *inode = file->f_mapping->host;
> -	loff_t count;
> -	int ret;
>  
>  	if (IS_SWAPFILE(inode))
>  		return -ETXTBSY;
>  
> -	if (!iov_iter_count(from))
> +	if (!*count)
>  		return 0;
>  
>  	/* FIXME: this is for backwards compatibility with 2.4 */
> @@ -2975,8 +2966,21 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
>  	if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
>  		return -EINVAL;
>  
> -	count = iov_iter_count(from);
> -	ret = generic_write_check_limits(file, iocb->ki_pos, &count);
> +	return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
> +}
> +
> +/*
> + * Performs necessary checks before doing a write
> + *
> + * Can adjust writing position or amount of bytes to write.
> + * Returns a negative errno or the new number of bytes to write.
> + */
> +inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
> +{
> +	loff_t count = iov_iter_count(from);
> +	int ret;
> +
> +	ret = generic_write_checks_common(iocb, &count);
>  	if (ret)
>  		return ret;
>  
> @@ -2985,6 +2989,58 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
>  }
>  EXPORT_SYMBOL(generic_write_checks);
>  
> +int generic_encoded_write_checks(struct kiocb *iocb,
> +				 struct encoded_iov *encoded)
> +{
> +	loff_t count = encoded->unencoded_len;
> +	int ret;
> +
> +	ret = generic_write_checks_common(iocb, &count);
> +	if (ret)
> +		return ret;
> +
> +	if (count != encoded->unencoded_len) {
> +		/*
> +		 * The write got truncated by generic_write_checks_common(). We
> +		 * can't do a partial encoded write.
> +		 */
> +		return -EFBIG;
> +	}
> +	return 0;
> +}
> +EXPORT_SYMBOL(generic_encoded_write_checks);
> +
> +ssize_t check_encoded_read(struct kiocb *iocb, struct iov_iter *iter)
> +{
> +	if (!(iocb->ki_filp->f_flags & O_ENCODED))
> +		return -EPERM;
> +	if (iov_iter_single_seg_count(iter) != sizeof(struct encoded_iov))
> +		return -EINVAL;

I'm not sure what is precisely the right way of doing this within the
iov_iter world (maybe we should write some new helpers for that
usecase), but if you want to make forwards-compatibility much more
smooth please take a look at the new copy_struct_from_user() semantics
(it allows you to extend userspace-exposed structures without making it
painful for new software on old kernels).

Basically the semantics boil down to the following (ksize is the
kernel's struct size, and usize is the size that userspace used). All
new features must have their zero-value mean "don't use the new
feature".

  1. If ksize == usize, use it verbatim.
  2. If ksize > usize (old userspace), zero-fill the rest of the kernel
	 struct -- thus if userspace doesn't know about a new feature, it is
	 disabled.
  3. If ksize < usize (old kernel), check whether the trailing
     (usize - ksize) bytes are zero. If they are, then just use the
	 ksize prefix of the userspace struct. If they are non-zero then
	 give -E2BIG. Thus if userspace is newer than the kernel but isn't
	 using a new feature, it won't get an error.

This is how clone3(2) works (and openat2(2) will work), as well as some
older syscalls like perf_event_open(2) and sched_setattr(2). BPF also
has some similar semantics.

I really would like us to have a much more uniform way of defining
extensible APIs in Linux. By returning -EINVAL for all differently-sized
structs means that any new programs will give errors on older kernels
(even if they aren't using any new features).

> +	return iov_iter_count(iter) - sizeof(struct encoded_iov);
> +}
> +EXPORT_SYMBOL(check_encoded_read);
> +
> +int import_encoded_write(struct kiocb *iocb, struct encoded_iov *encoded,
> +			 struct iov_iter *from)
> +{
> +	if (!(iocb->ki_filp->f_flags & O_ENCODED))
> +		return -EPERM;
> +	if (iov_iter_single_seg_count(from) != sizeof(*encoded))
> +		return -EINVAL;
> +	if (copy_from_iter(encoded, sizeof(*encoded), from) != sizeof(*encoded))
> +		return -EFAULT;
> +	if (encoded->compression == ENCODED_IOV_COMPRESSION_NONE &&
> +	    encoded->encryption == ENCODED_IOV_ENCRYPTION_NONE)
> +		return -EINVAL;
> +	if (encoded->compression > ENCODED_IOV_COMPRESSION_TYPES ||
> +	    encoded->encryption > ENCODED_IOV_ENCRYPTION_TYPES)
> +		return -EINVAL;
> +	if (encoded->unencoded_offset >= encoded->unencoded_len)
> +		return -EINVAL;
> +	return 0;
> +}
> +EXPORT_SYMBOL(import_encoded_write);
> +
>  /*
>   * Performs necessary checks before doing a clone.
>   *
Darrick J. Wong Oct. 21, 2019, 6:28 p.m. UTC | #4
On Tue, Oct 15, 2019 at 11:42:40AM -0700, Omar Sandoval wrote:
> From: Omar Sandoval <osandov@fb.com>
> 
> Btrfs supports transparent compression: data written by the user can be
> compressed when written to disk and decompressed when read back.
> However, we'd like to add an interface to write pre-compressed data
> directly to the filesystem, and the matching interface to read
> compressed data without decompressing it. This adds support for
> so-called "encoded I/O" via preadv2() and pwritev2().
> 
> A new RWF_ENCODED flags indicates that a read or write is "encoded". If
> this flag is set, iov[0].iov_base points to a struct encoded_iov which
> is used for metadata: namely, the compression algorithm, unencoded
> (i.e., decompressed) length, and what subrange of the unencoded data
> should be used (needed for truncated or hole-punched extents and when
> reading in the middle of an extent). For reads, the filesystem returns
> this information; for writes, the caller provides it to the filesystem.
> iov[0].iov_len must be set to sizeof(struct encoded_iov), which can be
> used to extend the interface in the future. The remaining iovecs contain
> the encoded extent.
> 
> Filesystems must indicate that they support encoded writes by setting
> FMODE_ENCODED_IO in ->file_open().
> 
> Signed-off-by: Omar Sandoval <osandov@fb.com>
> ---
>  include/linux/fs.h      | 14 +++++++
>  include/uapi/linux/fs.h | 26 ++++++++++++-
>  mm/filemap.c            | 82 ++++++++++++++++++++++++++++++++++-------
>  3 files changed, 108 insertions(+), 14 deletions(-)
> 
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index e0d909d35763..54681f21e05e 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -175,6 +175,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
>  /* File does not contribute to nr_files count */
>  #define FMODE_NOACCOUNT		((__force fmode_t)0x20000000)
>  
> +/* File supports encoded IO */
> +#define FMODE_ENCODED_IO	((__force fmode_t)0x40000000)
> +
>  /*
>   * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
>   * that indicates that they should check the contents of the iovec are
> @@ -314,6 +317,7 @@ enum rw_hint {
>  #define IOCB_SYNC		(1 << 5)
>  #define IOCB_WRITE		(1 << 6)
>  #define IOCB_NOWAIT		(1 << 7)
> +#define IOCB_ENCODED		(1 << 8)
>  
>  struct kiocb {
>  	struct file		*ki_filp;
> @@ -3088,6 +3092,11 @@ extern int sb_min_blocksize(struct super_block *, int);
>  extern int generic_file_mmap(struct file *, struct vm_area_struct *);
>  extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
>  extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
> +struct encoded_iov;
> +extern int generic_encoded_write_checks(struct kiocb *, struct encoded_iov *);
> +extern ssize_t check_encoded_read(struct kiocb *, struct iov_iter *);
> +extern int import_encoded_write(struct kiocb *, struct encoded_iov *,
> +				struct iov_iter *);
>  extern int generic_remap_checks(struct file *file_in, loff_t pos_in,
>  				struct file *file_out, loff_t pos_out,
>  				loff_t *count, unsigned int remap_flags);
> @@ -3403,6 +3412,11 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
>  			return -EOPNOTSUPP;
>  		ki->ki_flags |= IOCB_NOWAIT;
>  	}
> +	if (flags & RWF_ENCODED) {
> +		if (!(ki->ki_filp->f_mode & FMODE_ENCODED_IO))
> +			return -EOPNOTSUPP;
> +		ki->ki_flags |= IOCB_ENCODED;
> +	}
>  	if (flags & RWF_HIPRI)
>  		ki->ki_flags |= IOCB_HIPRI;
>  	if (flags & RWF_DSYNC)
> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> index 379a612f8f1d..ed92a8a257cb 100644
> --- a/include/uapi/linux/fs.h
> +++ b/include/uapi/linux/fs.h
> @@ -284,6 +284,27 @@ struct fsxattr {
>  
>  typedef int __bitwise __kernel_rwf_t;
>  
> +enum {
> +	ENCODED_IOV_COMPRESSION_NONE,
> +	ENCODED_IOV_COMPRESSION_ZLIB,
> +	ENCODED_IOV_COMPRESSION_LZO,
> +	ENCODED_IOV_COMPRESSION_ZSTD,
> +	ENCODED_IOV_COMPRESSION_TYPES = ENCODED_IOV_COMPRESSION_ZSTD,
> +};
> +
> +enum {
> +	ENCODED_IOV_ENCRYPTION_NONE,
> +	ENCODED_IOV_ENCRYPTION_TYPES = ENCODED_IOV_ENCRYPTION_NONE,
> +};
> +
> +struct encoded_iov {
> +	__u64 len;
> +	__u64 unencoded_len;
> +	__u64 unencoded_offset;
> +	__u32 compression;
> +	__u32 encryption;

Can we add some must-be-zero padding space at the end here for whomever
comes along next wanting to add more encoding info?

(And maybe a manpage and some basic testing, to reiterate Dave...)

--D

> +};
> +
>  /* high priority request, poll if possible */
>  #define RWF_HIPRI	((__force __kernel_rwf_t)0x00000001)
>  
> @@ -299,8 +320,11 @@ typedef int __bitwise __kernel_rwf_t;
>  /* per-IO O_APPEND */
>  #define RWF_APPEND	((__force __kernel_rwf_t)0x00000010)
>  
> +/* encoded (e.g., compressed or encrypted) IO */
> +#define RWF_ENCODED	((__force __kernel_rwf_t)0x00000020)
> +
>  /* mask of flags supported by the kernel */
>  #define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
> -			 RWF_APPEND)
> +			 RWF_APPEND | RWF_ENCODED)
>  
>  #endif /* _UAPI_LINUX_FS_H */
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 1146fcfa3215..d2e6d9caf353 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -2948,24 +2948,15 @@ static int generic_write_check_limits(struct file *file, loff_t pos,
>  	return 0;
>  }
>  
> -/*
> - * Performs necessary checks before doing a write
> - *
> - * Can adjust writing position or amount of bytes to write.
> - * Returns appropriate error code that caller should return or
> - * zero in case that write should be allowed.
> - */
> -inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
> +static int generic_write_checks_common(struct kiocb *iocb, loff_t *count)
>  {
>  	struct file *file = iocb->ki_filp;
>  	struct inode *inode = file->f_mapping->host;
> -	loff_t count;
> -	int ret;
>  
>  	if (IS_SWAPFILE(inode))
>  		return -ETXTBSY;
>  
> -	if (!iov_iter_count(from))
> +	if (!*count)
>  		return 0;
>  
>  	/* FIXME: this is for backwards compatibility with 2.4 */
> @@ -2975,8 +2966,21 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
>  	if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
>  		return -EINVAL;
>  
> -	count = iov_iter_count(from);
> -	ret = generic_write_check_limits(file, iocb->ki_pos, &count);
> +	return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
> +}
> +
> +/*
> + * Performs necessary checks before doing a write
> + *
> + * Can adjust writing position or amount of bytes to write.
> + * Returns a negative errno or the new number of bytes to write.
> + */
> +inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
> +{
> +	loff_t count = iov_iter_count(from);
> +	int ret;
> +
> +	ret = generic_write_checks_common(iocb, &count);
>  	if (ret)
>  		return ret;
>  
> @@ -2985,6 +2989,58 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
>  }
>  EXPORT_SYMBOL(generic_write_checks);
>  
> +int generic_encoded_write_checks(struct kiocb *iocb,
> +				 struct encoded_iov *encoded)
> +{
> +	loff_t count = encoded->unencoded_len;
> +	int ret;
> +
> +	ret = generic_write_checks_common(iocb, &count);
> +	if (ret)
> +		return ret;
> +
> +	if (count != encoded->unencoded_len) {
> +		/*
> +		 * The write got truncated by generic_write_checks_common(). We
> +		 * can't do a partial encoded write.
> +		 */
> +		return -EFBIG;
> +	}
> +	return 0;
> +}
> +EXPORT_SYMBOL(generic_encoded_write_checks);
> +
> +ssize_t check_encoded_read(struct kiocb *iocb, struct iov_iter *iter)
> +{
> +	if (!(iocb->ki_filp->f_flags & O_ENCODED))
> +		return -EPERM;
> +	if (iov_iter_single_seg_count(iter) != sizeof(struct encoded_iov))
> +		return -EINVAL;
> +	return iov_iter_count(iter) - sizeof(struct encoded_iov);
> +}
> +EXPORT_SYMBOL(check_encoded_read);
> +
> +int import_encoded_write(struct kiocb *iocb, struct encoded_iov *encoded,
> +			 struct iov_iter *from)
> +{
> +	if (!(iocb->ki_filp->f_flags & O_ENCODED))
> +		return -EPERM;
> +	if (iov_iter_single_seg_count(from) != sizeof(*encoded))
> +		return -EINVAL;
> +	if (copy_from_iter(encoded, sizeof(*encoded), from) != sizeof(*encoded))
> +		return -EFAULT;
> +	if (encoded->compression == ENCODED_IOV_COMPRESSION_NONE &&
> +	    encoded->encryption == ENCODED_IOV_ENCRYPTION_NONE)
> +		return -EINVAL;
> +	if (encoded->compression > ENCODED_IOV_COMPRESSION_TYPES ||
> +	    encoded->encryption > ENCODED_IOV_ENCRYPTION_TYPES)
> +		return -EINVAL;
> +	if (encoded->unencoded_offset >= encoded->unencoded_len)
> +		return -EINVAL;
> +	return 0;
> +}
> +EXPORT_SYMBOL(import_encoded_write);
> +
>  /*
>   * Performs necessary checks before doing a clone.
>   *
> -- 
> 2.23.0
>
Aleksa Sarai Oct. 21, 2019, 6:38 p.m. UTC | #5
On 2019-10-21, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> On Tue, Oct 15, 2019 at 11:42:40AM -0700, Omar Sandoval wrote:
> > From: Omar Sandoval <osandov@fb.com>
> > 
> > Btrfs supports transparent compression: data written by the user can be
> > compressed when written to disk and decompressed when read back.
> > However, we'd like to add an interface to write pre-compressed data
> > directly to the filesystem, and the matching interface to read
> > compressed data without decompressing it. This adds support for
> > so-called "encoded I/O" via preadv2() and pwritev2().
> > 
> > A new RWF_ENCODED flags indicates that a read or write is "encoded". If
> > this flag is set, iov[0].iov_base points to a struct encoded_iov which
> > is used for metadata: namely, the compression algorithm, unencoded
> > (i.e., decompressed) length, and what subrange of the unencoded data
> > should be used (needed for truncated or hole-punched extents and when
> > reading in the middle of an extent). For reads, the filesystem returns
> > this information; for writes, the caller provides it to the filesystem.
> > iov[0].iov_len must be set to sizeof(struct encoded_iov), which can be
> > used to extend the interface in the future. The remaining iovecs contain
> > the encoded extent.
> > 
> > Filesystems must indicate that they support encoded writes by setting
> > FMODE_ENCODED_IO in ->file_open().
> > 
> > Signed-off-by: Omar Sandoval <osandov@fb.com>
> > ---
> >  include/linux/fs.h      | 14 +++++++
> >  include/uapi/linux/fs.h | 26 ++++++++++++-
> >  mm/filemap.c            | 82 ++++++++++++++++++++++++++++++++++-------
> >  3 files changed, 108 insertions(+), 14 deletions(-)
> > 
> > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > index e0d909d35763..54681f21e05e 100644
> > --- a/include/linux/fs.h
> > +++ b/include/linux/fs.h
> > @@ -175,6 +175,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
> >  /* File does not contribute to nr_files count */
> >  #define FMODE_NOACCOUNT		((__force fmode_t)0x20000000)
> >  
> > +/* File supports encoded IO */
> > +#define FMODE_ENCODED_IO	((__force fmode_t)0x40000000)
> > +
> >  /*
> >   * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
> >   * that indicates that they should check the contents of the iovec are
> > @@ -314,6 +317,7 @@ enum rw_hint {
> >  #define IOCB_SYNC		(1 << 5)
> >  #define IOCB_WRITE		(1 << 6)
> >  #define IOCB_NOWAIT		(1 << 7)
> > +#define IOCB_ENCODED		(1 << 8)
> >  
> >  struct kiocb {
> >  	struct file		*ki_filp;
> > @@ -3088,6 +3092,11 @@ extern int sb_min_blocksize(struct super_block *, int);
> >  extern int generic_file_mmap(struct file *, struct vm_area_struct *);
> >  extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
> >  extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
> > +struct encoded_iov;
> > +extern int generic_encoded_write_checks(struct kiocb *, struct encoded_iov *);
> > +extern ssize_t check_encoded_read(struct kiocb *, struct iov_iter *);
> > +extern int import_encoded_write(struct kiocb *, struct encoded_iov *,
> > +				struct iov_iter *);
> >  extern int generic_remap_checks(struct file *file_in, loff_t pos_in,
> >  				struct file *file_out, loff_t pos_out,
> >  				loff_t *count, unsigned int remap_flags);
> > @@ -3403,6 +3412,11 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
> >  			return -EOPNOTSUPP;
> >  		ki->ki_flags |= IOCB_NOWAIT;
> >  	}
> > +	if (flags & RWF_ENCODED) {
> > +		if (!(ki->ki_filp->f_mode & FMODE_ENCODED_IO))
> > +			return -EOPNOTSUPP;
> > +		ki->ki_flags |= IOCB_ENCODED;
> > +	}
> >  	if (flags & RWF_HIPRI)
> >  		ki->ki_flags |= IOCB_HIPRI;
> >  	if (flags & RWF_DSYNC)
> > diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> > index 379a612f8f1d..ed92a8a257cb 100644
> > --- a/include/uapi/linux/fs.h
> > +++ b/include/uapi/linux/fs.h
> > @@ -284,6 +284,27 @@ struct fsxattr {
> >  
> >  typedef int __bitwise __kernel_rwf_t;
> >  
> > +enum {
> > +	ENCODED_IOV_COMPRESSION_NONE,
> > +	ENCODED_IOV_COMPRESSION_ZLIB,
> > +	ENCODED_IOV_COMPRESSION_LZO,
> > +	ENCODED_IOV_COMPRESSION_ZSTD,
> > +	ENCODED_IOV_COMPRESSION_TYPES = ENCODED_IOV_COMPRESSION_ZSTD,
> > +};
> > +
> > +enum {
> > +	ENCODED_IOV_ENCRYPTION_NONE,
> > +	ENCODED_IOV_ENCRYPTION_TYPES = ENCODED_IOV_ENCRYPTION_NONE,
> > +};
> > +
> > +struct encoded_iov {
> > +	__u64 len;
> > +	__u64 unencoded_len;
> > +	__u64 unencoded_offset;
> > +	__u32 compression;
> > +	__u32 encryption;
> 
> Can we add some must-be-zero padding space at the end here for whomever
> comes along next wanting to add more encoding info?

I would suggest to copy the extension design of copy_struct_from_user().
Adding must-be-zero padding is a less-ideal solution to the extension
problem than length-based extension.

Also (I might be wrong) but shouldn't the __u64s be __aligned_u64 (as
with syscall structure arguments)?

> (And maybe a manpage and some basic testing, to reiterate Dave...)
> 
> --D
> 
> > +};
> > +
> >  /* high priority request, poll if possible */
> >  #define RWF_HIPRI	((__force __kernel_rwf_t)0x00000001)
> >  
> > @@ -299,8 +320,11 @@ typedef int __bitwise __kernel_rwf_t;
> >  /* per-IO O_APPEND */
> >  #define RWF_APPEND	((__force __kernel_rwf_t)0x00000010)
> >  
> > +/* encoded (e.g., compressed or encrypted) IO */
> > +#define RWF_ENCODED	((__force __kernel_rwf_t)0x00000020)
> > +
> >  /* mask of flags supported by the kernel */
> >  #define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
> > -			 RWF_APPEND)
> > +			 RWF_APPEND | RWF_ENCODED)
> >  
> >  #endif /* _UAPI_LINUX_FS_H */
> > diff --git a/mm/filemap.c b/mm/filemap.c
> > index 1146fcfa3215..d2e6d9caf353 100644
> > --- a/mm/filemap.c
> > +++ b/mm/filemap.c
> > @@ -2948,24 +2948,15 @@ static int generic_write_check_limits(struct file *file, loff_t pos,
> >  	return 0;
> >  }
> >  
> > -/*
> > - * Performs necessary checks before doing a write
> > - *
> > - * Can adjust writing position or amount of bytes to write.
> > - * Returns appropriate error code that caller should return or
> > - * zero in case that write should be allowed.
> > - */
> > -inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
> > +static int generic_write_checks_common(struct kiocb *iocb, loff_t *count)
> >  {
> >  	struct file *file = iocb->ki_filp;
> >  	struct inode *inode = file->f_mapping->host;
> > -	loff_t count;
> > -	int ret;
> >  
> >  	if (IS_SWAPFILE(inode))
> >  		return -ETXTBSY;
> >  
> > -	if (!iov_iter_count(from))
> > +	if (!*count)
> >  		return 0;
> >  
> >  	/* FIXME: this is for backwards compatibility with 2.4 */
> > @@ -2975,8 +2966,21 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
> >  	if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
> >  		return -EINVAL;
> >  
> > -	count = iov_iter_count(from);
> > -	ret = generic_write_check_limits(file, iocb->ki_pos, &count);
> > +	return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
> > +}
> > +
> > +/*
> > + * Performs necessary checks before doing a write
> > + *
> > + * Can adjust writing position or amount of bytes to write.
> > + * Returns a negative errno or the new number of bytes to write.
> > + */
> > +inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
> > +{
> > +	loff_t count = iov_iter_count(from);
> > +	int ret;
> > +
> > +	ret = generic_write_checks_common(iocb, &count);
> >  	if (ret)
> >  		return ret;
> >  
> > @@ -2985,6 +2989,58 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
> >  }
> >  EXPORT_SYMBOL(generic_write_checks);
> >  
> > +int generic_encoded_write_checks(struct kiocb *iocb,
> > +				 struct encoded_iov *encoded)
> > +{
> > +	loff_t count = encoded->unencoded_len;
> > +	int ret;
> > +
> > +	ret = generic_write_checks_common(iocb, &count);
> > +	if (ret)
> > +		return ret;
> > +
> > +	if (count != encoded->unencoded_len) {
> > +		/*
> > +		 * The write got truncated by generic_write_checks_common(). We
> > +		 * can't do a partial encoded write.
> > +		 */
> > +		return -EFBIG;
> > +	}
> > +	return 0;
> > +}
> > +EXPORT_SYMBOL(generic_encoded_write_checks);
> > +
> > +ssize_t check_encoded_read(struct kiocb *iocb, struct iov_iter *iter)
> > +{
> > +	if (!(iocb->ki_filp->f_flags & O_ENCODED))
> > +		return -EPERM;
> > +	if (iov_iter_single_seg_count(iter) != sizeof(struct encoded_iov))
> > +		return -EINVAL;
> > +	return iov_iter_count(iter) - sizeof(struct encoded_iov);
> > +}
> > +EXPORT_SYMBOL(check_encoded_read);
> > +
> > +int import_encoded_write(struct kiocb *iocb, struct encoded_iov *encoded,
> > +			 struct iov_iter *from)
> > +{
> > +	if (!(iocb->ki_filp->f_flags & O_ENCODED))
> > +		return -EPERM;
> > +	if (iov_iter_single_seg_count(from) != sizeof(*encoded))
> > +		return -EINVAL;
> > +	if (copy_from_iter(encoded, sizeof(*encoded), from) != sizeof(*encoded))
> > +		return -EFAULT;
> > +	if (encoded->compression == ENCODED_IOV_COMPRESSION_NONE &&
> > +	    encoded->encryption == ENCODED_IOV_ENCRYPTION_NONE)
> > +		return -EINVAL;
> > +	if (encoded->compression > ENCODED_IOV_COMPRESSION_TYPES ||
> > +	    encoded->encryption > ENCODED_IOV_ENCRYPTION_TYPES)
> > +		return -EINVAL;
> > +	if (encoded->unencoded_offset >= encoded->unencoded_len)
> > +		return -EINVAL;
> > +	return 0;
> > +}
> > +EXPORT_SYMBOL(import_encoded_write);
> > +
> >  /*
> >   * Performs necessary checks before doing a clone.
> >   *
> > -- 
> > 2.23.0
> >
Darrick J. Wong Oct. 21, 2019, 7 p.m. UTC | #6
On Tue, Oct 22, 2019 at 05:38:31AM +1100, Aleksa Sarai wrote:
> On 2019-10-21, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> > On Tue, Oct 15, 2019 at 11:42:40AM -0700, Omar Sandoval wrote:
> > > From: Omar Sandoval <osandov@fb.com>
> > > 
> > > Btrfs supports transparent compression: data written by the user can be
> > > compressed when written to disk and decompressed when read back.
> > > However, we'd like to add an interface to write pre-compressed data
> > > directly to the filesystem, and the matching interface to read
> > > compressed data without decompressing it. This adds support for
> > > so-called "encoded I/O" via preadv2() and pwritev2().
> > > 
> > > A new RWF_ENCODED flags indicates that a read or write is "encoded". If
> > > this flag is set, iov[0].iov_base points to a struct encoded_iov which
> > > is used for metadata: namely, the compression algorithm, unencoded
> > > (i.e., decompressed) length, and what subrange of the unencoded data
> > > should be used (needed for truncated or hole-punched extents and when
> > > reading in the middle of an extent). For reads, the filesystem returns
> > > this information; for writes, the caller provides it to the filesystem.
> > > iov[0].iov_len must be set to sizeof(struct encoded_iov), which can be
> > > used to extend the interface in the future. The remaining iovecs contain
> > > the encoded extent.
> > > 
> > > Filesystems must indicate that they support encoded writes by setting
> > > FMODE_ENCODED_IO in ->file_open().
> > > 
> > > Signed-off-by: Omar Sandoval <osandov@fb.com>
> > > ---
> > >  include/linux/fs.h      | 14 +++++++
> > >  include/uapi/linux/fs.h | 26 ++++++++++++-
> > >  mm/filemap.c            | 82 ++++++++++++++++++++++++++++++++++-------
> > >  3 files changed, 108 insertions(+), 14 deletions(-)
> > > 
> > > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > > index e0d909d35763..54681f21e05e 100644
> > > --- a/include/linux/fs.h
> > > +++ b/include/linux/fs.h
> > > @@ -175,6 +175,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
> > >  /* File does not contribute to nr_files count */
> > >  #define FMODE_NOACCOUNT		((__force fmode_t)0x20000000)
> > >  
> > > +/* File supports encoded IO */
> > > +#define FMODE_ENCODED_IO	((__force fmode_t)0x40000000)
> > > +
> > >  /*
> > >   * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
> > >   * that indicates that they should check the contents of the iovec are
> > > @@ -314,6 +317,7 @@ enum rw_hint {
> > >  #define IOCB_SYNC		(1 << 5)
> > >  #define IOCB_WRITE		(1 << 6)
> > >  #define IOCB_NOWAIT		(1 << 7)
> > > +#define IOCB_ENCODED		(1 << 8)
> > >  
> > >  struct kiocb {
> > >  	struct file		*ki_filp;
> > > @@ -3088,6 +3092,11 @@ extern int sb_min_blocksize(struct super_block *, int);
> > >  extern int generic_file_mmap(struct file *, struct vm_area_struct *);
> > >  extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
> > >  extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
> > > +struct encoded_iov;
> > > +extern int generic_encoded_write_checks(struct kiocb *, struct encoded_iov *);
> > > +extern ssize_t check_encoded_read(struct kiocb *, struct iov_iter *);
> > > +extern int import_encoded_write(struct kiocb *, struct encoded_iov *,
> > > +				struct iov_iter *);
> > >  extern int generic_remap_checks(struct file *file_in, loff_t pos_in,
> > >  				struct file *file_out, loff_t pos_out,
> > >  				loff_t *count, unsigned int remap_flags);
> > > @@ -3403,6 +3412,11 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
> > >  			return -EOPNOTSUPP;
> > >  		ki->ki_flags |= IOCB_NOWAIT;
> > >  	}
> > > +	if (flags & RWF_ENCODED) {
> > > +		if (!(ki->ki_filp->f_mode & FMODE_ENCODED_IO))
> > > +			return -EOPNOTSUPP;
> > > +		ki->ki_flags |= IOCB_ENCODED;
> > > +	}
> > >  	if (flags & RWF_HIPRI)
> > >  		ki->ki_flags |= IOCB_HIPRI;
> > >  	if (flags & RWF_DSYNC)
> > > diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> > > index 379a612f8f1d..ed92a8a257cb 100644
> > > --- a/include/uapi/linux/fs.h
> > > +++ b/include/uapi/linux/fs.h
> > > @@ -284,6 +284,27 @@ struct fsxattr {
> > >  
> > >  typedef int __bitwise __kernel_rwf_t;
> > >  
> > > +enum {
> > > +	ENCODED_IOV_COMPRESSION_NONE,
> > > +	ENCODED_IOV_COMPRESSION_ZLIB,
> > > +	ENCODED_IOV_COMPRESSION_LZO,
> > > +	ENCODED_IOV_COMPRESSION_ZSTD,
> > > +	ENCODED_IOV_COMPRESSION_TYPES = ENCODED_IOV_COMPRESSION_ZSTD,
> > > +};
> > > +
> > > +enum {
> > > +	ENCODED_IOV_ENCRYPTION_NONE,
> > > +	ENCODED_IOV_ENCRYPTION_TYPES = ENCODED_IOV_ENCRYPTION_NONE,
> > > +};
> > > +
> > > +struct encoded_iov {
> > > +	__u64 len;
> > > +	__u64 unencoded_len;
> > > +	__u64 unencoded_offset;
> > > +	__u32 compression;
> > > +	__u32 encryption;
> > 
> > Can we add some must-be-zero padding space at the end here for whomever
> > comes along next wanting to add more encoding info?
> 
> I would suggest to copy the extension design of copy_struct_from_user().
> Adding must-be-zero padding is a less-ideal solution to the extension
> problem than length-based extension.

Come to think of it, you /do/ have to specify iov_len so... yeah, do
that instead; we can always extend the structure later.

> Also (I might be wrong) but shouldn't the __u64s be __aligned_u64 (as
> with syscall structure arguments)?

<shrug> No idea, that's the first I've heard of that type and it doesn't
seem to be used by the fs code.  Why would we care about alignment for
an incore structure?

--D

> 
> > (And maybe a manpage and some basic testing, to reiterate Dave...)
> > 
> > --D
> > 
> > > +};
> > > +
> > >  /* high priority request, poll if possible */
> > >  #define RWF_HIPRI	((__force __kernel_rwf_t)0x00000001)
> > >  
> > > @@ -299,8 +320,11 @@ typedef int __bitwise __kernel_rwf_t;
> > >  /* per-IO O_APPEND */
> > >  #define RWF_APPEND	((__force __kernel_rwf_t)0x00000010)
> > >  
> > > +/* encoded (e.g., compressed or encrypted) IO */
> > > +#define RWF_ENCODED	((__force __kernel_rwf_t)0x00000020)
> > > +
> > >  /* mask of flags supported by the kernel */
> > >  #define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
> > > -			 RWF_APPEND)
> > > +			 RWF_APPEND | RWF_ENCODED)
> > >  
> > >  #endif /* _UAPI_LINUX_FS_H */
> > > diff --git a/mm/filemap.c b/mm/filemap.c
> > > index 1146fcfa3215..d2e6d9caf353 100644
> > > --- a/mm/filemap.c
> > > +++ b/mm/filemap.c
> > > @@ -2948,24 +2948,15 @@ static int generic_write_check_limits(struct file *file, loff_t pos,
> > >  	return 0;
> > >  }
> > >  
> > > -/*
> > > - * Performs necessary checks before doing a write
> > > - *
> > > - * Can adjust writing position or amount of bytes to write.
> > > - * Returns appropriate error code that caller should return or
> > > - * zero in case that write should be allowed.
> > > - */
> > > -inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
> > > +static int generic_write_checks_common(struct kiocb *iocb, loff_t *count)
> > >  {
> > >  	struct file *file = iocb->ki_filp;
> > >  	struct inode *inode = file->f_mapping->host;
> > > -	loff_t count;
> > > -	int ret;
> > >  
> > >  	if (IS_SWAPFILE(inode))
> > >  		return -ETXTBSY;
> > >  
> > > -	if (!iov_iter_count(from))
> > > +	if (!*count)
> > >  		return 0;
> > >  
> > >  	/* FIXME: this is for backwards compatibility with 2.4 */
> > > @@ -2975,8 +2966,21 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
> > >  	if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
> > >  		return -EINVAL;
> > >  
> > > -	count = iov_iter_count(from);
> > > -	ret = generic_write_check_limits(file, iocb->ki_pos, &count);
> > > +	return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
> > > +}
> > > +
> > > +/*
> > > + * Performs necessary checks before doing a write
> > > + *
> > > + * Can adjust writing position or amount of bytes to write.
> > > + * Returns a negative errno or the new number of bytes to write.
> > > + */
> > > +inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
> > > +{
> > > +	loff_t count = iov_iter_count(from);
> > > +	int ret;
> > > +
> > > +	ret = generic_write_checks_common(iocb, &count);
> > >  	if (ret)
> > >  		return ret;
> > >  
> > > @@ -2985,6 +2989,58 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
> > >  }
> > >  EXPORT_SYMBOL(generic_write_checks);
> > >  
> > > +int generic_encoded_write_checks(struct kiocb *iocb,
> > > +				 struct encoded_iov *encoded)
> > > +{
> > > +	loff_t count = encoded->unencoded_len;
> > > +	int ret;
> > > +
> > > +	ret = generic_write_checks_common(iocb, &count);
> > > +	if (ret)
> > > +		return ret;
> > > +
> > > +	if (count != encoded->unencoded_len) {
> > > +		/*
> > > +		 * The write got truncated by generic_write_checks_common(). We
> > > +		 * can't do a partial encoded write.
> > > +		 */
> > > +		return -EFBIG;
> > > +	}
> > > +	return 0;
> > > +}
> > > +EXPORT_SYMBOL(generic_encoded_write_checks);
> > > +
> > > +ssize_t check_encoded_read(struct kiocb *iocb, struct iov_iter *iter)
> > > +{
> > > +	if (!(iocb->ki_filp->f_flags & O_ENCODED))
> > > +		return -EPERM;
> > > +	if (iov_iter_single_seg_count(iter) != sizeof(struct encoded_iov))
> > > +		return -EINVAL;
> > > +	return iov_iter_count(iter) - sizeof(struct encoded_iov);
> > > +}
> > > +EXPORT_SYMBOL(check_encoded_read);
> > > +
> > > +int import_encoded_write(struct kiocb *iocb, struct encoded_iov *encoded,
> > > +			 struct iov_iter *from)
> > > +{
> > > +	if (!(iocb->ki_filp->f_flags & O_ENCODED))
> > > +		return -EPERM;
> > > +	if (iov_iter_single_seg_count(from) != sizeof(*encoded))
> > > +		return -EINVAL;
> > > +	if (copy_from_iter(encoded, sizeof(*encoded), from) != sizeof(*encoded))
> > > +		return -EFAULT;
> > > +	if (encoded->compression == ENCODED_IOV_COMPRESSION_NONE &&
> > > +	    encoded->encryption == ENCODED_IOV_ENCRYPTION_NONE)
> > > +		return -EINVAL;
> > > +	if (encoded->compression > ENCODED_IOV_COMPRESSION_TYPES ||
> > > +	    encoded->encryption > ENCODED_IOV_ENCRYPTION_TYPES)
> > > +		return -EINVAL;
> > > +	if (encoded->unencoded_offset >= encoded->unencoded_len)
> > > +		return -EINVAL;
> > > +	return 0;
> > > +}
> > > +EXPORT_SYMBOL(import_encoded_write);
> > > +
> > >  /*
> > >   * Performs necessary checks before doing a clone.
> > >   *
> > > -- 
> > > 2.23.0
> > > 
> 
> 
> -- 
> Aleksa Sarai
> Senior Software Engineer (Containers)
> SUSE Linux GmbH
> <https://www.cyphar.com/>
Omar Sandoval Oct. 21, 2019, 7:07 p.m. UTC | #7
On Mon, Oct 21, 2019 at 11:28:06AM -0700, Darrick J. Wong wrote:
> On Tue, Oct 15, 2019 at 11:42:40AM -0700, Omar Sandoval wrote:
> > From: Omar Sandoval <osandov@fb.com>
> > 
> > Btrfs supports transparent compression: data written by the user can be
> > compressed when written to disk and decompressed when read back.
> > However, we'd like to add an interface to write pre-compressed data
> > directly to the filesystem, and the matching interface to read
> > compressed data without decompressing it. This adds support for
> > so-called "encoded I/O" via preadv2() and pwritev2().
> > 
> > A new RWF_ENCODED flags indicates that a read or write is "encoded". If
> > this flag is set, iov[0].iov_base points to a struct encoded_iov which
> > is used for metadata: namely, the compression algorithm, unencoded
> > (i.e., decompressed) length, and what subrange of the unencoded data
> > should be used (needed for truncated or hole-punched extents and when
> > reading in the middle of an extent). For reads, the filesystem returns
> > this information; for writes, the caller provides it to the filesystem.
> > iov[0].iov_len must be set to sizeof(struct encoded_iov), which can be
> > used to extend the interface in the future. The remaining iovecs contain
> > the encoded extent.
> > 
> > Filesystems must indicate that they support encoded writes by setting
> > FMODE_ENCODED_IO in ->file_open().
> > 
> > Signed-off-by: Omar Sandoval <osandov@fb.com>
> > ---
> >  include/linux/fs.h      | 14 +++++++
> >  include/uapi/linux/fs.h | 26 ++++++++++++-
> >  mm/filemap.c            | 82 ++++++++++++++++++++++++++++++++++-------
> >  3 files changed, 108 insertions(+), 14 deletions(-)
> > 
> > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > index e0d909d35763..54681f21e05e 100644
> > --- a/include/linux/fs.h
> > +++ b/include/linux/fs.h
> > @@ -175,6 +175,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
> >  /* File does not contribute to nr_files count */
> >  #define FMODE_NOACCOUNT		((__force fmode_t)0x20000000)
> >  
> > +/* File supports encoded IO */
> > +#define FMODE_ENCODED_IO	((__force fmode_t)0x40000000)
> > +
> >  /*
> >   * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
> >   * that indicates that they should check the contents of the iovec are
> > @@ -314,6 +317,7 @@ enum rw_hint {
> >  #define IOCB_SYNC		(1 << 5)
> >  #define IOCB_WRITE		(1 << 6)
> >  #define IOCB_NOWAIT		(1 << 7)
> > +#define IOCB_ENCODED		(1 << 8)
> >  
> >  struct kiocb {
> >  	struct file		*ki_filp;
> > @@ -3088,6 +3092,11 @@ extern int sb_min_blocksize(struct super_block *, int);
> >  extern int generic_file_mmap(struct file *, struct vm_area_struct *);
> >  extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
> >  extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
> > +struct encoded_iov;
> > +extern int generic_encoded_write_checks(struct kiocb *, struct encoded_iov *);
> > +extern ssize_t check_encoded_read(struct kiocb *, struct iov_iter *);
> > +extern int import_encoded_write(struct kiocb *, struct encoded_iov *,
> > +				struct iov_iter *);
> >  extern int generic_remap_checks(struct file *file_in, loff_t pos_in,
> >  				struct file *file_out, loff_t pos_out,
> >  				loff_t *count, unsigned int remap_flags);
> > @@ -3403,6 +3412,11 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
> >  			return -EOPNOTSUPP;
> >  		ki->ki_flags |= IOCB_NOWAIT;
> >  	}
> > +	if (flags & RWF_ENCODED) {
> > +		if (!(ki->ki_filp->f_mode & FMODE_ENCODED_IO))
> > +			return -EOPNOTSUPP;
> > +		ki->ki_flags |= IOCB_ENCODED;
> > +	}
> >  	if (flags & RWF_HIPRI)
> >  		ki->ki_flags |= IOCB_HIPRI;
> >  	if (flags & RWF_DSYNC)
> > diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> > index 379a612f8f1d..ed92a8a257cb 100644
> > --- a/include/uapi/linux/fs.h
> > +++ b/include/uapi/linux/fs.h
> > @@ -284,6 +284,27 @@ struct fsxattr {
> >  
> >  typedef int __bitwise __kernel_rwf_t;
> >  
> > +enum {
> > +	ENCODED_IOV_COMPRESSION_NONE,
> > +	ENCODED_IOV_COMPRESSION_ZLIB,
> > +	ENCODED_IOV_COMPRESSION_LZO,
> > +	ENCODED_IOV_COMPRESSION_ZSTD,
> > +	ENCODED_IOV_COMPRESSION_TYPES = ENCODED_IOV_COMPRESSION_ZSTD,
> > +};
> > +
> > +enum {
> > +	ENCODED_IOV_ENCRYPTION_NONE,
> > +	ENCODED_IOV_ENCRYPTION_TYPES = ENCODED_IOV_ENCRYPTION_NONE,
> > +};
> > +
> > +struct encoded_iov {
> > +	__u64 len;
> > +	__u64 unencoded_len;
> > +	__u64 unencoded_offset;
> > +	__u32 compression;
> > +	__u32 encryption;
> 
> Can we add some must-be-zero padding space at the end here for whomever
> comes along next wanting to add more encoding info?

From the commit message:

iov_len must be set to sizeof(struct encoded_iov), which can be used to
extend the interface in the future.

> (And maybe a manpage and some basic testing, to reiterate Dave...)

I sent a man page as part of this thread:

https://lore.kernel.org/linux-btrfs/c7e8f93596fee7bb818dc0edf29f484036be1abb.1571164851.git.osandov@fb.com/

See my reply to Dave, I have tests in my xfstests repo that I haven't
sent yet.

> --D
Aleksa Sarai Oct. 22, 2019, 1:37 a.m. UTC | #8
On 2019-10-21, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> On Tue, Oct 22, 2019 at 05:38:31AM +1100, Aleksa Sarai wrote:
> > On 2019-10-21, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> > > On Tue, Oct 15, 2019 at 11:42:40AM -0700, Omar Sandoval wrote:
> > > > From: Omar Sandoval <osandov@fb.com>
> > > > 
> > > > Btrfs supports transparent compression: data written by the user can be
> > > > compressed when written to disk and decompressed when read back.
> > > > However, we'd like to add an interface to write pre-compressed data
> > > > directly to the filesystem, and the matching interface to read
> > > > compressed data without decompressing it. This adds support for
> > > > so-called "encoded I/O" via preadv2() and pwritev2().
> > > > 
> > > > A new RWF_ENCODED flags indicates that a read or write is "encoded". If
> > > > this flag is set, iov[0].iov_base points to a struct encoded_iov which
> > > > is used for metadata: namely, the compression algorithm, unencoded
> > > > (i.e., decompressed) length, and what subrange of the unencoded data
> > > > should be used (needed for truncated or hole-punched extents and when
> > > > reading in the middle of an extent). For reads, the filesystem returns
> > > > this information; for writes, the caller provides it to the filesystem.
> > > > iov[0].iov_len must be set to sizeof(struct encoded_iov), which can be
> > > > used to extend the interface in the future. The remaining iovecs contain
> > > > the encoded extent.
> > > > 
> > > > Filesystems must indicate that they support encoded writes by setting
> > > > FMODE_ENCODED_IO in ->file_open().
> > > > 
> > > > Signed-off-by: Omar Sandoval <osandov@fb.com>
> > > > ---
> > > >  include/linux/fs.h      | 14 +++++++
> > > >  include/uapi/linux/fs.h | 26 ++++++++++++-
> > > >  mm/filemap.c            | 82 ++++++++++++++++++++++++++++++++++-------
> > > >  3 files changed, 108 insertions(+), 14 deletions(-)
> > > > 
> > > > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > > > index e0d909d35763..54681f21e05e 100644
> > > > --- a/include/linux/fs.h
> > > > +++ b/include/linux/fs.h
> > > > @@ -175,6 +175,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
> > > >  /* File does not contribute to nr_files count */
> > > >  #define FMODE_NOACCOUNT		((__force fmode_t)0x20000000)
> > > >  
> > > > +/* File supports encoded IO */
> > > > +#define FMODE_ENCODED_IO	((__force fmode_t)0x40000000)
> > > > +
> > > >  /*
> > > >   * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
> > > >   * that indicates that they should check the contents of the iovec are
> > > > @@ -314,6 +317,7 @@ enum rw_hint {
> > > >  #define IOCB_SYNC		(1 << 5)
> > > >  #define IOCB_WRITE		(1 << 6)
> > > >  #define IOCB_NOWAIT		(1 << 7)
> > > > +#define IOCB_ENCODED		(1 << 8)
> > > >  
> > > >  struct kiocb {
> > > >  	struct file		*ki_filp;
> > > > @@ -3088,6 +3092,11 @@ extern int sb_min_blocksize(struct super_block *, int);
> > > >  extern int generic_file_mmap(struct file *, struct vm_area_struct *);
> > > >  extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
> > > >  extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
> > > > +struct encoded_iov;
> > > > +extern int generic_encoded_write_checks(struct kiocb *, struct encoded_iov *);
> > > > +extern ssize_t check_encoded_read(struct kiocb *, struct iov_iter *);
> > > > +extern int import_encoded_write(struct kiocb *, struct encoded_iov *,
> > > > +				struct iov_iter *);
> > > >  extern int generic_remap_checks(struct file *file_in, loff_t pos_in,
> > > >  				struct file *file_out, loff_t pos_out,
> > > >  				loff_t *count, unsigned int remap_flags);
> > > > @@ -3403,6 +3412,11 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
> > > >  			return -EOPNOTSUPP;
> > > >  		ki->ki_flags |= IOCB_NOWAIT;
> > > >  	}
> > > > +	if (flags & RWF_ENCODED) {
> > > > +		if (!(ki->ki_filp->f_mode & FMODE_ENCODED_IO))
> > > > +			return -EOPNOTSUPP;
> > > > +		ki->ki_flags |= IOCB_ENCODED;
> > > > +	}
> > > >  	if (flags & RWF_HIPRI)
> > > >  		ki->ki_flags |= IOCB_HIPRI;
> > > >  	if (flags & RWF_DSYNC)
> > > > diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> > > > index 379a612f8f1d..ed92a8a257cb 100644
> > > > --- a/include/uapi/linux/fs.h
> > > > +++ b/include/uapi/linux/fs.h
> > > > @@ -284,6 +284,27 @@ struct fsxattr {
> > > >  
> > > >  typedef int __bitwise __kernel_rwf_t;
> > > >  
> > > > +enum {
> > > > +	ENCODED_IOV_COMPRESSION_NONE,
> > > > +	ENCODED_IOV_COMPRESSION_ZLIB,
> > > > +	ENCODED_IOV_COMPRESSION_LZO,
> > > > +	ENCODED_IOV_COMPRESSION_ZSTD,
> > > > +	ENCODED_IOV_COMPRESSION_TYPES = ENCODED_IOV_COMPRESSION_ZSTD,
> > > > +};
> > > > +
> > > > +enum {
> > > > +	ENCODED_IOV_ENCRYPTION_NONE,
> > > > +	ENCODED_IOV_ENCRYPTION_TYPES = ENCODED_IOV_ENCRYPTION_NONE,
> > > > +};
> > > > +
> > > > +struct encoded_iov {
> > > > +	__u64 len;
> > > > +	__u64 unencoded_len;
> > > > +	__u64 unencoded_offset;
> > > > +	__u32 compression;
> > > > +	__u32 encryption;
> > > 
> > > Can we add some must-be-zero padding space at the end here for whomever
> > > comes along next wanting to add more encoding info?
> > 
> > I would suggest to copy the extension design of copy_struct_from_user().
> > Adding must-be-zero padding is a less-ideal solution to the extension
> > problem than length-based extension.
> 
> Come to think of it, you /do/ have to specify iov_len so... yeah, do
> that instead; we can always extend the structure later.
> 
> > Also (I might be wrong) but shouldn't the __u64s be __aligned_u64 (as
> > with syscall structure arguments)?
> 
> <shrug> No idea, that's the first I've heard of that type and it doesn't
> seem to be used by the fs code.  Why would we care about alignment for
> an incore structure?

When passing u64s from userspace, it's generally considered a good idea
to use __aligned_u64 -- the main reason is that 32-bit userspace on a
64-bit kernel will use different structure alignment for 64-bit fields.

This means you'd need to implement a bunch of COMPAT_SYSCALL-like
handling for that case. It's much simpler to use __aligned_u64 (and on
the plus side I don't think you need to add any fields to ensure the
padding is zero).

> > 
> > > (And maybe a manpage and some basic testing, to reiterate Dave...)
> > > 
> > > --D
> > > 
> > > > +};
> > > > +
> > > >  /* high priority request, poll if possible */
> > > >  #define RWF_HIPRI	((__force __kernel_rwf_t)0x00000001)
> > > >  
> > > > @@ -299,8 +320,11 @@ typedef int __bitwise __kernel_rwf_t;
> > > >  /* per-IO O_APPEND */
> > > >  #define RWF_APPEND	((__force __kernel_rwf_t)0x00000010)
> > > >  
> > > > +/* encoded (e.g., compressed or encrypted) IO */
> > > > +#define RWF_ENCODED	((__force __kernel_rwf_t)0x00000020)
> > > > +
> > > >  /* mask of flags supported by the kernel */
> > > >  #define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
> > > > -			 RWF_APPEND)
> > > > +			 RWF_APPEND | RWF_ENCODED)
> > > >  
> > > >  #endif /* _UAPI_LINUX_FS_H */
> > > > diff --git a/mm/filemap.c b/mm/filemap.c
> > > > index 1146fcfa3215..d2e6d9caf353 100644
> > > > --- a/mm/filemap.c
> > > > +++ b/mm/filemap.c
> > > > @@ -2948,24 +2948,15 @@ static int generic_write_check_limits(struct file *file, loff_t pos,
> > > >  	return 0;
> > > >  }
> > > >  
> > > > -/*
> > > > - * Performs necessary checks before doing a write
> > > > - *
> > > > - * Can adjust writing position or amount of bytes to write.
> > > > - * Returns appropriate error code that caller should return or
> > > > - * zero in case that write should be allowed.
> > > > - */
> > > > -inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
> > > > +static int generic_write_checks_common(struct kiocb *iocb, loff_t *count)
> > > >  {
> > > >  	struct file *file = iocb->ki_filp;
> > > >  	struct inode *inode = file->f_mapping->host;
> > > > -	loff_t count;
> > > > -	int ret;
> > > >  
> > > >  	if (IS_SWAPFILE(inode))
> > > >  		return -ETXTBSY;
> > > >  
> > > > -	if (!iov_iter_count(from))
> > > > +	if (!*count)
> > > >  		return 0;
> > > >  
> > > >  	/* FIXME: this is for backwards compatibility with 2.4 */
> > > > @@ -2975,8 +2966,21 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
> > > >  	if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
> > > >  		return -EINVAL;
> > > >  
> > > > -	count = iov_iter_count(from);
> > > > -	ret = generic_write_check_limits(file, iocb->ki_pos, &count);
> > > > +	return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
> > > > +}
> > > > +
> > > > +/*
> > > > + * Performs necessary checks before doing a write
> > > > + *
> > > > + * Can adjust writing position or amount of bytes to write.
> > > > + * Returns a negative errno or the new number of bytes to write.
> > > > + */
> > > > +inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
> > > > +{
> > > > +	loff_t count = iov_iter_count(from);
> > > > +	int ret;
> > > > +
> > > > +	ret = generic_write_checks_common(iocb, &count);
> > > >  	if (ret)
> > > >  		return ret;
> > > >  
> > > > @@ -2985,6 +2989,58 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
> > > >  }
> > > >  EXPORT_SYMBOL(generic_write_checks);
> > > >  
> > > > +int generic_encoded_write_checks(struct kiocb *iocb,
> > > > +				 struct encoded_iov *encoded)
> > > > +{
> > > > +	loff_t count = encoded->unencoded_len;
> > > > +	int ret;
> > > > +
> > > > +	ret = generic_write_checks_common(iocb, &count);
> > > > +	if (ret)
> > > > +		return ret;
> > > > +
> > > > +	if (count != encoded->unencoded_len) {
> > > > +		/*
> > > > +		 * The write got truncated by generic_write_checks_common(). We
> > > > +		 * can't do a partial encoded write.
> > > > +		 */
> > > > +		return -EFBIG;
> > > > +	}
> > > > +	return 0;
> > > > +}
> > > > +EXPORT_SYMBOL(generic_encoded_write_checks);
> > > > +
> > > > +ssize_t check_encoded_read(struct kiocb *iocb, struct iov_iter *iter)
> > > > +{
> > > > +	if (!(iocb->ki_filp->f_flags & O_ENCODED))
> > > > +		return -EPERM;
> > > > +	if (iov_iter_single_seg_count(iter) != sizeof(struct encoded_iov))
> > > > +		return -EINVAL;
> > > > +	return iov_iter_count(iter) - sizeof(struct encoded_iov);
> > > > +}
> > > > +EXPORT_SYMBOL(check_encoded_read);
> > > > +
> > > > +int import_encoded_write(struct kiocb *iocb, struct encoded_iov *encoded,
> > > > +			 struct iov_iter *from)
> > > > +{
> > > > +	if (!(iocb->ki_filp->f_flags & O_ENCODED))
> > > > +		return -EPERM;
> > > > +	if (iov_iter_single_seg_count(from) != sizeof(*encoded))
> > > > +		return -EINVAL;
> > > > +	if (copy_from_iter(encoded, sizeof(*encoded), from) != sizeof(*encoded))
> > > > +		return -EFAULT;
> > > > +	if (encoded->compression == ENCODED_IOV_COMPRESSION_NONE &&
> > > > +	    encoded->encryption == ENCODED_IOV_ENCRYPTION_NONE)
> > > > +		return -EINVAL;
> > > > +	if (encoded->compression > ENCODED_IOV_COMPRESSION_TYPES ||
> > > > +	    encoded->encryption > ENCODED_IOV_ENCRYPTION_TYPES)
> > > > +		return -EINVAL;
> > > > +	if (encoded->unencoded_offset >= encoded->unencoded_len)
> > > > +		return -EINVAL;
> > > > +	return 0;
> > > > +}
> > > > +EXPORT_SYMBOL(import_encoded_write);
> > > > +
> > > >  /*
> > > >   * Performs necessary checks before doing a clone.
> > > >   *
Aleksa Sarai Oct. 22, 2019, 2:02 a.m. UTC | #9
On 2019-10-21, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> On Tue, Oct 22, 2019 at 05:38:31AM +1100, Aleksa Sarai wrote:
> > On 2019-10-21, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> > > On Tue, Oct 15, 2019 at 11:42:40AM -0700, Omar Sandoval wrote:
> > > > From: Omar Sandoval <osandov@fb.com>
> > > > 
> > > > Btrfs supports transparent compression: data written by the user can be
> > > > compressed when written to disk and decompressed when read back.
> > > > However, we'd like to add an interface to write pre-compressed data
> > > > directly to the filesystem, and the matching interface to read
> > > > compressed data without decompressing it. This adds support for
> > > > so-called "encoded I/O" via preadv2() and pwritev2().
> > > > 
> > > > A new RWF_ENCODED flags indicates that a read or write is "encoded". If
> > > > this flag is set, iov[0].iov_base points to a struct encoded_iov which
> > > > is used for metadata: namely, the compression algorithm, unencoded
> > > > (i.e., decompressed) length, and what subrange of the unencoded data
> > > > should be used (needed for truncated or hole-punched extents and when
> > > > reading in the middle of an extent). For reads, the filesystem returns
> > > > this information; for writes, the caller provides it to the filesystem.
> > > > iov[0].iov_len must be set to sizeof(struct encoded_iov), which can be
> > > > used to extend the interface in the future. The remaining iovecs contain
> > > > the encoded extent.
> > > > 
> > > > Filesystems must indicate that they support encoded writes by setting
> > > > FMODE_ENCODED_IO in ->file_open().
> > > > 
> > > > Signed-off-by: Omar Sandoval <osandov@fb.com>
> > > > ---
> > > >  include/linux/fs.h      | 14 +++++++
> > > >  include/uapi/linux/fs.h | 26 ++++++++++++-
> > > >  mm/filemap.c            | 82 ++++++++++++++++++++++++++++++++++-------
> > > >  3 files changed, 108 insertions(+), 14 deletions(-)
> > > > 
> > > > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > > > index e0d909d35763..54681f21e05e 100644
> > > > --- a/include/linux/fs.h
> > > > +++ b/include/linux/fs.h
> > > > @@ -175,6 +175,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
> > > >  /* File does not contribute to nr_files count */
> > > >  #define FMODE_NOACCOUNT		((__force fmode_t)0x20000000)
> > > >  
> > > > +/* File supports encoded IO */
> > > > +#define FMODE_ENCODED_IO	((__force fmode_t)0x40000000)
> > > > +
> > > >  /*
> > > >   * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
> > > >   * that indicates that they should check the contents of the iovec are
> > > > @@ -314,6 +317,7 @@ enum rw_hint {
> > > >  #define IOCB_SYNC		(1 << 5)
> > > >  #define IOCB_WRITE		(1 << 6)
> > > >  #define IOCB_NOWAIT		(1 << 7)
> > > > +#define IOCB_ENCODED		(1 << 8)
> > > >  
> > > >  struct kiocb {
> > > >  	struct file		*ki_filp;
> > > > @@ -3088,6 +3092,11 @@ extern int sb_min_blocksize(struct super_block *, int);
> > > >  extern int generic_file_mmap(struct file *, struct vm_area_struct *);
> > > >  extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
> > > >  extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
> > > > +struct encoded_iov;
> > > > +extern int generic_encoded_write_checks(struct kiocb *, struct encoded_iov *);
> > > > +extern ssize_t check_encoded_read(struct kiocb *, struct iov_iter *);
> > > > +extern int import_encoded_write(struct kiocb *, struct encoded_iov *,
> > > > +				struct iov_iter *);
> > > >  extern int generic_remap_checks(struct file *file_in, loff_t pos_in,
> > > >  				struct file *file_out, loff_t pos_out,
> > > >  				loff_t *count, unsigned int remap_flags);
> > > > @@ -3403,6 +3412,11 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
> > > >  			return -EOPNOTSUPP;
> > > >  		ki->ki_flags |= IOCB_NOWAIT;
> > > >  	}
> > > > +	if (flags & RWF_ENCODED) {
> > > > +		if (!(ki->ki_filp->f_mode & FMODE_ENCODED_IO))
> > > > +			return -EOPNOTSUPP;
> > > > +		ki->ki_flags |= IOCB_ENCODED;
> > > > +	}
> > > >  	if (flags & RWF_HIPRI)
> > > >  		ki->ki_flags |= IOCB_HIPRI;
> > > >  	if (flags & RWF_DSYNC)
> > > > diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> > > > index 379a612f8f1d..ed92a8a257cb 100644
> > > > --- a/include/uapi/linux/fs.h
> > > > +++ b/include/uapi/linux/fs.h
> > > > @@ -284,6 +284,27 @@ struct fsxattr {
> > > >  
> > > >  typedef int __bitwise __kernel_rwf_t;
> > > >  
> > > > +enum {
> > > > +	ENCODED_IOV_COMPRESSION_NONE,
> > > > +	ENCODED_IOV_COMPRESSION_ZLIB,
> > > > +	ENCODED_IOV_COMPRESSION_LZO,
> > > > +	ENCODED_IOV_COMPRESSION_ZSTD,
> > > > +	ENCODED_IOV_COMPRESSION_TYPES = ENCODED_IOV_COMPRESSION_ZSTD,
> > > > +};
> > > > +
> > > > +enum {
> > > > +	ENCODED_IOV_ENCRYPTION_NONE,
> > > > +	ENCODED_IOV_ENCRYPTION_TYPES = ENCODED_IOV_ENCRYPTION_NONE,
> > > > +};
> > > > +
> > > > +struct encoded_iov {
> > > > +	__u64 len;
> > > > +	__u64 unencoded_len;
> > > > +	__u64 unencoded_offset;
> > > > +	__u32 compression;
> > > > +	__u32 encryption;
> > > 
> > > Can we add some must-be-zero padding space at the end here for whomever
> > > comes along next wanting to add more encoding info?
> > 
> > I would suggest to copy the extension design of copy_struct_from_user().
> > Adding must-be-zero padding is a less-ideal solution to the extension
> > problem than length-based extension.
> 
> Come to think of it, you /do/ have to specify iov_len so... yeah, do
> that instead; we can always extend the structure later.

Just to clarify -- if we want to make the interface forward-compatible
from the outset (programs built 4 years from now running on 5.5), we
will need to implement this in the original merge. Otherwise userspace
will need to handle backwards-compatibility themselves once new features
are added.

@Omar: If it'd make your life easier, I can send some draft patches
	   which port copy_struct_from_user() to iovec-land.

> > Also (I might be wrong) but shouldn't the __u64s be __aligned_u64 (as
> > with syscall structure arguments)?
> 
> <shrug> No idea, that's the first I've heard of that type and it doesn't
> seem to be used by the fs code.  Why would we care about alignment for
> an incore structure?
> 
> --D
> 
> > 
> > > (And maybe a manpage and some basic testing, to reiterate Dave...)
> > > 
> > > --D
> > > 
> > > > +};
> > > > +
> > > >  /* high priority request, poll if possible */
> > > >  #define RWF_HIPRI	((__force __kernel_rwf_t)0x00000001)
> > > >  
> > > > @@ -299,8 +320,11 @@ typedef int __bitwise __kernel_rwf_t;
> > > >  /* per-IO O_APPEND */
> > > >  #define RWF_APPEND	((__force __kernel_rwf_t)0x00000010)
> > > >  
> > > > +/* encoded (e.g., compressed or encrypted) IO */
> > > > +#define RWF_ENCODED	((__force __kernel_rwf_t)0x00000020)
> > > > +
> > > >  /* mask of flags supported by the kernel */
> > > >  #define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
> > > > -			 RWF_APPEND)
> > > > +			 RWF_APPEND | RWF_ENCODED)
> > > >  
> > > >  #endif /* _UAPI_LINUX_FS_H */
> > > > diff --git a/mm/filemap.c b/mm/filemap.c
> > > > index 1146fcfa3215..d2e6d9caf353 100644
> > > > --- a/mm/filemap.c
> > > > +++ b/mm/filemap.c
> > > > @@ -2948,24 +2948,15 @@ static int generic_write_check_limits(struct file *file, loff_t pos,
> > > >  	return 0;
> > > >  }
> > > >  
> > > > -/*
> > > > - * Performs necessary checks before doing a write
> > > > - *
> > > > - * Can adjust writing position or amount of bytes to write.
> > > > - * Returns appropriate error code that caller should return or
> > > > - * zero in case that write should be allowed.
> > > > - */
> > > > -inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
> > > > +static int generic_write_checks_common(struct kiocb *iocb, loff_t *count)
> > > >  {
> > > >  	struct file *file = iocb->ki_filp;
> > > >  	struct inode *inode = file->f_mapping->host;
> > > > -	loff_t count;
> > > > -	int ret;
> > > >  
> > > >  	if (IS_SWAPFILE(inode))
> > > >  		return -ETXTBSY;
> > > >  
> > > > -	if (!iov_iter_count(from))
> > > > +	if (!*count)
> > > >  		return 0;
> > > >  
> > > >  	/* FIXME: this is for backwards compatibility with 2.4 */
> > > > @@ -2975,8 +2966,21 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
> > > >  	if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
> > > >  		return -EINVAL;
> > > >  
> > > > -	count = iov_iter_count(from);
> > > > -	ret = generic_write_check_limits(file, iocb->ki_pos, &count);
> > > > +	return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
> > > > +}
> > > > +
> > > > +/*
> > > > + * Performs necessary checks before doing a write
> > > > + *
> > > > + * Can adjust writing position or amount of bytes to write.
> > > > + * Returns a negative errno or the new number of bytes to write.
> > > > + */
> > > > +inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
> > > > +{
> > > > +	loff_t count = iov_iter_count(from);
> > > > +	int ret;
> > > > +
> > > > +	ret = generic_write_checks_common(iocb, &count);
> > > >  	if (ret)
> > > >  		return ret;
> > > >  
> > > > @@ -2985,6 +2989,58 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
> > > >  }
> > > >  EXPORT_SYMBOL(generic_write_checks);
> > > >  
> > > > +int generic_encoded_write_checks(struct kiocb *iocb,
> > > > +				 struct encoded_iov *encoded)
> > > > +{
> > > > +	loff_t count = encoded->unencoded_len;
> > > > +	int ret;
> > > > +
> > > > +	ret = generic_write_checks_common(iocb, &count);
> > > > +	if (ret)
> > > > +		return ret;
> > > > +
> > > > +	if (count != encoded->unencoded_len) {
> > > > +		/*
> > > > +		 * The write got truncated by generic_write_checks_common(). We
> > > > +		 * can't do a partial encoded write.
> > > > +		 */
> > > > +		return -EFBIG;
> > > > +	}
> > > > +	return 0;
> > > > +}
> > > > +EXPORT_SYMBOL(generic_encoded_write_checks);
> > > > +
> > > > +ssize_t check_encoded_read(struct kiocb *iocb, struct iov_iter *iter)
> > > > +{
> > > > +	if (!(iocb->ki_filp->f_flags & O_ENCODED))
> > > > +		return -EPERM;
> > > > +	if (iov_iter_single_seg_count(iter) != sizeof(struct encoded_iov))
> > > > +		return -EINVAL;
> > > > +	return iov_iter_count(iter) - sizeof(struct encoded_iov);
> > > > +}
> > > > +EXPORT_SYMBOL(check_encoded_read);
> > > > +
> > > > +int import_encoded_write(struct kiocb *iocb, struct encoded_iov *encoded,
> > > > +			 struct iov_iter *from)
> > > > +{
> > > > +	if (!(iocb->ki_filp->f_flags & O_ENCODED))
> > > > +		return -EPERM;
> > > > +	if (iov_iter_single_seg_count(from) != sizeof(*encoded))
> > > > +		return -EINVAL;
> > > > +	if (copy_from_iter(encoded, sizeof(*encoded), from) != sizeof(*encoded))
> > > > +		return -EFAULT;
> > > > +	if (encoded->compression == ENCODED_IOV_COMPRESSION_NONE &&
> > > > +	    encoded->encryption == ENCODED_IOV_ENCRYPTION_NONE)
> > > > +		return -EINVAL;
> > > > +	if (encoded->compression > ENCODED_IOV_COMPRESSION_TYPES ||
> > > > +	    encoded->encryption > ENCODED_IOV_ENCRYPTION_TYPES)
> > > > +		return -EINVAL;
> > > > +	if (encoded->unencoded_offset >= encoded->unencoded_len)
> > > > +		return -EINVAL;
> > > > +	return 0;
> > > > +}
> > > > +EXPORT_SYMBOL(import_encoded_write);
> > > > +
> > > >  /*
> > > >   * Performs necessary checks before doing a clone.
> > > >   *
Omar Sandoval Oct. 30, 2019, 10:21 p.m. UTC | #10
On Tue, Oct 22, 2019 at 12:37:17PM +1100, Aleksa Sarai wrote:
> On 2019-10-21, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> > On Tue, Oct 22, 2019 at 05:38:31AM +1100, Aleksa Sarai wrote:
> > > On 2019-10-21, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> > > > On Tue, Oct 15, 2019 at 11:42:40AM -0700, Omar Sandoval wrote:
> > > > > From: Omar Sandoval <osandov@fb.com>
> > > > > 
> > > > > Btrfs supports transparent compression: data written by the user can be
> > > > > compressed when written to disk and decompressed when read back.
> > > > > However, we'd like to add an interface to write pre-compressed data
> > > > > directly to the filesystem, and the matching interface to read
> > > > > compressed data without decompressing it. This adds support for
> > > > > so-called "encoded I/O" via preadv2() and pwritev2().
> > > > > 
> > > > > A new RWF_ENCODED flags indicates that a read or write is "encoded". If
> > > > > this flag is set, iov[0].iov_base points to a struct encoded_iov which
> > > > > is used for metadata: namely, the compression algorithm, unencoded
> > > > > (i.e., decompressed) length, and what subrange of the unencoded data
> > > > > should be used (needed for truncated or hole-punched extents and when
> > > > > reading in the middle of an extent). For reads, the filesystem returns
> > > > > this information; for writes, the caller provides it to the filesystem.
> > > > > iov[0].iov_len must be set to sizeof(struct encoded_iov), which can be
> > > > > used to extend the interface in the future. The remaining iovecs contain
> > > > > the encoded extent.
> > > > > 
> > > > > Filesystems must indicate that they support encoded writes by setting
> > > > > FMODE_ENCODED_IO in ->file_open().
> > > > > 
> > > > > Signed-off-by: Omar Sandoval <osandov@fb.com>
> > > > > ---
> > > > >  include/linux/fs.h      | 14 +++++++
> > > > >  include/uapi/linux/fs.h | 26 ++++++++++++-
> > > > >  mm/filemap.c            | 82 ++++++++++++++++++++++++++++++++++-------
> > > > >  3 files changed, 108 insertions(+), 14 deletions(-)
> > > > > 
> > > > > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > > > > index e0d909d35763..54681f21e05e 100644
> > > > > --- a/include/linux/fs.h
> > > > > +++ b/include/linux/fs.h
> > > > > @@ -175,6 +175,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
> > > > >  /* File does not contribute to nr_files count */
> > > > >  #define FMODE_NOACCOUNT		((__force fmode_t)0x20000000)
> > > > >  
> > > > > +/* File supports encoded IO */
> > > > > +#define FMODE_ENCODED_IO	((__force fmode_t)0x40000000)
> > > > > +
> > > > >  /*
> > > > >   * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
> > > > >   * that indicates that they should check the contents of the iovec are
> > > > > @@ -314,6 +317,7 @@ enum rw_hint {
> > > > >  #define IOCB_SYNC		(1 << 5)
> > > > >  #define IOCB_WRITE		(1 << 6)
> > > > >  #define IOCB_NOWAIT		(1 << 7)
> > > > > +#define IOCB_ENCODED		(1 << 8)
> > > > >  
> > > > >  struct kiocb {
> > > > >  	struct file		*ki_filp;
> > > > > @@ -3088,6 +3092,11 @@ extern int sb_min_blocksize(struct super_block *, int);
> > > > >  extern int generic_file_mmap(struct file *, struct vm_area_struct *);
> > > > >  extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
> > > > >  extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
> > > > > +struct encoded_iov;
> > > > > +extern int generic_encoded_write_checks(struct kiocb *, struct encoded_iov *);
> > > > > +extern ssize_t check_encoded_read(struct kiocb *, struct iov_iter *);
> > > > > +extern int import_encoded_write(struct kiocb *, struct encoded_iov *,
> > > > > +				struct iov_iter *);
> > > > >  extern int generic_remap_checks(struct file *file_in, loff_t pos_in,
> > > > >  				struct file *file_out, loff_t pos_out,
> > > > >  				loff_t *count, unsigned int remap_flags);
> > > > > @@ -3403,6 +3412,11 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
> > > > >  			return -EOPNOTSUPP;
> > > > >  		ki->ki_flags |= IOCB_NOWAIT;
> > > > >  	}
> > > > > +	if (flags & RWF_ENCODED) {
> > > > > +		if (!(ki->ki_filp->f_mode & FMODE_ENCODED_IO))
> > > > > +			return -EOPNOTSUPP;
> > > > > +		ki->ki_flags |= IOCB_ENCODED;
> > > > > +	}
> > > > >  	if (flags & RWF_HIPRI)
> > > > >  		ki->ki_flags |= IOCB_HIPRI;
> > > > >  	if (flags & RWF_DSYNC)
> > > > > diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> > > > > index 379a612f8f1d..ed92a8a257cb 100644
> > > > > --- a/include/uapi/linux/fs.h
> > > > > +++ b/include/uapi/linux/fs.h
> > > > > @@ -284,6 +284,27 @@ struct fsxattr {
> > > > >  
> > > > >  typedef int __bitwise __kernel_rwf_t;
> > > > >  
> > > > > +enum {
> > > > > +	ENCODED_IOV_COMPRESSION_NONE,
> > > > > +	ENCODED_IOV_COMPRESSION_ZLIB,
> > > > > +	ENCODED_IOV_COMPRESSION_LZO,
> > > > > +	ENCODED_IOV_COMPRESSION_ZSTD,
> > > > > +	ENCODED_IOV_COMPRESSION_TYPES = ENCODED_IOV_COMPRESSION_ZSTD,
> > > > > +};
> > > > > +
> > > > > +enum {
> > > > > +	ENCODED_IOV_ENCRYPTION_NONE,
> > > > > +	ENCODED_IOV_ENCRYPTION_TYPES = ENCODED_IOV_ENCRYPTION_NONE,
> > > > > +};
> > > > > +
> > > > > +struct encoded_iov {
> > > > > +	__u64 len;
> > > > > +	__u64 unencoded_len;
> > > > > +	__u64 unencoded_offset;
> > > > > +	__u32 compression;
> > > > > +	__u32 encryption;
> > > > 
> > > > Can we add some must-be-zero padding space at the end here for whomever
> > > > comes along next wanting to add more encoding info?
> > > 
> > > I would suggest to copy the extension design of copy_struct_from_user().
> > > Adding must-be-zero padding is a less-ideal solution to the extension
> > > problem than length-based extension.
> > 
> > Come to think of it, you /do/ have to specify iov_len so... yeah, do
> > that instead; we can always extend the structure later.
> > 
> > > Also (I might be wrong) but shouldn't the __u64s be __aligned_u64 (as
> > > with syscall structure arguments)?
> > 
> > <shrug> No idea, that's the first I've heard of that type and it doesn't
> > seem to be used by the fs code.  Why would we care about alignment for
> > an incore structure?
> 
> When passing u64s from userspace, it's generally considered a good idea
> to use __aligned_u64 -- the main reason is that 32-bit userspace on a
> 64-bit kernel will use different structure alignment for 64-bit fields.
> 
> This means you'd need to implement a bunch of COMPAT_SYSCALL-like
> handling for that case. It's much simpler to use __aligned_u64 (and on
> the plus side I don't think you need to add any fields to ensure the
> padding is zero).

I'll used __aligned_u64 for the next submission.
Omar Sandoval Oct. 30, 2019, 10:26 p.m. UTC | #11
On Tue, Oct 22, 2019 at 01:02:15PM +1100, Aleksa Sarai wrote:
> On 2019-10-21, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> > On Tue, Oct 22, 2019 at 05:38:31AM +1100, Aleksa Sarai wrote:
> > > On 2019-10-21, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> > > > On Tue, Oct 15, 2019 at 11:42:40AM -0700, Omar Sandoval wrote:
> > > > > From: Omar Sandoval <osandov@fb.com>
> > > > > 
> > > > > Btrfs supports transparent compression: data written by the user can be
> > > > > compressed when written to disk and decompressed when read back.
> > > > > However, we'd like to add an interface to write pre-compressed data
> > > > > directly to the filesystem, and the matching interface to read
> > > > > compressed data without decompressing it. This adds support for
> > > > > so-called "encoded I/O" via preadv2() and pwritev2().
> > > > > 
> > > > > A new RWF_ENCODED flags indicates that a read or write is "encoded". If
> > > > > this flag is set, iov[0].iov_base points to a struct encoded_iov which
> > > > > is used for metadata: namely, the compression algorithm, unencoded
> > > > > (i.e., decompressed) length, and what subrange of the unencoded data
> > > > > should be used (needed for truncated or hole-punched extents and when
> > > > > reading in the middle of an extent). For reads, the filesystem returns
> > > > > this information; for writes, the caller provides it to the filesystem.
> > > > > iov[0].iov_len must be set to sizeof(struct encoded_iov), which can be
> > > > > used to extend the interface in the future. The remaining iovecs contain
> > > > > the encoded extent.
> > > > > 
> > > > > Filesystems must indicate that they support encoded writes by setting
> > > > > FMODE_ENCODED_IO in ->file_open().
> > > > > 
> > > > > Signed-off-by: Omar Sandoval <osandov@fb.com>
> > > > > ---
> > > > >  include/linux/fs.h      | 14 +++++++
> > > > >  include/uapi/linux/fs.h | 26 ++++++++++++-
> > > > >  mm/filemap.c            | 82 ++++++++++++++++++++++++++++++++++-------
> > > > >  3 files changed, 108 insertions(+), 14 deletions(-)
> > > > > 
> > > > > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > > > > index e0d909d35763..54681f21e05e 100644
> > > > > --- a/include/linux/fs.h
> > > > > +++ b/include/linux/fs.h
> > > > > @@ -175,6 +175,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
> > > > >  /* File does not contribute to nr_files count */
> > > > >  #define FMODE_NOACCOUNT		((__force fmode_t)0x20000000)
> > > > >  
> > > > > +/* File supports encoded IO */
> > > > > +#define FMODE_ENCODED_IO	((__force fmode_t)0x40000000)
> > > > > +
> > > > >  /*
> > > > >   * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
> > > > >   * that indicates that they should check the contents of the iovec are
> > > > > @@ -314,6 +317,7 @@ enum rw_hint {
> > > > >  #define IOCB_SYNC		(1 << 5)
> > > > >  #define IOCB_WRITE		(1 << 6)
> > > > >  #define IOCB_NOWAIT		(1 << 7)
> > > > > +#define IOCB_ENCODED		(1 << 8)
> > > > >  
> > > > >  struct kiocb {
> > > > >  	struct file		*ki_filp;
> > > > > @@ -3088,6 +3092,11 @@ extern int sb_min_blocksize(struct super_block *, int);
> > > > >  extern int generic_file_mmap(struct file *, struct vm_area_struct *);
> > > > >  extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
> > > > >  extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
> > > > > +struct encoded_iov;
> > > > > +extern int generic_encoded_write_checks(struct kiocb *, struct encoded_iov *);
> > > > > +extern ssize_t check_encoded_read(struct kiocb *, struct iov_iter *);
> > > > > +extern int import_encoded_write(struct kiocb *, struct encoded_iov *,
> > > > > +				struct iov_iter *);
> > > > >  extern int generic_remap_checks(struct file *file_in, loff_t pos_in,
> > > > >  				struct file *file_out, loff_t pos_out,
> > > > >  				loff_t *count, unsigned int remap_flags);
> > > > > @@ -3403,6 +3412,11 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
> > > > >  			return -EOPNOTSUPP;
> > > > >  		ki->ki_flags |= IOCB_NOWAIT;
> > > > >  	}
> > > > > +	if (flags & RWF_ENCODED) {
> > > > > +		if (!(ki->ki_filp->f_mode & FMODE_ENCODED_IO))
> > > > > +			return -EOPNOTSUPP;
> > > > > +		ki->ki_flags |= IOCB_ENCODED;
> > > > > +	}
> > > > >  	if (flags & RWF_HIPRI)
> > > > >  		ki->ki_flags |= IOCB_HIPRI;
> > > > >  	if (flags & RWF_DSYNC)
> > > > > diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> > > > > index 379a612f8f1d..ed92a8a257cb 100644
> > > > > --- a/include/uapi/linux/fs.h
> > > > > +++ b/include/uapi/linux/fs.h
> > > > > @@ -284,6 +284,27 @@ struct fsxattr {
> > > > >  
> > > > >  typedef int __bitwise __kernel_rwf_t;
> > > > >  
> > > > > +enum {
> > > > > +	ENCODED_IOV_COMPRESSION_NONE,
> > > > > +	ENCODED_IOV_COMPRESSION_ZLIB,
> > > > > +	ENCODED_IOV_COMPRESSION_LZO,
> > > > > +	ENCODED_IOV_COMPRESSION_ZSTD,
> > > > > +	ENCODED_IOV_COMPRESSION_TYPES = ENCODED_IOV_COMPRESSION_ZSTD,
> > > > > +};
> > > > > +
> > > > > +enum {
> > > > > +	ENCODED_IOV_ENCRYPTION_NONE,
> > > > > +	ENCODED_IOV_ENCRYPTION_TYPES = ENCODED_IOV_ENCRYPTION_NONE,
> > > > > +};
> > > > > +
> > > > > +struct encoded_iov {
> > > > > +	__u64 len;
> > > > > +	__u64 unencoded_len;
> > > > > +	__u64 unencoded_offset;
> > > > > +	__u32 compression;
> > > > > +	__u32 encryption;
> > > > 
> > > > Can we add some must-be-zero padding space at the end here for whomever
> > > > comes along next wanting to add more encoding info?
> > > 
> > > I would suggest to copy the extension design of copy_struct_from_user().
> > > Adding must-be-zero padding is a less-ideal solution to the extension
> > > problem than length-based extension.
> > 
> > Come to think of it, you /do/ have to specify iov_len so... yeah, do
> > that instead; we can always extend the structure later.
> 
> Just to clarify -- if we want to make the interface forward-compatible
> from the outset (programs built 4 years from now running on 5.5), we
> will need to implement this in the original merge. Otherwise userspace
> will need to handle backwards-compatibility themselves once new features
> are added.
> 
> @Omar: If it'd make your life easier, I can send some draft patches
> 	   which port copy_struct_from_user() to iovec-land.

You're right, I didn't think about the case of newer programs on older
kernels. I can do that for the next submission. RWF_ENCODED should
probably translate the E2BIG from copy_struct_from_user() to EINVAL,
though, to avoid ambiguity with the case that the buffer wasn't big
enough to return the encoded data.
Aleksa Sarai Oct. 30, 2019, 11:11 p.m. UTC | #12
On 2019-10-30, Omar Sandoval <osandov@osandov.com> wrote:
> On Tue, Oct 22, 2019 at 01:02:15PM +1100, Aleksa Sarai wrote:
> > On 2019-10-21, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> > > On Tue, Oct 22, 2019 at 05:38:31AM +1100, Aleksa Sarai wrote:
> > > > On 2019-10-21, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> > > > > On Tue, Oct 15, 2019 at 11:42:40AM -0700, Omar Sandoval wrote:
> > > > > > From: Omar Sandoval <osandov@fb.com>
> > > > > > 
> > > > > > Btrfs supports transparent compression: data written by the user can be
> > > > > > compressed when written to disk and decompressed when read back.
> > > > > > However, we'd like to add an interface to write pre-compressed data
> > > > > > directly to the filesystem, and the matching interface to read
> > > > > > compressed data without decompressing it. This adds support for
> > > > > > so-called "encoded I/O" via preadv2() and pwritev2().
> > > > > > 
> > > > > > A new RWF_ENCODED flags indicates that a read or write is "encoded". If
> > > > > > this flag is set, iov[0].iov_base points to a struct encoded_iov which
> > > > > > is used for metadata: namely, the compression algorithm, unencoded
> > > > > > (i.e., decompressed) length, and what subrange of the unencoded data
> > > > > > should be used (needed for truncated or hole-punched extents and when
> > > > > > reading in the middle of an extent). For reads, the filesystem returns
> > > > > > this information; for writes, the caller provides it to the filesystem.
> > > > > > iov[0].iov_len must be set to sizeof(struct encoded_iov), which can be
> > > > > > used to extend the interface in the future. The remaining iovecs contain
> > > > > > the encoded extent.
> > > > > > 
> > > > > > Filesystems must indicate that they support encoded writes by setting
> > > > > > FMODE_ENCODED_IO in ->file_open().
> > > > > > 
> > > > > > Signed-off-by: Omar Sandoval <osandov@fb.com>
> > > > > > ---
> > > > > >  include/linux/fs.h      | 14 +++++++
> > > > > >  include/uapi/linux/fs.h | 26 ++++++++++++-
> > > > > >  mm/filemap.c            | 82 ++++++++++++++++++++++++++++++++++-------
> > > > > >  3 files changed, 108 insertions(+), 14 deletions(-)
> > > > > > 
> > > > > > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > > > > > index e0d909d35763..54681f21e05e 100644
> > > > > > --- a/include/linux/fs.h
> > > > > > +++ b/include/linux/fs.h
> > > > > > @@ -175,6 +175,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
> > > > > >  /* File does not contribute to nr_files count */
> > > > > >  #define FMODE_NOACCOUNT		((__force fmode_t)0x20000000)
> > > > > >  
> > > > > > +/* File supports encoded IO */
> > > > > > +#define FMODE_ENCODED_IO	((__force fmode_t)0x40000000)
> > > > > > +
> > > > > >  /*
> > > > > >   * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
> > > > > >   * that indicates that they should check the contents of the iovec are
> > > > > > @@ -314,6 +317,7 @@ enum rw_hint {
> > > > > >  #define IOCB_SYNC		(1 << 5)
> > > > > >  #define IOCB_WRITE		(1 << 6)
> > > > > >  #define IOCB_NOWAIT		(1 << 7)
> > > > > > +#define IOCB_ENCODED		(1 << 8)
> > > > > >  
> > > > > >  struct kiocb {
> > > > > >  	struct file		*ki_filp;
> > > > > > @@ -3088,6 +3092,11 @@ extern int sb_min_blocksize(struct super_block *, int);
> > > > > >  extern int generic_file_mmap(struct file *, struct vm_area_struct *);
> > > > > >  extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
> > > > > >  extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
> > > > > > +struct encoded_iov;
> > > > > > +extern int generic_encoded_write_checks(struct kiocb *, struct encoded_iov *);
> > > > > > +extern ssize_t check_encoded_read(struct kiocb *, struct iov_iter *);
> > > > > > +extern int import_encoded_write(struct kiocb *, struct encoded_iov *,
> > > > > > +				struct iov_iter *);
> > > > > >  extern int generic_remap_checks(struct file *file_in, loff_t pos_in,
> > > > > >  				struct file *file_out, loff_t pos_out,
> > > > > >  				loff_t *count, unsigned int remap_flags);
> > > > > > @@ -3403,6 +3412,11 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
> > > > > >  			return -EOPNOTSUPP;
> > > > > >  		ki->ki_flags |= IOCB_NOWAIT;
> > > > > >  	}
> > > > > > +	if (flags & RWF_ENCODED) {
> > > > > > +		if (!(ki->ki_filp->f_mode & FMODE_ENCODED_IO))
> > > > > > +			return -EOPNOTSUPP;
> > > > > > +		ki->ki_flags |= IOCB_ENCODED;
> > > > > > +	}
> > > > > >  	if (flags & RWF_HIPRI)
> > > > > >  		ki->ki_flags |= IOCB_HIPRI;
> > > > > >  	if (flags & RWF_DSYNC)
> > > > > > diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> > > > > > index 379a612f8f1d..ed92a8a257cb 100644
> > > > > > --- a/include/uapi/linux/fs.h
> > > > > > +++ b/include/uapi/linux/fs.h
> > > > > > @@ -284,6 +284,27 @@ struct fsxattr {
> > > > > >  
> > > > > >  typedef int __bitwise __kernel_rwf_t;
> > > > > >  
> > > > > > +enum {
> > > > > > +	ENCODED_IOV_COMPRESSION_NONE,
> > > > > > +	ENCODED_IOV_COMPRESSION_ZLIB,
> > > > > > +	ENCODED_IOV_COMPRESSION_LZO,
> > > > > > +	ENCODED_IOV_COMPRESSION_ZSTD,
> > > > > > +	ENCODED_IOV_COMPRESSION_TYPES = ENCODED_IOV_COMPRESSION_ZSTD,
> > > > > > +};
> > > > > > +
> > > > > > +enum {
> > > > > > +	ENCODED_IOV_ENCRYPTION_NONE,
> > > > > > +	ENCODED_IOV_ENCRYPTION_TYPES = ENCODED_IOV_ENCRYPTION_NONE,
> > > > > > +};
> > > > > > +
> > > > > > +struct encoded_iov {
> > > > > > +	__u64 len;
> > > > > > +	__u64 unencoded_len;
> > > > > > +	__u64 unencoded_offset;
> > > > > > +	__u32 compression;
> > > > > > +	__u32 encryption;
> > > > > 
> > > > > Can we add some must-be-zero padding space at the end here for whomever
> > > > > comes along next wanting to add more encoding info?
> > > > 
> > > > I would suggest to copy the extension design of copy_struct_from_user().
> > > > Adding must-be-zero padding is a less-ideal solution to the extension
> > > > problem than length-based extension.
> > > 
> > > Come to think of it, you /do/ have to specify iov_len so... yeah, do
> > > that instead; we can always extend the structure later.
> > 
> > Just to clarify -- if we want to make the interface forward-compatible
> > from the outset (programs built 4 years from now running on 5.5), we
> > will need to implement this in the original merge. Otherwise userspace
> > will need to handle backwards-compatibility themselves once new features
> > are added.
> > 
> > @Omar: If it'd make your life easier, I can send some draft patches
> > 	   which port copy_struct_from_user() to iovec-land.
> 
> You're right, I didn't think about the case of newer programs on older
> kernels. I can do that for the next submission. RWF_ENCODED should
> probably translate the E2BIG from copy_struct_from_user() to EINVAL,
> though, to avoid ambiguity with the case that the buffer wasn't big
> enough to return the encoded data.

Yeah, that seems fair enough. I would've preferred to keep the error
semantics the same everywhere, but adding additional ambiguity to such
error cases isn't a good idea.

It's a bit of a shame we don't have more granular EINVALs to make it
easier to figure out *why* you got an EINVAL (then we wouldn't have had
to abuse E2BIG to indicate to userspace "you're using a new feature on
an old kernel") -- but that's a more generic problem that probably won't
be solved any time soon.

Patch
diff mbox series

diff --git a/include/linux/fs.h b/include/linux/fs.h
index e0d909d35763..54681f21e05e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -175,6 +175,9 @@  typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 /* File does not contribute to nr_files count */
 #define FMODE_NOACCOUNT		((__force fmode_t)0x20000000)
 
+/* File supports encoded IO */
+#define FMODE_ENCODED_IO	((__force fmode_t)0x40000000)
+
 /*
  * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
  * that indicates that they should check the contents of the iovec are
@@ -314,6 +317,7 @@  enum rw_hint {
 #define IOCB_SYNC		(1 << 5)
 #define IOCB_WRITE		(1 << 6)
 #define IOCB_NOWAIT		(1 << 7)
+#define IOCB_ENCODED		(1 << 8)
 
 struct kiocb {
 	struct file		*ki_filp;
@@ -3088,6 +3092,11 @@  extern int sb_min_blocksize(struct super_block *, int);
 extern int generic_file_mmap(struct file *, struct vm_area_struct *);
 extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
 extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
+struct encoded_iov;
+extern int generic_encoded_write_checks(struct kiocb *, struct encoded_iov *);
+extern ssize_t check_encoded_read(struct kiocb *, struct iov_iter *);
+extern int import_encoded_write(struct kiocb *, struct encoded_iov *,
+				struct iov_iter *);
 extern int generic_remap_checks(struct file *file_in, loff_t pos_in,
 				struct file *file_out, loff_t pos_out,
 				loff_t *count, unsigned int remap_flags);
@@ -3403,6 +3412,11 @@  static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
 			return -EOPNOTSUPP;
 		ki->ki_flags |= IOCB_NOWAIT;
 	}
+	if (flags & RWF_ENCODED) {
+		if (!(ki->ki_filp->f_mode & FMODE_ENCODED_IO))
+			return -EOPNOTSUPP;
+		ki->ki_flags |= IOCB_ENCODED;
+	}
 	if (flags & RWF_HIPRI)
 		ki->ki_flags |= IOCB_HIPRI;
 	if (flags & RWF_DSYNC)
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 379a612f8f1d..ed92a8a257cb 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -284,6 +284,27 @@  struct fsxattr {
 
 typedef int __bitwise __kernel_rwf_t;
 
+enum {
+	ENCODED_IOV_COMPRESSION_NONE,
+	ENCODED_IOV_COMPRESSION_ZLIB,
+	ENCODED_IOV_COMPRESSION_LZO,
+	ENCODED_IOV_COMPRESSION_ZSTD,
+	ENCODED_IOV_COMPRESSION_TYPES = ENCODED_IOV_COMPRESSION_ZSTD,
+};
+
+enum {
+	ENCODED_IOV_ENCRYPTION_NONE,
+	ENCODED_IOV_ENCRYPTION_TYPES = ENCODED_IOV_ENCRYPTION_NONE,
+};
+
+struct encoded_iov {
+	__u64 len;
+	__u64 unencoded_len;
+	__u64 unencoded_offset;
+	__u32 compression;
+	__u32 encryption;
+};
+
 /* high priority request, poll if possible */
 #define RWF_HIPRI	((__force __kernel_rwf_t)0x00000001)
 
@@ -299,8 +320,11 @@  typedef int __bitwise __kernel_rwf_t;
 /* per-IO O_APPEND */
 #define RWF_APPEND	((__force __kernel_rwf_t)0x00000010)
 
+/* encoded (e.g., compressed or encrypted) IO */
+#define RWF_ENCODED	((__force __kernel_rwf_t)0x00000020)
+
 /* mask of flags supported by the kernel */
 #define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
-			 RWF_APPEND)
+			 RWF_APPEND | RWF_ENCODED)
 
 #endif /* _UAPI_LINUX_FS_H */
diff --git a/mm/filemap.c b/mm/filemap.c
index 1146fcfa3215..d2e6d9caf353 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2948,24 +2948,15 @@  static int generic_write_check_limits(struct file *file, loff_t pos,
 	return 0;
 }
 
-/*
- * Performs necessary checks before doing a write
- *
- * Can adjust writing position or amount of bytes to write.
- * Returns appropriate error code that caller should return or
- * zero in case that write should be allowed.
- */
-inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
+static int generic_write_checks_common(struct kiocb *iocb, loff_t *count)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
-	loff_t count;
-	int ret;
 
 	if (IS_SWAPFILE(inode))
 		return -ETXTBSY;
 
-	if (!iov_iter_count(from))
+	if (!*count)
 		return 0;
 
 	/* FIXME: this is for backwards compatibility with 2.4 */
@@ -2975,8 +2966,21 @@  inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
 	if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
 		return -EINVAL;
 
-	count = iov_iter_count(from);
-	ret = generic_write_check_limits(file, iocb->ki_pos, &count);
+	return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
+}
+
+/*
+ * Performs necessary checks before doing a write
+ *
+ * Can adjust writing position or amount of bytes to write.
+ * Returns a negative errno or the new number of bytes to write.
+ */
+inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
+{
+	loff_t count = iov_iter_count(from);
+	int ret;
+
+	ret = generic_write_checks_common(iocb, &count);
 	if (ret)
 		return ret;
 
@@ -2985,6 +2989,58 @@  inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
 }
 EXPORT_SYMBOL(generic_write_checks);
 
+int generic_encoded_write_checks(struct kiocb *iocb,
+				 struct encoded_iov *encoded)
+{
+	loff_t count = encoded->unencoded_len;
+	int ret;
+
+	ret = generic_write_checks_common(iocb, &count);
+	if (ret)
+		return ret;
+
+	if (count != encoded->unencoded_len) {
+		/*
+		 * The write got truncated by generic_write_checks_common(). We
+		 * can't do a partial encoded write.
+		 */
+		return -EFBIG;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(generic_encoded_write_checks);
+
+ssize_t check_encoded_read(struct kiocb *iocb, struct iov_iter *iter)
+{
+	if (!(iocb->ki_filp->f_flags & O_ENCODED))
+		return -EPERM;
+	if (iov_iter_single_seg_count(iter) != sizeof(struct encoded_iov))
+		return -EINVAL;
+	return iov_iter_count(iter) - sizeof(struct encoded_iov);
+}
+EXPORT_SYMBOL(check_encoded_read);
+
+int import_encoded_write(struct kiocb *iocb, struct encoded_iov *encoded,
+			 struct iov_iter *from)
+{
+	if (!(iocb->ki_filp->f_flags & O_ENCODED))
+		return -EPERM;
+	if (iov_iter_single_seg_count(from) != sizeof(*encoded))
+		return -EINVAL;
+	if (copy_from_iter(encoded, sizeof(*encoded), from) != sizeof(*encoded))
+		return -EFAULT;
+	if (encoded->compression == ENCODED_IOV_COMPRESSION_NONE &&
+	    encoded->encryption == ENCODED_IOV_ENCRYPTION_NONE)
+		return -EINVAL;
+	if (encoded->compression > ENCODED_IOV_COMPRESSION_TYPES ||
+	    encoded->encryption > ENCODED_IOV_ENCRYPTION_TYPES)
+		return -EINVAL;
+	if (encoded->unencoded_offset >= encoded->unencoded_len)
+		return -EINVAL;
+	return 0;
+}
+EXPORT_SYMBOL(import_encoded_write);
+
 /*
  * Performs necessary checks before doing a clone.
  *