diff mbox

[v3] fuse: add support for copy_file_range()

Message ID 20180629125341.30466-1-ndevos@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Niels de Vos June 29, 2018, 12:53 p.m. UTC
There are several FUSE filesystems that can implement server-side copy
or other efficient copy/duplication/clone methods. The copy_file_range()
syscall is the standard interface that users have access to while not
depending on external libraries that bypass FUSE.

Signed-off-by: Niels de Vos <ndevos@redhat.com>

---
v2: return ssize_t instead of long
v3: add nodeid_out to fuse_copy_file_range_in for libfuse expectations
---
 fs/fuse/file.c            |  66 +++++++++++++++++++++++
 fs/fuse/fuse_i.h          |   3 ++
 include/uapi/linux/fuse.h | 107 ++++++++++++++++++++++----------------
 3 files changed, 132 insertions(+), 44 deletions(-)

Comments

Niels de Vos Aug. 6, 2018, 10:46 a.m. UTC | #1
Hi Miklos,

On Fri, Jun 29, 2018 at 02:53:41PM +0200, Niels de Vos wrote:
> There are several FUSE filesystems that can implement server-side copy
> or other efficient copy/duplication/clone methods. The copy_file_range()
> syscall is the standard interface that users have access to while not
> depending on external libraries that bypass FUSE.

Could you have a look at this patch? A review would be most welcome.
This has been tested with libfuse, and the pull-request for that is
available at https://github.com/libfuse/libfuse/pull/259

Marcin had a look already too, with his feedback we landed at V3 of this
change.

Thanks,
Niels


> Signed-off-by: Niels de Vos <ndevos@redhat.com>
> 
> ---
> v2: return ssize_t instead of long
> v3: add nodeid_out to fuse_copy_file_range_in for libfuse expectations
> ---
>  fs/fuse/file.c            |  66 +++++++++++++++++++++++
>  fs/fuse/fuse_i.h          |   3 ++
>  include/uapi/linux/fuse.h | 107 ++++++++++++++++++++++----------------
>  3 files changed, 132 insertions(+), 44 deletions(-)
> 
> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> index 67648ccbdd43..864939a1215d 100644
> --- a/fs/fuse/file.c
> +++ b/fs/fuse/file.c
> @@ -3009,6 +3009,71 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
>  	return err;
>  }
>  
> +static ssize_t fuse_copy_file_range(struct file *file_in, loff_t pos_in,
> +				    struct file *file_out, loff_t pos_out,
> +				    size_t len, unsigned int flags)
> +{
> +	struct fuse_file *ff_in = file_in->private_data;
> +	struct fuse_file *ff_out = file_out->private_data;
> +	struct inode *inode_out = file_inode(file_out);
> +	struct fuse_inode *fi_out = get_fuse_inode(inode_out);
> +	struct fuse_conn *fc = ff_in->fc;
> +	FUSE_ARGS(args);
> +	struct fuse_copy_file_range_in inarg = {
> +		.fh_in = ff_in->fh,
> +		.off_in = pos_in,
> +		.nodeid_out = ff_out->nodeid,
> +		.fh_out = ff_out->fh,
> +		.off_out = pos_out,
> +		.len = len,
> +		.flags = flags
> +	};
> +	struct fuse_copy_file_range_out outarg;
> +	ssize_t err;
> +
> +	if (fc->no_copy_file_range)
> +		return -EOPNOTSUPP;
> +
> +	inode_lock(inode_out);
> +	set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
> +
> +	args.in.h.opcode = FUSE_COPY_FILE_RANGE;
> +	args.in.h.nodeid = ff_in->nodeid;
> +	args.in.numargs = 1;
> +	args.in.args[0].size = sizeof(inarg);
> +	args.in.args[0].value = &inarg;
> +	args.out.numargs = 1;
> +	args.out.args[0].size = sizeof(outarg);
> +	args.out.args[0].value = &outarg;
> +	err = fuse_simple_request(fc, &args);
> +	if (err == -ENOSYS) {
> +		fc->no_copy_file_range = 1;
> +		err = -EOPNOTSUPP;
> +	}
> +	if (err)
> +		goto out;
> +
> +	/* we might have extended the file */
> +	if (outarg.size > 0) {
> +		/* Size of inode_out may not have changed in case of
> +		 * overwrites, oh well. */
> +		bool changed = fuse_write_update_size(inode_out,
> +						      pos_out + outarg.size);
> +
> +		if (changed && fc->writeback_cache)
> +			file_update_time(file_out);
> +	}
> +
> +	fuse_invalidate_attr(inode_out);
> +
> +	err = outarg.size;
> +out:
> +	clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
> +	inode_unlock(inode_out);
> +
> +	return err;
> +}
> +
>  static const struct file_operations fuse_file_operations = {
>  	.llseek		= fuse_file_llseek,
>  	.read_iter	= fuse_file_read_iter,
> @@ -3025,6 +3090,7 @@ static const struct file_operations fuse_file_operations = {
>  	.compat_ioctl	= fuse_file_compat_ioctl,
>  	.poll		= fuse_file_poll,
>  	.fallocate	= fuse_file_fallocate,
> +	.copy_file_range = fuse_copy_file_range,
>  };
>  
>  static const struct file_operations fuse_direct_io_file_operations = {
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index 5256ad333b05..ea848bb7d9e2 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -637,6 +637,9 @@ struct fuse_conn {
>  	/** Allow other than the mounter user to access the filesystem ? */
>  	unsigned allow_other:1;
>  
> +	/** Does the filesystem support copy_file_range? */
> +	unsigned no_copy_file_range:1;
> +
>  	/** The number of requests waiting for completion */
>  	atomic_t num_waiting;
>  
> diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> index 92fa24c24c92..84aa810e04c8 100644
> --- a/include/uapi/linux/fuse.h
> +++ b/include/uapi/linux/fuse.h
> @@ -116,6 +116,9 @@
>   *
>   *  7.27
>   *  - add FUSE_ABORT_ERROR
> + *
> + *  7.28
> + *  - add FUSE_COPY_FILE_RANGE
>   */
>  
>  #ifndef _LINUX_FUSE_H
> @@ -337,50 +340,51 @@ struct fuse_file_lock {
>  #define FUSE_POLL_SCHEDULE_NOTIFY (1 << 0)
>  
>  enum fuse_opcode {
> -	FUSE_LOOKUP	   = 1,
> -	FUSE_FORGET	   = 2,  /* no reply */
> -	FUSE_GETATTR	   = 3,
> -	FUSE_SETATTR	   = 4,
> -	FUSE_READLINK	   = 5,
> -	FUSE_SYMLINK	   = 6,
> -	FUSE_MKNOD	   = 8,
> -	FUSE_MKDIR	   = 9,
> -	FUSE_UNLINK	   = 10,
> -	FUSE_RMDIR	   = 11,
> -	FUSE_RENAME	   = 12,
> -	FUSE_LINK	   = 13,
> -	FUSE_OPEN	   = 14,
> -	FUSE_READ	   = 15,
> -	FUSE_WRITE	   = 16,
> -	FUSE_STATFS	   = 17,
> -	FUSE_RELEASE       = 18,
> -	FUSE_FSYNC         = 20,
> -	FUSE_SETXATTR      = 21,
> -	FUSE_GETXATTR      = 22,
> -	FUSE_LISTXATTR     = 23,
> -	FUSE_REMOVEXATTR   = 24,
> -	FUSE_FLUSH         = 25,
> -	FUSE_INIT          = 26,
> -	FUSE_OPENDIR       = 27,
> -	FUSE_READDIR       = 28,
> -	FUSE_RELEASEDIR    = 29,
> -	FUSE_FSYNCDIR      = 30,
> -	FUSE_GETLK         = 31,
> -	FUSE_SETLK         = 32,
> -	FUSE_SETLKW        = 33,
> -	FUSE_ACCESS        = 34,
> -	FUSE_CREATE        = 35,
> -	FUSE_INTERRUPT     = 36,
> -	FUSE_BMAP          = 37,
> -	FUSE_DESTROY       = 38,
> -	FUSE_IOCTL         = 39,
> -	FUSE_POLL          = 40,
> -	FUSE_NOTIFY_REPLY  = 41,
> -	FUSE_BATCH_FORGET  = 42,
> -	FUSE_FALLOCATE     = 43,
> -	FUSE_READDIRPLUS   = 44,
> -	FUSE_RENAME2       = 45,
> -	FUSE_LSEEK         = 46,
> +	FUSE_LOOKUP	     = 1,
> +	FUSE_FORGET	     = 2,  /* no reply */
> +	FUSE_GETATTR	     = 3,
> +	FUSE_SETATTR	     = 4,
> +	FUSE_READLINK	     = 5,
> +	FUSE_SYMLINK	     = 6,
> +	FUSE_MKNOD	     = 8,
> +	FUSE_MKDIR	     = 9,
> +	FUSE_UNLINK	     = 10,
> +	FUSE_RMDIR	     = 11,
> +	FUSE_RENAME	     = 12,
> +	FUSE_LINK	     = 13,
> +	FUSE_OPEN	     = 14,
> +	FUSE_READ	     = 15,
> +	FUSE_WRITE	     = 16,
> +	FUSE_STATFS	     = 17,
> +	FUSE_RELEASE         = 18,
> +	FUSE_FSYNC           = 20,
> +	FUSE_SETXATTR        = 21,
> +	FUSE_GETXATTR        = 22,
> +	FUSE_LISTXATTR       = 23,
> +	FUSE_REMOVEXATTR     = 24,
> +	FUSE_FLUSH           = 25,
> +	FUSE_INIT            = 26,
> +	FUSE_OPENDIR         = 27,
> +	FUSE_READDIR         = 28,
> +	FUSE_RELEASEDIR      = 29,
> +	FUSE_FSYNCDIR        = 30,
> +	FUSE_GETLK           = 31,
> +	FUSE_SETLK           = 32,
> +	FUSE_SETLKW          = 33,
> +	FUSE_ACCESS          = 34,
> +	FUSE_CREATE          = 35,
> +	FUSE_INTERRUPT       = 36,
> +	FUSE_BMAP            = 37,
> +	FUSE_DESTROY         = 38,
> +	FUSE_IOCTL           = 39,
> +	FUSE_POLL            = 40,
> +	FUSE_NOTIFY_REPLY    = 41,
> +	FUSE_BATCH_FORGET    = 42,
> +	FUSE_FALLOCATE       = 43,
> +	FUSE_READDIRPLUS     = 44,
> +	FUSE_RENAME2         = 45,
> +	FUSE_LSEEK           = 46,
> +	FUSE_COPY_FILE_RANGE = 47,
>  
>  	/* CUSE specific operations */
>  	CUSE_INIT          = 4096,
> @@ -792,4 +796,19 @@ struct fuse_lseek_out {
>  	uint64_t	offset;
>  };
>  
> +struct fuse_copy_file_range_in {
> +	uint64_t	fh_in;
> +	uint64_t	off_in;
> +	uint64_t	nodeid_out;
> +	uint64_t	fh_out;
> +	uint64_t	off_out;
> +	uint64_t	len;
> +	uint32_t	flags;
> +};
> +
> +struct fuse_copy_file_range_out {
> +	uint32_t	size;
> +	uint32_t	padding;
> +};
> +
>  #endif /* _LINUX_FUSE_H */
> -- 
> 2.17.1
>
Miklos Szeredi Aug. 7, 2018, 12:02 p.m. UTC | #2
On Fri, Jun 29, 2018 at 2:53 PM, Niels de Vos <ndevos@redhat.com> wrote:
> There are several FUSE filesystems that can implement server-side copy
> or other efficient copy/duplication/clone methods. The copy_file_range()
> syscall is the standard interface that users have access to while not
> depending on external libraries that bypass FUSE.
>
> Signed-off-by: Niels de Vos <ndevos@redhat.com>
>
> ---
> v2: return ssize_t instead of long
> v3: add nodeid_out to fuse_copy_file_range_in for libfuse expectations
> ---
>  fs/fuse/file.c            |  66 +++++++++++++++++++++++
>  fs/fuse/fuse_i.h          |   3 ++
>  include/uapi/linux/fuse.h | 107 ++++++++++++++++++++++----------------
>  3 files changed, 132 insertions(+), 44 deletions(-)
>
> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> index 67648ccbdd43..864939a1215d 100644
> --- a/fs/fuse/file.c
> +++ b/fs/fuse/file.c
> @@ -3009,6 +3009,71 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
>         return err;
>  }
>
> +static ssize_t fuse_copy_file_range(struct file *file_in, loff_t pos_in,
> +                                   struct file *file_out, loff_t pos_out,
> +                                   size_t len, unsigned int flags)
> +{
> +       struct fuse_file *ff_in = file_in->private_data;
> +       struct fuse_file *ff_out = file_out->private_data;
> +       struct inode *inode_out = file_inode(file_out);
> +       struct fuse_inode *fi_out = get_fuse_inode(inode_out);
> +       struct fuse_conn *fc = ff_in->fc;
> +       FUSE_ARGS(args);
> +       struct fuse_copy_file_range_in inarg = {
> +               .fh_in = ff_in->fh,
> +               .off_in = pos_in,
> +               .nodeid_out = ff_out->nodeid,
> +               .fh_out = ff_out->fh,
> +               .off_out = pos_out,
> +               .len = len,
> +               .flags = flags
> +       };
> +       struct fuse_copy_file_range_out outarg;
> +       ssize_t err;
> +
> +       if (fc->no_copy_file_range)
> +               return -EOPNOTSUPP;
> +
> +       inode_lock(inode_out);
> +       set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);

This one is only needed in the non-writeback-cache case and only if
the operations is size extending.

Here's how the writeback-cache is supposed to work: the kernel buffers
writes, just like a normal filesystem, as well as buffering related
metadata updates (size & [cm]time), again, just like a normal
filesystem.  This means we just don't care about i_size being updated
in userspace, any such change will be overwritten when the metadata is
flushed out.

In writeback-cache mode, when we do any other data modification, we
need to first flush out the cache so that the order of writes is not
mixed up.  See fallocate() for example.  We could be selective and
only flush the range covered by [pos, pos+len], but just flushing
everything is okay.

I could add these, but you already have a test for this set up, so, I
wouldn't mind if you post a new version.

> +
> +       args.in.h.opcode = FUSE_COPY_FILE_RANGE;
> +       args.in.h.nodeid = ff_in->nodeid;
> +       args.in.numargs = 1;
> +       args.in.args[0].size = sizeof(inarg);
> +       args.in.args[0].value = &inarg;
> +       args.out.numargs = 1;
> +       args.out.args[0].size = sizeof(outarg);
> +       args.out.args[0].value = &outarg;
> +       err = fuse_simple_request(fc, &args);
> +       if (err == -ENOSYS) {
> +               fc->no_copy_file_range = 1;
> +               err = -EOPNOTSUPP;
> +       }
> +       if (err)
> +               goto out;
> +
> +       /* we might have extended the file */
> +       if (outarg.size > 0) {
> +               /* Size of inode_out may not have changed in case of
> +                * overwrites, oh well. */
> +               bool changed = fuse_write_update_size(inode_out,
> +                                                     pos_out + outarg.size);
> +
> +               if (changed && fc->writeback_cache)
> +                       file_update_time(file_out);
> +       }
> +
> +       fuse_invalidate_attr(inode_out);
> +
> +       err = outarg.size;
> +out:
> +       clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
> +       inode_unlock(inode_out);
> +
> +       return err;
> +}
> +
>  static const struct file_operations fuse_file_operations = {
>         .llseek         = fuse_file_llseek,
>         .read_iter      = fuse_file_read_iter,
> @@ -3025,6 +3090,7 @@ static const struct file_operations fuse_file_operations = {
>         .compat_ioctl   = fuse_file_compat_ioctl,
>         .poll           = fuse_file_poll,
>         .fallocate      = fuse_file_fallocate,
> +       .copy_file_range = fuse_copy_file_range,
>  };
>
>  static const struct file_operations fuse_direct_io_file_operations = {
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index 5256ad333b05..ea848bb7d9e2 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -637,6 +637,9 @@ struct fuse_conn {
>         /** Allow other than the mounter user to access the filesystem ? */
>         unsigned allow_other:1;
>
> +       /** Does the filesystem support copy_file_range? */
> +       unsigned no_copy_file_range:1;
> +
>         /** The number of requests waiting for completion */
>         atomic_t num_waiting;
>
> diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> index 92fa24c24c92..84aa810e04c8 100644
> --- a/include/uapi/linux/fuse.h
> +++ b/include/uapi/linux/fuse.h
> @@ -116,6 +116,9 @@
>   *
>   *  7.27
>   *  - add FUSE_ABORT_ERROR
> + *
> + *  7.28
> + *  - add FUSE_COPY_FILE_RANGE
>   */
>
>  #ifndef _LINUX_FUSE_H
> @@ -337,50 +340,51 @@ struct fuse_file_lock {
>  #define FUSE_POLL_SCHEDULE_NOTIFY (1 << 0)
>
>  enum fuse_opcode {
> -       FUSE_LOOKUP        = 1,
> -       FUSE_FORGET        = 2,  /* no reply */
> -       FUSE_GETATTR       = 3,
> -       FUSE_SETATTR       = 4,
> -       FUSE_READLINK      = 5,
> -       FUSE_SYMLINK       = 6,
> -       FUSE_MKNOD         = 8,
> -       FUSE_MKDIR         = 9,
> -       FUSE_UNLINK        = 10,
> -       FUSE_RMDIR         = 11,
> -       FUSE_RENAME        = 12,
> -       FUSE_LINK          = 13,
> -       FUSE_OPEN          = 14,
> -       FUSE_READ          = 15,
> -       FUSE_WRITE         = 16,
> -       FUSE_STATFS        = 17,
> -       FUSE_RELEASE       = 18,
> -       FUSE_FSYNC         = 20,
> -       FUSE_SETXATTR      = 21,
> -       FUSE_GETXATTR      = 22,
> -       FUSE_LISTXATTR     = 23,
> -       FUSE_REMOVEXATTR   = 24,
> -       FUSE_FLUSH         = 25,
> -       FUSE_INIT          = 26,
> -       FUSE_OPENDIR       = 27,
> -       FUSE_READDIR       = 28,
> -       FUSE_RELEASEDIR    = 29,
> -       FUSE_FSYNCDIR      = 30,
> -       FUSE_GETLK         = 31,
> -       FUSE_SETLK         = 32,
> -       FUSE_SETLKW        = 33,
> -       FUSE_ACCESS        = 34,
> -       FUSE_CREATE        = 35,
> -       FUSE_INTERRUPT     = 36,
> -       FUSE_BMAP          = 37,
> -       FUSE_DESTROY       = 38,
> -       FUSE_IOCTL         = 39,
> -       FUSE_POLL          = 40,
> -       FUSE_NOTIFY_REPLY  = 41,
> -       FUSE_BATCH_FORGET  = 42,
> -       FUSE_FALLOCATE     = 43,
> -       FUSE_READDIRPLUS   = 44,
> -       FUSE_RENAME2       = 45,
> -       FUSE_LSEEK         = 46,
> +       FUSE_LOOKUP          = 1,
> +       FUSE_FORGET          = 2,  /* no reply */
> +       FUSE_GETATTR         = 3,
> +       FUSE_SETATTR         = 4,
> +       FUSE_READLINK        = 5,
> +       FUSE_SYMLINK         = 6,
> +       FUSE_MKNOD           = 8,
> +       FUSE_MKDIR           = 9,
> +       FUSE_UNLINK          = 10,
> +       FUSE_RMDIR           = 11,
> +       FUSE_RENAME          = 12,
> +       FUSE_LINK            = 13,
> +       FUSE_OPEN            = 14,
> +       FUSE_READ            = 15,
> +       FUSE_WRITE           = 16,
> +       FUSE_STATFS          = 17,
> +       FUSE_RELEASE         = 18,
> +       FUSE_FSYNC           = 20,
> +       FUSE_SETXATTR        = 21,
> +       FUSE_GETXATTR        = 22,
> +       FUSE_LISTXATTR       = 23,
> +       FUSE_REMOVEXATTR     = 24,
> +       FUSE_FLUSH           = 25,
> +       FUSE_INIT            = 26,
> +       FUSE_OPENDIR         = 27,
> +       FUSE_READDIR         = 28,
> +       FUSE_RELEASEDIR      = 29,
> +       FUSE_FSYNCDIR        = 30,
> +       FUSE_GETLK           = 31,
> +       FUSE_SETLK           = 32,
> +       FUSE_SETLKW          = 33,
> +       FUSE_ACCESS          = 34,
> +       FUSE_CREATE          = 35,
> +       FUSE_INTERRUPT       = 36,
> +       FUSE_BMAP            = 37,
> +       FUSE_DESTROY         = 38,
> +       FUSE_IOCTL           = 39,
> +       FUSE_POLL            = 40,
> +       FUSE_NOTIFY_REPLY    = 41,
> +       FUSE_BATCH_FORGET    = 42,
> +       FUSE_FALLOCATE       = 43,
> +       FUSE_READDIRPLUS     = 44,
> +       FUSE_RENAME2         = 45,
> +       FUSE_LSEEK           = 46,
> +       FUSE_COPY_FILE_RANGE = 47,

Nit: please do tabulation with tabs instead of spaces.

>
>         /* CUSE specific operations */
>         CUSE_INIT          = 4096,
> @@ -792,4 +796,19 @@ struct fuse_lseek_out {
>         uint64_t        offset;
>  };
>
> +struct fuse_copy_file_range_in {
> +       uint64_t        fh_in;
> +       uint64_t        off_in;
> +       uint64_t        nodeid_out;
> +       uint64_t        fh_out;
> +       uint64_t        off_out;
> +       uint64_t        len;
> +       uint32_t        flags;

Why not uint64_t for flags?

> +};
> +
> +struct fuse_copy_file_range_out {
> +       uint32_t        size;
> +       uint32_t        padding;
> +};

Could reuse "struct fuse_write_out" for this.   Helps with the
userspace interface as well, since the same fuse_reply_write()
function can be used.

Thanks,
Miklos
Niels de Vos Aug. 21, 2018, 10:12 a.m. UTC | #3
On Tue, Aug 07, 2018 at 02:02:35PM +0200, Miklos Szeredi wrote:
> On Fri, Jun 29, 2018 at 2:53 PM, Niels de Vos <ndevos@redhat.com> wrote:
> > There are several FUSE filesystems that can implement server-side copy
> > or other efficient copy/duplication/clone methods. The copy_file_range()
> > syscall is the standard interface that users have access to while not
> > depending on external libraries that bypass FUSE.
> >
> > Signed-off-by: Niels de Vos <ndevos@redhat.com>
> >
> > ---
> > v2: return ssize_t instead of long
> > v3: add nodeid_out to fuse_copy_file_range_in for libfuse expectations
> > ---
> >  fs/fuse/file.c            |  66 +++++++++++++++++++++++
> >  fs/fuse/fuse_i.h          |   3 ++
> >  include/uapi/linux/fuse.h | 107 ++++++++++++++++++++++----------------
> >  3 files changed, 132 insertions(+), 44 deletions(-)
> >
> > diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> > index 67648ccbdd43..864939a1215d 100644
> > --- a/fs/fuse/file.c
> > +++ b/fs/fuse/file.c
> > @@ -3009,6 +3009,71 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
> >         return err;
> >  }
> >
> > +static ssize_t fuse_copy_file_range(struct file *file_in, loff_t pos_in,
> > +                                   struct file *file_out, loff_t pos_out,
> > +                                   size_t len, unsigned int flags)
> > +{
> > +       struct fuse_file *ff_in = file_in->private_data;
> > +       struct fuse_file *ff_out = file_out->private_data;
> > +       struct inode *inode_out = file_inode(file_out);
> > +       struct fuse_inode *fi_out = get_fuse_inode(inode_out);
> > +       struct fuse_conn *fc = ff_in->fc;
> > +       FUSE_ARGS(args);
> > +       struct fuse_copy_file_range_in inarg = {
> > +               .fh_in = ff_in->fh,
> > +               .off_in = pos_in,
> > +               .nodeid_out = ff_out->nodeid,
> > +               .fh_out = ff_out->fh,
> > +               .off_out = pos_out,
> > +               .len = len,
> > +               .flags = flags
> > +       };
> > +       struct fuse_copy_file_range_out outarg;
> > +       ssize_t err;
> > +
> > +       if (fc->no_copy_file_range)
> > +               return -EOPNOTSUPP;
> > +
> > +       inode_lock(inode_out);
> > +       set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
> 
> This one is only needed in the non-writeback-cache case and only if
> the operations is size extending.
> 
> Here's how the writeback-cache is supposed to work: the kernel buffers
> writes, just like a normal filesystem, as well as buffering related
> metadata updates (size & [cm]time), again, just like a normal
> filesystem.  This means we just don't care about i_size being updated
> in userspace, any such change will be overwritten when the metadata is
> flushed out.
> 
> In writeback-cache mode, when we do any other data modification, we
> need to first flush out the cache so that the order of writes is not
> mixed up.  See fallocate() for example.  We could be selective and
> only flush the range covered by [pos, pos+len], but just flushing
> everything is okay.

Thanks! I think I understood what you mean and I'll be sending an
updated version soon.

> I could add these, but you already have a test for this set up, so, I
> wouldn't mind if you post a new version.

No problem. I got something ready and tested on my side.


...
> > +       FUSE_POLL            = 40,
> > +       FUSE_NOTIFY_REPLY    = 41,
> > +       FUSE_BATCH_FORGET    = 42,
> > +       FUSE_FALLOCATE       = 43,
> > +       FUSE_READDIRPLUS     = 44,
> > +       FUSE_RENAME2         = 45,
> > +       FUSE_LSEEK           = 46,
> > +       FUSE_COPY_FILE_RANGE = 47,
> 
> Nit: please do tabulation with tabs instead of spaces.

Will do.


> >
> >         /* CUSE specific operations */
> >         CUSE_INIT          = 4096,
> > @@ -792,4 +796,19 @@ struct fuse_lseek_out {
> >         uint64_t        offset;
> >  };
> >
> > +struct fuse_copy_file_range_in {
> > +       uint64_t        fh_in;
> > +       uint64_t        off_in;
> > +       uint64_t        nodeid_out;
> > +       uint64_t        fh_out;
> > +       uint64_t        off_out;
> > +       uint64_t        len;
> > +       uint32_t        flags;
> 
> Why not uint64_t for flags?

Everything else uses uint32_t for flags in this file. I'll make it
uint64_t in the next update.


> > +};
> > +
> > +struct fuse_copy_file_range_out {
> > +       uint32_t        size;
> > +       uint32_t        padding;
> > +};
> 
> Could reuse "struct fuse_write_out" for this.   Helps with the
> userspace interface as well, since the same fuse_reply_write()
> function can be used.

I considered that before as well. In case the interface changes an
updated struct fuse_copy_file_range_out can always be added later. And
hopefully there is no reason to change it at all.

At the moment I am running a few more test to verify an updated patch,
and will send it out later today.

Niels
diff mbox

Patch

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 67648ccbdd43..864939a1215d 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -3009,6 +3009,71 @@  static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 	return err;
 }
 
+static ssize_t fuse_copy_file_range(struct file *file_in, loff_t pos_in,
+				    struct file *file_out, loff_t pos_out,
+				    size_t len, unsigned int flags)
+{
+	struct fuse_file *ff_in = file_in->private_data;
+	struct fuse_file *ff_out = file_out->private_data;
+	struct inode *inode_out = file_inode(file_out);
+	struct fuse_inode *fi_out = get_fuse_inode(inode_out);
+	struct fuse_conn *fc = ff_in->fc;
+	FUSE_ARGS(args);
+	struct fuse_copy_file_range_in inarg = {
+		.fh_in = ff_in->fh,
+		.off_in = pos_in,
+		.nodeid_out = ff_out->nodeid,
+		.fh_out = ff_out->fh,
+		.off_out = pos_out,
+		.len = len,
+		.flags = flags
+	};
+	struct fuse_copy_file_range_out outarg;
+	ssize_t err;
+
+	if (fc->no_copy_file_range)
+		return -EOPNOTSUPP;
+
+	inode_lock(inode_out);
+	set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
+
+	args.in.h.opcode = FUSE_COPY_FILE_RANGE;
+	args.in.h.nodeid = ff_in->nodeid;
+	args.in.numargs = 1;
+	args.in.args[0].size = sizeof(inarg);
+	args.in.args[0].value = &inarg;
+	args.out.numargs = 1;
+	args.out.args[0].size = sizeof(outarg);
+	args.out.args[0].value = &outarg;
+	err = fuse_simple_request(fc, &args);
+	if (err == -ENOSYS) {
+		fc->no_copy_file_range = 1;
+		err = -EOPNOTSUPP;
+	}
+	if (err)
+		goto out;
+
+	/* we might have extended the file */
+	if (outarg.size > 0) {
+		/* Size of inode_out may not have changed in case of
+		 * overwrites, oh well. */
+		bool changed = fuse_write_update_size(inode_out,
+						      pos_out + outarg.size);
+
+		if (changed && fc->writeback_cache)
+			file_update_time(file_out);
+	}
+
+	fuse_invalidate_attr(inode_out);
+
+	err = outarg.size;
+out:
+	clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
+	inode_unlock(inode_out);
+
+	return err;
+}
+
 static const struct file_operations fuse_file_operations = {
 	.llseek		= fuse_file_llseek,
 	.read_iter	= fuse_file_read_iter,
@@ -3025,6 +3090,7 @@  static const struct file_operations fuse_file_operations = {
 	.compat_ioctl	= fuse_file_compat_ioctl,
 	.poll		= fuse_file_poll,
 	.fallocate	= fuse_file_fallocate,
+	.copy_file_range = fuse_copy_file_range,
 };
 
 static const struct file_operations fuse_direct_io_file_operations = {
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 5256ad333b05..ea848bb7d9e2 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -637,6 +637,9 @@  struct fuse_conn {
 	/** Allow other than the mounter user to access the filesystem ? */
 	unsigned allow_other:1;
 
+	/** Does the filesystem support copy_file_range? */
+	unsigned no_copy_file_range:1;
+
 	/** The number of requests waiting for completion */
 	atomic_t num_waiting;
 
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 92fa24c24c92..84aa810e04c8 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -116,6 +116,9 @@ 
  *
  *  7.27
  *  - add FUSE_ABORT_ERROR
+ *
+ *  7.28
+ *  - add FUSE_COPY_FILE_RANGE
  */
 
 #ifndef _LINUX_FUSE_H
@@ -337,50 +340,51 @@  struct fuse_file_lock {
 #define FUSE_POLL_SCHEDULE_NOTIFY (1 << 0)
 
 enum fuse_opcode {
-	FUSE_LOOKUP	   = 1,
-	FUSE_FORGET	   = 2,  /* no reply */
-	FUSE_GETATTR	   = 3,
-	FUSE_SETATTR	   = 4,
-	FUSE_READLINK	   = 5,
-	FUSE_SYMLINK	   = 6,
-	FUSE_MKNOD	   = 8,
-	FUSE_MKDIR	   = 9,
-	FUSE_UNLINK	   = 10,
-	FUSE_RMDIR	   = 11,
-	FUSE_RENAME	   = 12,
-	FUSE_LINK	   = 13,
-	FUSE_OPEN	   = 14,
-	FUSE_READ	   = 15,
-	FUSE_WRITE	   = 16,
-	FUSE_STATFS	   = 17,
-	FUSE_RELEASE       = 18,
-	FUSE_FSYNC         = 20,
-	FUSE_SETXATTR      = 21,
-	FUSE_GETXATTR      = 22,
-	FUSE_LISTXATTR     = 23,
-	FUSE_REMOVEXATTR   = 24,
-	FUSE_FLUSH         = 25,
-	FUSE_INIT          = 26,
-	FUSE_OPENDIR       = 27,
-	FUSE_READDIR       = 28,
-	FUSE_RELEASEDIR    = 29,
-	FUSE_FSYNCDIR      = 30,
-	FUSE_GETLK         = 31,
-	FUSE_SETLK         = 32,
-	FUSE_SETLKW        = 33,
-	FUSE_ACCESS        = 34,
-	FUSE_CREATE        = 35,
-	FUSE_INTERRUPT     = 36,
-	FUSE_BMAP          = 37,
-	FUSE_DESTROY       = 38,
-	FUSE_IOCTL         = 39,
-	FUSE_POLL          = 40,
-	FUSE_NOTIFY_REPLY  = 41,
-	FUSE_BATCH_FORGET  = 42,
-	FUSE_FALLOCATE     = 43,
-	FUSE_READDIRPLUS   = 44,
-	FUSE_RENAME2       = 45,
-	FUSE_LSEEK         = 46,
+	FUSE_LOOKUP	     = 1,
+	FUSE_FORGET	     = 2,  /* no reply */
+	FUSE_GETATTR	     = 3,
+	FUSE_SETATTR	     = 4,
+	FUSE_READLINK	     = 5,
+	FUSE_SYMLINK	     = 6,
+	FUSE_MKNOD	     = 8,
+	FUSE_MKDIR	     = 9,
+	FUSE_UNLINK	     = 10,
+	FUSE_RMDIR	     = 11,
+	FUSE_RENAME	     = 12,
+	FUSE_LINK	     = 13,
+	FUSE_OPEN	     = 14,
+	FUSE_READ	     = 15,
+	FUSE_WRITE	     = 16,
+	FUSE_STATFS	     = 17,
+	FUSE_RELEASE         = 18,
+	FUSE_FSYNC           = 20,
+	FUSE_SETXATTR        = 21,
+	FUSE_GETXATTR        = 22,
+	FUSE_LISTXATTR       = 23,
+	FUSE_REMOVEXATTR     = 24,
+	FUSE_FLUSH           = 25,
+	FUSE_INIT            = 26,
+	FUSE_OPENDIR         = 27,
+	FUSE_READDIR         = 28,
+	FUSE_RELEASEDIR      = 29,
+	FUSE_FSYNCDIR        = 30,
+	FUSE_GETLK           = 31,
+	FUSE_SETLK           = 32,
+	FUSE_SETLKW          = 33,
+	FUSE_ACCESS          = 34,
+	FUSE_CREATE          = 35,
+	FUSE_INTERRUPT       = 36,
+	FUSE_BMAP            = 37,
+	FUSE_DESTROY         = 38,
+	FUSE_IOCTL           = 39,
+	FUSE_POLL            = 40,
+	FUSE_NOTIFY_REPLY    = 41,
+	FUSE_BATCH_FORGET    = 42,
+	FUSE_FALLOCATE       = 43,
+	FUSE_READDIRPLUS     = 44,
+	FUSE_RENAME2         = 45,
+	FUSE_LSEEK           = 46,
+	FUSE_COPY_FILE_RANGE = 47,
 
 	/* CUSE specific operations */
 	CUSE_INIT          = 4096,
@@ -792,4 +796,19 @@  struct fuse_lseek_out {
 	uint64_t	offset;
 };
 
+struct fuse_copy_file_range_in {
+	uint64_t	fh_in;
+	uint64_t	off_in;
+	uint64_t	nodeid_out;
+	uint64_t	fh_out;
+	uint64_t	off_out;
+	uint64_t	len;
+	uint32_t	flags;
+};
+
+struct fuse_copy_file_range_out {
+	uint32_t	size;
+	uint32_t	padding;
+};
+
 #endif /* _LINUX_FUSE_H */