diff mbox

fs: try to clone files first in vfs_copy_file_range

Message ID 1480063201-17906-2-git-send-email-hch@lst.de (mailing list archive)
State New, archived
Headers show

Commit Message

Christoph Hellwig Nov. 25, 2016, 8:40 a.m. UTC
A clone is a perfectly fine implementation of a file copy, so most
file systems just implement the copy that way.  Instead of duplicating
this logic move it to the VFS.  Currently btrfs and XFS implement copies
the same way as clones and there is no behavior change for them, cifs
only implements clones and grow support for copy_file_range with this
patch.  NFS implements both, so this will allow copy_file_range to work
on servers that only implement CLONE and be lot more efficient on servers
that implements CLONE and COPY.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/btrfs/ctree.h  |  3 ---
 fs/btrfs/file.c   |  1 -
 fs/btrfs/ioctl.c  | 12 ------------
 fs/read_write.c   | 27 ++++++++++++++++++++++-----
 fs/xfs/xfs_file.c | 19 -------------------
 5 files changed, 22 insertions(+), 40 deletions(-)

Comments

Amir Goldstein Nov. 29, 2016, 5:15 a.m. UTC | #1
On Fri, Nov 25, 2016 at 10:40 AM, Christoph Hellwig <hch@lst.de> wrote:
> A clone is a perfectly fine implementation of a file copy, so most
> file systems just implement the copy that way.  Instead of duplicating
> this logic move it to the VFS.  Currently btrfs and XFS implement copies
> the same way as clones and there is no behavior change for them, cifs
> only implements clones and grow support for copy_file_range with this
> patch.  NFS implements both, so this will allow copy_file_range to work
> on servers that only implement CLONE and be lot more efficient on servers
> that implements CLONE and COPY.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/btrfs/ctree.h  |  3 ---
>  fs/btrfs/file.c   |  1 -
>  fs/btrfs/ioctl.c  | 12 ------------
>  fs/read_write.c   | 27 ++++++++++++++++++++++-----
>  fs/xfs/xfs_file.c | 19 -------------------
>  5 files changed, 22 insertions(+), 40 deletions(-)
>
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index e4e01a9..a49df8b 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -3229,9 +3229,6 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
>                       loff_t pos, size_t write_bytes,
>                       struct extent_state **cached);
>  int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
> -ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in,
> -                             struct file *file_out, loff_t pos_out,
> -                             size_t len, unsigned int flags);
>  int btrfs_clone_file_range(struct file *file_in, loff_t pos_in,
>                            struct file *file_out, loff_t pos_out, u64 len);
>
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index 3a14c87..991cc99 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -2998,7 +2998,6 @@ const struct file_operations btrfs_file_operations = {
>  #ifdef CONFIG_COMPAT
>         .compat_ioctl   = btrfs_compat_ioctl,
>  #endif
> -       .copy_file_range = btrfs_copy_file_range,
>         .clone_file_range = btrfs_clone_file_range,
>         .dedupe_file_range = btrfs_dedupe_file_range,
>  };
> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> index 7acbd2c..dab7462 100644
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
> @@ -3980,18 +3980,6 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
>         return ret;
>  }
>
> -ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in,
> -                             struct file *file_out, loff_t pos_out,
> -                             size_t len, unsigned int flags)
> -{
> -       ssize_t ret;
> -
> -       ret = btrfs_clone_files(file_out, file_in, pos_in, len, pos_out);
> -       if (ret == 0)
> -               ret = len;
> -       return ret;
> -}
> -
>  int btrfs_clone_file_range(struct file *src_file, loff_t off,
>                 struct file *dst_file, loff_t destoff, u64 len)
>  {
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 190e0d36..6674a4b 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -1542,20 +1542,37 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>         if (ret)
>                 return ret;
>
> -       ret = -EOPNOTSUPP;
> -       if (file_out->f_op->copy_file_range)
> +       /*
> +        * Try cloning first, this is supported by more file systems, and
> +        * more efficient if both clone and copy are supported (e.g. NFS).
> +        */

For fs that support both copy and clone, I am not convinced that it up to
VFS to take that decision for the application.
If application 'chose' copy over clone maybe it had a reason.
I suggest to move this to 'clone fallback' after copy_file_range and before
do_splice_direct.

IOWs, for all file systems in question except NFS,
clone_file_range is the 'generic' implementation used by copy_file_range.
So complying to VFS standards, fallback to generic handler if there is no
fs specific handler.

A hypothetical case why copy_range implementation would be preferred
by an application that runs over specific hypothetical fs - copy_file_range
can be more easily implemented as a killable copy loop, returning the length
that was copy before interrupted by a signal.

> +       if (file_in->f_op->clone_file_range) {
> +               ret = file_in->f_op->clone_file_range(file_in, pos_in,
> +                               file_out, pos_out, len);
> +               if (ret == 0) {
> +                       ret = len;
> +                       goto done;
> +               }
> +       }
> +
> +       if (file_out->f_op->copy_file_range) {
>                 ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out,
>                                                       pos_out, len, flags);
> -       if (ret == -EOPNOTSUPP)
> -               ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> -                               len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
> +               if (ret != -EOPNOTSUPP)
> +                       goto done;
> +       }
>
> +       ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> +                       len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
> +
> +done:
>         if (ret > 0) {
>                 fsnotify_access(file_in);
>                 add_rchar(current, ret);
>                 fsnotify_modify(file_out);
>                 add_wchar(current, ret);
>         }
> +
>         inc_syscr(current);
>         inc_syscw(current);
>
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 6e4f7f9..86ecc9b 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -909,24 +909,6 @@ xfs_file_fallocate(
>         return error;
>  }
>
> -STATIC ssize_t
> -xfs_file_copy_range(
> -       struct file     *file_in,
> -       loff_t          pos_in,
> -       struct file     *file_out,
> -       loff_t          pos_out,
> -       size_t          len,
> -       unsigned int    flags)
> -{
> -       int             error;
> -
> -       error = xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
> -                                    len, false);
> -       if (error)
> -               return error;
> -       return len;
> -}
> -
>  STATIC int
>  xfs_file_clone_range(
>         struct file     *file_in,
> @@ -1625,7 +1607,6 @@ const struct file_operations xfs_file_operations = {
>         .fsync          = xfs_file_fsync,
>         .get_unmapped_area = thp_get_unmapped_area,
>         .fallocate      = xfs_file_fallocate,
> -       .copy_file_range = xfs_file_copy_range,
>         .clone_file_range = xfs_file_clone_range,
>         .dedupe_file_range = xfs_file_dedupe_range,
>  };
> --
> 2.1.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Nov. 29, 2016, 8:55 a.m. UTC | #2
[fullquote removed, please get your email etiquette right or I'll stop
responding]


> For fs that support both copy and clone, I am not convinced that it up to
> VFS to take that decision for the application.

We had that discussion many times.  Yes, the system can decide it, as
the application does not see the difference.

> If application 'chose' copy over clone maybe it had a reason.

Yes, because it's lazy and simply doesn't give a f**k how the data
is copied.  Clone provides much stronger guarantees, but if you don't
need them you'll just use call that will always work.

> I suggest to move this to 'clone fallback' after copy_file_range and before
> do_splice_direct.

That won't do the right thing for the NFS case.

> A hypothetical case why copy_range implementation would be preferred
> by an application that runs over specific hypothetical fs - copy_file_range
> can be more easily implemented as a killable copy loop, returning the length
> that was copy before interrupted by a signal.

Clone is a fast metadata operation and is finished before you even
had a chance to kill it.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Amir Goldstein Nov. 29, 2016, 10:28 a.m. UTC | #3
On Tue, Nov 29, 2016 at 10:55 AM, Christoph Hellwig <hch@lst.de> wrote:
> [fullquote removed, please get your email etiquette right or I'll stop
> responding]
>

Sorry. I though the practice was to keep original patch in tact for review.

>
>> A hypothetical case why copy_range implementation would be preferred
>> by an application that runs over specific hypothetical fs - copy_file_range
>> can be more easily implemented as a killable copy loop, returning the length
>> that was copy before interrupted by a signal.
>
> Clone is a fast metadata operation and is finished before you even
> had a chance to kill it.

To be fair, Clone is a fast metadata operation on xfs/btrfs/ocfs2.
I don't think we can know for sure how a future file systems will choose to
implement clone, nor can we tell for sure how any version of remote Windows CIFS
server will implement it.
The only thing we do know for sure is that the API difference between
clone_range and copy_range must be respected.

But in any case, if you say all this was already discussed in the past,
I personally have no strong feeling against forcing clone.

Thanks for clarifying my questions,
Amir.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Nov. 29, 2016, 12:08 p.m. UTC | #4
On Tue, Nov 29, 2016 at 12:28:01PM +0200, Amir Goldstein wrote:
> On Tue, Nov 29, 2016 at 10:55 AM, Christoph Hellwig <hch@lst.de> wrote:
> > [fullquote removed, please get your email etiquette right or I'll stop
> > responding]
> >
> 
> Sorry. I though the practice was to keep original patch in tact for review.

The practice is to quote what's relevant.  You generally have a lot of
leeway to decide how much exactly you think fits, but a fullquote only
ever makes sense when forwarding the mail to someone not previously
involved with the thread.

> To be fair, Clone is a fast metadata operation on xfs/btrfs/ocfs2.
> I don't think we can know for sure how a future file systems will choose to
> implement clone, nor can we tell for sure how any version of remote Windows CIFS
> server will implement it.

If it's not a fast and atomic metadata operation it must not implement
clone_file_range, but should implement copy_file_range instead.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index e4e01a9..a49df8b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3229,9 +3229,6 @@  int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
 		      loff_t pos, size_t write_bytes,
 		      struct extent_state **cached);
 int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
-ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in,
-			      struct file *file_out, loff_t pos_out,
-			      size_t len, unsigned int flags);
 int btrfs_clone_file_range(struct file *file_in, loff_t pos_in,
 			   struct file *file_out, loff_t pos_out, u64 len);
 
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 3a14c87..991cc99 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2998,7 +2998,6 @@  const struct file_operations btrfs_file_operations = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= btrfs_compat_ioctl,
 #endif
-	.copy_file_range = btrfs_copy_file_range,
 	.clone_file_range = btrfs_clone_file_range,
 	.dedupe_file_range = btrfs_dedupe_file_range,
 };
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 7acbd2c..dab7462 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3980,18 +3980,6 @@  static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
 	return ret;
 }
 
-ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in,
-			      struct file *file_out, loff_t pos_out,
-			      size_t len, unsigned int flags)
-{
-	ssize_t ret;
-
-	ret = btrfs_clone_files(file_out, file_in, pos_in, len, pos_out);
-	if (ret == 0)
-		ret = len;
-	return ret;
-}
-
 int btrfs_clone_file_range(struct file *src_file, loff_t off,
 		struct file *dst_file, loff_t destoff, u64 len)
 {
diff --git a/fs/read_write.c b/fs/read_write.c
index 190e0d36..6674a4b 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1542,20 +1542,37 @@  ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
 	if (ret)
 		return ret;
 
-	ret = -EOPNOTSUPP;
-	if (file_out->f_op->copy_file_range)
+	/*
+	 * Try cloning first, this is supported by more file systems, and
+	 * more efficient if both clone and copy are supported (e.g. NFS).
+	 */
+	if (file_in->f_op->clone_file_range) {
+		ret = file_in->f_op->clone_file_range(file_in, pos_in,
+				file_out, pos_out, len);
+		if (ret == 0) {
+			ret = len;
+			goto done;
+		}
+	}
+
+	if (file_out->f_op->copy_file_range) {
 		ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out,
 						      pos_out, len, flags);
-	if (ret == -EOPNOTSUPP)
-		ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
-				len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
+		if (ret != -EOPNOTSUPP)
+			goto done;
+	}
 
+	ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
+			len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
+
+done:
 	if (ret > 0) {
 		fsnotify_access(file_in);
 		add_rchar(current, ret);
 		fsnotify_modify(file_out);
 		add_wchar(current, ret);
 	}
+
 	inc_syscr(current);
 	inc_syscw(current);
 
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 6e4f7f9..86ecc9b 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -909,24 +909,6 @@  xfs_file_fallocate(
 	return error;
 }
 
-STATIC ssize_t
-xfs_file_copy_range(
-	struct file	*file_in,
-	loff_t		pos_in,
-	struct file	*file_out,
-	loff_t		pos_out,
-	size_t		len,
-	unsigned int	flags)
-{
-	int		error;
-
-	error = xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
-				     len, false);
-	if (error)
-		return error;
-	return len;
-}
-
 STATIC int
 xfs_file_clone_range(
 	struct file	*file_in,
@@ -1625,7 +1607,6 @@  const struct file_operations xfs_file_operations = {
 	.fsync		= xfs_file_fsync,
 	.get_unmapped_area = thp_get_unmapped_area,
 	.fallocate	= xfs_file_fallocate,
-	.copy_file_range = xfs_file_copy_range,
 	.clone_file_range = xfs_file_clone_range,
 	.dedupe_file_range = xfs_file_dedupe_range,
 };