diff mbox

[v1,9/8] copy_file_range.2: New page documenting copy_file_range()

Message ID 1441397823-1203-10-git-send-email-Anna.Schumaker@Netapp.com (mailing list archive)
State New, archived
Headers show

Commit Message

Schumaker, Anna Sept. 4, 2015, 8:17 p.m. UTC
copy_file_range() is a new system call for copying ranges of data
completely in the kernel.  This gives filesystems an opportunity to
implement some kind of "copy acceleration", such as reflinks or
server-side-copy (in the case of NFS).

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 man2/copy_file_range.2 | 168 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 168 insertions(+)
 create mode 100644 man2/copy_file_range.2

Comments

Darrick J. Wong Sept. 4, 2015, 9:38 p.m. UTC | #1
On Fri, Sep 04, 2015 at 04:17:03PM -0400, Anna Schumaker wrote:
> copy_file_range() is a new system call for copying ranges of data
> completely in the kernel.  This gives filesystems an opportunity to
> implement some kind of "copy acceleration", such as reflinks or
> server-side-copy (in the case of NFS).
> 
> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
> ---
>  man2/copy_file_range.2 | 168 +++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 168 insertions(+)
>  create mode 100644 man2/copy_file_range.2
> 
> diff --git a/man2/copy_file_range.2 b/man2/copy_file_range.2
> new file mode 100644
> index 0000000..4a4cb73
> --- /dev/null
> +++ b/man2/copy_file_range.2
> @@ -0,0 +1,168 @@
> +.\"This manpage is Copyright (C) 2015 Anna Schumaker <Anna.Schumaker@Netapp.com>
> +.TH COPY 2 2015-8-31 "Linux" "Linux Programmer's Manual"
> +.SH NAME
> +copy_file_range \- Copy a range of data from one file to another
> +.SH SYNOPSIS
> +.nf
> +.B #include <linux/copy.h>
> +.B #include <sys/syscall.h>
> +.B #include <unistd.h>
> +
> +.BI "ssize_t syscall(__NR_copy_file_range, int " fd_in ", loff_t * " off_in ",
> +.BI "                int " fd_out ", loff_t * " off_out ", size_t " len ",
> +.BI "                unsigned int " flags );
> +.fi
> +.SH DESCRIPTION
> +The
> +.BR copy_file_range ()
> +system call performs an in-kernel copy between two file descriptors
> +without all that tedious mucking about in userspace.

;)

> +It copies up to
> +.I len
> +bytes of data from file descriptor
> +.I fd_in
> +to file descriptor
> +.I fd_out
> +at
> +.IR off_out .
> +The file descriptors must not refer to the same file.

Why?  btrfs (and XFS) reflink can handle the case of a file sharing blocks
with itself.

> +
> +The following semantics apply for
> +.IR fd_in ,
> +and similar statements apply to
> +.IR off_out :
> +.IP * 3
> +If
> +.I off_in
> +is NULL, then bytes are read from
> +.I fd_in
> +starting from the current file offset and the current
> +file offset is adjusted appropriately.
> +.IP *
> +If
> +.I off_in
> +is not NULL, then
> +.I off_in
> +must point to a buffer that specifies the starting
> +offset where bytes from
> +.I fd_in
> +will be read.  The current file offset of
> +.I fd_in
> +is not changed, but
> +.I off_in
> +is adjusted appropriately.
> +.PP
> +The default behavior of
> +.BR copy_file_range ()
> +is filesystem specific, and might result in creating a
> +copy-on-write reflink.
> +In the event that a given filesystem does not implement
> +any form of copy acceleration, the kernel will perform
> +a deep copy of the requested range by reading bytes from

I wonder if it's wise to allow deep copies -- what happens if len == 1T?
Will this syscall just block for a really long time?

> +.I fd_in
> +and writing them to
> +.IR fd_out .

"...if COPY_REFLINK is not set in flags."

> +
> +Currently, Linux only supports the following flag:
> +.TP 1.9i
> +.B COPY_REFLINK
> +Only perform the copy if the filesystem can do it as a reflink.
> +Do not fall back on performing a deep copy.
> +.SH RETURN VALUE
> +Upon successful completion,
> +.BR copy_file_range ()
> +will return the number of bytes copied between files.
> +This could be less than the length originally requested.
> +
> +On error,
> +.BR copy_file_range ()
> +returns \-1 and
> +.I errno
> +is set to indicate the error.
> +.SH ERRORS
> +.TP
> +.B EBADF
> +One or more file descriptors are not valid,
> +or do not have proper read-write mode.

"or fd_out is not opened for writing"?

> +.TP
> +.B EINVAL
> +Requested range extends beyond the end of the file;
> +.I flags
> +argument is set to an invalid value.
> +.TP
> +.B EOPNOTSUPP
> +.B COPY_REFLINK
> +was specified in
> +.IR flags ,
> +but the target filesystem does not support reflinks.
> +.TP
> +.B EXDEV
> +Target filesystem doesn't support cross-filesystem copies.
> +.SH VERSIONS

Perhaps this ought to list a few more errors (EIO, ENOSPC, ENOSYS, EPERM...)
that can be returned?  (I was looking at the fallocate manpage.)

--D

> +The
> +.BR copy_file_range ()
> +system call first appeared in Linux 4.3.
> +.SH CONFORMING TO
> +The
> +.BR copy_file_range ()
> +system call is a nonstandard Linux extension.
> +.SH EXAMPLE
> +.nf
> +
> +#define _GNU_SOURCE
> +#include <fcntl.h>
> +#include <linux/copy.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <sys/stat.h>
> +#include <sys/syscall.h>
> +#include <unistd.h>
> +
> +
> +int main(int argc, char **argv)
> +{
> +    int fd_in, fd_out;
> +    struct stat stat;
> +    loff_t len, ret;
> +
> +    if (argc != 3) {
> +        fprintf(stderr, "Usage: %s <pathname> <pathname>\n", argv[0]);
> +        exit(EXIT_FAILURE);
> +    }
> +
> +    fd_in = open(argv[1], O_RDONLY);
> +    if (fd_in == -1) {
> +        perror("open (argv[1])");
> +        exit(EXIT_FAILURE);
> +    }
> +
> +    if (fstat(fd_in, &stat) == -1) {
> +        perror("fstat");
> +        exit(EXIT_FAILURE);
> +    }
> +    len = stat.st_size;
> +
> +    fd_out = open(argv[2], O_WRONLY | O_CREAT, 0644);
> +    if (fd_out == -1) {
> +        perror("open (argv[2])");
> +        exit(EXIT_FAILURE);
> +    }
> +
> +    do {
> +        ret = syscall(__NR_copy_file_range, fd_in, NULL,
> +                      fd_out, NULL, len, 0);
> +        if (ret == -1) {
> +            perror("copy_file_range");
> +            exit(EXIT_FAILURE);
> +        }
> +
> +        len -= ret;
> +    } while (len > 0);
> +
> +    close(fd_in);
> +    close(fd_out);
> +    exit(EXIT_SUCCESS);
> +}
> +.fi
> +.SH SEE ALSO
> +.BR splice (2)
> -- 
> 2.5.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andreas Dilger Sept. 4, 2015, 10:31 p.m. UTC | #2
On Sep 4, 2015, at 3:38 PM, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> 
> On Fri, Sep 04, 2015 at 04:17:03PM -0400, Anna Schumaker wrote:
>> copy_file_range() is a new system call for copying ranges of data
>> completely in the kernel.  This gives filesystems an opportunity to
>> implement some kind of "copy acceleration", such as reflinks or
>> server-side-copy (in the case of NFS).
>> 
>> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
>> ---
>> man2/copy_file_range.2 | 168 +++++++++++++++++++++++++++++++++++++++++++++++++
>> 1 file changed, 168 insertions(+)
>> create mode 100644 man2/copy_file_range.2
>> 
>> diff --git a/man2/copy_file_range.2 b/man2/copy_file_range.2
>> new file mode 100644
>> index 0000000..4a4cb73
>> --- /dev/null
>> +++ b/man2/copy_file_range.2
>> @@ -0,0 +1,168 @@
>> +.\"This manpage is Copyright (C) 2015 Anna Schumaker <Anna.Schumaker@Netapp.com>
>> +.TH COPY 2 2015-8-31 "Linux" "Linux Programmer's Manual"
>> +.SH NAME
>> +copy_file_range \- Copy a range of data from one file to another
>> +.SH SYNOPSIS
>> +.nf
>> +.B #include <linux/copy.h>
>> +.B #include <sys/syscall.h>
>> +.B #include <unistd.h>
>> +
>> +.BI "ssize_t syscall(__NR_copy_file_range, int " fd_in ", loff_t * " off_in ",
>> +.BI "                int " fd_out ", loff_t * " off_out ", size_t " len ",
>> +.BI "                unsigned int " flags );
>> +.fi
>> +.SH DESCRIPTION
>> +The
>> +.BR copy_file_range ()
>> +system call performs an in-kernel copy between two file descriptors
>> +without all that tedious mucking about in userspace.
> 
> ;)
> 
>> +It copies up to
>> +.I len
>> +bytes of data from file descriptor
>> +.I fd_in
>> +to file descriptor
>> +.I fd_out
>> +at
>> +.IR off_out .
>> +The file descriptors must not refer to the same file.
> 
> Why?  btrfs (and XFS) reflink can handle the case of a file sharing blocks
> with itself.
> 
>> +
>> +The following semantics apply for
>> +.IR fd_in ,
>> +and similar statements apply to
>> +.IR off_out :
>> +.IP * 3
>> +If
>> +.I off_in
>> +is NULL, then bytes are read from
>> +.I fd_in
>> +starting from the current file offset and the current
>> +file offset is adjusted appropriately.
>> +.IP *
>> +If
>> +.I off_in
>> +is not NULL, then
>> +.I off_in
>> +must point to a buffer that specifies the starting
>> +offset where bytes from
>> +.I fd_in
>> +will be read.  The current file offset of
>> +.I fd_in
>> +is not changed, but
>> +.I off_in
>> +is adjusted appropriately.
>> +.PP
>> +The default behavior of
>> +.BR copy_file_range ()
>> +is filesystem specific, and might result in creating a
>> +copy-on-write reflink.
>> +In the event that a given filesystem does not implement
>> +any form of copy acceleration, the kernel will perform
>> +a deep copy of the requested range by reading bytes from
> 
> I wonder if it's wise to allow deep copies -- what happens if
> len == 1T? Will this syscall just block for a really long time?

It should be interruptible, and return the length of the number of
bytes copied so far, just like read() and write().  That allows
the caller to continue where it left off, or abort and delete the
target file, or whatever it wants to do.

Cheers, Andreas

>> +.I fd_in
>> +and writing them to
>> +.IR fd_out .
> 
> "...if COPY_REFLINK is not set in flags."
> 
>> +
>> +Currently, Linux only supports the following flag:
>> +.TP 1.9i
>> +.B COPY_REFLINK
>> +Only perform the copy if the filesystem can do it as a reflink.
>> +Do not fall back on performing a deep copy.
>> +.SH RETURN VALUE
>> +Upon successful completion,
>> +.BR copy_file_range ()
>> +will return the number of bytes copied between files.
>> +This could be less than the length originally requested.
>> +
>> +On error,
>> +.BR copy_file_range ()
>> +returns \-1 and
>> +.I errno
>> +is set to indicate the error.
>> +.SH ERRORS
>> +.TP
>> +.B EBADF
>> +One or more file descriptors are not valid,
>> +or do not have proper read-write mode.
> 
> "or fd_out is not opened for writing"?
> 
>> +.TP
>> +.B EINVAL
>> +Requested range extends beyond the end of the file;
>> +.I flags
>> +argument is set to an invalid value.
>> +.TP
>> +.B EOPNOTSUPP
>> +.B COPY_REFLINK
>> +was specified in
>> +.IR flags ,
>> +but the target filesystem does not support reflinks.
>> +.TP
>> +.B EXDEV
>> +Target filesystem doesn't support cross-filesystem copies.
>> +.SH VERSIONS
> 
> Perhaps this ought to list a few more errors (EIO, ENOSPC, ENOSYS, EPERM...)
> that can be returned?  (I was looking at the fallocate manpage.)
> 
> --D
> 
>> +The
>> +.BR copy_file_range ()
>> +system call first appeared in Linux 4.3.
>> +.SH CONFORMING TO
>> +The
>> +.BR copy_file_range ()
>> +system call is a nonstandard Linux extension.
>> +.SH EXAMPLE
>> +.nf
>> +
>> +#define _GNU_SOURCE
>> +#include <fcntl.h>
>> +#include <linux/copy.h>
>> +#include <stdio.h>
>> +#include <stdlib.h>
>> +#include <sys/stat.h>
>> +#include <sys/syscall.h>
>> +#include <unistd.h>
>> +
>> +
>> +int main(int argc, char **argv)
>> +{
>> +    int fd_in, fd_out;
>> +    struct stat stat;
>> +    loff_t len, ret;
>> +
>> +    if (argc != 3) {
>> +        fprintf(stderr, "Usage: %s <pathname> <pathname>\n", argv[0]);
>> +        exit(EXIT_FAILURE);
>> +    }
>> +
>> +    fd_in = open(argv[1], O_RDONLY);
>> +    if (fd_in == -1) {
>> +        perror("open (argv[1])");
>> +        exit(EXIT_FAILURE);
>> +    }
>> +
>> +    if (fstat(fd_in, &stat) == -1) {
>> +        perror("fstat");
>> +        exit(EXIT_FAILURE);
>> +    }
>> +    len = stat.st_size;
>> +
>> +    fd_out = open(argv[2], O_WRONLY | O_CREAT, 0644);
>> +    if (fd_out == -1) {
>> +        perror("open (argv[2])");
>> +        exit(EXIT_FAILURE);
>> +    }
>> +
>> +    do {
>> +        ret = syscall(__NR_copy_file_range, fd_in, NULL,
>> +                      fd_out, NULL, len, 0);
>> +        if (ret == -1) {
>> +            perror("copy_file_range");
>> +            exit(EXIT_FAILURE);
>> +        }
>> +
>> +        len -= ret;
>> +    } while (len > 0);
>> +
>> +    close(fd_in);
>> +    close(fd_out);
>> +    exit(EXIT_SUCCESS);
>> +}
>> +.fi
>> +.SH SEE ALSO
>> +.BR splice (2)
>> -- 
>> 2.5.1
>> 
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


Cheers, Andreas





--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Schumaker, Anna Sept. 8, 2015, 3:04 p.m. UTC | #3
On 09/04/2015 05:38 PM, Darrick J. Wong wrote:
> On Fri, Sep 04, 2015 at 04:17:03PM -0400, Anna Schumaker wrote:
>> copy_file_range() is a new system call for copying ranges of data
>> completely in the kernel.  This gives filesystems an opportunity to
>> implement some kind of "copy acceleration", such as reflinks or
>> server-side-copy (in the case of NFS).
>>
>> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
>> ---
>>  man2/copy_file_range.2 | 168 +++++++++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 168 insertions(+)
>>  create mode 100644 man2/copy_file_range.2
>>
>> diff --git a/man2/copy_file_range.2 b/man2/copy_file_range.2
>> new file mode 100644
>> index 0000000..4a4cb73
>> --- /dev/null
>> +++ b/man2/copy_file_range.2
>> @@ -0,0 +1,168 @@
>> +.\"This manpage is Copyright (C) 2015 Anna Schumaker <Anna.Schumaker@Netapp.com>
>> +.TH COPY 2 2015-8-31 "Linux" "Linux Programmer's Manual"
>> +.SH NAME
>> +copy_file_range \- Copy a range of data from one file to another
>> +.SH SYNOPSIS
>> +.nf
>> +.B #include <linux/copy.h>
>> +.B #include <sys/syscall.h>
>> +.B #include <unistd.h>
>> +
>> +.BI "ssize_t syscall(__NR_copy_file_range, int " fd_in ", loff_t * " off_in ",
>> +.BI "                int " fd_out ", loff_t * " off_out ", size_t " len ",
>> +.BI "                unsigned int " flags );
>> +.fi
>> +.SH DESCRIPTION
>> +The
>> +.BR copy_file_range ()
>> +system call performs an in-kernel copy between two file descriptors
>> +without all that tedious mucking about in userspace.
> 
> ;)
> 
>> +It copies up to
>> +.I len
>> +bytes of data from file descriptor
>> +.I fd_in
>> +to file descriptor
>> +.I fd_out
>> +at
>> +.IR off_out .
>> +The file descriptors must not refer to the same file.
> 
> Why?  btrfs (and XFS) reflink can handle the case of a file sharing blocks
> with itself.

I've never really thought about it... Zach had that in his initial submission, so mentioned it in the man page.  Should I remove that bit?


> 
>> +
>> +The following semantics apply for
>> +.IR fd_in ,
>> +and similar statements apply to
>> +.IR off_out :
>> +.IP * 3
>> +If
>> +.I off_in
>> +is NULL, then bytes are read from
>> +.I fd_in
>> +starting from the current file offset and the current
>> +file offset is adjusted appropriately.
>> +.IP *
>> +If
>> +.I off_in
>> +is not NULL, then
>> +.I off_in
>> +must point to a buffer that specifies the starting
>> +offset where bytes from
>> +.I fd_in
>> +will be read.  The current file offset of
>> +.I fd_in
>> +is not changed, but
>> +.I off_in
>> +is adjusted appropriately.
>> +.PP
>> +The default behavior of
>> +.BR copy_file_range ()
>> +is filesystem specific, and might result in creating a
>> +copy-on-write reflink.
>> +In the event that a given filesystem does not implement
>> +any form of copy acceleration, the kernel will perform
>> +a deep copy of the requested range by reading bytes from
> 
> I wonder if it's wise to allow deep copies -- what happens if len == 1T?
> Will this syscall just block for a really long time?

We use rw_verify_area(), (similar to read and write) so we won't allow a value of len that long.  I can mention this in an updated version of this man page!


> 
>> +.I fd_in
>> +and writing them to
>> +.IR fd_out .
> 
> "...if COPY_REFLINK is not set in flags."

Sure.

> 
>> +
>> +Currently, Linux only supports the following flag:
>> +.TP 1.9i
>> +.B COPY_REFLINK
>> +Only perform the copy if the filesystem can do it as a reflink.
>> +Do not fall back on performing a deep copy.
>> +.SH RETURN VALUE
>> +Upon successful completion,
>> +.BR copy_file_range ()
>> +will return the number of bytes copied between files.
>> +This could be less than the length originally requested.
>> +
>> +On error,
>> +.BR copy_file_range ()
>> +returns \-1 and
>> +.I errno
>> +is set to indicate the error.
>> +.SH ERRORS
>> +.TP
>> +.B EBADF
>> +One or more file descriptors are not valid,
>> +or do not have proper read-write mode.
> 
> "or fd_out is not opened for writing"?

I'll add that.

> 
>> +.TP
>> +.B EINVAL
>> +Requested range extends beyond the end of the file;
>> +.I flags
>> +argument is set to an invalid value.
>> +.TP
>> +.B EOPNOTSUPP
>> +.B COPY_REFLINK
>> +was specified in
>> +.IR flags ,
>> +but the target filesystem does not support reflinks.
>> +.TP
>> +.B EXDEV
>> +Target filesystem doesn't support cross-filesystem copies.
>> +.SH VERSIONS
> 
> Perhaps this ought to list a few more errors (EIO, ENOSPC, ENOSYS, EPERM...)
> that can be returned?  (I was looking at the fallocate manpage.)

Okay.  I'll poke around for what else could be returned!

Thanks,
Anna

> 
> --D
> 
>> +The
>> +.BR copy_file_range ()
>> +system call first appeared in Linux 4.3.
>> +.SH CONFORMING TO
>> +The
>> +.BR copy_file_range ()
>> +system call is a nonstandard Linux extension.
>> +.SH EXAMPLE
>> +.nf
>> +
>> +#define _GNU_SOURCE
>> +#include <fcntl.h>
>> +#include <linux/copy.h>
>> +#include <stdio.h>
>> +#include <stdlib.h>
>> +#include <sys/stat.h>
>> +#include <sys/syscall.h>
>> +#include <unistd.h>
>> +
>> +
>> +int main(int argc, char **argv)
>> +{
>> +    int fd_in, fd_out;
>> +    struct stat stat;
>> +    loff_t len, ret;
>> +
>> +    if (argc != 3) {
>> +        fprintf(stderr, "Usage: %s <pathname> <pathname>\n", argv[0]);
>> +        exit(EXIT_FAILURE);
>> +    }
>> +
>> +    fd_in = open(argv[1], O_RDONLY);
>> +    if (fd_in == -1) {
>> +        perror("open (argv[1])");
>> +        exit(EXIT_FAILURE);
>> +    }
>> +
>> +    if (fstat(fd_in, &stat) == -1) {
>> +        perror("fstat");
>> +        exit(EXIT_FAILURE);
>> +    }
>> +    len = stat.st_size;
>> +
>> +    fd_out = open(argv[2], O_WRONLY | O_CREAT, 0644);
>> +    if (fd_out == -1) {
>> +        perror("open (argv[2])");
>> +        exit(EXIT_FAILURE);
>> +    }
>> +
>> +    do {
>> +        ret = syscall(__NR_copy_file_range, fd_in, NULL,
>> +                      fd_out, NULL, len, 0);
>> +        if (ret == -1) {
>> +            perror("copy_file_range");
>> +            exit(EXIT_FAILURE);
>> +        }
>> +
>> +        len -= ret;
>> +    } while (len > 0);
>> +
>> +    close(fd_in);
>> +    close(fd_out);
>> +    exit(EXIT_SUCCESS);
>> +}
>> +.fi
>> +.SH SEE ALSO
>> +.BR splice (2)
>> -- 
>> 2.5.1
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Schumaker, Anna Sept. 8, 2015, 3:05 p.m. UTC | #4
On 09/04/2015 06:31 PM, Andreas Dilger wrote:
> On Sep 4, 2015, at 3:38 PM, Darrick J. Wong <darrick.wong@oracle.com> wrote:
>>
>> On Fri, Sep 04, 2015 at 04:17:03PM -0400, Anna Schumaker wrote:
>>> copy_file_range() is a new system call for copying ranges of data
>>> completely in the kernel.  This gives filesystems an opportunity to
>>> implement some kind of "copy acceleration", such as reflinks or
>>> server-side-copy (in the case of NFS).
>>>
>>> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
>>> ---
>>> man2/copy_file_range.2 | 168 +++++++++++++++++++++++++++++++++++++++++++++++++
>>> 1 file changed, 168 insertions(+)
>>> create mode 100644 man2/copy_file_range.2
>>>
>>> diff --git a/man2/copy_file_range.2 b/man2/copy_file_range.2
>>> new file mode 100644
>>> index 0000000..4a4cb73
>>> --- /dev/null
>>> +++ b/man2/copy_file_range.2
>>> @@ -0,0 +1,168 @@
>>> +.\"This manpage is Copyright (C) 2015 Anna Schumaker <Anna.Schumaker@Netapp.com>
>>> +.TH COPY 2 2015-8-31 "Linux" "Linux Programmer's Manual"
>>> +.SH NAME
>>> +copy_file_range \- Copy a range of data from one file to another
>>> +.SH SYNOPSIS
>>> +.nf
>>> +.B #include <linux/copy.h>
>>> +.B #include <sys/syscall.h>
>>> +.B #include <unistd.h>
>>> +
>>> +.BI "ssize_t syscall(__NR_copy_file_range, int " fd_in ", loff_t * " off_in ",
>>> +.BI "                int " fd_out ", loff_t * " off_out ", size_t " len ",
>>> +.BI "                unsigned int " flags );
>>> +.fi
>>> +.SH DESCRIPTION
>>> +The
>>> +.BR copy_file_range ()
>>> +system call performs an in-kernel copy between two file descriptors
>>> +without all that tedious mucking about in userspace.
>>
>> ;)
>>
>>> +It copies up to
>>> +.I len
>>> +bytes of data from file descriptor
>>> +.I fd_in
>>> +to file descriptor
>>> +.I fd_out
>>> +at
>>> +.IR off_out .
>>> +The file descriptors must not refer to the same file.
>>
>> Why?  btrfs (and XFS) reflink can handle the case of a file sharing blocks
>> with itself.
>>
>>> +
>>> +The following semantics apply for
>>> +.IR fd_in ,
>>> +and similar statements apply to
>>> +.IR off_out :
>>> +.IP * 3
>>> +If
>>> +.I off_in
>>> +is NULL, then bytes are read from
>>> +.I fd_in
>>> +starting from the current file offset and the current
>>> +file offset is adjusted appropriately.
>>> +.IP *
>>> +If
>>> +.I off_in
>>> +is not NULL, then
>>> +.I off_in
>>> +must point to a buffer that specifies the starting
>>> +offset where bytes from
>>> +.I fd_in
>>> +will be read.  The current file offset of
>>> +.I fd_in
>>> +is not changed, but
>>> +.I off_in
>>> +is adjusted appropriately.
>>> +.PP
>>> +The default behavior of
>>> +.BR copy_file_range ()
>>> +is filesystem specific, and might result in creating a
>>> +copy-on-write reflink.
>>> +In the event that a given filesystem does not implement
>>> +any form of copy acceleration, the kernel will perform
>>> +a deep copy of the requested range by reading bytes from
>>
>> I wonder if it's wise to allow deep copies -- what happens if
>> len == 1T? Will this syscall just block for a really long time?
> 
> It should be interruptible, and return the length of the number of
> bytes copied so far, just like read() and write().  That allows
> the caller to continue where it left off, or abort and delete the
> target file, or whatever it wants to do.

We already return the number of bytes copied so far, so I'll look into making it interruptable!

Thanks,
Anna

> 
> Cheers, Andreas
> 
>>> +.I fd_in
>>> +and writing them to
>>> +.IR fd_out .
>>
>> "...if COPY_REFLINK is not set in flags."
>>
>>> +
>>> +Currently, Linux only supports the following flag:
>>> +.TP 1.9i
>>> +.B COPY_REFLINK
>>> +Only perform the copy if the filesystem can do it as a reflink.
>>> +Do not fall back on performing a deep copy.
>>> +.SH RETURN VALUE
>>> +Upon successful completion,
>>> +.BR copy_file_range ()
>>> +will return the number of bytes copied between files.
>>> +This could be less than the length originally requested.
>>> +
>>> +On error,
>>> +.BR copy_file_range ()
>>> +returns \-1 and
>>> +.I errno
>>> +is set to indicate the error.
>>> +.SH ERRORS
>>> +.TP
>>> +.B EBADF
>>> +One or more file descriptors are not valid,
>>> +or do not have proper read-write mode.
>>
>> "or fd_out is not opened for writing"?
>>
>>> +.TP
>>> +.B EINVAL
>>> +Requested range extends beyond the end of the file;
>>> +.I flags
>>> +argument is set to an invalid value.
>>> +.TP
>>> +.B EOPNOTSUPP
>>> +.B COPY_REFLINK
>>> +was specified in
>>> +.IR flags ,
>>> +but the target filesystem does not support reflinks.
>>> +.TP
>>> +.B EXDEV
>>> +Target filesystem doesn't support cross-filesystem copies.
>>> +.SH VERSIONS
>>
>> Perhaps this ought to list a few more errors (EIO, ENOSPC, ENOSYS, EPERM...)
>> that can be returned?  (I was looking at the fallocate manpage.)
>>
>> --D
>>
>>> +The
>>> +.BR copy_file_range ()
>>> +system call first appeared in Linux 4.3.
>>> +.SH CONFORMING TO
>>> +The
>>> +.BR copy_file_range ()
>>> +system call is a nonstandard Linux extension.
>>> +.SH EXAMPLE
>>> +.nf
>>> +
>>> +#define _GNU_SOURCE
>>> +#include <fcntl.h>
>>> +#include <linux/copy.h>
>>> +#include <stdio.h>
>>> +#include <stdlib.h>
>>> +#include <sys/stat.h>
>>> +#include <sys/syscall.h>
>>> +#include <unistd.h>
>>> +
>>> +
>>> +int main(int argc, char **argv)
>>> +{
>>> +    int fd_in, fd_out;
>>> +    struct stat stat;
>>> +    loff_t len, ret;
>>> +
>>> +    if (argc != 3) {
>>> +        fprintf(stderr, "Usage: %s <pathname> <pathname>\n", argv[0]);
>>> +        exit(EXIT_FAILURE);
>>> +    }
>>> +
>>> +    fd_in = open(argv[1], O_RDONLY);
>>> +    if (fd_in == -1) {
>>> +        perror("open (argv[1])");
>>> +        exit(EXIT_FAILURE);
>>> +    }
>>> +
>>> +    if (fstat(fd_in, &stat) == -1) {
>>> +        perror("fstat");
>>> +        exit(EXIT_FAILURE);
>>> +    }
>>> +    len = stat.st_size;
>>> +
>>> +    fd_out = open(argv[2], O_WRONLY | O_CREAT, 0644);
>>> +    if (fd_out == -1) {
>>> +        perror("open (argv[2])");
>>> +        exit(EXIT_FAILURE);
>>> +    }
>>> +
>>> +    do {
>>> +        ret = syscall(__NR_copy_file_range, fd_in, NULL,
>>> +                      fd_out, NULL, len, 0);
>>> +        if (ret == -1) {
>>> +            perror("copy_file_range");
>>> +            exit(EXIT_FAILURE);
>>> +        }
>>> +
>>> +        len -= ret;
>>> +    } while (len > 0);
>>> +
>>> +    close(fd_in);
>>> +    close(fd_out);
>>> +    exit(EXIT_SUCCESS);
>>> +}
>>> +.fi
>>> +.SH SEE ALSO
>>> +.BR splice (2)
>>> -- 
>>> 2.5.1
>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 
> Cheers, Andreas
> 
> 
> 
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong Sept. 8, 2015, 8:39 p.m. UTC | #5
On Tue, Sep 08, 2015 at 11:04:03AM -0400, Anna Schumaker wrote:
> On 09/04/2015 05:38 PM, Darrick J. Wong wrote:
> > On Fri, Sep 04, 2015 at 04:17:03PM -0400, Anna Schumaker wrote:
> >> copy_file_range() is a new system call for copying ranges of data
> >> completely in the kernel.  This gives filesystems an opportunity to
> >> implement some kind of "copy acceleration", such as reflinks or
> >> server-side-copy (in the case of NFS).
> >>
> >> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
> >> ---
> >>  man2/copy_file_range.2 | 168 +++++++++++++++++++++++++++++++++++++++++++++++++
> >>  1 file changed, 168 insertions(+)
> >>  create mode 100644 man2/copy_file_range.2
> >>
> >> diff --git a/man2/copy_file_range.2 b/man2/copy_file_range.2
> >> new file mode 100644
> >> index 0000000..4a4cb73
> >> --- /dev/null
> >> +++ b/man2/copy_file_range.2
> >> @@ -0,0 +1,168 @@
> >> +.\"This manpage is Copyright (C) 2015 Anna Schumaker <Anna.Schumaker@Netapp.com>
> >> +.TH COPY 2 2015-8-31 "Linux" "Linux Programmer's Manual"
> >> +.SH NAME
> >> +copy_file_range \- Copy a range of data from one file to another
> >> +.SH SYNOPSIS
> >> +.nf
> >> +.B #include <linux/copy.h>
> >> +.B #include <sys/syscall.h>
> >> +.B #include <unistd.h>
> >> +
> >> +.BI "ssize_t syscall(__NR_copy_file_range, int " fd_in ", loff_t * " off_in ",
> >> +.BI "                int " fd_out ", loff_t * " off_out ", size_t " len ",
> >> +.BI "                unsigned int " flags );
> >> +.fi
> >> +.SH DESCRIPTION
> >> +The
> >> +.BR copy_file_range ()
> >> +system call performs an in-kernel copy between two file descriptors
> >> +without all that tedious mucking about in userspace.
> > 
> > ;)
> > 
> >> +It copies up to
> >> +.I len
> >> +bytes of data from file descriptor
> >> +.I fd_in
> >> +to file descriptor
> >> +.I fd_out
> >> +at
> >> +.IR off_out .
> >> +The file descriptors must not refer to the same file.
> > 
> > Why?  btrfs (and XFS) reflink can handle the case of a file sharing blocks
> > with itself.
> 
> I've never really thought about it... Zach had that in his initial
> submission, so mentioned it in the man page.  Should I remove that bit?

Yes, please!

I could be wrong, but I think btrfs only started supporting files that share
blocks with themselves relatively recently(?)

I'm not sure why zab added this; was hoping he'd speak up. ;)

> 
> > 
> >> +
> >> +The following semantics apply for
> >> +.IR fd_in ,
> >> +and similar statements apply to
> >> +.IR off_out :
> >> +.IP * 3
> >> +If
> >> +.I off_in
> >> +is NULL, then bytes are read from
> >> +.I fd_in
> >> +starting from the current file offset and the current
> >> +file offset is adjusted appropriately.
> >> +.IP *
> >> +If
> >> +.I off_in
> >> +is not NULL, then
> >> +.I off_in
> >> +must point to a buffer that specifies the starting
> >> +offset where bytes from
> >> +.I fd_in
> >> +will be read.  The current file offset of
> >> +.I fd_in
> >> +is not changed, but
> >> +.I off_in
> >> +is adjusted appropriately.
> >> +.PP
> >> +The default behavior of
> >> +.BR copy_file_range ()
> >> +is filesystem specific, and might result in creating a
> >> +copy-on-write reflink.
> >> +In the event that a given filesystem does not implement
> >> +any form of copy acceleration, the kernel will perform
> >> +a deep copy of the requested range by reading bytes from
> > 
> > I wonder if it's wise to allow deep copies -- what happens if len == 1T?
> > Will this syscall just block for a really long time?
> 
> We use rw_verify_area(), (similar to read and write) so we won't allow a
> value of len that long.  I can mention this in an updated version of this man
> page!

Ok.  I guess MAX_RW_COUNT limits us to about 4G at once, which for a splice
copy is probably reasonable.

The reason why I asked about len == 1T specifically is that I can (with
somewhat long delays) reflink about 260 million extents at a time on XFS,
which is about 1TB.  Given that locks get held for the duration, it's probably
not a bad thing to limit userspace to 4G at a time.

(But hey, it's fun to stress-test once in a while. :))

--D

> 
> 
> > 
> >> +.I fd_in
> >> +and writing them to
> >> +.IR fd_out .
> > 
> > "...if COPY_REFLINK is not set in flags."
> 
> Sure.
> 
> > 
> >> +
> >> +Currently, Linux only supports the following flag:
> >> +.TP 1.9i
> >> +.B COPY_REFLINK
> >> +Only perform the copy if the filesystem can do it as a reflink.
> >> +Do not fall back on performing a deep copy.
> >> +.SH RETURN VALUE
> >> +Upon successful completion,
> >> +.BR copy_file_range ()
> >> +will return the number of bytes copied between files.
> >> +This could be less than the length originally requested.
> >> +
> >> +On error,
> >> +.BR copy_file_range ()
> >> +returns \-1 and
> >> +.I errno
> >> +is set to indicate the error.
> >> +.SH ERRORS
> >> +.TP
> >> +.B EBADF
> >> +One or more file descriptors are not valid,
> >> +or do not have proper read-write mode.
> > 
> > "or fd_out is not opened for writing"?
> 
> I'll add that.
> 
> > 
> >> +.TP
> >> +.B EINVAL
> >> +Requested range extends beyond the end of the file;
> >> +.I flags
> >> +argument is set to an invalid value.
> >> +.TP
> >> +.B EOPNOTSUPP
> >> +.B COPY_REFLINK
> >> +was specified in
> >> +.IR flags ,
> >> +but the target filesystem does not support reflinks.
> >> +.TP
> >> +.B EXDEV
> >> +Target filesystem doesn't support cross-filesystem copies.
> >> +.SH VERSIONS
> > 
> > Perhaps this ought to list a few more errors (EIO, ENOSPC, ENOSYS, EPERM...)
> > that can be returned?  (I was looking at the fallocate manpage.)
> 
> Okay.  I'll poke around for what else could be returned!
> 
> Thanks,
> Anna
> 
> > 
> > --D
> > 
> >> +The
> >> +.BR copy_file_range ()
> >> +system call first appeared in Linux 4.3.
> >> +.SH CONFORMING TO
> >> +The
> >> +.BR copy_file_range ()
> >> +system call is a nonstandard Linux extension.
> >> +.SH EXAMPLE
> >> +.nf
> >> +
> >> +#define _GNU_SOURCE
> >> +#include <fcntl.h>
> >> +#include <linux/copy.h>
> >> +#include <stdio.h>
> >> +#include <stdlib.h>
> >> +#include <sys/stat.h>
> >> +#include <sys/syscall.h>
> >> +#include <unistd.h>
> >> +
> >> +
> >> +int main(int argc, char **argv)
> >> +{
> >> +    int fd_in, fd_out;
> >> +    struct stat stat;
> >> +    loff_t len, ret;
> >> +
> >> +    if (argc != 3) {
> >> +        fprintf(stderr, "Usage: %s <pathname> <pathname>\n", argv[0]);
> >> +        exit(EXIT_FAILURE);
> >> +    }
> >> +
> >> +    fd_in = open(argv[1], O_RDONLY);
> >> +    if (fd_in == -1) {
> >> +        perror("open (argv[1])");
> >> +        exit(EXIT_FAILURE);
> >> +    }
> >> +
> >> +    if (fstat(fd_in, &stat) == -1) {
> >> +        perror("fstat");
> >> +        exit(EXIT_FAILURE);
> >> +    }
> >> +    len = stat.st_size;
> >> +
> >> +    fd_out = open(argv[2], O_WRONLY | O_CREAT, 0644);
> >> +    if (fd_out == -1) {
> >> +        perror("open (argv[2])");
> >> +        exit(EXIT_FAILURE);
> >> +    }
> >> +
> >> +    do {
> >> +        ret = syscall(__NR_copy_file_range, fd_in, NULL,
> >> +                      fd_out, NULL, len, 0);
> >> +        if (ret == -1) {
> >> +            perror("copy_file_range");
> >> +            exit(EXIT_FAILURE);
> >> +        }
> >> +
> >> +        len -= ret;
> >> +    } while (len > 0);
> >> +
> >> +    close(fd_in);
> >> +    close(fd_out);
> >> +    exit(EXIT_SUCCESS);
> >> +}
> >> +.fi
> >> +.SH SEE ALSO
> >> +.BR splice (2)
> >> -- 
> >> 2.5.1
> >>
> >> --
> >> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> >> the body of a message to majordomo@vger.kernel.org
> >> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Sterba Sept. 9, 2015, 9:16 a.m. UTC | #6
On Tue, Sep 08, 2015 at 01:39:18PM -0700, Darrick J. Wong wrote:
> > >> +The file descriptors must not refer to the same file.
> > > 
> > > Why?  btrfs (and XFS) reflink can handle the case of a file sharing blocks
> > > with itself.
> > 
> > I've never really thought about it... Zach had that in his initial
> > submission, so mentioned it in the man page.  Should I remove that bit?
> 
> Yes, please!
> 
> I could be wrong, but I think btrfs only started supporting files that share
> blocks with themselves relatively recently(?)

The support was added into the cloning ioctl itself, otherwise same-file
clones were always possible with a sidestep using another file.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Austin S. Hemmelgarn Sept. 9, 2015, 11:38 a.m. UTC | #7
On 2015-09-08 16:39, Darrick J. Wong wrote:
> On Tue, Sep 08, 2015 at 11:04:03AM -0400, Anna Schumaker wrote:
>> On 09/04/2015 05:38 PM, Darrick J. Wong wrote:
>>> On Fri, Sep 04, 2015 at 04:17:03PM -0400, Anna Schumaker wrote:
>>>> copy_file_range() is a new system call for copying ranges of data
>>>> completely in the kernel.  This gives filesystems an opportunity to
>>>> implement some kind of "copy acceleration", such as reflinks or
>>>> server-side-copy (in the case of NFS).
>>>>
>>>> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
>>>> ---
>>>>   man2/copy_file_range.2 | 168 +++++++++++++++++++++++++++++++++++++++++++++++++
>>>>   1 file changed, 168 insertions(+)
>>>>   create mode 100644 man2/copy_file_range.2
>>>>
>>>> diff --git a/man2/copy_file_range.2 b/man2/copy_file_range.2
>>>> new file mode 100644
>>>> index 0000000..4a4cb73
>>>> --- /dev/null
>>>> +++ b/man2/copy_file_range.2
>>>> @@ -0,0 +1,168 @@
>>>> +.\"This manpage is Copyright (C) 2015 Anna Schumaker <Anna.Schumaker@Netapp.com>
>>>> +.TH COPY 2 2015-8-31 "Linux" "Linux Programmer's Manual"
>>>> +.SH NAME
>>>> +copy_file_range \- Copy a range of data from one file to another
>>>> +.SH SYNOPSIS
>>>> +.nf
>>>> +.B #include <linux/copy.h>
>>>> +.B #include <sys/syscall.h>
>>>> +.B #include <unistd.h>
>>>> +
>>>> +.BI "ssize_t syscall(__NR_copy_file_range, int " fd_in ", loff_t * " off_in ",
>>>> +.BI "                int " fd_out ", loff_t * " off_out ", size_t " len ",
>>>> +.BI "                unsigned int " flags );
>>>> +.fi
>>>> +.SH DESCRIPTION
>>>> +The
>>>> +.BR copy_file_range ()
>>>> +system call performs an in-kernel copy between two file descriptors
>>>> +without all that tedious mucking about in userspace.
>>>
>>> ;)
>>>
>>>> +It copies up to
>>>> +.I len
>>>> +bytes of data from file descriptor
>>>> +.I fd_in
>>>> +to file descriptor
>>>> +.I fd_out
>>>> +at
>>>> +.IR off_out .
>>>> +The file descriptors must not refer to the same file.
>>>
>>> Why?  btrfs (and XFS) reflink can handle the case of a file sharing blocks
>>> with itself.
>>
>> I've never really thought about it... Zach had that in his initial
>> submission, so mentioned it in the man page.  Should I remove that bit?
>
> Yes, please!
>
> I could be wrong, but I think btrfs only started supporting files that share
> blocks with themselves relatively recently(?)
>
> I'm not sure why zab added this; was hoping he'd speak up. ;)
>
>>
>>>
>>>> +
>>>> +The following semantics apply for
>>>> +.IR fd_in ,
>>>> +and similar statements apply to
>>>> +.IR off_out :
>>>> +.IP * 3
>>>> +If
>>>> +.I off_in
>>>> +is NULL, then bytes are read from
>>>> +.I fd_in
>>>> +starting from the current file offset and the current
>>>> +file offset is adjusted appropriately.
>>>> +.IP *
>>>> +If
>>>> +.I off_in
>>>> +is not NULL, then
>>>> +.I off_in
>>>> +must point to a buffer that specifies the starting
>>>> +offset where bytes from
>>>> +.I fd_in
>>>> +will be read.  The current file offset of
>>>> +.I fd_in
>>>> +is not changed, but
>>>> +.I off_in
>>>> +is adjusted appropriately.
>>>> +.PP
>>>> +The default behavior of
>>>> +.BR copy_file_range ()
>>>> +is filesystem specific, and might result in creating a
>>>> +copy-on-write reflink.
>>>> +In the event that a given filesystem does not implement
>>>> +any form of copy acceleration, the kernel will perform
>>>> +a deep copy of the requested range by reading bytes from
>>>
>>> I wonder if it's wise to allow deep copies -- what happens if len == 1T?
>>> Will this syscall just block for a really long time?
>>
>> We use rw_verify_area(), (similar to read and write) so we won't allow a
>> value of len that long.  I can mention this in an updated version of this man
>> page!
>
> Ok.  I guess MAX_RW_COUNT limits us to about 4G at once, which for a splice
> copy is probably reasonable.
>
> The reason why I asked about len == 1T specifically is that I can (with
> somewhat long delays) reflink about 260 million extents at a time on XFS,
> which is about 1TB.  Given that locks get held for the duration, it's probably
> not a bad thing to limit userspace to 4G at a time.
I'd personally love to see that be tunable by a sysctl (kind of like how 
you can control the maximum number of AIO requests in flight), and for 
that matter we might want to be able to limit the number of in-progress 
copies going on.
>
> (But hey, it's fun to stress-test once in a while. :))
>
> --D
>
>>
>>
>>>
>>>> +.I fd_in
>>>> +and writing them to
>>>> +.IR fd_out .
>>>
>>> "...if COPY_REFLINK is not set in flags."
>>
>> Sure.
>>
>>>
>>>> +
>>>> +Currently, Linux only supports the following flag:
>>>> +.TP 1.9i
>>>> +.B COPY_REFLINK
>>>> +Only perform the copy if the filesystem can do it as a reflink.
>>>> +Do not fall back on performing a deep copy.
>>>> +.SH RETURN VALUE
>>>> +Upon successful completion,
>>>> +.BR copy_file_range ()
>>>> +will return the number of bytes copied between files.
>>>> +This could be less than the length originally requested.
>>>> +
>>>> +On error,
>>>> +.BR copy_file_range ()
>>>> +returns \-1 and
>>>> +.I errno
>>>> +is set to indicate the error.
>>>> +.SH ERRORS
>>>> +.TP
>>>> +.B EBADF
>>>> +One or more file descriptors are not valid,
>>>> +or do not have proper read-write mode.
>>>
>>> "or fd_out is not opened for writing"?
>>
>> I'll add that.
>>
>>>
>>>> +.TP
>>>> +.B EINVAL
>>>> +Requested range extends beyond the end of the file;
>>>> +.I flags
>>>> +argument is set to an invalid value.
>>>> +.TP
>>>> +.B EOPNOTSUPP
>>>> +.B COPY_REFLINK
>>>> +was specified in
>>>> +.IR flags ,
>>>> +but the target filesystem does not support reflinks.
>>>> +.TP
>>>> +.B EXDEV
>>>> +Target filesystem doesn't support cross-filesystem copies.
>>>> +.SH VERSIONS
>>>
>>> Perhaps this ought to list a few more errors (EIO, ENOSPC, ENOSYS, EPERM...)
>>> that can be returned?  (I was looking at the fallocate manpage.)
>>
>> Okay.  I'll poke around for what else could be returned!
>>
>> Thanks,
>> Anna
>>
>>>
>>> --D
>>>
>>>> +The
>>>> +.BR copy_file_range ()
>>>> +system call first appeared in Linux 4.3.
>>>> +.SH CONFORMING TO
>>>> +The
>>>> +.BR copy_file_range ()
>>>> +system call is a nonstandard Linux extension.
>>>> +.SH EXAMPLE
>>>> +.nf
>>>> +
>>>> +#define _GNU_SOURCE
>>>> +#include <fcntl.h>
>>>> +#include <linux/copy.h>
>>>> +#include <stdio.h>
>>>> +#include <stdlib.h>
>>>> +#include <sys/stat.h>
>>>> +#include <sys/syscall.h>
>>>> +#include <unistd.h>
>>>> +
>>>> +
>>>> +int main(int argc, char **argv)
>>>> +{
>>>> +    int fd_in, fd_out;
>>>> +    struct stat stat;
>>>> +    loff_t len, ret;
>>>> +
>>>> +    if (argc != 3) {
>>>> +        fprintf(stderr, "Usage: %s <pathname> <pathname>\n", argv[0]);
>>>> +        exit(EXIT_FAILURE);
>>>> +    }
>>>> +
>>>> +    fd_in = open(argv[1], O_RDONLY);
>>>> +    if (fd_in == -1) {
>>>> +        perror("open (argv[1])");
>>>> +        exit(EXIT_FAILURE);
>>>> +    }
>>>> +
>>>> +    if (fstat(fd_in, &stat) == -1) {
>>>> +        perror("fstat");
>>>> +        exit(EXIT_FAILURE);
>>>> +    }
>>>> +    len = stat.st_size;
>>>> +
>>>> +    fd_out = open(argv[2], O_WRONLY | O_CREAT, 0644);
>>>> +    if (fd_out == -1) {
>>>> +        perror("open (argv[2])");
>>>> +        exit(EXIT_FAILURE);
>>>> +    }
>>>> +
>>>> +    do {
>>>> +        ret = syscall(__NR_copy_file_range, fd_in, NULL,
>>>> +                      fd_out, NULL, len, 0);
>>>> +        if (ret == -1) {
>>>> +            perror("copy_file_range");
>>>> +            exit(EXIT_FAILURE);
>>>> +        }
>>>> +
>>>> +        len -= ret;
>>>> +    } while (len > 0);
>>>> +
>>>> +    close(fd_in);
>>>> +    close(fd_out);
>>>> +    exit(EXIT_SUCCESS);
>>>> +}
>>>> +.fi
>>>> +.SH SEE ALSO
>>>> +.BR splice (2)
>>>> --
>>>> 2.5.1
>>>>
>>>> --
>>>> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
>>>> the body of a message to majordomo@vger.kernel.org
>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
Darrick J. Wong Sept. 9, 2015, 5:17 p.m. UTC | #8
On Wed, Sep 09, 2015 at 07:38:14AM -0400, Austin S Hemmelgarn wrote:
> On 2015-09-08 16:39, Darrick J. Wong wrote:
> >On Tue, Sep 08, 2015 at 11:04:03AM -0400, Anna Schumaker wrote:
> >>On 09/04/2015 05:38 PM, Darrick J. Wong wrote:
> >>>On Fri, Sep 04, 2015 at 04:17:03PM -0400, Anna Schumaker wrote:
> >>>>copy_file_range() is a new system call for copying ranges of data
> >>>>completely in the kernel.  This gives filesystems an opportunity to
> >>>>implement some kind of "copy acceleration", such as reflinks or
> >>>>server-side-copy (in the case of NFS).
> >>>>
> >>>>Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
> >>>>---
> >>>>  man2/copy_file_range.2 | 168 +++++++++++++++++++++++++++++++++++++++++++++++++
> >>>>  1 file changed, 168 insertions(+)
> >>>>  create mode 100644 man2/copy_file_range.2
> >>>>
> >>>>diff --git a/man2/copy_file_range.2 b/man2/copy_file_range.2
> >>>>new file mode 100644
> >>>>index 0000000..4a4cb73
> >>>>--- /dev/null
> >>>>+++ b/man2/copy_file_range.2
> >>>>@@ -0,0 +1,168 @@
> >>>>+.\"This manpage is Copyright (C) 2015 Anna Schumaker <Anna.Schumaker@Netapp.com>
> >>>>+.TH COPY 2 2015-8-31 "Linux" "Linux Programmer's Manual"
> >>>>+.SH NAME
> >>>>+copy_file_range \- Copy a range of data from one file to another
> >>>>+.SH SYNOPSIS
> >>>>+.nf
> >>>>+.B #include <linux/copy.h>
> >>>>+.B #include <sys/syscall.h>
> >>>>+.B #include <unistd.h>
> >>>>+
> >>>>+.BI "ssize_t syscall(__NR_copy_file_range, int " fd_in ", loff_t * " off_in ",
> >>>>+.BI "                int " fd_out ", loff_t * " off_out ", size_t " len ",
> >>>>+.BI "                unsigned int " flags );
> >>>>+.fi
> >>>>+.SH DESCRIPTION
> >>>>+The
> >>>>+.BR copy_file_range ()
> >>>>+system call performs an in-kernel copy between two file descriptors
> >>>>+without all that tedious mucking about in userspace.
> >>>
> >>>;)
> >>>
> >>>>+It copies up to
> >>>>+.I len
> >>>>+bytes of data from file descriptor
> >>>>+.I fd_in
> >>>>+to file descriptor
> >>>>+.I fd_out
> >>>>+at
> >>>>+.IR off_out .
> >>>>+The file descriptors must not refer to the same file.
> >>>
> >>>Why?  btrfs (and XFS) reflink can handle the case of a file sharing blocks
> >>>with itself.
> >>
> >>I've never really thought about it... Zach had that in his initial
> >>submission, so mentioned it in the man page.  Should I remove that bit?
> >
> >Yes, please!
> >
> >I could be wrong, but I think btrfs only started supporting files that share
> >blocks with themselves relatively recently(?)
> >
> >I'm not sure why zab added this; was hoping he'd speak up. ;)
> >
> >>
> >>>
> >>>>+
> >>>>+The following semantics apply for
> >>>>+.IR fd_in ,
> >>>>+and similar statements apply to
> >>>>+.IR off_out :
> >>>>+.IP * 3
> >>>>+If
> >>>>+.I off_in
> >>>>+is NULL, then bytes are read from
> >>>>+.I fd_in
> >>>>+starting from the current file offset and the current
> >>>>+file offset is adjusted appropriately.
> >>>>+.IP *
> >>>>+If
> >>>>+.I off_in
> >>>>+is not NULL, then
> >>>>+.I off_in
> >>>>+must point to a buffer that specifies the starting
> >>>>+offset where bytes from
> >>>>+.I fd_in
> >>>>+will be read.  The current file offset of
> >>>>+.I fd_in
> >>>>+is not changed, but
> >>>>+.I off_in
> >>>>+is adjusted appropriately.
> >>>>+.PP
> >>>>+The default behavior of
> >>>>+.BR copy_file_range ()
> >>>>+is filesystem specific, and might result in creating a
> >>>>+copy-on-write reflink.
> >>>>+In the event that a given filesystem does not implement
> >>>>+any form of copy acceleration, the kernel will perform
> >>>>+a deep copy of the requested range by reading bytes from
> >>>
> >>>I wonder if it's wise to allow deep copies -- what happens if len == 1T?
> >>>Will this syscall just block for a really long time?
> >>
> >>We use rw_verify_area(), (similar to read and write) so we won't allow a
> >>value of len that long.  I can mention this in an updated version of this man
> >>page!
> >
> >Ok.  I guess MAX_RW_COUNT limits us to about 4G at once, which for a splice

Heh, INT_MAX, so 2GB at once.

> >copy is probably reasonable.
> >
> >The reason why I asked about len == 1T specifically is that I can (with
> >somewhat long delays) reflink about 260 million extents at a time on XFS,
> >which is about 1TB.  Given that locks get held for the duration, it's probably
> >not a bad thing to limit userspace to 4G at a time.
>
> I'd personally love to see that be tunable by a sysctl (kind of like
> how you can control the maximum number of AIO requests in flight),
> and for that matter we might want to be able to limit the number of
> in-progress copies going on.

Now that I think about it, btrfs' reflink ioctl doesn't seem to have any
particular limit on how much you can reflink in a single call.  XFS doesn't
have a limit either.  Given that reflink should create a tiny amount of IO
compared to the number of bytes being manipulated, should we allow a higher
limit when ssize_t is large enough?

Copy-through-the-pagecache should stick to MAX_RW_COUNT.

I noticed that btrfs won't dedupe more than 16M per call.  Any thoughts?

--D

> >
> >(But hey, it's fun to stress-test once in a while. :))
> >
> >--D
> >
> >>
> >>
> >>>
> >>>>+.I fd_in
> >>>>+and writing them to
> >>>>+.IR fd_out .
> >>>
> >>>"...if COPY_REFLINK is not set in flags."
> >>
> >>Sure.
> >>
> >>>
> >>>>+
> >>>>+Currently, Linux only supports the following flag:
> >>>>+.TP 1.9i
> >>>>+.B COPY_REFLINK
> >>>>+Only perform the copy if the filesystem can do it as a reflink.
> >>>>+Do not fall back on performing a deep copy.
> >>>>+.SH RETURN VALUE
> >>>>+Upon successful completion,
> >>>>+.BR copy_file_range ()
> >>>>+will return the number of bytes copied between files.
> >>>>+This could be less than the length originally requested.
> >>>>+
> >>>>+On error,
> >>>>+.BR copy_file_range ()
> >>>>+returns \-1 and
> >>>>+.I errno
> >>>>+is set to indicate the error.
> >>>>+.SH ERRORS
> >>>>+.TP
> >>>>+.B EBADF
> >>>>+One or more file descriptors are not valid,
> >>>>+or do not have proper read-write mode.
> >>>
> >>>"or fd_out is not opened for writing"?
> >>
> >>I'll add that.
> >>
> >>>
> >>>>+.TP
> >>>>+.B EINVAL
> >>>>+Requested range extends beyond the end of the file;
> >>>>+.I flags
> >>>>+argument is set to an invalid value.
> >>>>+.TP
> >>>>+.B EOPNOTSUPP
> >>>>+.B COPY_REFLINK
> >>>>+was specified in
> >>>>+.IR flags ,
> >>>>+but the target filesystem does not support reflinks.
> >>>>+.TP
> >>>>+.B EXDEV
> >>>>+Target filesystem doesn't support cross-filesystem copies.
> >>>>+.SH VERSIONS
> >>>
> >>>Perhaps this ought to list a few more errors (EIO, ENOSPC, ENOSYS, EPERM...)
> >>>that can be returned?  (I was looking at the fallocate manpage.)
> >>
> >>Okay.  I'll poke around for what else could be returned!
> >>
> >>Thanks,
> >>Anna
> >>
> >>>
> >>>--D
> >>>
> >>>>+The
> >>>>+.BR copy_file_range ()
> >>>>+system call first appeared in Linux 4.3.
> >>>>+.SH CONFORMING TO
> >>>>+The
> >>>>+.BR copy_file_range ()
> >>>>+system call is a nonstandard Linux extension.
> >>>>+.SH EXAMPLE
> >>>>+.nf
> >>>>+
> >>>>+#define _GNU_SOURCE
> >>>>+#include <fcntl.h>
> >>>>+#include <linux/copy.h>
> >>>>+#include <stdio.h>
> >>>>+#include <stdlib.h>
> >>>>+#include <sys/stat.h>
> >>>>+#include <sys/syscall.h>
> >>>>+#include <unistd.h>
> >>>>+
> >>>>+
> >>>>+int main(int argc, char **argv)
> >>>>+{
> >>>>+    int fd_in, fd_out;
> >>>>+    struct stat stat;
> >>>>+    loff_t len, ret;
> >>>>+
> >>>>+    if (argc != 3) {
> >>>>+        fprintf(stderr, "Usage: %s <pathname> <pathname>\n", argv[0]);
> >>>>+        exit(EXIT_FAILURE);
> >>>>+    }
> >>>>+
> >>>>+    fd_in = open(argv[1], O_RDONLY);
> >>>>+    if (fd_in == -1) {
> >>>>+        perror("open (argv[1])");
> >>>>+        exit(EXIT_FAILURE);
> >>>>+    }
> >>>>+
> >>>>+    if (fstat(fd_in, &stat) == -1) {
> >>>>+        perror("fstat");
> >>>>+        exit(EXIT_FAILURE);
> >>>>+    }
> >>>>+    len = stat.st_size;
> >>>>+
> >>>>+    fd_out = open(argv[2], O_WRONLY | O_CREAT, 0644);
> >>>>+    if (fd_out == -1) {
> >>>>+        perror("open (argv[2])");
> >>>>+        exit(EXIT_FAILURE);
> >>>>+    }
> >>>>+
> >>>>+    do {
> >>>>+        ret = syscall(__NR_copy_file_range, fd_in, NULL,
> >>>>+                      fd_out, NULL, len, 0);
> >>>>+        if (ret == -1) {
> >>>>+            perror("copy_file_range");
> >>>>+            exit(EXIT_FAILURE);
> >>>>+        }
> >>>>+
> >>>>+        len -= ret;
> >>>>+    } while (len > 0);
> >>>>+
> >>>>+    close(fd_in);
> >>>>+    close(fd_out);
> >>>>+    exit(EXIT_SUCCESS);
> >>>>+}
> >>>>+.fi
> >>>>+.SH SEE ALSO
> >>>>+.BR splice (2)
> >>>>--
> >>>>2.5.1
> >>>>
> >>>>--
> >>>>To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> >>>>the body of a message to majordomo@vger.kernel.org
> >>>>More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >>
> >--
> >To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> >the body of a message to majordomo@vger.kernel.org
> >More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >
> 
> 


--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Schumaker, Anna Sept. 9, 2015, 5:31 p.m. UTC | #9
On 09/09/2015 01:17 PM, Darrick J. Wong wrote:
> On Wed, Sep 09, 2015 at 07:38:14AM -0400, Austin S Hemmelgarn wrote:
>> On 2015-09-08 16:39, Darrick J. Wong wrote:
>>> On Tue, Sep 08, 2015 at 11:04:03AM -0400, Anna Schumaker wrote:
>>>> On 09/04/2015 05:38 PM, Darrick J. Wong wrote:
>>>>> On Fri, Sep 04, 2015 at 04:17:03PM -0400, Anna Schumaker wrote:
>>>>>> copy_file_range() is a new system call for copying ranges of data
>>>>>> completely in the kernel.  This gives filesystems an opportunity to
>>>>>> implement some kind of "copy acceleration", such as reflinks or
>>>>>> server-side-copy (in the case of NFS).
>>>>>>
>>>>>> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
>>>>>> ---
>>>>>>  man2/copy_file_range.2 | 168 +++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>>  1 file changed, 168 insertions(+)
>>>>>>  create mode 100644 man2/copy_file_range.2
>>>>>>
>>>>>> diff --git a/man2/copy_file_range.2 b/man2/copy_file_range.2
>>>>>> new file mode 100644
>>>>>> index 0000000..4a4cb73
>>>>>> --- /dev/null
>>>>>> +++ b/man2/copy_file_range.2
>>>>>> @@ -0,0 +1,168 @@
>>>>>> +.\"This manpage is Copyright (C) 2015 Anna Schumaker <Anna.Schumaker@Netapp.com>
>>>>>> +.TH COPY 2 2015-8-31 "Linux" "Linux Programmer's Manual"
>>>>>> +.SH NAME
>>>>>> +copy_file_range \- Copy a range of data from one file to another
>>>>>> +.SH SYNOPSIS
>>>>>> +.nf
>>>>>> +.B #include <linux/copy.h>
>>>>>> +.B #include <sys/syscall.h>
>>>>>> +.B #include <unistd.h>
>>>>>> +
>>>>>> +.BI "ssize_t syscall(__NR_copy_file_range, int " fd_in ", loff_t * " off_in ",
>>>>>> +.BI "                int " fd_out ", loff_t * " off_out ", size_t " len ",
>>>>>> +.BI "                unsigned int " flags );
>>>>>> +.fi
>>>>>> +.SH DESCRIPTION
>>>>>> +The
>>>>>> +.BR copy_file_range ()
>>>>>> +system call performs an in-kernel copy between two file descriptors
>>>>>> +without all that tedious mucking about in userspace.
>>>>>
>>>>> ;)
>>>>>
>>>>>> +It copies up to
>>>>>> +.I len
>>>>>> +bytes of data from file descriptor
>>>>>> +.I fd_in
>>>>>> +to file descriptor
>>>>>> +.I fd_out
>>>>>> +at
>>>>>> +.IR off_out .
>>>>>> +The file descriptors must not refer to the same file.
>>>>>
>>>>> Why?  btrfs (and XFS) reflink can handle the case of a file sharing blocks
>>>>> with itself.
>>>>
>>>> I've never really thought about it... Zach had that in his initial
>>>> submission, so mentioned it in the man page.  Should I remove that bit?
>>>
>>> Yes, please!
>>>
>>> I could be wrong, but I think btrfs only started supporting files that share
>>> blocks with themselves relatively recently(?)
>>>
>>> I'm not sure why zab added this; was hoping he'd speak up. ;)
>>>
>>>>
>>>>>
>>>>>> +
>>>>>> +The following semantics apply for
>>>>>> +.IR fd_in ,
>>>>>> +and similar statements apply to
>>>>>> +.IR off_out :
>>>>>> +.IP * 3
>>>>>> +If
>>>>>> +.I off_in
>>>>>> +is NULL, then bytes are read from
>>>>>> +.I fd_in
>>>>>> +starting from the current file offset and the current
>>>>>> +file offset is adjusted appropriately.
>>>>>> +.IP *
>>>>>> +If
>>>>>> +.I off_in
>>>>>> +is not NULL, then
>>>>>> +.I off_in
>>>>>> +must point to a buffer that specifies the starting
>>>>>> +offset where bytes from
>>>>>> +.I fd_in
>>>>>> +will be read.  The current file offset of
>>>>>> +.I fd_in
>>>>>> +is not changed, but
>>>>>> +.I off_in
>>>>>> +is adjusted appropriately.
>>>>>> +.PP
>>>>>> +The default behavior of
>>>>>> +.BR copy_file_range ()
>>>>>> +is filesystem specific, and might result in creating a
>>>>>> +copy-on-write reflink.
>>>>>> +In the event that a given filesystem does not implement
>>>>>> +any form of copy acceleration, the kernel will perform
>>>>>> +a deep copy of the requested range by reading bytes from
>>>>>
>>>>> I wonder if it's wise to allow deep copies -- what happens if len == 1T?
>>>>> Will this syscall just block for a really long time?
>>>>
>>>> We use rw_verify_area(), (similar to read and write) so we won't allow a
>>>> value of len that long.  I can mention this in an updated version of this man
>>>> page!
>>>
>>> Ok.  I guess MAX_RW_COUNT limits us to about 4G at once, which for a splice
> 
> Heh, INT_MAX, so 2GB at once.
> 
>>> copy is probably reasonable.
>>>
>>> The reason why I asked about len == 1T specifically is that I can (with
>>> somewhat long delays) reflink about 260 million extents at a time on XFS,
>>> which is about 1TB.  Given that locks get held for the duration, it's probably
>>> not a bad thing to limit userspace to 4G at a time.
>>
>> I'd personally love to see that be tunable by a sysctl (kind of like
>> how you can control the maximum number of AIO requests in flight),
>> and for that matter we might want to be able to limit the number of
>> in-progress copies going on.
> 
> Now that I think about it, btrfs' reflink ioctl doesn't seem to have any
> particular limit on how much you can reflink in a single call.  XFS doesn't
> have a limit either.  Given that reflink should create a tiny amount of IO
> compared to the number of bytes being manipulated, should we allow a higher
> limit when ssize_t is large enough?
> 
> Copy-through-the-pagecache should stick to MAX_RW_COUNT.

Should I keep rejecting pagecache copies if len > MAX_RW_COUNT?  Or would it be okay to change the value of len to MAX_RW_COUNT in this case?

Anna

> 
> I noticed that btrfs won't dedupe more than 16M per call.  Any thoughts?
> 
> --D
> 
>>>
>>> (But hey, it's fun to stress-test once in a while. :))
>>>
>>> --D
>>>
>>>>
>>>>
>>>>>
>>>>>> +.I fd_in
>>>>>> +and writing them to
>>>>>> +.IR fd_out .
>>>>>
>>>>> "...if COPY_REFLINK is not set in flags."
>>>>
>>>> Sure.
>>>>
>>>>>
>>>>>> +
>>>>>> +Currently, Linux only supports the following flag:
>>>>>> +.TP 1.9i
>>>>>> +.B COPY_REFLINK
>>>>>> +Only perform the copy if the filesystem can do it as a reflink.
>>>>>> +Do not fall back on performing a deep copy.
>>>>>> +.SH RETURN VALUE
>>>>>> +Upon successful completion,
>>>>>> +.BR copy_file_range ()
>>>>>> +will return the number of bytes copied between files.
>>>>>> +This could be less than the length originally requested.
>>>>>> +
>>>>>> +On error,
>>>>>> +.BR copy_file_range ()
>>>>>> +returns \-1 and
>>>>>> +.I errno
>>>>>> +is set to indicate the error.
>>>>>> +.SH ERRORS
>>>>>> +.TP
>>>>>> +.B EBADF
>>>>>> +One or more file descriptors are not valid,
>>>>>> +or do not have proper read-write mode.
>>>>>
>>>>> "or fd_out is not opened for writing"?
>>>>
>>>> I'll add that.
>>>>
>>>>>
>>>>>> +.TP
>>>>>> +.B EINVAL
>>>>>> +Requested range extends beyond the end of the file;
>>>>>> +.I flags
>>>>>> +argument is set to an invalid value.
>>>>>> +.TP
>>>>>> +.B EOPNOTSUPP
>>>>>> +.B COPY_REFLINK
>>>>>> +was specified in
>>>>>> +.IR flags ,
>>>>>> +but the target filesystem does not support reflinks.
>>>>>> +.TP
>>>>>> +.B EXDEV
>>>>>> +Target filesystem doesn't support cross-filesystem copies.
>>>>>> +.SH VERSIONS
>>>>>
>>>>> Perhaps this ought to list a few more errors (EIO, ENOSPC, ENOSYS, EPERM...)
>>>>> that can be returned?  (I was looking at the fallocate manpage.)
>>>>
>>>> Okay.  I'll poke around for what else could be returned!
>>>>
>>>> Thanks,
>>>> Anna
>>>>
>>>>>
>>>>> --D
>>>>>
>>>>>> +The
>>>>>> +.BR copy_file_range ()
>>>>>> +system call first appeared in Linux 4.3.
>>>>>> +.SH CONFORMING TO
>>>>>> +The
>>>>>> +.BR copy_file_range ()
>>>>>> +system call is a nonstandard Linux extension.
>>>>>> +.SH EXAMPLE
>>>>>> +.nf
>>>>>> +
>>>>>> +#define _GNU_SOURCE
>>>>>> +#include <fcntl.h>
>>>>>> +#include <linux/copy.h>
>>>>>> +#include <stdio.h>
>>>>>> +#include <stdlib.h>
>>>>>> +#include <sys/stat.h>
>>>>>> +#include <sys/syscall.h>
>>>>>> +#include <unistd.h>
>>>>>> +
>>>>>> +
>>>>>> +int main(int argc, char **argv)
>>>>>> +{
>>>>>> +    int fd_in, fd_out;
>>>>>> +    struct stat stat;
>>>>>> +    loff_t len, ret;
>>>>>> +
>>>>>> +    if (argc != 3) {
>>>>>> +        fprintf(stderr, "Usage: %s <pathname> <pathname>\n", argv[0]);
>>>>>> +        exit(EXIT_FAILURE);
>>>>>> +    }
>>>>>> +
>>>>>> +    fd_in = open(argv[1], O_RDONLY);
>>>>>> +    if (fd_in == -1) {
>>>>>> +        perror("open (argv[1])");
>>>>>> +        exit(EXIT_FAILURE);
>>>>>> +    }
>>>>>> +
>>>>>> +    if (fstat(fd_in, &stat) == -1) {
>>>>>> +        perror("fstat");
>>>>>> +        exit(EXIT_FAILURE);
>>>>>> +    }
>>>>>> +    len = stat.st_size;
>>>>>> +
>>>>>> +    fd_out = open(argv[2], O_WRONLY | O_CREAT, 0644);
>>>>>> +    if (fd_out == -1) {
>>>>>> +        perror("open (argv[2])");
>>>>>> +        exit(EXIT_FAILURE);
>>>>>> +    }
>>>>>> +
>>>>>> +    do {
>>>>>> +        ret = syscall(__NR_copy_file_range, fd_in, NULL,
>>>>>> +                      fd_out, NULL, len, 0);
>>>>>> +        if (ret == -1) {
>>>>>> +            perror("copy_file_range");
>>>>>> +            exit(EXIT_FAILURE);
>>>>>> +        }
>>>>>> +
>>>>>> +        len -= ret;
>>>>>> +    } while (len > 0);
>>>>>> +
>>>>>> +    close(fd_in);
>>>>>> +    close(fd_out);
>>>>>> +    exit(EXIT_SUCCESS);
>>>>>> +}
>>>>>> +.fi
>>>>>> +.SH SEE ALSO
>>>>>> +.BR splice (2)
>>>>>> --
>>>>>> 2.5.1
>>>>>>
>>>>>> --
>>>>>> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
>>>>>> the body of a message to majordomo@vger.kernel.org
>>>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>
>>
>>
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong Sept. 9, 2015, 6:12 p.m. UTC | #10
On Wed, Sep 09, 2015 at 01:31:24PM -0400, Anna Schumaker wrote:
> On 09/09/2015 01:17 PM, Darrick J. Wong wrote:
> > On Wed, Sep 09, 2015 at 07:38:14AM -0400, Austin S Hemmelgarn wrote:
> >> On 2015-09-08 16:39, Darrick J. Wong wrote:
> >>> On Tue, Sep 08, 2015 at 11:04:03AM -0400, Anna Schumaker wrote:
> >>>> On 09/04/2015 05:38 PM, Darrick J. Wong wrote:
> >>>>> On Fri, Sep 04, 2015 at 04:17:03PM -0400, Anna Schumaker wrote:
> >>>>>> copy_file_range() is a new system call for copying ranges of data
> >>>>>> completely in the kernel.  This gives filesystems an opportunity to
> >>>>>> implement some kind of "copy acceleration", such as reflinks or
> >>>>>> server-side-copy (in the case of NFS).
> >>>>>>
> >>>>>> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
> >>>>>> ---
> >>>>>>  man2/copy_file_range.2 | 168 +++++++++++++++++++++++++++++++++++++++++++++++++
> >>>>>>  1 file changed, 168 insertions(+)
> >>>>>>  create mode 100644 man2/copy_file_range.2
> >>>>>>
> >>>>>> diff --git a/man2/copy_file_range.2 b/man2/copy_file_range.2
> >>>>>> new file mode 100644
> >>>>>> index 0000000..4a4cb73
> >>>>>> --- /dev/null
> >>>>>> +++ b/man2/copy_file_range.2
> >>>>>> @@ -0,0 +1,168 @@
> >>>>>> +.\"This manpage is Copyright (C) 2015 Anna Schumaker <Anna.Schumaker@Netapp.com>
> >>>>>> +.TH COPY 2 2015-8-31 "Linux" "Linux Programmer's Manual"
> >>>>>> +.SH NAME
> >>>>>> +copy_file_range \- Copy a range of data from one file to another
> >>>>>> +.SH SYNOPSIS
> >>>>>> +.nf
> >>>>>> +.B #include <linux/copy.h>
> >>>>>> +.B #include <sys/syscall.h>
> >>>>>> +.B #include <unistd.h>
> >>>>>> +
> >>>>>> +.BI "ssize_t syscall(__NR_copy_file_range, int " fd_in ", loff_t * " off_in ",
> >>>>>> +.BI "                int " fd_out ", loff_t * " off_out ", size_t " len ",
> >>>>>> +.BI "                unsigned int " flags );
> >>>>>> +.fi
> >>>>>> +.SH DESCRIPTION
> >>>>>> +The
> >>>>>> +.BR copy_file_range ()
> >>>>>> +system call performs an in-kernel copy between two file descriptors
> >>>>>> +without all that tedious mucking about in userspace.
> >>>>>
> >>>>> ;)
> >>>>>
> >>>>>> +It copies up to
> >>>>>> +.I len
> >>>>>> +bytes of data from file descriptor
> >>>>>> +.I fd_in
> >>>>>> +to file descriptor
> >>>>>> +.I fd_out
> >>>>>> +at
> >>>>>> +.IR off_out .
> >>>>>> +The file descriptors must not refer to the same file.
> >>>>>
> >>>>> Why?  btrfs (and XFS) reflink can handle the case of a file sharing blocks
> >>>>> with itself.
> >>>>
> >>>> I've never really thought about it... Zach had that in his initial
> >>>> submission, so mentioned it in the man page.  Should I remove that bit?
> >>>
> >>> Yes, please!
> >>>
> >>> I could be wrong, but I think btrfs only started supporting files that share
> >>> blocks with themselves relatively recently(?)
> >>>
> >>> I'm not sure why zab added this; was hoping he'd speak up. ;)
> >>>
> >>>>
> >>>>>
> >>>>>> +
> >>>>>> +The following semantics apply for
> >>>>>> +.IR fd_in ,
> >>>>>> +and similar statements apply to
> >>>>>> +.IR off_out :
> >>>>>> +.IP * 3
> >>>>>> +If
> >>>>>> +.I off_in
> >>>>>> +is NULL, then bytes are read from
> >>>>>> +.I fd_in
> >>>>>> +starting from the current file offset and the current
> >>>>>> +file offset is adjusted appropriately.
> >>>>>> +.IP *
> >>>>>> +If
> >>>>>> +.I off_in
> >>>>>> +is not NULL, then
> >>>>>> +.I off_in
> >>>>>> +must point to a buffer that specifies the starting
> >>>>>> +offset where bytes from
> >>>>>> +.I fd_in
> >>>>>> +will be read.  The current file offset of
> >>>>>> +.I fd_in
> >>>>>> +is not changed, but
> >>>>>> +.I off_in
> >>>>>> +is adjusted appropriately.
> >>>>>> +.PP
> >>>>>> +The default behavior of
> >>>>>> +.BR copy_file_range ()
> >>>>>> +is filesystem specific, and might result in creating a
> >>>>>> +copy-on-write reflink.
> >>>>>> +In the event that a given filesystem does not implement
> >>>>>> +any form of copy acceleration, the kernel will perform
> >>>>>> +a deep copy of the requested range by reading bytes from
> >>>>>
> >>>>> I wonder if it's wise to allow deep copies -- what happens if len == 1T?
> >>>>> Will this syscall just block for a really long time?
> >>>>
> >>>> We use rw_verify_area(), (similar to read and write) so we won't allow a
> >>>> value of len that long.  I can mention this in an updated version of this man
> >>>> page!
> >>>
> >>> Ok.  I guess MAX_RW_COUNT limits us to about 4G at once, which for a splice
> > 
> > Heh, INT_MAX, so 2GB at once.
> > 
> >>> copy is probably reasonable.
> >>>
> >>> The reason why I asked about len == 1T specifically is that I can (with
> >>> somewhat long delays) reflink about 260 million extents at a time on XFS,
> >>> which is about 1TB.  Given that locks get held for the duration, it's probably
> >>> not a bad thing to limit userspace to 4G at a time.
> >>
> >> I'd personally love to see that be tunable by a sysctl (kind of like
> >> how you can control the maximum number of AIO requests in flight),
> >> and for that matter we might want to be able to limit the number of
> >> in-progress copies going on.
> > 
> > Now that I think about it, btrfs' reflink ioctl doesn't seem to have any
> > particular limit on how much you can reflink in a single call.  XFS doesn't
> > have a limit either.  Given that reflink should create a tiny amount of IO
> > compared to the number of bytes being manipulated, should we allow a higher
> > limit when ssize_t is large enough?
> > 
> > Copy-through-the-pagecache should stick to MAX_RW_COUNT.
> 
> Should I keep rejecting pagecache copies if len > MAX_RW_COUNT?  Or would it
> be okay to change the value of len to MAX_RW_COUNT in this case?

OH.  Heh.

rw_verify_area returns either an error code or a len that's been clamped to
MAX_RW_COUNT.  However, the syscall code only checks for errors, and otherwise
ignores the clamp.  So I guess the length has never been clamped.

Since the syscall returns ssize_t, I think it's fine to keep around the return
value from rw_verify_area and use it to clamp len if we have to fall back on
pagecache copy.  Otherwise we'll let each FS' copy routine decide its maximum.

--D

> 
> Anna
> 
> > 
> > I noticed that btrfs won't dedupe more than 16M per call.  Any thoughts?
> > 
> > --D
> > 
> >>>
> >>> (But hey, it's fun to stress-test once in a while. :))
> >>>
> >>> --D
> >>>
> >>>>
> >>>>
> >>>>>
> >>>>>> +.I fd_in
> >>>>>> +and writing them to
> >>>>>> +.IR fd_out .
> >>>>>
> >>>>> "...if COPY_REFLINK is not set in flags."
> >>>>
> >>>> Sure.
> >>>>
> >>>>>
> >>>>>> +
> >>>>>> +Currently, Linux only supports the following flag:
> >>>>>> +.TP 1.9i
> >>>>>> +.B COPY_REFLINK
> >>>>>> +Only perform the copy if the filesystem can do it as a reflink.
> >>>>>> +Do not fall back on performing a deep copy.
> >>>>>> +.SH RETURN VALUE
> >>>>>> +Upon successful completion,
> >>>>>> +.BR copy_file_range ()
> >>>>>> +will return the number of bytes copied between files.
> >>>>>> +This could be less than the length originally requested.
> >>>>>> +
> >>>>>> +On error,
> >>>>>> +.BR copy_file_range ()
> >>>>>> +returns \-1 and
> >>>>>> +.I errno
> >>>>>> +is set to indicate the error.
> >>>>>> +.SH ERRORS
> >>>>>> +.TP
> >>>>>> +.B EBADF
> >>>>>> +One or more file descriptors are not valid,
> >>>>>> +or do not have proper read-write mode.
> >>>>>
> >>>>> "or fd_out is not opened for writing"?
> >>>>
> >>>> I'll add that.
> >>>>
> >>>>>
> >>>>>> +.TP
> >>>>>> +.B EINVAL
> >>>>>> +Requested range extends beyond the end of the file;
> >>>>>> +.I flags
> >>>>>> +argument is set to an invalid value.
> >>>>>> +.TP
> >>>>>> +.B EOPNOTSUPP
> >>>>>> +.B COPY_REFLINK
> >>>>>> +was specified in
> >>>>>> +.IR flags ,
> >>>>>> +but the target filesystem does not support reflinks.
> >>>>>> +.TP
> >>>>>> +.B EXDEV
> >>>>>> +Target filesystem doesn't support cross-filesystem copies.
> >>>>>> +.SH VERSIONS
> >>>>>
> >>>>> Perhaps this ought to list a few more errors (EIO, ENOSPC, ENOSYS, EPERM...)
> >>>>> that can be returned?  (I was looking at the fallocate manpage.)
> >>>>
> >>>> Okay.  I'll poke around for what else could be returned!
> >>>>
> >>>> Thanks,
> >>>> Anna
> >>>>
> >>>>>
> >>>>> --D
> >>>>>
> >>>>>> +The
> >>>>>> +.BR copy_file_range ()
> >>>>>> +system call first appeared in Linux 4.3.
> >>>>>> +.SH CONFORMING TO
> >>>>>> +The
> >>>>>> +.BR copy_file_range ()
> >>>>>> +system call is a nonstandard Linux extension.
> >>>>>> +.SH EXAMPLE
> >>>>>> +.nf
> >>>>>> +
> >>>>>> +#define _GNU_SOURCE
> >>>>>> +#include <fcntl.h>
> >>>>>> +#include <linux/copy.h>
> >>>>>> +#include <stdio.h>
> >>>>>> +#include <stdlib.h>
> >>>>>> +#include <sys/stat.h>
> >>>>>> +#include <sys/syscall.h>
> >>>>>> +#include <unistd.h>
> >>>>>> +
> >>>>>> +
> >>>>>> +int main(int argc, char **argv)
> >>>>>> +{
> >>>>>> +    int fd_in, fd_out;
> >>>>>> +    struct stat stat;
> >>>>>> +    loff_t len, ret;
> >>>>>> +
> >>>>>> +    if (argc != 3) {
> >>>>>> +        fprintf(stderr, "Usage: %s <pathname> <pathname>\n", argv[0]);
> >>>>>> +        exit(EXIT_FAILURE);
> >>>>>> +    }
> >>>>>> +
> >>>>>> +    fd_in = open(argv[1], O_RDONLY);
> >>>>>> +    if (fd_in == -1) {
> >>>>>> +        perror("open (argv[1])");
> >>>>>> +        exit(EXIT_FAILURE);
> >>>>>> +    }
> >>>>>> +
> >>>>>> +    if (fstat(fd_in, &stat) == -1) {
> >>>>>> +        perror("fstat");
> >>>>>> +        exit(EXIT_FAILURE);
> >>>>>> +    }
> >>>>>> +    len = stat.st_size;
> >>>>>> +
> >>>>>> +    fd_out = open(argv[2], O_WRONLY | O_CREAT, 0644);
> >>>>>> +    if (fd_out == -1) {
> >>>>>> +        perror("open (argv[2])");
> >>>>>> +        exit(EXIT_FAILURE);
> >>>>>> +    }
> >>>>>> +
> >>>>>> +    do {
> >>>>>> +        ret = syscall(__NR_copy_file_range, fd_in, NULL,
> >>>>>> +                      fd_out, NULL, len, 0);
> >>>>>> +        if (ret == -1) {
> >>>>>> +            perror("copy_file_range");
> >>>>>> +            exit(EXIT_FAILURE);
> >>>>>> +        }
> >>>>>> +
> >>>>>> +        len -= ret;
> >>>>>> +    } while (len > 0);
> >>>>>> +
> >>>>>> +    close(fd_in);
> >>>>>> +    close(fd_out);
> >>>>>> +    exit(EXIT_SUCCESS);
> >>>>>> +}
> >>>>>> +.fi
> >>>>>> +.SH SEE ALSO
> >>>>>> +.BR splice (2)
> >>>>>> --
> >>>>>> 2.5.1
> >>>>>>
> >>>>>> --
> >>>>>> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> >>>>>> the body of a message to majordomo@vger.kernel.org
> >>>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >>>>
> >>> --
> >>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> >>> the body of a message to majordomo@vger.kernel.org
> >>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >>>
> >>
> >>
> > 
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > 
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Schumaker, Anna Sept. 9, 2015, 7:25 p.m. UTC | #11
On 09/09/2015 02:12 PM, Darrick J. Wong wrote:
> On Wed, Sep 09, 2015 at 01:31:24PM -0400, Anna Schumaker wrote:
>> On 09/09/2015 01:17 PM, Darrick J. Wong wrote:
>>> On Wed, Sep 09, 2015 at 07:38:14AM -0400, Austin S Hemmelgarn wrote:
>>>> On 2015-09-08 16:39, Darrick J. Wong wrote:
>>>>> On Tue, Sep 08, 2015 at 11:04:03AM -0400, Anna Schumaker wrote:
>>>>>> On 09/04/2015 05:38 PM, Darrick J. Wong wrote:
>>>>>>> On Fri, Sep 04, 2015 at 04:17:03PM -0400, Anna Schumaker wrote:
>>>>>>>> copy_file_range() is a new system call for copying ranges of data
>>>>>>>> completely in the kernel.  This gives filesystems an opportunity to
>>>>>>>> implement some kind of "copy acceleration", such as reflinks or
>>>>>>>> server-side-copy (in the case of NFS).
>>>>>>>>
>>>>>>>> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
>>>>>>>> ---
>>>>>>>>  man2/copy_file_range.2 | 168 +++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>>>>  1 file changed, 168 insertions(+)
>>>>>>>>  create mode 100644 man2/copy_file_range.2
>>>>>>>>
>>>>>>>> diff --git a/man2/copy_file_range.2 b/man2/copy_file_range.2
>>>>>>>> new file mode 100644
>>>>>>>> index 0000000..4a4cb73
>>>>>>>> --- /dev/null
>>>>>>>> +++ b/man2/copy_file_range.2
>>>>>>>> @@ -0,0 +1,168 @@
>>>>>>>> +.\"This manpage is Copyright (C) 2015 Anna Schumaker <Anna.Schumaker@Netapp.com>
>>>>>>>> +.TH COPY 2 2015-8-31 "Linux" "Linux Programmer's Manual"
>>>>>>>> +.SH NAME
>>>>>>>> +copy_file_range \- Copy a range of data from one file to another
>>>>>>>> +.SH SYNOPSIS
>>>>>>>> +.nf
>>>>>>>> +.B #include <linux/copy.h>
>>>>>>>> +.B #include <sys/syscall.h>
>>>>>>>> +.B #include <unistd.h>
>>>>>>>> +
>>>>>>>> +.BI "ssize_t syscall(__NR_copy_file_range, int " fd_in ", loff_t * " off_in ",
>>>>>>>> +.BI "                int " fd_out ", loff_t * " off_out ", size_t " len ",
>>>>>>>> +.BI "                unsigned int " flags );
>>>>>>>> +.fi
>>>>>>>> +.SH DESCRIPTION
>>>>>>>> +The
>>>>>>>> +.BR copy_file_range ()
>>>>>>>> +system call performs an in-kernel copy between two file descriptors
>>>>>>>> +without all that tedious mucking about in userspace.
>>>>>>>
>>>>>>> ;)
>>>>>>>
>>>>>>>> +It copies up to
>>>>>>>> +.I len
>>>>>>>> +bytes of data from file descriptor
>>>>>>>> +.I fd_in
>>>>>>>> +to file descriptor
>>>>>>>> +.I fd_out
>>>>>>>> +at
>>>>>>>> +.IR off_out .
>>>>>>>> +The file descriptors must not refer to the same file.
>>>>>>>
>>>>>>> Why?  btrfs (and XFS) reflink can handle the case of a file sharing blocks
>>>>>>> with itself.
>>>>>>
>>>>>> I've never really thought about it... Zach had that in his initial
>>>>>> submission, so mentioned it in the man page.  Should I remove that bit?
>>>>>
>>>>> Yes, please!
>>>>>
>>>>> I could be wrong, but I think btrfs only started supporting files that share
>>>>> blocks with themselves relatively recently(?)
>>>>>
>>>>> I'm not sure why zab added this; was hoping he'd speak up. ;)
>>>>>
>>>>>>
>>>>>>>
>>>>>>>> +
>>>>>>>> +The following semantics apply for
>>>>>>>> +.IR fd_in ,
>>>>>>>> +and similar statements apply to
>>>>>>>> +.IR off_out :
>>>>>>>> +.IP * 3
>>>>>>>> +If
>>>>>>>> +.I off_in
>>>>>>>> +is NULL, then bytes are read from
>>>>>>>> +.I fd_in
>>>>>>>> +starting from the current file offset and the current
>>>>>>>> +file offset is adjusted appropriately.
>>>>>>>> +.IP *
>>>>>>>> +If
>>>>>>>> +.I off_in
>>>>>>>> +is not NULL, then
>>>>>>>> +.I off_in
>>>>>>>> +must point to a buffer that specifies the starting
>>>>>>>> +offset where bytes from
>>>>>>>> +.I fd_in
>>>>>>>> +will be read.  The current file offset of
>>>>>>>> +.I fd_in
>>>>>>>> +is not changed, but
>>>>>>>> +.I off_in
>>>>>>>> +is adjusted appropriately.
>>>>>>>> +.PP
>>>>>>>> +The default behavior of
>>>>>>>> +.BR copy_file_range ()
>>>>>>>> +is filesystem specific, and might result in creating a
>>>>>>>> +copy-on-write reflink.
>>>>>>>> +In the event that a given filesystem does not implement
>>>>>>>> +any form of copy acceleration, the kernel will perform
>>>>>>>> +a deep copy of the requested range by reading bytes from
>>>>>>>
>>>>>>> I wonder if it's wise to allow deep copies -- what happens if len == 1T?
>>>>>>> Will this syscall just block for a really long time?
>>>>>>
>>>>>> We use rw_verify_area(), (similar to read and write) so we won't allow a
>>>>>> value of len that long.  I can mention this in an updated version of this man
>>>>>> page!
>>>>>
>>>>> Ok.  I guess MAX_RW_COUNT limits us to about 4G at once, which for a splice
>>>
>>> Heh, INT_MAX, so 2GB at once.
>>>
>>>>> copy is probably reasonable.
>>>>>
>>>>> The reason why I asked about len == 1T specifically is that I can (with
>>>>> somewhat long delays) reflink about 260 million extents at a time on XFS,
>>>>> which is about 1TB.  Given that locks get held for the duration, it's probably
>>>>> not a bad thing to limit userspace to 4G at a time.
>>>>
>>>> I'd personally love to see that be tunable by a sysctl (kind of like
>>>> how you can control the maximum number of AIO requests in flight),
>>>> and for that matter we might want to be able to limit the number of
>>>> in-progress copies going on.
>>>
>>> Now that I think about it, btrfs' reflink ioctl doesn't seem to have any
>>> particular limit on how much you can reflink in a single call.  XFS doesn't
>>> have a limit either.  Given that reflink should create a tiny amount of IO
>>> compared to the number of bytes being manipulated, should we allow a higher
>>> limit when ssize_t is large enough?
>>>
>>> Copy-through-the-pagecache should stick to MAX_RW_COUNT.
>>
>> Should I keep rejecting pagecache copies if len > MAX_RW_COUNT?  Or would it
>> be okay to change the value of len to MAX_RW_COUNT in this case?
> 
> OH.  Heh.
> 
> rw_verify_area returns either an error code or a len that's been clamped to
> MAX_RW_COUNT.  However, the syscall code only checks for errors, and otherwise
> ignores the clamp.  So I guess the length has never been clamped.
> 
> Since the syscall returns ssize_t, I think it's fine to keep around the return
> value from rw_verify_area and use it to clamp len if we have to fall back on
> pagecache copy.  Otherwise we'll let each FS' copy routine decide its maximum.

Okay.  I'll use the return value to clamp the copy length, but move this code so that it's only for pagecache copies.  Thanks!

Anna

> 
> --D
> 
>>
>> Anna
>>
>>>
>>> I noticed that btrfs won't dedupe more than 16M per call.  Any thoughts?
>>>
>>> --D
>>>
>>>>>
>>>>> (But hey, it's fun to stress-test once in a while. :))
>>>>>
>>>>> --D
>>>>>
>>>>>>
>>>>>>
>>>>>>>
>>>>>>>> +.I fd_in
>>>>>>>> +and writing them to
>>>>>>>> +.IR fd_out .
>>>>>>>
>>>>>>> "...if COPY_REFLINK is not set in flags."
>>>>>>
>>>>>> Sure.
>>>>>>
>>>>>>>
>>>>>>>> +
>>>>>>>> +Currently, Linux only supports the following flag:
>>>>>>>> +.TP 1.9i
>>>>>>>> +.B COPY_REFLINK
>>>>>>>> +Only perform the copy if the filesystem can do it as a reflink.
>>>>>>>> +Do not fall back on performing a deep copy.
>>>>>>>> +.SH RETURN VALUE
>>>>>>>> +Upon successful completion,
>>>>>>>> +.BR copy_file_range ()
>>>>>>>> +will return the number of bytes copied between files.
>>>>>>>> +This could be less than the length originally requested.
>>>>>>>> +
>>>>>>>> +On error,
>>>>>>>> +.BR copy_file_range ()
>>>>>>>> +returns \-1 and
>>>>>>>> +.I errno
>>>>>>>> +is set to indicate the error.
>>>>>>>> +.SH ERRORS
>>>>>>>> +.TP
>>>>>>>> +.B EBADF
>>>>>>>> +One or more file descriptors are not valid,
>>>>>>>> +or do not have proper read-write mode.
>>>>>>>
>>>>>>> "or fd_out is not opened for writing"?
>>>>>>
>>>>>> I'll add that.
>>>>>>
>>>>>>>
>>>>>>>> +.TP
>>>>>>>> +.B EINVAL
>>>>>>>> +Requested range extends beyond the end of the file;
>>>>>>>> +.I flags
>>>>>>>> +argument is set to an invalid value.
>>>>>>>> +.TP
>>>>>>>> +.B EOPNOTSUPP
>>>>>>>> +.B COPY_REFLINK
>>>>>>>> +was specified in
>>>>>>>> +.IR flags ,
>>>>>>>> +but the target filesystem does not support reflinks.
>>>>>>>> +.TP
>>>>>>>> +.B EXDEV
>>>>>>>> +Target filesystem doesn't support cross-filesystem copies.
>>>>>>>> +.SH VERSIONS
>>>>>>>
>>>>>>> Perhaps this ought to list a few more errors (EIO, ENOSPC, ENOSYS, EPERM...)
>>>>>>> that can be returned?  (I was looking at the fallocate manpage.)
>>>>>>
>>>>>> Okay.  I'll poke around for what else could be returned!
>>>>>>
>>>>>> Thanks,
>>>>>> Anna
>>>>>>
>>>>>>>
>>>>>>> --D
>>>>>>>
>>>>>>>> +The
>>>>>>>> +.BR copy_file_range ()
>>>>>>>> +system call first appeared in Linux 4.3.
>>>>>>>> +.SH CONFORMING TO
>>>>>>>> +The
>>>>>>>> +.BR copy_file_range ()
>>>>>>>> +system call is a nonstandard Linux extension.
>>>>>>>> +.SH EXAMPLE
>>>>>>>> +.nf
>>>>>>>> +
>>>>>>>> +#define _GNU_SOURCE
>>>>>>>> +#include <fcntl.h>
>>>>>>>> +#include <linux/copy.h>
>>>>>>>> +#include <stdio.h>
>>>>>>>> +#include <stdlib.h>
>>>>>>>> +#include <sys/stat.h>
>>>>>>>> +#include <sys/syscall.h>
>>>>>>>> +#include <unistd.h>
>>>>>>>> +
>>>>>>>> +
>>>>>>>> +int main(int argc, char **argv)
>>>>>>>> +{
>>>>>>>> +    int fd_in, fd_out;
>>>>>>>> +    struct stat stat;
>>>>>>>> +    loff_t len, ret;
>>>>>>>> +
>>>>>>>> +    if (argc != 3) {
>>>>>>>> +        fprintf(stderr, "Usage: %s <pathname> <pathname>\n", argv[0]);
>>>>>>>> +        exit(EXIT_FAILURE);
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    fd_in = open(argv[1], O_RDONLY);
>>>>>>>> +    if (fd_in == -1) {
>>>>>>>> +        perror("open (argv[1])");
>>>>>>>> +        exit(EXIT_FAILURE);
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    if (fstat(fd_in, &stat) == -1) {
>>>>>>>> +        perror("fstat");
>>>>>>>> +        exit(EXIT_FAILURE);
>>>>>>>> +    }
>>>>>>>> +    len = stat.st_size;
>>>>>>>> +
>>>>>>>> +    fd_out = open(argv[2], O_WRONLY | O_CREAT, 0644);
>>>>>>>> +    if (fd_out == -1) {
>>>>>>>> +        perror("open (argv[2])");
>>>>>>>> +        exit(EXIT_FAILURE);
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    do {
>>>>>>>> +        ret = syscall(__NR_copy_file_range, fd_in, NULL,
>>>>>>>> +                      fd_out, NULL, len, 0);
>>>>>>>> +        if (ret == -1) {
>>>>>>>> +            perror("copy_file_range");
>>>>>>>> +            exit(EXIT_FAILURE);
>>>>>>>> +        }
>>>>>>>> +
>>>>>>>> +        len -= ret;
>>>>>>>> +    } while (len > 0);
>>>>>>>> +
>>>>>>>> +    close(fd_in);
>>>>>>>> +    close(fd_out);
>>>>>>>> +    exit(EXIT_SUCCESS);
>>>>>>>> +}
>>>>>>>> +.fi
>>>>>>>> +.SH SEE ALSO
>>>>>>>> +.BR splice (2)
>>>>>>>> --
>>>>>>>> 2.5.1
>>>>>>>>
>>>>>>>> --
>>>>>>>> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
>>>>>>>> the body of a message to majordomo@vger.kernel.org
>>>>>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>>>>
>>>>> --
>>>>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>>>>> the body of a message to majordomo@vger.kernel.org
>>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>>>
>>>>
>>>>
>>>
>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>
>>

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Sterba Sept. 10, 2015, 3:42 p.m. UTC | #12
On Wed, Sep 09, 2015 at 10:17:57AM -0700, Darrick J. Wong wrote:
> I noticed that btrfs won't dedupe more than 16M per call.  Any thoughts?

btrfs_ioctl_file_extent_same:

3138         /*
3139          * Limit the total length we will dedupe for each operation.
3140          * This is intended to bound the total time spent in this
3141          * ioctl to something sane.
3142          */
3143         if (len > BTRFS_MAX_DEDUPE_LEN)
3144                 len = BTRFS_MAX_DEDUPE_LEN;

The deduplication compares the source and destination blocks and does
not use the checksum based approach (btrfs_cmp_data()). The 16M limit is
artifical, I don't have an estimate whether the value is ok or not. The
longer dedupe chunk the lower the chance to find more matching extents,
so the practially used chunk sizes are in range of hundreds of
kilobytes. But this obviously depends on data and many-megabyte-sized
chunks could fit some usecases easily.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong Sept. 10, 2015, 4:43 p.m. UTC | #13
On Thu, Sep 10, 2015 at 05:42:51PM +0200, David Sterba wrote:
> On Wed, Sep 09, 2015 at 10:17:57AM -0700, Darrick J. Wong wrote:
> > I noticed that btrfs won't dedupe more than 16M per call.  Any thoughts?
> 
> btrfs_ioctl_file_extent_same:
> 
> 3138         /*
> 3139          * Limit the total length we will dedupe for each operation.
> 3140          * This is intended to bound the total time spent in this
> 3141          * ioctl to something sane.
> 3142          */
> 3143         if (len > BTRFS_MAX_DEDUPE_LEN)
> 3144                 len = BTRFS_MAX_DEDUPE_LEN;
> 
> The deduplication compares the source and destination blocks and does
> not use the checksum based approach (btrfs_cmp_data()). The 16M limit is
> artifical, I don't have an estimate whether the value is ok or not. The
> longer dedupe chunk the lower the chance to find more matching extents,
> so the practially used chunk sizes are in range of hundreds of
> kilobytes. But this obviously depends on data and many-megabyte-sized
> chunks could fit some usecases easily.

I guessed that 16M was a 'reasonable default maximum' since the semantics seem
to be "link these two ranges together if all block contents match", not "I
think these ranges match, link together any blocks which actually /do/ match".
Personally, I doubt that it'll often be the case that a dedupe tool finds >16M
chunks to dedupe *and* for whatever reason can't just call iteratively.

Internally it could do some fadvise-like magic to avoid polluting the page
cache with the compares, but I agree that not letting the call take forever
is a good thing.

Oh well.  It /could/ be a per-fs tunable if anyone yells loudly.

--D
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/man2/copy_file_range.2 b/man2/copy_file_range.2
new file mode 100644
index 0000000..4a4cb73
--- /dev/null
+++ b/man2/copy_file_range.2
@@ -0,0 +1,168 @@ 
+.\"This manpage is Copyright (C) 2015 Anna Schumaker <Anna.Schumaker@Netapp.com>
+.TH COPY 2 2015-8-31 "Linux" "Linux Programmer's Manual"
+.SH NAME
+copy_file_range \- Copy a range of data from one file to another
+.SH SYNOPSIS
+.nf
+.B #include <linux/copy.h>
+.B #include <sys/syscall.h>
+.B #include <unistd.h>
+
+.BI "ssize_t syscall(__NR_copy_file_range, int " fd_in ", loff_t * " off_in ",
+.BI "                int " fd_out ", loff_t * " off_out ", size_t " len ",
+.BI "                unsigned int " flags );
+.fi
+.SH DESCRIPTION
+The
+.BR copy_file_range ()
+system call performs an in-kernel copy between two file descriptors
+without all that tedious mucking about in userspace.
+It copies up to
+.I len
+bytes of data from file descriptor
+.I fd_in
+to file descriptor
+.I fd_out
+at
+.IR off_out .
+The file descriptors must not refer to the same file.
+
+The following semantics apply for
+.IR fd_in ,
+and similar statements apply to
+.IR off_out :
+.IP * 3
+If
+.I off_in
+is NULL, then bytes are read from
+.I fd_in
+starting from the current file offset and the current
+file offset is adjusted appropriately.
+.IP *
+If
+.I off_in
+is not NULL, then
+.I off_in
+must point to a buffer that specifies the starting
+offset where bytes from
+.I fd_in
+will be read.  The current file offset of
+.I fd_in
+is not changed, but
+.I off_in
+is adjusted appropriately.
+.PP
+The default behavior of
+.BR copy_file_range ()
+is filesystem specific, and might result in creating a
+copy-on-write reflink.
+In the event that a given filesystem does not implement
+any form of copy acceleration, the kernel will perform
+a deep copy of the requested range by reading bytes from
+.I fd_in
+and writing them to
+.IR fd_out .
+
+Currently, Linux only supports the following flag:
+.TP 1.9i
+.B COPY_REFLINK
+Only perform the copy if the filesystem can do it as a reflink.
+Do not fall back on performing a deep copy.
+.SH RETURN VALUE
+Upon successful completion,
+.BR copy_file_range ()
+will return the number of bytes copied between files.
+This could be less than the length originally requested.
+
+On error,
+.BR copy_file_range ()
+returns \-1 and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EBADF
+One or more file descriptors are not valid,
+or do not have proper read-write mode.
+.TP
+.B EINVAL
+Requested range extends beyond the end of the file;
+.I flags
+argument is set to an invalid value.
+.TP
+.B EOPNOTSUPP
+.B COPY_REFLINK
+was specified in
+.IR flags ,
+but the target filesystem does not support reflinks.
+.TP
+.B EXDEV
+Target filesystem doesn't support cross-filesystem copies.
+.SH VERSIONS
+The
+.BR copy_file_range ()
+system call first appeared in Linux 4.3.
+.SH CONFORMING TO
+The
+.BR copy_file_range ()
+system call is a nonstandard Linux extension.
+.SH EXAMPLE
+.nf
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <linux/copy.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+
+int main(int argc, char **argv)
+{
+    int fd_in, fd_out;
+    struct stat stat;
+    loff_t len, ret;
+
+    if (argc != 3) {
+        fprintf(stderr, "Usage: %s <pathname> <pathname>\n", argv[0]);
+        exit(EXIT_FAILURE);
+    }
+
+    fd_in = open(argv[1], O_RDONLY);
+    if (fd_in == -1) {
+        perror("open (argv[1])");
+        exit(EXIT_FAILURE);
+    }
+
+    if (fstat(fd_in, &stat) == -1) {
+        perror("fstat");
+        exit(EXIT_FAILURE);
+    }
+    len = stat.st_size;
+
+    fd_out = open(argv[2], O_WRONLY | O_CREAT, 0644);
+    if (fd_out == -1) {
+        perror("open (argv[2])");
+        exit(EXIT_FAILURE);
+    }
+
+    do {
+        ret = syscall(__NR_copy_file_range, fd_in, NULL,
+                      fd_out, NULL, len, 0);
+        if (ret == -1) {
+            perror("copy_file_range");
+            exit(EXIT_FAILURE);
+        }
+
+        len -= ret;
+    } while (len > 0);
+
+    close(fd_in);
+    close(fd_out);
+    exit(EXIT_SUCCESS);
+}
+.fi
+.SH SEE ALSO
+.BR splice (2)