diff mbox

[RFC,v0,1/4] vfs: add copy_range syscall and vfs entry point

Message ID 1368566126-17610-2-git-send-email-zab@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Zach Brown May 14, 2013, 9:15 p.m. UTC
This adds a syscall and vfs entry point for clone_range which offloads
data copying between existing files.

The syscall is a thin wrapper around the vfs entry point.  Its arguments
are inspired by sys_splice().

The behaviour of the vfs helper is derived from the current btrfs
CLONE_RANGE ioctl.
---
 fs/Makefile                       |   2 +-
 fs/copy_range.c                   | 127 ++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h                |   3 +
 include/uapi/asm-generic/unistd.h |   4 +-
 kernel/sys_ni.c                   |   1 +
 5 files changed, 135 insertions(+), 2 deletions(-)
 create mode 100644 fs/copy_range.c

Comments

Eric Wong May 15, 2013, 7:44 p.m. UTC | #1
Zach Brown <zab@redhat.com> wrote:
> This adds a syscall and vfs entry point for clone_range which offloads
> data copying between existing files.
> 
> The syscall is a thin wrapper around the vfs entry point.  Its arguments
> are inspired by sys_splice().

Why introduce a new syscall instead of extending sys_splice?

Perhaps adding a new SPLICE_F_COPYRANGE flag to avoid compatibility
problems with userspace which may expect ESPIPE when given two non-pipes
would be useful.

If the user doesn't need a out offset, then sendfile() should also be
able to transparently utilize COPY/CLONE_RANGE, too.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zach Brown May 15, 2013, 8:03 p.m. UTC | #2
On Wed, May 15, 2013 at 07:44:05PM +0000, Eric Wong wrote:
> Why introduce a new syscall instead of extending sys_splice?

Personally, I think it's ugly to have different operations use the same
syscall just because their arguments match.

But that preference aside, sure, if the consensus is that we'd rather
use the splice() entry point then I can duck tape the pieces together to
make it work.

> If the user doesn't need a out offset, then sendfile() should also be
> able to transparently utilize COPY/CLONE_RANGE, too.

Perhaps, yeah.

- z
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ric Wheeler May 16, 2013, 9:16 p.m. UTC | #3
On 05/15/2013 04:03 PM, Zach Brown wrote:
> On Wed, May 15, 2013 at 07:44:05PM +0000, Eric Wong wrote:
>> Why introduce a new syscall instead of extending sys_splice?
> Personally, I think it's ugly to have different operations use the same
> syscall just because their arguments match.

I agree with Zach - having a system call called "splice" do copy offloads is not 
intuitive.

This is a very reasonable name for something that battled its way through 
several standards bodies (for NFS and SCSI :)), so we should give it a 
reasonable name

thanks!

Ric

>
> But that preference aside, sure, if the consensus is that we'd rather
> use the splice() entry point then I can duck tape the pieces together to
> make it work.
>
>> If the user doesn't need a out offset, then sendfile() should also be
>> able to transparently utilize COPY/CLONE_RANGE, too.
> Perhaps, yeah.
>
> - z

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Wong May 21, 2013, 7:47 p.m. UTC | #4
Zach Brown <zab@redhat.com> wrote:
> On Wed, May 15, 2013 at 07:44:05PM +0000, Eric Wong wrote:
> > Why introduce a new syscall instead of extending sys_splice?
> 
> Personally, I think it's ugly to have different operations use the same
> syscall just because their arguments match.

Fair enough.  I think adding a (currently unused) flags parameter would
make sense for future-proofing.

If this were extended to sockets/pipes, peek/nonblock/waitall flags
may be added.

Basically, something like splice/tee, but without needing to
create/manage a pipe in userspace or needing extra syscalls/looping.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zach Brown May 21, 2013, 7:50 p.m. UTC | #5
On Tue, May 21, 2013 at 07:47:19PM +0000, Eric Wong wrote:
> Zach Brown <zab@redhat.com> wrote:
> > On Wed, May 15, 2013 at 07:44:05PM +0000, Eric Wong wrote:
> > > Why introduce a new syscall instead of extending sys_splice?
> > 
> > Personally, I think it's ugly to have different operations use the same
> > syscall just because their arguments match.
> 
> Fair enough.  I think adding a (currently unused) flags parameter would
> make sense for future-proofing.

Yeah, that seems reasonble.  

- z
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/Makefile b/fs/Makefile
index 4fe6df3..1be83b3 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@  obj-y :=	open.o read_write.o file_table.o super.o \
 		attr.o bad_inode.o file.o filesystems.o namespace.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o \
 		pnode.o splice.o sync.o utimes.o \
-		stack.o fs_struct.o statfs.o
+		stack.o fs_struct.o statfs.o copy_range.o
 
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=	buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
diff --git a/fs/copy_range.c b/fs/copy_range.c
new file mode 100644
index 0000000..3000b9f
--- /dev/null
+++ b/fs/copy_range.c
@@ -0,0 +1,127 @@ 
+/*
+ * "copy_range": offload data copying between existing files
+ *
+ * Copyright (C) 2013 Zach Brown <zab@redhat.com>
+ */
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/syscalls.h>
+#include <linux/export.h>
+#include <linux/fsnotify.h>
+
+/**
+ * vfs_copy_range - copy range of bytes from source file to existing file
+ * @file_in:   source regular file
+ * @pos_in:    starting byte offset to copy from the source file
+ * @file_out:  destination regular file
+ * @pos_out:   starting byte offset to copy to in the destination file
+ * @count:     number of bytes to copy
+ *
+ * Returns number of bytes successfully copied from the start of the range or
+ * a negative errno error value.
+ *
+ * The number of bytes successfully written can be less than the input
+ * count if an error is encountered.  In this partial success case the
+ * contents of the destination range after the copied bytes can be a mix
+ * of pre-existing bytes, bytes from the source range, or zeros,
+ * depending on the implementation.
+ *
+ * The source range must be entirely within i_size in the source file.
+ * A destination range outside of the size of the destination file will
+ * extend its size.
+ */
+ssize_t vfs_copy_range(struct file *file_in, loff_t pos_in,
+		       struct file *file_out, loff_t pos_out,
+		       size_t count)
+{
+	struct inode *inode_in;
+	struct inode *inode_out;
+	ssize_t ret;
+
+	if (count == 0)
+		return 0;
+
+	/* copy_range allows full ssize_t count, ignoring MAX_RW_COUNT  */
+	ret = rw_verify_area(READ, file_in, &pos_in, count);
+	if (ret >= 0)
+		ret = rw_verify_area(WRITE, file_out, &pos_out, count);
+	if (ret < 0)
+		return ret;
+
+	if (!(file_in->f_mode & FMODE_READ) ||
+	    !(file_out->f_mode & FMODE_WRITE) ||
+	    (file_out->f_flags & O_APPEND) ||
+	    !file_in->f_op || !file_in->f_op->copy_range)
+		return -EINVAL;
+
+	inode_in = file_inode(file_in);
+	inode_out = file_inode(file_out);
+
+	/* make sure offsets don't wrap and the input is inside i_size */
+	if (pos_in + count < pos_in || pos_out + count < pos_out ||
+	    pos_in + count > i_size_read(inode_in))
+		return -EINVAL;
+
+	/* XXX do we want this test?  btrfs_ioctl_clone_range() */
+	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
+		return -EISDIR;
+
+	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
+		return -EINVAL;
+
+	if (inode_in->i_sb != inode_out->i_sb ||
+	    file_in->f_path.mnt != file_out->f_path.mnt)
+		return -EXDEV;
+
+	/* forbid ranges in the same file for now */
+	if (inode_in == inode_out)
+		return -EINVAL;
+
+	ret = mnt_want_write_file(file_out);
+	if (ret)
+		return ret;
+
+	ret = file_in->f_op->copy_range(file_in, pos_in, file_out, pos_out,
+					count);
+	if (ret > 0) {
+		fsnotify_access(file_in);
+		add_rchar(current, ret);
+		fsnotify_modify(file_out);
+		add_wchar(current, ret);
+	}
+	inc_syscr(current);
+	inc_syscw(current);
+
+	mnt_drop_write_file(file_out);
+
+	return ret;
+}
+EXPORT_SYMBOL(vfs_copy_range);
+
+SYSCALL_DEFINE5(copy_range, int, fd_in, loff_t __user *, upos_in,
+		int, fd_out, loff_t __user *, upos_out, size_t, count)
+{
+	loff_t pos_in;
+	loff_t pos_out;
+	struct fd f_in;
+	struct fd f_out;
+	ssize_t ret;
+
+	if (get_user(pos_in, upos_in) || get_user(pos_out, upos_out))
+		return -EFAULT;
+
+	f_in = fdget(fd_in);
+	f_out = fdget(fd_out);
+
+	if (f_in.file && f_out.file)
+		ret = vfs_copy_range(f_in.file, pos_in, f_out.file, pos_out,
+				     count);
+	else
+		ret = -EBADF;
+
+	fdput(f_in);
+	fdput(f_out);
+
+	return ret;
+}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 43db02e..6214893 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1543,6 +1543,7 @@  struct file_operations {
 	long (*fallocate)(struct file *file, int mode, loff_t offset,
 			  loff_t len);
 	int (*show_fdinfo)(struct seq_file *m, struct file *f);
+	ssize_t (*copy_range)(struct file *, loff_t, struct file *, loff_t, size_t);
 };
 
 struct inode_operations {
@@ -1588,6 +1589,8 @@  extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
 		unsigned long, loff_t *);
 extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
 		unsigned long, loff_t *);
+extern ssize_t vfs_copy_range(struct file *, loff_t , struct file *, loff_t,
+		size_t);
 
 struct super_operations {
    	struct inode *(*alloc_inode)(struct super_block *sb);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 0cc74c4..3935d1c 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -692,9 +692,11 @@  __SC_COMP(__NR_process_vm_writev, sys_process_vm_writev, \
 __SYSCALL(__NR_kcmp, sys_kcmp)
 #define __NR_finit_module 273
 __SYSCALL(__NR_finit_module, sys_finit_module)
+#define __NR_copy_range 274
+__SYSCALL(__NR_copy_range, sys_copy_range)
 
 #undef __NR_syscalls
-#define __NR_syscalls 274
+#define __NR_syscalls 275
 
 /*
  * All syscalls below here should go away really,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 7078052..af7808a 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -151,6 +151,7 @@  cond_syscall(sys_process_vm_readv);
 cond_syscall(sys_process_vm_writev);
 cond_syscall(compat_sys_process_vm_readv);
 cond_syscall(compat_sys_process_vm_writev);
+cond_syscall(sys_copy_range);
 
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read);