Message ID | 20170614170653.58083-1-kolga@netapp.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Hi Olga, On 06/14/2017 01:06 PM, Olga Kornievskaia wrote: > This is a proposal to allow 64bit count to copy and return as > a result of a copy_file_range. No attempt was made to share code > with the 32bit function because 32bit interface should probably > get depreciated. > > Why use 64bit? Current uses of 32bit are by clone_file_range() > which could use 64bit count and NFS copy_file_range also supports > 64bit count value. > > Also with this proposal off-the-bet allow the userland to copy > between different mount points. > > Signed-off-by: Olga Kornievskaia <kolga@netapp.com> > --- > arch/x86/entry/syscalls/syscall_32.tbl | 1 + > arch/x86/entry/syscalls/syscall_64.tbl | 1 + > fs/read_write.c | 136 +++++++++++++++++++++++++++++++++ > include/linux/fs.h | 4 + > include/linux/syscalls.h | 3 + > include/uapi/asm-generic/unistd.h | 4 +- > kernel/sys_ni.c | 1 + > 7 files changed, 149 insertions(+), 1 deletion(-) > > diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl > index 448ac21..1f5f1e7 100644 > --- a/arch/x86/entry/syscalls/syscall_32.tbl > +++ b/arch/x86/entry/syscalls/syscall_32.tbl > @@ -391,3 +391,4 @@ > 382 i386 pkey_free sys_pkey_free > 383 i386 statx sys_statx > 384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl > +385 i386 copy_file_range64 sys_copy_file_range64 > diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl > index 5aef183..e658bbe 100644 > --- a/arch/x86/entry/syscalls/syscall_64.tbl > +++ b/arch/x86/entry/syscalls/syscall_64.tbl > @@ -339,6 +339,7 @@ > 330 common pkey_alloc sys_pkey_alloc > 331 common pkey_free sys_pkey_free > 332 common statx sys_statx > +333 64 copy_file_range64 sys_copy_file_range64 > > # > # x32-specific system call numbers start at 512 to avoid cache impact > diff --git a/fs/read_write.c b/fs/read_write.c > index 47c1d44..e2665c1 100644 > --- a/fs/read_write.c > +++ b/fs/read_write.c > @@ -1700,6 +1700,142 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, > return ret; > } > > +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in, > + struct file *file_out, loff_t pos_out, > + u64 len, unsigned int flags) > +{ > + struct inode *inode_in = file_inode(file_in); > + struct inode *inode_out = file_inode(file_out); > + u64 ret; > + > + if (flags != 0) > + return -EINVAL; > + > + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) > + return -EISDIR; > + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) > + return -EINVAL; > + > + ret = rw_verify_area(READ, file_in, &pos_in, len); > + if (unlikely(ret)) > + return ret; Doesn't rw_verify_area() only accept 32-bit ranges? You might have to use clone_verify_area() instead (maybe consider renaming that function to something generic?) Thanks, Anna > + > + ret = rw_verify_area(WRITE, file_out, &pos_out, len); > + if (unlikely(ret)) > + return ret; > + > + if (!(file_in->f_mode & FMODE_READ) || > + !(file_out->f_mode & FMODE_WRITE) || > + (file_out->f_flags & O_APPEND)) > + return -EBADF; > + > + if (len == 0) > + return 0; > + > + file_start_write(file_out); > + > + /* > + * Try cloning first, this is supported by more file systems, and > + * more efficient if both clone and copy are supported (e.g. NFS). > + */ > + if (file_in->f_op->clone_file_range) { > + ret = file_in->f_op->clone_file_range(file_in, pos_in, > + file_out, pos_out, len); > + if (ret == 0) { > + ret = len; > + goto done; > + } > + } > + > + if (file_out->f_op->copy_file_range64) { > + ret = file_out->f_op->copy_file_range64(file_in, pos_in, > + file_out, pos_out, len, flags); > + if (ret != -EOPNOTSUPP) > + goto done; > + } > + > + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, > + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); > + > +done: > + if (ret > 0) { > + fsnotify_access(file_in); > + add_rchar(current, ret); > + fsnotify_modify(file_out); > + add_wchar(current, ret); > + } > + > + inc_syscr(current); > + inc_syscw(current); > + > + file_end_write(file_out); > + > + return ret; > +} > +EXPORT_SYMBOL(vfs_copy_file_range64); > + > +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in, > + int, fd_out, loff_t __user *, off_out, > + u64, len, unsigned int, flags) > +{ > + loff_t pos_in; > + loff_t pos_out; > + struct fd f_in; > + struct fd f_out; > + u64 ret = -EBADF; > + > + f_in = fdget(fd_in); > + if (!f_in.file) > + goto out2; > + > + f_out = fdget(fd_out); > + if (!f_out.file) > + goto out1; > + > + ret = -EFAULT; > + if (off_in) { > + if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) > + goto out; > + } else { > + pos_in = f_in.file->f_pos; > + } > + > + if (off_out) { > + if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) > + goto out; > + } else { > + pos_out = f_out.file->f_pos; > + } > + > + ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len, > + flags); > + if (ret > 0) { > + pos_in += ret; > + pos_out += ret; > + > + if (off_in) { > + if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) > + ret = -EFAULT; > + } else { > + f_in.file->f_pos = pos_in; > + } > + > + if (off_out) { > + if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) > + ret = -EFAULT; > + } else { > + f_out.file->f_pos = pos_out; > + } > + } > + > +out: > + fdput(f_out); > +out1: > + fdput(f_in); > +out2: > + return ret; > +} > + > static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) > { > struct inode *inode = file_inode(file); > diff --git a/include/linux/fs.h b/include/linux/fs.h > index 803e5a9..2727a98 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -1690,6 +1690,8 @@ struct file_operations { > u64); > ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *, > u64); > + u64 (*copy_file_range64)(struct file *, loff_t, struct file *, > + loff_t, u64, unsigned int); > }; > > struct inode_operations { > @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, > loff_t len, bool *is_same); > extern int vfs_dedupe_file_range(struct file *file, > struct file_dedupe_range *same); > +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *, > + loff_t, u64, unsigned int); > > struct super_operations { > struct inode *(*alloc_inode)(struct super_block *sb); > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h > index 980c3c9..f7ea78e 100644 > --- a/include/linux/syscalls.h > +++ b/include/linux/syscalls.h > @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len, > asmlinkage long sys_pkey_free(int pkey); > asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, > unsigned mask, struct statx __user *buffer); > +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in, > + int fd_out, loff_t __user *off_out, > + u64 len, unsigned int flags); > > #endif > diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h > index 061185a..e385a76 100644 > --- a/include/uapi/asm-generic/unistd.h > +++ b/include/uapi/asm-generic/unistd.h > @@ -731,9 +731,11 @@ > __SYSCALL(__NR_pkey_free, sys_pkey_free) > #define __NR_statx 291 > __SYSCALL(__NR_statx, sys_statx) > +#define __NR_copy_file_range64 292 > +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64) > > #undef __NR_syscalls > -#define __NR_syscalls 292 > +#define __NR_syscalls 293 > > /* > * All syscalls below here should go away really, > diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c > index 8acef85..8e0cfd4 100644 > --- a/kernel/sys_ni.c > +++ b/kernel/sys_ni.c > @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void) > cond_syscall(sys_capget); > cond_syscall(sys_capset); > cond_syscall(sys_copy_file_range); > +cond_syscall(sys_copy_file_range64); > > /* arch-specific weak syscall entries */ > cond_syscall(sys_pciconfig_read); >
On Wed, Jun 14, 2017 at 01:06:53PM -0400, Olga Kornievskaia wrote: > This is a proposal to allow 64bit count to copy and return as > a result of a copy_file_range. No attempt was made to share code > with the 32bit function because 32bit interface should probably > get depreciated. > > Why use 64bit? Current uses of 32bit are by clone_file_range() > which could use 64bit count and NFS copy_file_range also supports > 64bit count value. Please provide a very good use case that actually matters to a large portion of the Linux users. > Also with this proposal off-the-bet allow the userland to copy > between different mount points. Totally different issue that we've discussed N times. Trying to sneak it in behind peoples backs isn't going to improve the situation in any way.
On Thu, Jun 15, 2017 at 4:15 AM, Christoph Hellwig <hch@infradead.org> wrote: > On Wed, Jun 14, 2017 at 01:06:53PM -0400, Olga Kornievskaia wrote: >> This is a proposal to allow 64bit count to copy and return as >> a result of a copy_file_range. No attempt was made to share code >> with the 32bit function because 32bit interface should probably >> get depreciated. >> >> Why use 64bit? Current uses of 32bit are by clone_file_range() >> which could use 64bit count and NFS copy_file_range also supports >> 64bit count value. > > Please provide a very good use case that actually matters to a large > portion of the Linux users. Reason: it will not hurt any user but it will help for server-to-server NFS copy because for each invocation of of copy_file_range() it requires that the client sends COPY_NOTIFY, then COPY and then a copy triggers a establishment of clientid/session before the reading of the source file can happen. All that could be saved by have a 64bit value. >> Also with this proposal off-the-bet allow the userland to copy >> between different mount points. > > Totally different issue that we've discussed N times. Trying to sneak > it in behind peoples backs isn't going to improve the situation in any way. I would think that sneaking would have been to remove the check and not mention it in the commit description. I have explicitly brought the topic up in the commit description to bring the attention to the issue as a way to restart the conversation about the issue. I have several times have asked for the reason to why the check can not be removed. I'm asking again can you please explain why. > -- > To unsubscribe from this list: send the line "unsubscribe linux-nfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html
> On Jun 15, 2017, at 7:07 AM, Olga Kornievskaia <aglo@umich.edu> wrote: > > On Thu, Jun 15, 2017 at 4:15 AM, Christoph Hellwig <hch@infradead.org> wrote: >> On Wed, Jun 14, 2017 at 01:06:53PM -0400, Olga Kornievskaia wrote: >>> This is a proposal to allow 64bit count to copy and return as >>> a result of a copy_file_range. No attempt was made to share code >>> with the 32bit function because 32bit interface should probably >>> get depreciated. >>> >>> Why use 64bit? Current uses of 32bit are by clone_file_range() >>> which could use 64bit count and NFS copy_file_range also supports >>> 64bit count value. >> >> Please provide a very good use case that actually matters to a large >> portion of the Linux users. > > Reason: it will not hurt any user but it will help for > server-to-server NFS copy because for each invocation of of > copy_file_range() it requires that the client sends COPY_NOTIFY, then > COPY and then a copy triggers a establishment of clientid/session > before the reading of the source file can happen. All that could be > saved by have a 64bit value. What is the overhead of sending a couple of RPCs vs. copying 4GB of data on the server? > Current copy_file_range would work for any file size, it would just > need to be called in a loop by the application. With the given > proposal, there wouldn't be a need for the application to loop due to > the API size limitation. It might still loop if a partial copy was > done. Given that there always needs to be a loop to handle partial completion, adding the 64-bit syscall doesn't really simplify the application at all, but adds duplicate code to the kernel for little benefit. Cheers, Andreas
On Thu, Jun 15, 2017 at 2:35 PM, Andreas Dilger <adilger@dilger.ca> wrote: > >> On Jun 15, 2017, at 7:07 AM, Olga Kornievskaia <aglo@umich.edu> wrote: >> >> On Thu, Jun 15, 2017 at 4:15 AM, Christoph Hellwig <hch@infradead.org> wrote: >>> On Wed, Jun 14, 2017 at 01:06:53PM -0400, Olga Kornievskaia wrote: >>>> This is a proposal to allow 64bit count to copy and return as >>>> a result of a copy_file_range. No attempt was made to share code >>>> with the 32bit function because 32bit interface should probably >>>> get depreciated. >>>> >>>> Why use 64bit? Current uses of 32bit are by clone_file_range() >>>> which could use 64bit count and NFS copy_file_range also supports >>>> 64bit count value. >>> >>> Please provide a very good use case that actually matters to a large >>> portion of the Linux users. >> >> Reason: it will not hurt any user but it will help for >> server-to-server NFS copy because for each invocation of of >> copy_file_range() it requires that the client sends COPY_NOTIFY, then >> COPY and then a copy triggers a establishment of clientid/session >> before the reading of the source file can happen. All that could be >> saved by have a 64bit value. > > What is the overhead of sending a couple of RPCs vs. copying 4GB of > data on the server? We get about 11% improvement having an interface that removes the need for multiple calls. That's testing 8gb copy that normally would have needed 2 copies. >> Current copy_file_range would work for any file size, it would just >> need to be called in a loop by the application. With the given >> proposal, there wouldn't be a need for the application to loop due to >> the API size limitation. It might still loop if a partial copy was >> done. > > > Given that there always needs to be a loop to handle partial completion, > adding the 64-bit syscall doesn't really simplify the application at > all, but adds duplicate code to the kernel for little benefit. If 32bit version would be deprecated then there won't be duplicate code.
On Thu, Jun 15, 2017 at 12:35:29PM -0600, Andreas Dilger wrote: > > > On Jun 15, 2017, at 7:07 AM, Olga Kornievskaia <aglo@umich.edu> wrote: > > Reason: it will not hurt any user but it will help for > > server-to-server NFS copy because for each invocation of of > > copy_file_range() it requires that the client sends COPY_NOTIFY, then > > COPY and then a copy triggers a establishment of clientid/session > > before the reading of the source file can happen. All that could be > > saved by have a 64bit value. > > What is the overhead of sending a couple of RPCs vs. copying 4GB of > data on the server? It makes a difference in the clone case, doesn't it? (Though I'd think best there would be a special "copy to end of file" value.) --b.
On Fri, Jun 16, 2017 at 01:36:30PM -0400, J. Bruce Fields wrote: > It makes a difference in the clone case, doesn't it? (Though I'd think > best there would be a special "copy to end of file" value.) Clones uses a 0 length as "whole" file. Both for the NFS operation and the Linux ioctl.
> On Jun 17, 2017, at 6:03 AM, Christoph Hellwig <hch@infradead.org> wrote: > > On Fri, Jun 16, 2017 at 01:36:30PM -0400, J. Bruce Fields wrote: >> It makes a difference in the clone case, doesn't it? (Though I'd think >> best there would be a special "copy to end of file" value.) > > Clones uses a 0 length as "whole" file. Both for the NFS operation > and the Linux ioctl. Does that actually work? There is check vfs_copy_file_range() if (len == 0) return 0;
On Sat, Jun 17, 2017 at 08:42:54AM -0400, Olga Kornievskaia wrote: > > > On Jun 17, 2017, at 6:03 AM, Christoph Hellwig <hch@infradead.org> wrote: > > > > On Fri, Jun 16, 2017 at 01:36:30PM -0400, J. Bruce Fields wrote: > >> It makes a difference in the clone case, doesn't it? (Though I'd think > >> best there would be a special "copy to end of file" value.) > > > > Clones uses a 0 length as "whole" file. Both for the NFS operation > > and the Linux ioctl. > > Does that actually work? There is check vfs_copy_file_range() Please re-read the above sentence. I'm talking about NFS clone and IOC_CLONE_RANGE, not copy_file_range. copy_file_range had to follow the NFS semantics that don't have the special 0 case.
On Sat, Jun 17, 2017 at 08:09:38AM -0700, Christoph Hellwig wrote: > On Sat, Jun 17, 2017 at 08:42:54AM -0400, Olga Kornievskaia wrote: > > > > > On Jun 17, 2017, at 6:03 AM, Christoph Hellwig <hch@infradead.org> wrote: > > > > > > On Fri, Jun 16, 2017 at 01:36:30PM -0400, J. Bruce Fields wrote: > > >> It makes a difference in the clone case, doesn't it? (Though I'd think > > >> best there would be a special "copy to end of file" value.) > > > > > > Clones uses a 0 length as "whole" file. Both for the NFS operation > > > and the Linux ioctl. > > > > Does that actually work? There is check vfs_copy_file_range() > > Please re-read the above sentence. I'm talking about NFS clone and > IOC_CLONE_RANGE, not copy_file_range. copy_file_range had to follow > the NFS semantics that don't have the special 0 case. Nope, COPY has it: https://tools.ietf.org/html/rfc7862#section-15.2.3 A count of 0 (zero) requests that all bytes from ca_src_offset through EOF be copied to the destination. I think leaving it out of copy_file_range was just an oversight. --b.
On Wed, 2017-06-14 at 13:06 -0400, Olga Kornievskaia wrote: > This is a proposal to allow 64bit count to copy and return as > a result of a copy_file_range. No attempt was made to share code > with the 32bit function because 32bit interface should probably > get depreciated. > > Why use 64bit? Current uses of 32bit are by clone_file_range() > which could use 64bit count and NFS copy_file_range also supports > 64bit count value. > > Also with this proposal off-the-bet allow the userland to copy > between different mount points. > > Signed-off-by: Olga Kornievskaia <kolga@netapp.com> > --- > arch/x86/entry/syscalls/syscall_32.tbl | 1 + > arch/x86/entry/syscalls/syscall_64.tbl | 1 + > fs/read_write.c | 136 +++++++++++++++++++++++++++++++++ > include/linux/fs.h | 4 + > include/linux/syscalls.h | 3 + > include/uapi/asm-generic/unistd.h | 4 +- > kernel/sys_ni.c | 1 + > 7 files changed, 149 insertions(+), 1 deletion(-) > > diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl > index 448ac21..1f5f1e7 100644 > --- a/arch/x86/entry/syscalls/syscall_32.tbl > +++ b/arch/x86/entry/syscalls/syscall_32.tbl > @@ -391,3 +391,4 @@ > 382 i386 pkey_free sys_pkey_free > 383 i386 statx sys_statx > 384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl > +385 i386 copy_file_range64 sys_copy_file_range64 > diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl > index 5aef183..e658bbe 100644 > --- a/arch/x86/entry/syscalls/syscall_64.tbl > +++ b/arch/x86/entry/syscalls/syscall_64.tbl > @@ -339,6 +339,7 @@ > 330 common pkey_alloc sys_pkey_alloc > 331 common pkey_free sys_pkey_free > 332 common statx sys_statx > +333 64 copy_file_range64 sys_copy_file_range64 > > # > # x32-specific system call numbers start at 512 to avoid cache impact > diff --git a/fs/read_write.c b/fs/read_write.c > index 47c1d44..e2665c1 100644 > --- a/fs/read_write.c > +++ b/fs/read_write.c > @@ -1700,6 +1700,142 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, > return ret; > } > > +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in, > + struct file *file_out, loff_t pos_out, > + u64 len, unsigned int flags) > +{ > + struct inode *inode_in = file_inode(file_in); > + struct inode *inode_out = file_inode(file_out); > + u64 ret; > + > + if (flags != 0) > + return -EINVAL; > + > + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) > + return -EISDIR; > + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) > + return -EINVAL; > + > + ret = rw_verify_area(READ, file_in, &pos_in, len); > + if (unlikely(ret)) > + return ret; > + > + ret = rw_verify_area(WRITE, file_out, &pos_out, len); > + if (unlikely(ret)) > + return ret; > + > + if (!(file_in->f_mode & FMODE_READ) || > + !(file_out->f_mode & FMODE_WRITE) || > + (file_out->f_flags & O_APPEND)) > + return -EBADF; > + > + if (len == 0) > + return 0; > + > + file_start_write(file_out); > + > + /* > + * Try cloning first, this is supported by more file systems, and > + * more efficient if both clone and copy are supported (e.g. NFS). > + */ > + if (file_in->f_op->clone_file_range) { > + ret = file_in->f_op->clone_file_range(file_in, pos_in, > + file_out, pos_out, len); > + if (ret == 0) { > + ret = len; > + goto done; > + } > + } > + > + if (file_out->f_op->copy_file_range64) { > + ret = file_out->f_op->copy_file_range64(file_in, pos_in, > + file_out, pos_out, len, flags); > + if (ret != -EOPNOTSUPP) > + goto done; > + } > + > + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, > + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); > + > +done: > + if (ret > 0) { > + fsnotify_access(file_in); > + add_rchar(current, ret); > + fsnotify_modify(file_out); > + add_wchar(current, ret); > + } > + > + inc_syscr(current); > + inc_syscw(current); > + > + file_end_write(file_out); > + > + return ret; > +} > +EXPORT_SYMBOL(vfs_copy_file_range64); > + > +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in, > + int, fd_out, loff_t __user *, off_out, > + u64, len, unsigned int, flags) > +{ > + loff_t pos_in; > + loff_t pos_out; > + struct fd f_in; > + struct fd f_out; > + u64 ret = -EBADF; > + > + f_in = fdget(fd_in); > + if (!f_in.file) > + goto out2; > + > + f_out = fdget(fd_out); > + if (!f_out.file) > + goto out1; > + > + ret = -EFAULT; > + if (off_in) { > + if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) > + goto out; > + } else { > + pos_in = f_in.file->f_pos; > + } > + > + if (off_out) { > + if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) > + goto out; > + } else { > + pos_out = f_out.file->f_pos; > + } > + > + ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len, > + flags); > + if (ret > 0) { > + pos_in += ret; > + pos_out += ret; > + > + if (off_in) { > + if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) > + ret = -EFAULT; > + } else { > + f_in.file->f_pos = pos_in; > + } > + > + if (off_out) { > + if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) > + ret = -EFAULT; > + } else { > + f_out.file->f_pos = pos_out; > + } > + } > + > +out: > + fdput(f_out); > +out1: > + fdput(f_in); > +out2: > + return ret; > +} > + > static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) > { > struct inode *inode = file_inode(file); > diff --git a/include/linux/fs.h b/include/linux/fs.h > index 803e5a9..2727a98 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -1690,6 +1690,8 @@ struct file_operations { > u64); > ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *, > u64); > + u64 (*copy_file_range64)(struct file *, loff_t, struct file *, > + loff_t, u64, unsigned int); > }; > > struct inode_operations { > @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, > loff_t len, bool *is_same); > extern int vfs_dedupe_file_range(struct file *file, > struct file_dedupe_range *same); > +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *, > + loff_t, u64, unsigned int); > > struct super_operations { > struct inode *(*alloc_inode)(struct super_block *sb); > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h > index 980c3c9..f7ea78e 100644 > --- a/include/linux/syscalls.h > +++ b/include/linux/syscalls.h > @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len, > asmlinkage long sys_pkey_free(int pkey); > asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, > unsigned mask, struct statx __user *buffer); > +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in, > + int fd_out, loff_t __user *off_out, > + u64 len, unsigned int flags); > > #endif > diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h > index 061185a..e385a76 100644 > --- a/include/uapi/asm-generic/unistd.h > +++ b/include/uapi/asm-generic/unistd.h > @@ -731,9 +731,11 @@ > __SYSCALL(__NR_pkey_free, sys_pkey_free) > #define __NR_statx 291 > __SYSCALL(__NR_statx, sys_statx) > +#define __NR_copy_file_range64 292 > +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64) > > #undef __NR_syscalls > -#define __NR_syscalls 292 > +#define __NR_syscalls 293 > > /* > * All syscalls below here should go away really, > diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c > index 8acef85..8e0cfd4 100644 > --- a/kernel/sys_ni.c > +++ b/kernel/sys_ni.c > @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void) > cond_syscall(sys_capget); > cond_syscall(sys_capset); > cond_syscall(sys_copy_file_range); > +cond_syscall(sys_copy_file_range64); > > /* arch-specific weak syscall entries */ > cond_syscall(sys_pciconfig_read); Hi Olga, We discussed this a bit privately, but I'll note it here too. Server-to-server copy seems like a nice thing to me as well. There are several filesystems that can likely make use of that ability. I don't have a real opinion on the API change either, but you're making a very subtle change with this patch. Prior to this, the existing copy_file_range calls could count on the superblocks being the same. Now, it looks like you can pass them any old file pointer. The exsiting copy_file_range file ops need to be fixed to vet the struct files they've been handed if you want to do this. It would also be nice to see something on copy_file_range and clone_file_range in Documentation/filesystems/vfs.txt. Cheers,
On 06/24/2017 09:23 AM, Jeff Layton wrote: > On Wed, 2017-06-14 at 13:06 -0400, Olga Kornievskaia wrote: >> This is a proposal to allow 64bit count to copy and return as >> a result of a copy_file_range. No attempt was made to share code >> with the 32bit function because 32bit interface should probably >> get depreciated. >> >> Why use 64bit? Current uses of 32bit are by clone_file_range() >> which could use 64bit count and NFS copy_file_range also supports >> 64bit count value. >> >> Also with this proposal off-the-bet allow the userland to copy >> between different mount points. >> >> Signed-off-by: Olga Kornievskaia <kolga@netapp.com> >> --- >> arch/x86/entry/syscalls/syscall_32.tbl | 1 + >> arch/x86/entry/syscalls/syscall_64.tbl | 1 + >> fs/read_write.c | 136 +++++++++++++++++++++++++++++++++ >> include/linux/fs.h | 4 + >> include/linux/syscalls.h | 3 + >> include/uapi/asm-generic/unistd.h | 4 +- >> kernel/sys_ni.c | 1 + >> 7 files changed, 149 insertions(+), 1 deletion(-) >> >> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl >> index 448ac21..1f5f1e7 100644 >> --- a/arch/x86/entry/syscalls/syscall_32.tbl >> +++ b/arch/x86/entry/syscalls/syscall_32.tbl >> @@ -391,3 +391,4 @@ >> 382 i386 pkey_free sys_pkey_free >> 383 i386 statx sys_statx >> 384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl >> +385 i386 copy_file_range64 sys_copy_file_range64 >> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl >> index 5aef183..e658bbe 100644 >> --- a/arch/x86/entry/syscalls/syscall_64.tbl >> +++ b/arch/x86/entry/syscalls/syscall_64.tbl >> @@ -339,6 +339,7 @@ >> 330 common pkey_alloc sys_pkey_alloc >> 331 common pkey_free sys_pkey_free >> 332 common statx sys_statx >> +333 64 copy_file_range64 sys_copy_file_range64 >> >> # >> # x32-specific system call numbers start at 512 to avoid cache impact >> diff --git a/fs/read_write.c b/fs/read_write.c >> index 47c1d44..e2665c1 100644 >> --- a/fs/read_write.c >> +++ b/fs/read_write.c >> @@ -1700,6 +1700,142 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, >> return ret; >> } >> >> +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in, >> + struct file *file_out, loff_t pos_out, >> + u64 len, unsigned int flags) >> +{ >> + struct inode *inode_in = file_inode(file_in); >> + struct inode *inode_out = file_inode(file_out); >> + u64 ret; >> + >> + if (flags != 0) >> + return -EINVAL; >> + >> + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) >> + return -EISDIR; >> + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) >> + return -EINVAL; >> + >> + ret = rw_verify_area(READ, file_in, &pos_in, len); >> + if (unlikely(ret)) >> + return ret; >> + >> + ret = rw_verify_area(WRITE, file_out, &pos_out, len); >> + if (unlikely(ret)) >> + return ret; >> + >> + if (!(file_in->f_mode & FMODE_READ) || >> + !(file_out->f_mode & FMODE_WRITE) || >> + (file_out->f_flags & O_APPEND)) >> + return -EBADF; >> + >> + if (len == 0) >> + return 0; >> + >> + file_start_write(file_out); >> + >> + /* >> + * Try cloning first, this is supported by more file systems, and >> + * more efficient if both clone and copy are supported (e.g. NFS). >> + */ >> + if (file_in->f_op->clone_file_range) { >> + ret = file_in->f_op->clone_file_range(file_in, pos_in, >> + file_out, pos_out, len); >> + if (ret == 0) { >> + ret = len; >> + goto done; >> + } >> + } >> + >> + if (file_out->f_op->copy_file_range64) { >> + ret = file_out->f_op->copy_file_range64(file_in, pos_in, >> + file_out, pos_out, len, flags); >> + if (ret != -EOPNOTSUPP) >> + goto done; >> + } >> + >> + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, >> + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); >> + >> +done: >> + if (ret > 0) { >> + fsnotify_access(file_in); >> + add_rchar(current, ret); >> + fsnotify_modify(file_out); >> + add_wchar(current, ret); >> + } >> + >> + inc_syscr(current); >> + inc_syscw(current); >> + >> + file_end_write(file_out); >> + >> + return ret; >> +} >> +EXPORT_SYMBOL(vfs_copy_file_range64); >> + >> +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in, >> + int, fd_out, loff_t __user *, off_out, >> + u64, len, unsigned int, flags) >> +{ >> + loff_t pos_in; >> + loff_t pos_out; >> + struct fd f_in; >> + struct fd f_out; >> + u64 ret = -EBADF; >> + >> + f_in = fdget(fd_in); >> + if (!f_in.file) >> + goto out2; >> + >> + f_out = fdget(fd_out); >> + if (!f_out.file) >> + goto out1; >> + >> + ret = -EFAULT; >> + if (off_in) { >> + if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) >> + goto out; >> + } else { >> + pos_in = f_in.file->f_pos; >> + } >> + >> + if (off_out) { >> + if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) >> + goto out; >> + } else { >> + pos_out = f_out.file->f_pos; >> + } >> + >> + ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len, >> + flags); >> + if (ret > 0) { >> + pos_in += ret; >> + pos_out += ret; >> + >> + if (off_in) { >> + if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) >> + ret = -EFAULT; >> + } else { >> + f_in.file->f_pos = pos_in; >> + } >> + >> + if (off_out) { >> + if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) >> + ret = -EFAULT; >> + } else { >> + f_out.file->f_pos = pos_out; >> + } >> + } >> + >> +out: >> + fdput(f_out); >> +out1: >> + fdput(f_in); >> +out2: >> + return ret; >> +} >> + >> static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) >> { >> struct inode *inode = file_inode(file); >> diff --git a/include/linux/fs.h b/include/linux/fs.h >> index 803e5a9..2727a98 100644 >> --- a/include/linux/fs.h >> +++ b/include/linux/fs.h >> @@ -1690,6 +1690,8 @@ struct file_operations { >> u64); >> ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *, >> u64); >> + u64 (*copy_file_range64)(struct file *, loff_t, struct file *, >> + loff_t, u64, unsigned int); >> }; >> >> struct inode_operations { >> @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, >> loff_t len, bool *is_same); >> extern int vfs_dedupe_file_range(struct file *file, >> struct file_dedupe_range *same); >> +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *, >> + loff_t, u64, unsigned int); >> >> struct super_operations { >> struct inode *(*alloc_inode)(struct super_block *sb); >> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h >> index 980c3c9..f7ea78e 100644 >> --- a/include/linux/syscalls.h >> +++ b/include/linux/syscalls.h >> @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len, >> asmlinkage long sys_pkey_free(int pkey); >> asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, >> unsigned mask, struct statx __user *buffer); >> +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in, >> + int fd_out, loff_t __user *off_out, >> + u64 len, unsigned int flags); >> >> #endif >> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h >> index 061185a..e385a76 100644 >> --- a/include/uapi/asm-generic/unistd.h >> +++ b/include/uapi/asm-generic/unistd.h >> @@ -731,9 +731,11 @@ >> __SYSCALL(__NR_pkey_free, sys_pkey_free) >> #define __NR_statx 291 >> __SYSCALL(__NR_statx, sys_statx) >> +#define __NR_copy_file_range64 292 >> +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64) >> >> #undef __NR_syscalls >> -#define __NR_syscalls 292 >> +#define __NR_syscalls 293 >> >> /* >> * All syscalls below here should go away really, >> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c >> index 8acef85..8e0cfd4 100644 >> --- a/kernel/sys_ni.c >> +++ b/kernel/sys_ni.c >> @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void) >> cond_syscall(sys_capget); >> cond_syscall(sys_capset); >> cond_syscall(sys_copy_file_range); >> +cond_syscall(sys_copy_file_range64); >> >> /* arch-specific weak syscall entries */ >> cond_syscall(sys_pciconfig_read); > > Hi Olga, > > We discussed this a bit privately, but I'll note it here too. > > Server-to-server copy seems like a nice thing to me as well. There are > several filesystems that can likely make use of that ability. > > I don't have a real opinion on the API change either, but you're making > a very subtle change with this patch. Prior to this, the existing > copy_file_range calls could count on the superblocks being the same. > Now, it looks like you can pass them any old file pointer. What if we add a new file op for xdev copy that gets called when superblocks are different, but filesystem type is the same? We could keep the current copy_file_range fop to fall back on if superblocks are the same. Thoughts? Anna > > The exsiting copy_file_range file ops need to be fixed to vet the struct > files they've been handed if you want to do this. > > It would also be nice to see something on copy_file_range and > clone_file_range in Documentation/filesystems/vfs.txt. > > Cheers, >
On Mon, 2017-06-26 at 09:21 -0400, Anna Schumaker wrote: > > On 06/24/2017 09:23 AM, Jeff Layton wrote: > > On Wed, 2017-06-14 at 13:06 -0400, Olga Kornievskaia wrote: > > > This is a proposal to allow 64bit count to copy and return as > > > a result of a copy_file_range. No attempt was made to share code > > > with the 32bit function because 32bit interface should probably > > > get depreciated. > > > > > > Why use 64bit? Current uses of 32bit are by clone_file_range() > > > which could use 64bit count and NFS copy_file_range also supports > > > 64bit count value. > > > > > > Also with this proposal off-the-bet allow the userland to copy > > > between different mount points. > > > > > > Signed-off-by: Olga Kornievskaia <kolga@netapp.com> > > > --- > > > arch/x86/entry/syscalls/syscall_32.tbl | 1 + > > > arch/x86/entry/syscalls/syscall_64.tbl | 1 + > > > fs/read_write.c | 136 +++++++++++++++++++++++++++++++++ > > > include/linux/fs.h | 4 + > > > include/linux/syscalls.h | 3 + > > > include/uapi/asm-generic/unistd.h | 4 +- > > > kernel/sys_ni.c | 1 + > > > 7 files changed, 149 insertions(+), 1 deletion(-) > > > > > > diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl > > > index 448ac21..1f5f1e7 100644 > > > --- a/arch/x86/entry/syscalls/syscall_32.tbl > > > +++ b/arch/x86/entry/syscalls/syscall_32.tbl > > > @@ -391,3 +391,4 @@ > > > 382 i386 pkey_free sys_pkey_free > > > 383 i386 statx sys_statx > > > 384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl > > > +385 i386 copy_file_range64 sys_copy_file_range64 > > > diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl > > > index 5aef183..e658bbe 100644 > > > --- a/arch/x86/entry/syscalls/syscall_64.tbl > > > +++ b/arch/x86/entry/syscalls/syscall_64.tbl > > > @@ -339,6 +339,7 @@ > > > 330 common pkey_alloc sys_pkey_alloc > > > 331 common pkey_free sys_pkey_free > > > 332 common statx sys_statx > > > +333 64 copy_file_range64 sys_copy_file_range64 > > > > > > # > > > # x32-specific system call numbers start at 512 to avoid cache impact > > > diff --git a/fs/read_write.c b/fs/read_write.c > > > index 47c1d44..e2665c1 100644 > > > --- a/fs/read_write.c > > > +++ b/fs/read_write.c > > > @@ -1700,6 +1700,142 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, > > > return ret; > > > } > > > > > > +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in, > > > + struct file *file_out, loff_t pos_out, > > > + u64 len, unsigned int flags) > > > +{ > > > + struct inode *inode_in = file_inode(file_in); > > > + struct inode *inode_out = file_inode(file_out); > > > + u64 ret; > > > + > > > + if (flags != 0) > > > + return -EINVAL; > > > + > > > + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) > > > + return -EISDIR; > > > + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) > > > + return -EINVAL; > > > + > > > + ret = rw_verify_area(READ, file_in, &pos_in, len); > > > + if (unlikely(ret)) > > > + return ret; > > > + > > > + ret = rw_verify_area(WRITE, file_out, &pos_out, len); > > > + if (unlikely(ret)) > > > + return ret; > > > + > > > + if (!(file_in->f_mode & FMODE_READ) || > > > + !(file_out->f_mode & FMODE_WRITE) || > > > + (file_out->f_flags & O_APPEND)) > > > + return -EBADF; > > > + > > > + if (len == 0) > > > + return 0; > > > + > > > + file_start_write(file_out); > > > + > > > + /* > > > + * Try cloning first, this is supported by more file systems, and > > > + * more efficient if both clone and copy are supported (e.g. NFS). > > > + */ > > > + if (file_in->f_op->clone_file_range) { > > > + ret = file_in->f_op->clone_file_range(file_in, pos_in, > > > + file_out, pos_out, len); > > > + if (ret == 0) { > > > + ret = len; > > > + goto done; > > > + } > > > + } > > > + > > > + if (file_out->f_op->copy_file_range64) { > > > + ret = file_out->f_op->copy_file_range64(file_in, pos_in, > > > + file_out, pos_out, len, flags); > > > + if (ret != -EOPNOTSUPP) > > > + goto done; > > > + } > > > + > > > + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, > > > + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); > > > + > > > +done: > > > + if (ret > 0) { > > > + fsnotify_access(file_in); > > > + add_rchar(current, ret); > > > + fsnotify_modify(file_out); > > > + add_wchar(current, ret); > > > + } > > > + > > > + inc_syscr(current); > > > + inc_syscw(current); > > > + > > > + file_end_write(file_out); > > > + > > > + return ret; > > > +} > > > +EXPORT_SYMBOL(vfs_copy_file_range64); > > > + > > > +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in, > > > + int, fd_out, loff_t __user *, off_out, > > > + u64, len, unsigned int, flags) > > > +{ > > > + loff_t pos_in; > > > + loff_t pos_out; > > > + struct fd f_in; > > > + struct fd f_out; > > > + u64 ret = -EBADF; > > > + > > > + f_in = fdget(fd_in); > > > + if (!f_in.file) > > > + goto out2; > > > + > > > + f_out = fdget(fd_out); > > > + if (!f_out.file) > > > + goto out1; > > > + > > > + ret = -EFAULT; > > > + if (off_in) { > > > + if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) > > > + goto out; > > > + } else { > > > + pos_in = f_in.file->f_pos; > > > + } > > > + > > > + if (off_out) { > > > + if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) > > > + goto out; > > > + } else { > > > + pos_out = f_out.file->f_pos; > > > + } > > > + > > > + ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len, > > > + flags); > > > + if (ret > 0) { > > > + pos_in += ret; > > > + pos_out += ret; > > > + > > > + if (off_in) { > > > + if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) > > > + ret = -EFAULT; > > > + } else { > > > + f_in.file->f_pos = pos_in; > > > + } > > > + > > > + if (off_out) { > > > + if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) > > > + ret = -EFAULT; > > > + } else { > > > + f_out.file->f_pos = pos_out; > > > + } > > > + } > > > + > > > +out: > > > + fdput(f_out); > > > +out1: > > > + fdput(f_in); > > > +out2: > > > + return ret; > > > +} > > > + > > > static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) > > > { > > > struct inode *inode = file_inode(file); > > > diff --git a/include/linux/fs.h b/include/linux/fs.h > > > index 803e5a9..2727a98 100644 > > > --- a/include/linux/fs.h > > > +++ b/include/linux/fs.h > > > @@ -1690,6 +1690,8 @@ struct file_operations { > > > u64); > > > ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *, > > > u64); > > > + u64 (*copy_file_range64)(struct file *, loff_t, struct file *, > > > + loff_t, u64, unsigned int); > > > }; > > > > > > struct inode_operations { > > > @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, > > > loff_t len, bool *is_same); > > > extern int vfs_dedupe_file_range(struct file *file, > > > struct file_dedupe_range *same); > > > +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *, > > > + loff_t, u64, unsigned int); > > > > > > struct super_operations { > > > struct inode *(*alloc_inode)(struct super_block *sb); > > > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h > > > index 980c3c9..f7ea78e 100644 > > > --- a/include/linux/syscalls.h > > > +++ b/include/linux/syscalls.h > > > @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len, > > > asmlinkage long sys_pkey_free(int pkey); > > > asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, > > > unsigned mask, struct statx __user *buffer); > > > +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in, > > > + int fd_out, loff_t __user *off_out, > > > + u64 len, unsigned int flags); > > > > > > #endif > > > diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h > > > index 061185a..e385a76 100644 > > > --- a/include/uapi/asm-generic/unistd.h > > > +++ b/include/uapi/asm-generic/unistd.h > > > @@ -731,9 +731,11 @@ > > > __SYSCALL(__NR_pkey_free, sys_pkey_free) > > > #define __NR_statx 291 > > > __SYSCALL(__NR_statx, sys_statx) > > > +#define __NR_copy_file_range64 292 > > > +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64) > > > > > > #undef __NR_syscalls > > > -#define __NR_syscalls 292 > > > +#define __NR_syscalls 293 > > > > > > /* > > > * All syscalls below here should go away really, > > > diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c > > > index 8acef85..8e0cfd4 100644 > > > --- a/kernel/sys_ni.c > > > +++ b/kernel/sys_ni.c > > > @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void) > > > cond_syscall(sys_capget); > > > cond_syscall(sys_capset); > > > cond_syscall(sys_copy_file_range); > > > +cond_syscall(sys_copy_file_range64); > > > > > > /* arch-specific weak syscall entries */ > > > cond_syscall(sys_pciconfig_read); > > > > Hi Olga, > > > > We discussed this a bit privately, but I'll note it here too. > > > > Server-to-server copy seems like a nice thing to me as well. There are > > several filesystems that can likely make use of that ability. > > > > I don't have a real opinion on the API change either, but you're making > > a very subtle change with this patch. Prior to this, the existing > > copy_file_range calls could count on the superblocks being the same. > > Now, it looks like you can pass them any old file pointer. > > What if we add a new file op for xdev copy that gets called when > superblocks are different, but filesystem type is the same? We could > keep the current copy_file_range fop to fall back on if superblocks > are the same. I don't think that would really help much. There are only two filesystems with copy_file_range operations -- nfsv4 and cifs. I don't think we really need another special case op for this. What I would probably suggest is to just push the check for the same superblock down into the copy_file_range ops in one patch (with no functional change). Then, you could go and lift that restriction in the NFSv4 operation without regressing cifs. The CIFS folks could eventually do the same for theirs.
On Mon, Jun 26, 2017 at 9:46 AM, Jeff Layton <jlayton@redhat.com> wrote: > On Mon, 2017-06-26 at 09:21 -0400, Anna Schumaker wrote: >> >> On 06/24/2017 09:23 AM, Jeff Layton wrote: >> > On Wed, 2017-06-14 at 13:06 -0400, Olga Kornievskaia wrote: >> > > This is a proposal to allow 64bit count to copy and return as >> > > a result of a copy_file_range. No attempt was made to share code >> > > with the 32bit function because 32bit interface should probably >> > > get depreciated. >> > > >> > > Why use 64bit? Current uses of 32bit are by clone_file_range() >> > > which could use 64bit count and NFS copy_file_range also supports >> > > 64bit count value. >> > > >> > > Also with this proposal off-the-bet allow the userland to copy >> > > between different mount points. >> > > >> > > Signed-off-by: Olga Kornievskaia <kolga@netapp.com> >> > > --- >> > > arch/x86/entry/syscalls/syscall_32.tbl | 1 + >> > > arch/x86/entry/syscalls/syscall_64.tbl | 1 + >> > > fs/read_write.c | 136 +++++++++++++++++++++++++++++++++ >> > > include/linux/fs.h | 4 + >> > > include/linux/syscalls.h | 3 + >> > > include/uapi/asm-generic/unistd.h | 4 +- >> > > kernel/sys_ni.c | 1 + >> > > 7 files changed, 149 insertions(+), 1 deletion(-) >> > > >> > > diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl >> > > index 448ac21..1f5f1e7 100644 >> > > --- a/arch/x86/entry/syscalls/syscall_32.tbl >> > > +++ b/arch/x86/entry/syscalls/syscall_32.tbl >> > > @@ -391,3 +391,4 @@ >> > > 382 i386 pkey_free sys_pkey_free >> > > 383 i386 statx sys_statx >> > > 384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl >> > > +385 i386 copy_file_range64 sys_copy_file_range64 >> > > diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl >> > > index 5aef183..e658bbe 100644 >> > > --- a/arch/x86/entry/syscalls/syscall_64.tbl >> > > +++ b/arch/x86/entry/syscalls/syscall_64.tbl >> > > @@ -339,6 +339,7 @@ >> > > 330 common pkey_alloc sys_pkey_alloc >> > > 331 common pkey_free sys_pkey_free >> > > 332 common statx sys_statx >> > > +333 64 copy_file_range64 sys_copy_file_range64 >> > > >> > > # >> > > # x32-specific system call numbers start at 512 to avoid cache impact >> > > diff --git a/fs/read_write.c b/fs/read_write.c >> > > index 47c1d44..e2665c1 100644 >> > > --- a/fs/read_write.c >> > > +++ b/fs/read_write.c >> > > @@ -1700,6 +1700,142 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, >> > > return ret; >> > > } >> > > >> > > +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in, >> > > + struct file *file_out, loff_t pos_out, >> > > + u64 len, unsigned int flags) >> > > +{ >> > > + struct inode *inode_in = file_inode(file_in); >> > > + struct inode *inode_out = file_inode(file_out); >> > > + u64 ret; >> > > + >> > > + if (flags != 0) >> > > + return -EINVAL; >> > > + >> > > + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) >> > > + return -EISDIR; >> > > + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) >> > > + return -EINVAL; >> > > + >> > > + ret = rw_verify_area(READ, file_in, &pos_in, len); >> > > + if (unlikely(ret)) >> > > + return ret; >> > > + >> > > + ret = rw_verify_area(WRITE, file_out, &pos_out, len); >> > > + if (unlikely(ret)) >> > > + return ret; >> > > + >> > > + if (!(file_in->f_mode & FMODE_READ) || >> > > + !(file_out->f_mode & FMODE_WRITE) || >> > > + (file_out->f_flags & O_APPEND)) >> > > + return -EBADF; >> > > + >> > > + if (len == 0) >> > > + return 0; >> > > + >> > > + file_start_write(file_out); >> > > + >> > > + /* >> > > + * Try cloning first, this is supported by more file systems, and >> > > + * more efficient if both clone and copy are supported (e.g. NFS). >> > > + */ >> > > + if (file_in->f_op->clone_file_range) { >> > > + ret = file_in->f_op->clone_file_range(file_in, pos_in, >> > > + file_out, pos_out, len); >> > > + if (ret == 0) { >> > > + ret = len; >> > > + goto done; >> > > + } >> > > + } >> > > + >> > > + if (file_out->f_op->copy_file_range64) { >> > > + ret = file_out->f_op->copy_file_range64(file_in, pos_in, >> > > + file_out, pos_out, len, flags); >> > > + if (ret != -EOPNOTSUPP) >> > > + goto done; >> > > + } >> > > + >> > > + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, >> > > + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); >> > > + >> > > +done: >> > > + if (ret > 0) { >> > > + fsnotify_access(file_in); >> > > + add_rchar(current, ret); >> > > + fsnotify_modify(file_out); >> > > + add_wchar(current, ret); >> > > + } >> > > + >> > > + inc_syscr(current); >> > > + inc_syscw(current); >> > > + >> > > + file_end_write(file_out); >> > > + >> > > + return ret; >> > > +} >> > > +EXPORT_SYMBOL(vfs_copy_file_range64); >> > > + >> > > +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in, >> > > + int, fd_out, loff_t __user *, off_out, >> > > + u64, len, unsigned int, flags) >> > > +{ >> > > + loff_t pos_in; >> > > + loff_t pos_out; >> > > + struct fd f_in; >> > > + struct fd f_out; >> > > + u64 ret = -EBADF; >> > > + >> > > + f_in = fdget(fd_in); >> > > + if (!f_in.file) >> > > + goto out2; >> > > + >> > > + f_out = fdget(fd_out); >> > > + if (!f_out.file) >> > > + goto out1; >> > > + >> > > + ret = -EFAULT; >> > > + if (off_in) { >> > > + if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) >> > > + goto out; >> > > + } else { >> > > + pos_in = f_in.file->f_pos; >> > > + } >> > > + >> > > + if (off_out) { >> > > + if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) >> > > + goto out; >> > > + } else { >> > > + pos_out = f_out.file->f_pos; >> > > + } >> > > + >> > > + ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len, >> > > + flags); >> > > + if (ret > 0) { >> > > + pos_in += ret; >> > > + pos_out += ret; >> > > + >> > > + if (off_in) { >> > > + if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) >> > > + ret = -EFAULT; >> > > + } else { >> > > + f_in.file->f_pos = pos_in; >> > > + } >> > > + >> > > + if (off_out) { >> > > + if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) >> > > + ret = -EFAULT; >> > > + } else { >> > > + f_out.file->f_pos = pos_out; >> > > + } >> > > + } >> > > + >> > > +out: >> > > + fdput(f_out); >> > > +out1: >> > > + fdput(f_in); >> > > +out2: >> > > + return ret; >> > > +} >> > > + >> > > static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) >> > > { >> > > struct inode *inode = file_inode(file); >> > > diff --git a/include/linux/fs.h b/include/linux/fs.h >> > > index 803e5a9..2727a98 100644 >> > > --- a/include/linux/fs.h >> > > +++ b/include/linux/fs.h >> > > @@ -1690,6 +1690,8 @@ struct file_operations { >> > > u64); >> > > ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *, >> > > u64); >> > > + u64 (*copy_file_range64)(struct file *, loff_t, struct file *, >> > > + loff_t, u64, unsigned int); >> > > }; >> > > >> > > struct inode_operations { >> > > @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, >> > > loff_t len, bool *is_same); >> > > extern int vfs_dedupe_file_range(struct file *file, >> > > struct file_dedupe_range *same); >> > > +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *, >> > > + loff_t, u64, unsigned int); >> > > >> > > struct super_operations { >> > > struct inode *(*alloc_inode)(struct super_block *sb); >> > > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h >> > > index 980c3c9..f7ea78e 100644 >> > > --- a/include/linux/syscalls.h >> > > +++ b/include/linux/syscalls.h >> > > @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len, >> > > asmlinkage long sys_pkey_free(int pkey); >> > > asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, >> > > unsigned mask, struct statx __user *buffer); >> > > +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in, >> > > + int fd_out, loff_t __user *off_out, >> > > + u64 len, unsigned int flags); >> > > >> > > #endif >> > > diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h >> > > index 061185a..e385a76 100644 >> > > --- a/include/uapi/asm-generic/unistd.h >> > > +++ b/include/uapi/asm-generic/unistd.h >> > > @@ -731,9 +731,11 @@ >> > > __SYSCALL(__NR_pkey_free, sys_pkey_free) >> > > #define __NR_statx 291 >> > > __SYSCALL(__NR_statx, sys_statx) >> > > +#define __NR_copy_file_range64 292 >> > > +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64) >> > > >> > > #undef __NR_syscalls >> > > -#define __NR_syscalls 292 >> > > +#define __NR_syscalls 293 >> > > >> > > /* >> > > * All syscalls below here should go away really, >> > > diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c >> > > index 8acef85..8e0cfd4 100644 >> > > --- a/kernel/sys_ni.c >> > > +++ b/kernel/sys_ni.c >> > > @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void) >> > > cond_syscall(sys_capget); >> > > cond_syscall(sys_capset); >> > > cond_syscall(sys_copy_file_range); >> > > +cond_syscall(sys_copy_file_range64); >> > > >> > > /* arch-specific weak syscall entries */ >> > > cond_syscall(sys_pciconfig_read); >> > >> > Hi Olga, >> > >> > We discussed this a bit privately, but I'll note it here too. >> > >> > Server-to-server copy seems like a nice thing to me as well. There are >> > several filesystems that can likely make use of that ability. >> > >> > I don't have a real opinion on the API change either, but you're making >> > a very subtle change with this patch. Prior to this, the existing >> > copy_file_range calls could count on the superblocks being the same. >> > Now, it looks like you can pass them any old file pointer. >> >> What if we add a new file op for xdev copy that gets called when >> superblocks are different, but filesystem type is the same? We could >> keep the current copy_file_range fop to fall back on if superblocks >> are the same. > > I don't think that would really help much. There are only two > filesystems with copy_file_range operations -- nfsv4 and cifs. I don't > think we really need another special case op for this. > > What I would probably suggest is to just push the check for the same > superblock down into the copy_file_range ops in one patch (with no > functional change). Then, you could go and lift that restriction in the > NFSv4 operation without regressing cifs. The CIFS folks could eventually > do the same for theirs. CIFS folks already check if their target and source destination are different then they return with EXDEV. So I believe the check that file system ops that are the same for the fd_in and fd_out would be sufficient for the current uses?
On Mon, 2017-06-26 at 10:40 -0400, Olga Kornievskaia wrote: > > On Jun 26, 2017, at 9:46 AM, Jeff Layton <jlayton@redhat.com> wrote: > > > > On Mon, 2017-06-26 at 09:21 -0400, Anna Schumaker wrote: > > > On 06/24/2017 09:23 AM, Jeff Layton wrote: > > > > On Wed, 2017-06-14 at 13:06 -0400, Olga Kornievskaia wrote: > > > > > This is a proposal to allow 64bit count to copy and return as > > > > > a result of a copy_file_range. No attempt was made to share code > > > > > with the 32bit function because 32bit interface should probably > > > > > get depreciated. > > > > > > > > > > Why use 64bit? Current uses of 32bit are by clone_file_range() > > > > > which could use 64bit count and NFS copy_file_range also supports > > > > > 64bit count value. > > > > > > > > > > Also with this proposal off-the-bet allow the userland to copy > > > > > between different mount points. > > > > > > > > > > Signed-off-by: Olga Kornievskaia <kolga@netapp.com> > > > > > --- > > > > > arch/x86/entry/syscalls/syscall_32.tbl | 1 + > > > > > arch/x86/entry/syscalls/syscall_64.tbl | 1 + > > > > > fs/read_write.c | 136 +++++++++++++++++++++++++++++++++ > > > > > include/linux/fs.h | 4 + > > > > > include/linux/syscalls.h | 3 + > > > > > include/uapi/asm-generic/unistd.h | 4 +- > > > > > kernel/sys_ni.c | 1 + > > > > > 7 files changed, 149 insertions(+), 1 deletion(-) > > > > > > > > > > diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl > > > > > index 448ac21..1f5f1e7 100644 > > > > > --- a/arch/x86/entry/syscalls/syscall_32.tbl > > > > > +++ b/arch/x86/entry/syscalls/syscall_32.tbl > > > > > @@ -391,3 +391,4 @@ > > > > > 382 i386 pkey_free sys_pkey_free > > > > > 383 i386 statx sys_statx > > > > > 384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl > > > > > +385 i386 copy_file_range64 sys_copy_file_range64 > > > > > diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl > > > > > index 5aef183..e658bbe 100644 > > > > > --- a/arch/x86/entry/syscalls/syscall_64.tbl > > > > > +++ b/arch/x86/entry/syscalls/syscall_64.tbl > > > > > @@ -339,6 +339,7 @@ > > > > > 330 common pkey_alloc sys_pkey_alloc > > > > > 331 common pkey_free sys_pkey_free > > > > > 332 common statx sys_statx > > > > > +333 64 copy_file_range64 sys_copy_file_range64 > > > > > > > > > > # > > > > > # x32-specific system call numbers start at 512 to avoid cache impact > > > > > diff --git a/fs/read_write.c b/fs/read_write.c > > > > > index 47c1d44..e2665c1 100644 > > > > > --- a/fs/read_write.c > > > > > +++ b/fs/read_write.c > > > > > @@ -1700,6 +1700,142 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, > > > > > return ret; > > > > > } > > > > > > > > > > +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in, > > > > > + struct file *file_out, loff_t pos_out, > > > > > + u64 len, unsigned int flags) > > > > > +{ > > > > > + struct inode *inode_in = file_inode(file_in); > > > > > + struct inode *inode_out = file_inode(file_out); > > > > > + u64 ret; > > > > > + > > > > > + if (flags != 0) > > > > > + return -EINVAL; > > > > > + > > > > > + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) > > > > > + return -EISDIR; > > > > > + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) > > > > > + return -EINVAL; > > > > > + > > > > > + ret = rw_verify_area(READ, file_in, &pos_in, len); > > > > > + if (unlikely(ret)) > > > > > + return ret; > > > > > + > > > > > + ret = rw_verify_area(WRITE, file_out, &pos_out, len); > > > > > + if (unlikely(ret)) > > > > > + return ret; > > > > > + > > > > > + if (!(file_in->f_mode & FMODE_READ) || > > > > > + !(file_out->f_mode & FMODE_WRITE) || > > > > > + (file_out->f_flags & O_APPEND)) > > > > > + return -EBADF; > > > > > + > > > > > + if (len == 0) > > > > > + return 0; > > > > > + > > > > > + file_start_write(file_out); > > > > > + > > > > > + /* > > > > > + * Try cloning first, this is supported by more file systems, and > > > > > + * more efficient if both clone and copy are supported (e.g. NFS). > > > > > + */ > > > > > + if (file_in->f_op->clone_file_range) { > > > > > + ret = file_in->f_op->clone_file_range(file_in, pos_in, > > > > > + file_out, pos_out, len); > > > > > + if (ret == 0) { > > > > > + ret = len; > > > > > + goto done; > > > > > + } > > > > > + } > > > > > + > > > > > + if (file_out->f_op->copy_file_range64) { > > > > > + ret = file_out->f_op->copy_file_range64(file_in, pos_in, > > > > > + file_out, pos_out, len, flags); > > > > > + if (ret != -EOPNOTSUPP) > > > > > + goto done; > > > > > + } > > > > > + > > > > > + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, > > > > > + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); > > > > > + > > > > > +done: > > > > > + if (ret > 0) { > > > > > + fsnotify_access(file_in); > > > > > + add_rchar(current, ret); > > > > > + fsnotify_modify(file_out); > > > > > + add_wchar(current, ret); > > > > > + } > > > > > + > > > > > + inc_syscr(current); > > > > > + inc_syscw(current); > > > > > + > > > > > + file_end_write(file_out); > > > > > + > > > > > + return ret; > > > > > +} > > > > > +EXPORT_SYMBOL(vfs_copy_file_range64); > > > > > + > > > > > +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in, > > > > > + int, fd_out, loff_t __user *, off_out, > > > > > + u64, len, unsigned int, flags) > > > > > +{ > > > > > + loff_t pos_in; > > > > > + loff_t pos_out; > > > > > + struct fd f_in; > > > > > + struct fd f_out; > > > > > + u64 ret = -EBADF; > > > > > + > > > > > + f_in = fdget(fd_in); > > > > > + if (!f_in.file) > > > > > + goto out2; > > > > > + > > > > > + f_out = fdget(fd_out); > > > > > + if (!f_out.file) > > > > > + goto out1; > > > > > + > > > > > + ret = -EFAULT; > > > > > + if (off_in) { > > > > > + if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) > > > > > + goto out; > > > > > + } else { > > > > > + pos_in = f_in.file->f_pos; > > > > > + } > > > > > + > > > > > + if (off_out) { > > > > > + if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) > > > > > + goto out; > > > > > + } else { > > > > > + pos_out = f_out.file->f_pos; > > > > > + } > > > > > + > > > > > + ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len, > > > > > + flags); > > > > > + if (ret > 0) { > > > > > + pos_in += ret; > > > > > + pos_out += ret; > > > > > + > > > > > + if (off_in) { > > > > > + if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) > > > > > + ret = -EFAULT; > > > > > + } else { > > > > > + f_in.file->f_pos = pos_in; > > > > > + } > > > > > + > > > > > + if (off_out) { > > > > > + if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) > > > > > + ret = -EFAULT; > > > > > + } else { > > > > > + f_out.file->f_pos = pos_out; > > > > > + } > > > > > + } > > > > > + > > > > > +out: > > > > > + fdput(f_out); > > > > > +out1: > > > > > + fdput(f_in); > > > > > +out2: > > > > > + return ret; > > > > > +} > > > > > + > > > > > static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) > > > > > { > > > > > struct inode *inode = file_inode(file); > > > > > diff --git a/include/linux/fs.h b/include/linux/fs.h > > > > > index 803e5a9..2727a98 100644 > > > > > --- a/include/linux/fs.h > > > > > +++ b/include/linux/fs.h > > > > > @@ -1690,6 +1690,8 @@ struct file_operations { > > > > > u64); > > > > > ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *, > > > > > u64); > > > > > + u64 (*copy_file_range64)(struct file *, loff_t, struct file *, > > > > > + loff_t, u64, unsigned int); > > > > > }; > > > > > > > > > > struct inode_operations { > > > > > @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, > > > > > loff_t len, bool *is_same); > > > > > extern int vfs_dedupe_file_range(struct file *file, > > > > > struct file_dedupe_range *same); > > > > > +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *, > > > > > + loff_t, u64, unsigned int); > > > > > > > > > > struct super_operations { > > > > > struct inode *(*alloc_inode)(struct super_block *sb); > > > > > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h > > > > > index 980c3c9..f7ea78e 100644 > > > > > --- a/include/linux/syscalls.h > > > > > +++ b/include/linux/syscalls.h > > > > > @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len, > > > > > asmlinkage long sys_pkey_free(int pkey); > > > > > asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, > > > > > unsigned mask, struct statx __user *buffer); > > > > > +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in, > > > > > + int fd_out, loff_t __user *off_out, > > > > > + u64 len, unsigned int flags); > > > > > > > > > > #endif > > > > > diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h > > > > > index 061185a..e385a76 100644 > > > > > --- a/include/uapi/asm-generic/unistd.h > > > > > +++ b/include/uapi/asm-generic/unistd.h > > > > > @@ -731,9 +731,11 @@ > > > > > __SYSCALL(__NR_pkey_free, sys_pkey_free) > > > > > #define __NR_statx 291 > > > > > __SYSCALL(__NR_statx, sys_statx) > > > > > +#define __NR_copy_file_range64 292 > > > > > +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64) > > > > > > > > > > #undef __NR_syscalls > > > > > -#define __NR_syscalls 292 > > > > > +#define __NR_syscalls 293 > > > > > > > > > > /* > > > > > * All syscalls below here should go away really, > > > > > diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c > > > > > index 8acef85..8e0cfd4 100644 > > > > > --- a/kernel/sys_ni.c > > > > > +++ b/kernel/sys_ni.c > > > > > @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void) > > > > > cond_syscall(sys_capget); > > > > > cond_syscall(sys_capset); > > > > > cond_syscall(sys_copy_file_range); > > > > > +cond_syscall(sys_copy_file_range64); > > > > > > > > > > /* arch-specific weak syscall entries */ > > > > > cond_syscall(sys_pciconfig_read); > > > > > > > > Hi Olga, > > > > > > > > We discussed this a bit privately, but I'll note it here too. > > > > > > > > Server-to-server copy seems like a nice thing to me as well. There are > > > > several filesystems that can likely make use of that ability. > > > > > > > > I don't have a real opinion on the API change either, but you're making > > > > a very subtle change with this patch. Prior to this, the existing > > > > copy_file_range calls could count on the superblocks being the same. > > > > Now, it looks like you can pass them any old file pointer. > > > > > > What if we add a new file op for xdev copy that gets called when > > > superblocks are different, but filesystem type is the same? We could > > > keep the current copy_file_range fop to fall back on if superblocks > > > are the same. > > > > I don't think that would really help much. There are only two > > filesystems with copy_file_range operations -- nfsv4 and cifs. I don't > > think we really need another special case op for this. > > > > What I would probably suggest is to just push the check for the same > > superblock down into the copy_file_range ops in one patch (with no > > functional change). Then, you could go and lift that restriction in the > > NFSv4 operation without regressing cifs. The CIFS folks could eventually > > do the same for theirs. > > CIFS folks already check if their target and source destination are different then they return with EXDEV. So I believe the check that file system ops that are the same for the fd_in and fd_out would be sufficient for the current uses? > Yes that should cover it.
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 448ac21..1f5f1e7 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -391,3 +391,4 @@ 382 i386 pkey_free sys_pkey_free 383 i386 statx sys_statx 384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl +385 i386 copy_file_range64 sys_copy_file_range64 diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 5aef183..e658bbe 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -339,6 +339,7 @@ 330 common pkey_alloc sys_pkey_alloc 331 common pkey_free sys_pkey_free 332 common statx sys_statx +333 64 copy_file_range64 sys_copy_file_range64 # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/fs/read_write.c b/fs/read_write.c index 47c1d44..e2665c1 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1700,6 +1700,142 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, return ret; } +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + u64 len, unsigned int flags) +{ + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); + u64 ret; + + if (flags != 0) + return -EINVAL; + + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) + return -EISDIR; + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) + return -EINVAL; + + ret = rw_verify_area(READ, file_in, &pos_in, len); + if (unlikely(ret)) + return ret; + + ret = rw_verify_area(WRITE, file_out, &pos_out, len); + if (unlikely(ret)) + return ret; + + if (!(file_in->f_mode & FMODE_READ) || + !(file_out->f_mode & FMODE_WRITE) || + (file_out->f_flags & O_APPEND)) + return -EBADF; + + if (len == 0) + return 0; + + file_start_write(file_out); + + /* + * Try cloning first, this is supported by more file systems, and + * more efficient if both clone and copy are supported (e.g. NFS). + */ + if (file_in->f_op->clone_file_range) { + ret = file_in->f_op->clone_file_range(file_in, pos_in, + file_out, pos_out, len); + if (ret == 0) { + ret = len; + goto done; + } + } + + if (file_out->f_op->copy_file_range64) { + ret = file_out->f_op->copy_file_range64(file_in, pos_in, + file_out, pos_out, len, flags); + if (ret != -EOPNOTSUPP) + goto done; + } + + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); + +done: + if (ret > 0) { + fsnotify_access(file_in); + add_rchar(current, ret); + fsnotify_modify(file_out); + add_wchar(current, ret); + } + + inc_syscr(current); + inc_syscw(current); + + file_end_write(file_out); + + return ret; +} +EXPORT_SYMBOL(vfs_copy_file_range64); + +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in, + int, fd_out, loff_t __user *, off_out, + u64, len, unsigned int, flags) +{ + loff_t pos_in; + loff_t pos_out; + struct fd f_in; + struct fd f_out; + u64 ret = -EBADF; + + f_in = fdget(fd_in); + if (!f_in.file) + goto out2; + + f_out = fdget(fd_out); + if (!f_out.file) + goto out1; + + ret = -EFAULT; + if (off_in) { + if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) + goto out; + } else { + pos_in = f_in.file->f_pos; + } + + if (off_out) { + if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) + goto out; + } else { + pos_out = f_out.file->f_pos; + } + + ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len, + flags); + if (ret > 0) { + pos_in += ret; + pos_out += ret; + + if (off_in) { + if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) + ret = -EFAULT; + } else { + f_in.file->f_pos = pos_in; + } + + if (off_out) { + if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) + ret = -EFAULT; + } else { + f_out.file->f_pos = pos_out; + } + } + +out: + fdput(f_out); +out1: + fdput(f_in); +out2: + return ret; +} + static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) { struct inode *inode = file_inode(file); diff --git a/include/linux/fs.h b/include/linux/fs.h index 803e5a9..2727a98 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1690,6 +1690,8 @@ struct file_operations { u64); ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *, u64); + u64 (*copy_file_range64)(struct file *, loff_t, struct file *, + loff_t, u64, unsigned int); }; struct inode_operations { @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, loff_t len, bool *is_same); extern int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same); +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *, + loff_t, u64, unsigned int); struct super_operations { struct inode *(*alloc_inode)(struct super_block *sb); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 980c3c9..f7ea78e 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len, asmlinkage long sys_pkey_free(int pkey); asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, unsigned mask, struct statx __user *buffer); +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in, + int fd_out, loff_t __user *off_out, + u64 len, unsigned int flags); #endif diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 061185a..e385a76 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -731,9 +731,11 @@ __SYSCALL(__NR_pkey_free, sys_pkey_free) #define __NR_statx 291 __SYSCALL(__NR_statx, sys_statx) +#define __NR_copy_file_range64 292 +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64) #undef __NR_syscalls -#define __NR_syscalls 292 +#define __NR_syscalls 293 /* * All syscalls below here should go away really, diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 8acef85..8e0cfd4 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void) cond_syscall(sys_capget); cond_syscall(sys_capset); cond_syscall(sys_copy_file_range); +cond_syscall(sys_copy_file_range64); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read);
This is a proposal to allow 64bit count to copy and return as a result of a copy_file_range. No attempt was made to share code with the 32bit function because 32bit interface should probably get depreciated. Why use 64bit? Current uses of 32bit are by clone_file_range() which could use 64bit count and NFS copy_file_range also supports 64bit count value. Also with this proposal off-the-bet allow the userland to copy between different mount points. Signed-off-by: Olga Kornievskaia <kolga@netapp.com> --- arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + fs/read_write.c | 136 +++++++++++++++++++++++++++++++++ include/linux/fs.h | 4 + include/linux/syscalls.h | 3 + include/uapi/asm-generic/unistd.h | 4 +- kernel/sys_ni.c | 1 + 7 files changed, 149 insertions(+), 1 deletion(-)