Message ID | 153870033496.29072.3660384210745578982.stgit@magnolia (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | fs: fixes for serious clone/dedupe problems | expand |
On Fri, Oct 5, 2018 at 3:46 AM Darrick J. Wong <darrick.wong@oracle.com> wrote: > > From: Darrick J. Wong <darrick.wong@oracle.com> > > Change the clone_file_range and dedupe_file_range functions to return > the number of bytes they operated on. This is the precursor to allowing > fs implementations to return short clone/dedupe results to the user, > which will enable us to obey resource limits in a graceful manner. > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> > --- [...] > diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c > index aeaefd2a551b..6d792d817538 100644 > --- a/fs/overlayfs/file.c > +++ b/fs/overlayfs/file.c > @@ -487,16 +487,21 @@ static ssize_t ovl_copy_file_range(struct file *file_in, loff_t pos_in, > OVL_COPY); > } > > -static int ovl_clone_file_range(struct file *file_in, loff_t pos_in, > +static s64 ovl_clone_file_range(struct file *file_in, loff_t pos_in, > struct file *file_out, loff_t pos_out, u64 len) > { > - return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0, > - OVL_CLONE); > + int ret; > + > + ret = ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0, > + OVL_CLONE); > + return ret < 0 ? ret : len; > } > > -static int ovl_dedupe_file_range(struct file *file_in, loff_t pos_in, > +static s64 ovl_dedupe_file_range(struct file *file_in, loff_t pos_in, > struct file *file_out, loff_t pos_out, u64 len) > { > + int ret; > + > /* > * Don't copy up because of a dedupe request, this wouldn't make sense > * most of the time (data would be duplicated instead of deduplicated). > @@ -505,8 +510,9 @@ static int ovl_dedupe_file_range(struct file *file_in, loff_t pos_in, > !ovl_inode_upper(file_inode(file_out))) > return -EPERM; > > - return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0, > - OVL_DEDUPE); > + ret = ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0, > + OVL_DEDUPE); > + return ret < 0 ? ret : len; > } > This is not pretty at all. You are blocking the propagation of partial dedupe/clone result of files that are accessed via overlay over xfs. Please extend the interface change to the vfs helpers (i.e. vfs_clone_file_range()) and then the change above is not needed. Of course you would need to change the 3 callers of vfs_clone_file_range() that expect 0 is ok. Please take a look at commit a725356b6659 ("vfs: swap names of {do,vfs}_clone_file_range()") That was just merged for rc7. I do apologize for the churn, but it's a semantic mistake that I made that needed fixing, so please rebase your work on top of that and take care not to trip over it. ioctl_file_clone() and ovl_copy_up_data() just need to interpret positive return value correctly. nfsd4_clone_file_range() should have the same return value as vfs_clone_file_range() to be interpreted in nfsd4_clone(), following same practice as nfsd4_copy_file_range(). [...] > diff --git a/include/linux/fs.h b/include/linux/fs.h > index 2a4141d36ebf..e5755340e825 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -1759,10 +1759,12 @@ struct file_operations { > #endif > ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, > loff_t, size_t, unsigned int); > - int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t, > - u64); > - int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, > - u64); > + s64 (*clone_file_range)(struct file *file_in, loff_t pos_in, > + struct file *file_out, loff_t pos_out, > + u64 count); > + s64 (*dedupe_file_range)(struct file *file_in, loff_t pos_in, > + struct file *file_out, loff_t pos_out, > + u64 count); Matthew has objected a similar interface change when it was proposed by Miklos: https://marc.info/?l=linux-fsdevel&m=152570317110292&w=2 https://marc.info/?l=linux-fsdevel&m=152569298704781&w=2 He claimed that the interface should look like this: + loff_t (*dedupe_file_range)(struct file *src, loff_t src_off, + struct file *dst, loff_t dst_off, loff_t len); Thanks, Amir.
On Fri, Oct 05, 2018 at 11:06:54AM +0300, Amir Goldstein wrote: > On Fri, Oct 5, 2018 at 3:46 AM Darrick J. Wong <darrick.wong@oracle.com> wrote: > > > > From: Darrick J. Wong <darrick.wong@oracle.com> > > > > Change the clone_file_range and dedupe_file_range functions to return > > the number of bytes they operated on. This is the precursor to allowing > > fs implementations to return short clone/dedupe results to the user, > > which will enable us to obey resource limits in a graceful manner. > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> > > --- > > [...] > > > diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c > > index aeaefd2a551b..6d792d817538 100644 > > --- a/fs/overlayfs/file.c > > +++ b/fs/overlayfs/file.c > > @@ -487,16 +487,21 @@ static ssize_t ovl_copy_file_range(struct file *file_in, loff_t pos_in, > > OVL_COPY); > > } > > > > -static int ovl_clone_file_range(struct file *file_in, loff_t pos_in, > > +static s64 ovl_clone_file_range(struct file *file_in, loff_t pos_in, > > struct file *file_out, loff_t pos_out, u64 len) > > { > > - return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0, > > - OVL_CLONE); > > + int ret; > > + > > + ret = ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0, > > + OVL_CLONE); > > + return ret < 0 ? ret : len; > > } > > > > -static int ovl_dedupe_file_range(struct file *file_in, loff_t pos_in, > > +static s64 ovl_dedupe_file_range(struct file *file_in, loff_t pos_in, > > struct file *file_out, loff_t pos_out, u64 len) > > { > > + int ret; > > + > > /* > > * Don't copy up because of a dedupe request, this wouldn't make sense > > * most of the time (data would be duplicated instead of deduplicated). > > @@ -505,8 +510,9 @@ static int ovl_dedupe_file_range(struct file *file_in, loff_t pos_in, > > !ovl_inode_upper(file_inode(file_out))) > > return -EPERM; > > > > - return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0, > > - OVL_DEDUPE); > > + ret = ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0, > > + OVL_DEDUPE); > > + return ret < 0 ? ret : len; > > } > > > > This is not pretty at all. > You are blocking the propagation of partial dedupe/clone result > of files that are accessed via overlay over xfs. > > Please extend the interface change to the vfs helpers > (i.e. vfs_clone_file_range()) and then the change above is not needed. > > Of course you would need to change the 3 callers of > vfs_clone_file_range() that expect 0 is ok. Ok, I'll plumb the bytes-finished return value all the way through the internal APIs. > Please take a look at commit > a725356b6659 ("vfs: swap names of {do,vfs}_clone_file_range()") > > That was just merged for rc7. > > I do apologize for the churn, but it's a semantic mistake that > I made that needed fixing, so please rebase your work on top > of that and take care not to trip over it. Err... ok. That makes working on this a little messy, we'll see if I can get this mess rebased in time for 5.0. > ioctl_file_clone() and ovl_copy_up_data() just need to interpret > positive return value correctly. > nfsd4_clone_file_range() should have the same return value as > vfs_clone_file_range() to be interpreted in nfsd4_clone(), following > same practice as nfsd4_copy_file_range(). > > [...] > > > diff --git a/include/linux/fs.h b/include/linux/fs.h > > index 2a4141d36ebf..e5755340e825 100644 > > --- a/include/linux/fs.h > > +++ b/include/linux/fs.h > > @@ -1759,10 +1759,12 @@ struct file_operations { > > #endif > > ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, > > loff_t, size_t, unsigned int); > > - int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t, > > - u64); > > - int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, > > - u64); > > + s64 (*clone_file_range)(struct file *file_in, loff_t pos_in, > > + struct file *file_out, loff_t pos_out, > > + u64 count); > > + s64 (*dedupe_file_range)(struct file *file_in, loff_t pos_in, > > + struct file *file_out, loff_t pos_out, > > + u64 count); > > Matthew has objected a similar interface change when it was proposed by Miklos: > https://marc.info/?l=linux-fsdevel&m=152570317110292&w=2 > https://marc.info/?l=linux-fsdevel&m=152569298704781&w=2 > > He claimed that the interface should look like this: > + loff_t (*dedupe_file_range)(struct file *src, loff_t src_off, > + struct file *dst, loff_t dst_off, loff_t len); I don't really like loff_t (why does the typename for a size include "offset" in the name??) but I guess that's not horrible. I've never liked how functions take size_t (unsigned) but return ssize_t (signed) anyway. --D > Thanks, > Amir.
On Thu, Oct 04, 2018 at 05:45:35PM -0700, Darrick J. Wong wrote: > From: Darrick J. Wong <darrick.wong@oracle.com> > > Change the clone_file_range and dedupe_file_range functions to return > the number of bytes they operated on. This is the precursor to allowing > fs implementations to return short clone/dedupe results to the user, > which will enable us to obey resource limits in a graceful manner. For clone this doesn't make any sense, as we don't allow 'short clones'.
On Sat, Oct 06, 2018 at 03:41:34AM -0700, Christoph Hellwig wrote: > On Thu, Oct 04, 2018 at 05:45:35PM -0700, Darrick J. Wong wrote: > > From: Darrick J. Wong <darrick.wong@oracle.com> > > > > Change the clone_file_range and dedupe_file_range functions to return > > the number of bytes they operated on. This is the precursor to allowing > > fs implementations to return short clone/dedupe results to the user, > > which will enable us to obey resource limits in a graceful manner. > > For clone this doesn't make any sense, as we don't allow 'short clones'. That's correct. The overall goal is that copy_file_range will be able to call ->clone_file_range with a "I can handle a short clone" flag so that the fs (and the prep functions) know that they're allowed to shorten the request length, and we have a way to communicate the actual results back to userspace. Neither FS_IOC_CLONE nor FS_IOC_CLONERANGE will be able to take advantage of this, of course. They won't pass in the flag, and any implementation that wants to shorten the length will pass back -EINVAL instead. --D
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2cddfe7806a4..864651257142 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3218,7 +3218,7 @@ void btrfs_get_block_group_info(struct list_head *groups_list, struct btrfs_ioctl_space_info *space); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); -int btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff, +s64 btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff, struct file *dst_file, loff_t dst_loff, u64 olen); @@ -3250,7 +3250,7 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, struct extent_state **cached); int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); -int btrfs_clone_file_range(struct file *file_in, loff_t pos_in, +s64 btrfs_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, u64 len); /* tree-defrag.c */ diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d60b6caf09e8..35ba974f1333 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3627,13 +3627,14 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, return ret; } -int btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff, +s64 btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff, struct file *dst_file, loff_t dst_loff, u64 olen) { struct inode *src = file_inode(src_file); struct inode *dst = file_inode(dst_file); u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; + int ret; if (WARN_ON_ONCE(bs < PAGE_SIZE)) { /* @@ -3644,7 +3645,8 @@ int btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff, return -EINVAL; } - return btrfs_extent_same(src, src_loff, olen, dst, dst_loff); + ret = btrfs_extent_same(src, src_loff, olen, dst, dst_loff); + return ret < 0 ? ret : olen; } static int clone_finish_inode_update(struct btrfs_trans_handle *trans, @@ -4348,10 +4350,13 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, return ret; } -int btrfs_clone_file_range(struct file *src_file, loff_t off, +s64 btrfs_clone_file_range(struct file *src_file, loff_t off, struct file *dst_file, loff_t destoff, u64 len) { - return btrfs_clone_files(dst_file, src_file, off, len, destoff); + int ret; + + ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); + return ret < 0 ? ret : len; } static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 4288a6ecaf75..f914861f844f 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -180,7 +180,7 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t return nfs42_proc_allocate(filep, offset, len); } -static int nfs42_clone_file_range(struct file *src_file, loff_t src_off, +static s64 nfs42_clone_file_range(struct file *src_file, loff_t src_off, struct file *dst_file, loff_t dst_off, u64 count) { struct inode *dst_inode = file_inode(dst_file); @@ -240,7 +240,7 @@ static int nfs42_clone_file_range(struct file *src_file, loff_t src_off, inode_unlock(src_inode); } out: - return ret; + return ret < 0 ? ret : count; } #endif /* CONFIG_NFS_V4_2 */ diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 9fa35cb6f6e0..c4b78ee4a593 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2527,24 +2527,30 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence) return offset; } -static int ocfs2_file_clone_range(struct file *file_in, +static s64 ocfs2_file_clone_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, u64 len) { - return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out, - len, false); + int ret; + + ret = ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out, + len, false); + return ret < 0 ? ret : len; } -static int ocfs2_file_dedupe_range(struct file *file_in, +static s64 ocfs2_file_dedupe_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, u64 len) { - return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out, - len, true); + int ret; + + ret = ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out, + len, true); + return ret < 0 ? ret : len; } const struct inode_operations ocfs2_file_iops = { diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index aeaefd2a551b..6d792d817538 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -487,16 +487,21 @@ static ssize_t ovl_copy_file_range(struct file *file_in, loff_t pos_in, OVL_COPY); } -static int ovl_clone_file_range(struct file *file_in, loff_t pos_in, +static s64 ovl_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, u64 len) { - return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0, - OVL_CLONE); + int ret; + + ret = ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0, + OVL_CLONE); + return ret < 0 ? ret : len; } -static int ovl_dedupe_file_range(struct file *file_in, loff_t pos_in, +static s64 ovl_dedupe_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, u64 len) { + int ret; + /* * Don't copy up because of a dedupe request, this wouldn't make sense * most of the time (data would be duplicated instead of deduplicated). @@ -505,8 +510,9 @@ static int ovl_dedupe_file_range(struct file *file_in, loff_t pos_in, !ovl_inode_upper(file_inode(file_out))) return -EPERM; - return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0, - OVL_DEDUPE); + ret = ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0, + OVL_DEDUPE); + return ret < 0 ? ret : len; } const struct file_operations ovl_file_operations = { diff --git a/fs/read_write.c b/fs/read_write.c index 99b2f809180c..f51751281454 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1589,10 +1589,12 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, * more efficient if both clone and copy are supported (e.g. NFS). */ if (file_in->f_op->clone_file_range) { - ret = file_in->f_op->clone_file_range(file_in, pos_in, - file_out, pos_out, len); - if (ret == 0) { - ret = len; + s64 cloned; + + cloned = file_in->f_op->clone_file_range(file_in, pos_in, + file_out, pos_out, min(MAX_RW_COUNT, len)); + if (cloned >= 0) { + ret = cloned; goto done; } } @@ -1799,6 +1801,7 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in, { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); + s64 cloned; int ret; if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) @@ -1830,14 +1833,16 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in, if (ret) return ret; - ret = file_in->f_op->clone_file_range(file_in, pos_in, + cloned = file_in->f_op->clone_file_range(file_in, pos_in, file_out, pos_out, len); - if (!ret) { - fsnotify_access(file_in); - fsnotify_modify(file_out); - } + if (cloned < 0) + return cloned; + else if (len && cloned != len) + return -EINVAL; - return ret; + fsnotify_access(file_in); + fsnotify_modify(file_out); + return 0; } EXPORT_SYMBOL(vfs_clone_file_range); @@ -1937,7 +1942,7 @@ int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, } EXPORT_SYMBOL(vfs_dedupe_file_range_compare); -int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, +s64 vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, struct file *dst_file, loff_t dst_pos, u64 len) { s64 ret; @@ -1989,7 +1994,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) int i; int ret; u16 count = same->dest_count; - int deduped; + s64 deduped; if (!(file->f_mode & FMODE_READ)) return -EINVAL; @@ -2046,7 +2051,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) else if (deduped < 0) info->status = deduped; else - info->bytes_deduped = len; + info->bytes_deduped = deduped; next_fdput: fdput(dst_fd); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 61a5ad2600e8..efa95e0d8cee 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -919,7 +919,7 @@ xfs_file_fallocate( return error; } -STATIC int +STATIC s64 xfs_file_clone_range( struct file *file_in, loff_t pos_in, @@ -927,11 +927,14 @@ xfs_file_clone_range( loff_t pos_out, u64 len) { - return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, - len, false); + int ret; + + ret = xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, + len, false); + return ret < 0 ? ret : len; } -STATIC int +STATIC s64 xfs_file_dedupe_range( struct file *file_in, loff_t pos_in, @@ -939,8 +942,11 @@ xfs_file_dedupe_range( loff_t pos_out, u64 len) { - return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, - len, true); + int ret; + + ret = xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, + len, true); + return ret < 0 ? ret : len; } STATIC int diff --git a/include/linux/fs.h b/include/linux/fs.h index 2a4141d36ebf..e5755340e825 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1759,10 +1759,12 @@ struct file_operations { #endif ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int); - int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t, - u64); - int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, - u64); + s64 (*clone_file_range)(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + u64 count); + s64 (*dedupe_file_range)(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + u64 count); int (*fadvise)(struct file *, loff_t, loff_t, int); } __randomize_layout; @@ -1835,7 +1837,7 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, loff_t len, bool *is_same); extern int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same); -extern int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, +extern s64 vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, struct file *dst_file, loff_t dst_pos, u64 len);