diff mbox

[6/6] ocfs2: implement the VFS clone_range, copy_range, and dedupe_range features

Message ID 147873190419.2820.6549312356582219407.stgit@birch.djwong.org (mailing list archive)
State New, archived
Headers show

Commit Message

Darrick J. Wong Nov. 9, 2016, 10:51 p.m. UTC
Connect the new VFS clone_range, copy_range, and dedupe_range features
to the existing reflink capability of ocfs2.  Compared to the existing
ocfs2 reflink ioctl We have to do things a little differently to support
the VFS semantics (we can clone subranges of a file but we don't clone
xattrs), but the VFS ioctls are more broadly supported.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/ocfs2/file.c         |   62 ++++-
 fs/ocfs2/file.h         |    3 
 fs/ocfs2/refcounttree.c |  619 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/refcounttree.h |    7 +
 4 files changed, 688 insertions(+), 3 deletions(-)

Comments

Zhen Ren Nov. 11, 2016, 5:49 a.m. UTC | #1
Hi,

A few issues obvious to me:

On 11/10/2016 06:51 AM, Darrick J. Wong wrote:
> Connect the new VFS clone_range, copy_range, and dedupe_range features
> to the existing reflink capability of ocfs2.  Compared to the existing
> ocfs2 reflink ioctl We have to do things a little differently to support
> the VFS semantics (we can clone subranges of a file but we don't clone
> xattrs), but the VFS ioctls are more broadly supported.

How can I test the new ocfs2 reflink (with this patch) manually? What commands should I
use to do xxx_range things?

>
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>   fs/ocfs2/file.c         |   62 ++++-
>   fs/ocfs2/file.h         |    3
>   fs/ocfs2/refcounttree.c |  619 +++++++++++++++++++++++++++++++++++++++++++++++
>   fs/ocfs2/refcounttree.h |    7 +
>   4 files changed, 688 insertions(+), 3 deletions(-)
>
>
> diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
> index 000c234..d5a022d 100644
> --- a/fs/ocfs2/file.c
> +++ b/fs/ocfs2/file.c
> @@ -1667,9 +1667,9 @@ static void ocfs2_calc_trunc_pos(struct inode *inode,
>   	*done = ret;
>   }
>   
> -static int ocfs2_remove_inode_range(struct inode *inode,
> -				    struct buffer_head *di_bh, u64 byte_start,
> -				    u64 byte_len)
> +int ocfs2_remove_inode_range(struct inode *inode,
> +			     struct buffer_head *di_bh, u64 byte_start,
> +			     u64 byte_len)
>   {
>   	int ret = 0, flags = 0, done = 0, i;
>   	u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
> @@ -2440,6 +2440,56 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
>   	return offset;
>   }
>   
> +static ssize_t ocfs2_file_copy_range(struct file *file_in,
> +				     loff_t pos_in,
> +				     struct file *file_out,
> +				     loff_t pos_out,
> +				     size_t len,
> +				     unsigned int flags)
> +{
> +	int error;
> +
> +	error = ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
> +					  len, false);
> +	if (error)
> +		return error;
> +	return len;
> +}
> +
> +static int ocfs2_file_clone_range(struct file *file_in,
> +				  loff_t pos_in,
> +				  struct file *file_out,
> +				  loff_t pos_out,
> +				  u64 len)
> +{
> +	return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
> +					 len, false);
> +}
> +
> +#define OCFS2_MAX_DEDUPE_LEN	(16 * 1024 * 1024)
> +static ssize_t ocfs2_file_dedupe_range(struct file *src_file,
> +				       u64 loff,
> +				       u64 len,
> +				       struct file *dst_file,
> +				       u64 dst_loff)
> +{
> +	int error;
> +
> +	/*
> +	 * Limit the total length we will dedupe for each operation.
> +	 * This is intended to bound the total time spent in this
> +	 * ioctl to something sane.
> +	 */
> +	if (len > OCFS2_MAX_DEDUPE_LEN)
> +		len = OCFS2_MAX_DEDUPE_LEN;
> +
> +	error = ocfs2_reflink_remap_range(src_file, loff, dst_file, dst_loff,
> +					  len, true);
> +	if (error)
> +		return error;
> +	return len;
> +}
> +
>   const struct inode_operations ocfs2_file_iops = {
>   	.setattr	= ocfs2_setattr,
>   	.getattr	= ocfs2_getattr,
> @@ -2479,6 +2529,9 @@ const struct file_operations ocfs2_fops = {
>   	.splice_read	= generic_file_splice_read,
>   	.splice_write	= iter_file_splice_write,
>   	.fallocate	= ocfs2_fallocate,
> +	.copy_file_range = ocfs2_file_copy_range,
> +	.clone_file_range = ocfs2_file_clone_range,
> +	.dedupe_file_range = ocfs2_file_dedupe_range,
>   };
>   
>   const struct file_operations ocfs2_dops = {
> @@ -2524,6 +2577,9 @@ const struct file_operations ocfs2_fops_no_plocks = {
>   	.splice_read	= generic_file_splice_read,
>   	.splice_write	= iter_file_splice_write,
>   	.fallocate	= ocfs2_fallocate,
> +	.copy_file_range = ocfs2_file_copy_range,
> +	.clone_file_range = ocfs2_file_clone_range,
> +	.dedupe_file_range = ocfs2_file_dedupe_range,
>   };
>   
>   const struct file_operations ocfs2_dops_no_plocks = {
> diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
> index e8c62f2..897fd9a 100644
> --- a/fs/ocfs2/file.h
> +++ b/fs/ocfs2/file.h
> @@ -82,4 +82,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
>   
>   int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
>   				   size_t count);
> +int ocfs2_remove_inode_range(struct inode *inode,
> +			     struct buffer_head *di_bh, u64 byte_start,
> +			     u64 byte_len);
>   #endif /* OCFS2_FILE_H */
> diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
> index d92b6c6..3e2198c 100644
> --- a/fs/ocfs2/refcounttree.c
> +++ b/fs/ocfs2/refcounttree.c
> @@ -34,6 +34,7 @@
>   #include "xattr.h"
>   #include "namei.h"
>   #include "ocfs2_trace.h"
> +#include "file.h"
>   
>   #include <linux/bio.h>
>   #include <linux/blkdev.h>
> @@ -4447,3 +4448,621 @@ int ocfs2_reflink_ioctl(struct inode *inode,
>   
>   	return error;
>   }
> +
> +/* Update destination inode size, if necessary. */
> +static int ocfs2_reflink_update_dest(struct inode *dest,
> +				     struct buffer_head *d_bh,
> +				     loff_t newlen)
> +{
> +	handle_t *handle;
> +	struct ocfs2_dinode *di = (struct ocfs2_dinode *)d_bh->b_data;
> +	int ret;
> +
> +	if (newlen <= i_size_read(dest))
> +		return 0;
> +
> +	handle = ocfs2_start_trans(OCFS2_SB(dest->i_sb),
> +				   OCFS2_INODE_UPDATE_CREDITS);
> +	if (IS_ERR(handle)) {
> +		ret = PTR_ERR(handle);
> +		mlog_errno(ret);
> +		return ret;
> +	}
> +
> +	ret = ocfs2_journal_access_di(handle, INODE_CACHE(dest), d_bh,
> +				      OCFS2_JOURNAL_ACCESS_WRITE);
> +	if (ret) {
> +		mlog_errno(ret);
> +		goto out_commit;
> +	}
> +
> +	spin_lock(&OCFS2_I(dest)->ip_lock);
> +	if (newlen > i_size_read(dest)) {
> +		i_size_write(dest, newlen);
> +		di->i_size = newlen;

di->i_size = cpu_to_le64(newlen);

> +	}
> +	spin_unlock(&OCFS2_I(dest)->ip_lock);
> +

Add ocfs2_update_inode_fsync_trans() here? Looks this function was introduced by you to 
improve efficiency.
Just want to awake your memory about this, though I don't know about the details why it 
should be.

Eric

> +	ocfs2_journal_dirty(handle, d_bh);
> +
> +out_commit:
> +	ocfs2_commit_trans(OCFS2_SB(dest->i_sb), handle);
> +	return ret;
> +}
> +
> +/* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */
> +static int ocfs2_reflink_remap_extent(struct inode *s_inode,
> +				      struct buffer_head *s_bh,
> +				      loff_t pos_in,
> +				      struct inode *t_inode,
> +				      struct buffer_head *t_bh,
> +				      loff_t pos_out,
> +				      loff_t len,
> +				      struct ocfs2_cached_dealloc_ctxt *dealloc)
> +{
> +	struct ocfs2_extent_tree s_et;
> +	struct ocfs2_extent_tree t_et;
> +	struct ocfs2_dinode *dis;
> +	struct buffer_head *ref_root_bh = NULL;
> +	struct ocfs2_refcount_tree *ref_tree;
> +	struct ocfs2_super *osb;
> +	loff_t pstart, plen;
> +	u32 p_cluster, num_clusters, slast, spos, tpos;
> +	unsigned int ext_flags;
> +	int ret = 0;
> +
> +	osb = OCFS2_SB(s_inode->i_sb);
> +	dis = (struct ocfs2_dinode *)s_bh->b_data;
> +	ocfs2_init_dinode_extent_tree(&s_et, INODE_CACHE(s_inode), s_bh);
> +	ocfs2_init_dinode_extent_tree(&t_et, INODE_CACHE(t_inode), t_bh);
> +
> +	spos = ocfs2_bytes_to_clusters(s_inode->i_sb, pos_in);
> +	tpos = ocfs2_bytes_to_clusters(t_inode->i_sb, pos_out);
> +	slast = ocfs2_clusters_for_bytes(s_inode->i_sb, pos_in + len);
> +
> +	while (spos < slast) {
> +		if (fatal_signal_pending(current)) {
> +			ret = -EINTR;
> +			goto out;
> +		}
> +
> +		/* Look up the extent. */
> +		ret = ocfs2_get_clusters(s_inode, spos, &p_cluster,
> +					 &num_clusters, &ext_flags);
> +		if (ret) {
> +			mlog_errno(ret);
> +			goto out;
> +		}
> +
> +		num_clusters = min_t(u32, num_clusters, slast - spos);
> +
> +		/* Punch out the dest range. */
> +		pstart = ocfs2_clusters_to_bytes(t_inode->i_sb, tpos);
> +		plen = ocfs2_clusters_to_bytes(t_inode->i_sb, num_clusters);
> +		ret = ocfs2_remove_inode_range(t_inode, t_bh, pstart, plen);
> +		if (ret) {
> +			mlog_errno(ret);
> +			goto out;
> +		}
> +
> +		if (p_cluster == 0)
> +			goto next_loop;
> +
> +		/* Lock the refcount btree... */
> +		ret = ocfs2_lock_refcount_tree(osb,
> +					       le64_to_cpu(dis->i_refcount_loc),
> +					       1, &ref_tree, &ref_root_bh);
> +		if (ret) {
> +			mlog_errno(ret);
> +			goto out;
> +		}
> +
> +		/* Mark s_inode's extent as refcounted. */
> +		if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) {
> +			ret = ocfs2_add_refcount_flag(s_inode, &s_et,
> +						      &ref_tree->rf_ci,
> +						      ref_root_bh, spos,
> +						      p_cluster, num_clusters,
> +						      dealloc, NULL);
> +			if (ret) {
> +				mlog_errno(ret);
> +				goto out_unlock_refcount;
> +			}
> +		}
> +
> +		/* Map in the new extent. */
> +		ext_flags |= OCFS2_EXT_REFCOUNTED;
> +		ret = ocfs2_add_refcounted_extent(t_inode, &t_et,
> +						  &ref_tree->rf_ci,
> +						  ref_root_bh,
> +						  tpos, p_cluster,
> +						  num_clusters,
> +						  ext_flags,
> +						  dealloc);
> +		if (ret) {
> +			mlog_errno(ret);
> +			goto out_unlock_refcount;
> +		}
> +
> +		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
> +		brelse(ref_root_bh);
> +next_loop:
> +		spos += num_clusters;
> +		tpos += num_clusters;
> +	}
> +
> +out:
> +	return ret;
> +out_unlock_refcount:
> +	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
> +	brelse(ref_root_bh);
> +	return ret;
> +}
> +
> +/* Set up refcount tree and remap s_inode to t_inode. */
> +static int ocfs2_reflink_remap_blocks(struct inode *s_inode,
> +				      struct buffer_head *s_bh,
> +				      loff_t pos_in,
> +				      struct inode *t_inode,
> +				      struct buffer_head *t_bh,
> +				      loff_t pos_out,
> +				      loff_t len)
> +{
> +	struct ocfs2_cached_dealloc_ctxt dealloc;
> +	struct ocfs2_super *osb;
> +	struct ocfs2_dinode *dis;
> +	struct ocfs2_dinode *dit;
> +	int ret;
> +
> +	osb = OCFS2_SB(s_inode->i_sb);
> +	dis = (struct ocfs2_dinode *)s_bh->b_data;
> +	dit = (struct ocfs2_dinode *)t_bh->b_data;
> +	ocfs2_init_dealloc_ctxt(&dealloc);
> +
> +	/*
> +	 * If both inodes belong to two different refcount groups then
> +	 * forget it because we don't know how (or want) to go merging
> +	 * refcount trees.
> +	 */
> +	ret = -EOPNOTSUPP;
> +	if (ocfs2_is_refcount_inode(s_inode) &&
> +	    ocfs2_is_refcount_inode(t_inode) &&
> +	    le64_to_cpu(dis->i_refcount_loc) !=
> +	    le64_to_cpu(dit->i_refcount_loc))
> +		goto out;
> +
> +	/* Neither inode has a refcount tree.  Add one to s_inode. */
> +	if (!ocfs2_is_refcount_inode(s_inode) &&
> +	    !ocfs2_is_refcount_inode(t_inode)) {
> +		ret = ocfs2_create_refcount_tree(s_inode, s_bh);
> +		if (ret) {
> +			mlog_errno(ret);
> +			goto out;
> +		}
> +	}
> +
> +	/* Ensure that both inodes end up with the same refcount tree. */
> +	if (!ocfs2_is_refcount_inode(s_inode)) {
> +		ret = ocfs2_set_refcount_tree(s_inode, s_bh,
> +					      le64_to_cpu(dit->i_refcount_loc));
> +		if (ret) {
> +			mlog_errno(ret);
> +			goto out;
> +		}
> +	}
> +	if (!ocfs2_is_refcount_inode(t_inode)) {
> +		ret = ocfs2_set_refcount_tree(t_inode, t_bh,
> +					      le64_to_cpu(dis->i_refcount_loc));
> +		if (ret) {
> +			mlog_errno(ret);
> +			goto out;
> +		}
> +	}
> +
> +	/*
> +	 * If we're reflinking the entire file and the source is inline
> +	 * data, just copy the contents.
> +	 */
> +	if (pos_in == pos_out && pos_in == 0 && len == i_size_read(s_inode) &&
> +	    i_size_read(t_inode) <= len &&
> +	    (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) {
> +		ret = ocfs2_duplicate_inline_data(s_inode, s_bh, t_inode, t_bh);
> +		if (ret)
> +			mlog_errno(ret);
> +		goto out;
> +	}
> +
> +	ret = ocfs2_reflink_remap_extent(s_inode, s_bh, pos_in, t_inode, t_bh,
> +					 pos_out, len, &dealloc);
> +	if (ret) {
> +		mlog_errno(ret);
> +		goto out;
> +	}
> +
> +out:
> +	if (ocfs2_dealloc_has_cluster(&dealloc)) {
> +		ocfs2_schedule_truncate_log_flush(osb, 1);
> +		ocfs2_run_deallocs(osb, &dealloc);
> +	}
> +
> +	return ret;
> +}
> +
> +/* Lock an inode and grab a bh pointing to the inode. */
> +static int ocfs2_reflink_inodes_lock(struct inode *s_inode,
> +				     struct buffer_head **bh1,
> +				     struct inode *t_inode,
> +				     struct buffer_head **bh2)
> +{
> +	struct inode *inode1;
> +	struct inode *inode2;
> +	struct ocfs2_inode_info *oi1;
> +	struct ocfs2_inode_info *oi2;
> +	bool same_inode = (s_inode == t_inode);
> +	int status;
> +
> +	/* First grab the VFS and rw locks. */
> +	inode1 = s_inode;
> +	inode2 = t_inode;
> +	if (inode1->i_ino > inode2->i_ino)
> +		swap(inode1, inode2);
> +
> +	inode_lock(inode1);
> +	status = ocfs2_rw_lock(inode1, 1);
> +	if (status) {
> +		mlog_errno(status);
> +		goto out_i1;
> +	}
> +	if (!same_inode) {
> +		inode_lock_nested(inode2, I_MUTEX_CHILD);
> +		status = ocfs2_rw_lock(inode2, 1);
> +		if (status) {
> +			mlog_errno(status);
> +			goto out_i2;
> +		}
> +	}
> +
> +	/* Now go for the cluster locks */
> +	oi1 = OCFS2_I(inode1);
> +	oi2 = OCFS2_I(inode2);
> +
> +	trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
> +				(unsigned long long)oi2->ip_blkno);
> +
> +	if (*bh1)
> +		*bh1 = NULL;
> +	if (*bh2)
> +		*bh2 = NULL;
> +
> +	/* We always want to lock the one with the lower lockid first. */
> +	if (oi1->ip_blkno > oi2->ip_blkno)
> +		mlog_errno(-ENOLCK);
> +
> +	/* lock id1 */
> +	status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_REFLINK_TARGET);
> +	if (status < 0) {
> +		if (status != -ENOENT)
> +			mlog_errno(status);
> +		goto out_rw2;
> +	}
> +
> +	/* lock id2 */
> +	if (!same_inode) {
> +		status = ocfs2_inode_lock_nested(inode2, bh2, 1,
> +						 OI_LS_REFLINK_TARGET);
> +		if (status < 0) {
> +			if (status != -ENOENT)
> +				mlog_errno(status);
> +			goto out_cl1;
> +		}
> +	} else
> +		*bh2 = *bh1;
> +
> +	trace_ocfs2_double_lock_end(
> +			(unsigned long long)OCFS2_I(inode1)->ip_blkno,
> +			(unsigned long long)OCFS2_I(inode2)->ip_blkno);
> +
> +	return 0;
> +
> +out_cl1:
> +	ocfs2_inode_unlock(inode1, 1);
> +	brelse(*bh1);
> +	*bh1 = NULL;
> +out_rw2:
> +	ocfs2_rw_unlock(inode2, 1);
> +out_i2:
> +	inode_unlock(inode2);
> +	ocfs2_rw_unlock(inode1, 1);
> +out_i1:
> +	inode_unlock(inode1);
> +	return status;
> +}
> +
> +/* Unlock both inodes and release buffers. */
> +static void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
> +					struct buffer_head *s_bh,
> +					struct inode *t_inode,
> +					struct buffer_head *t_bh)
> +{
> +	ocfs2_inode_unlock(s_inode, 1);
> +	ocfs2_rw_unlock(s_inode, 1);
> +	inode_unlock(s_inode);
> +	brelse(s_bh);
> +
> +	if (s_inode == t_inode)
> +		return;
> +
> +	ocfs2_inode_unlock(t_inode, 1);
> +	ocfs2_rw_unlock(t_inode, 1);
> +	inode_unlock(t_inode);
> +	brelse(t_bh);
> +}
> +
> +/*
> + * Read a page's worth of file data into the page cache.  Return the page
> + * locked.
> + */
> +static struct page *ocfs2_reflink_get_page(struct inode *inode,
> +					   loff_t offset)
> +{
> +	struct address_space *mapping;
> +	struct page *page;
> +	pgoff_t n;
> +
> +	n = offset >> PAGE_SHIFT;
> +	mapping = inode->i_mapping;
> +	page = read_mapping_page(mapping, n, NULL);
> +	if (IS_ERR(page))
> +		return page;
> +	if (!PageUptodate(page)) {
> +		put_page(page);
> +		return ERR_PTR(-EIO);
> +	}
> +	lock_page(page);
> +	return page;
> +}
> +
> +/*
> + * Compare extents of two files to see if they are the same.
> + */
> +static int ocfs2_reflink_compare_extents(struct inode *src,
> +					 loff_t srcoff,
> +					 struct inode *dest,
> +					 loff_t destoff,
> +					 loff_t len,
> +					 bool *is_same)
> +{
> +	loff_t src_poff;
> +	loff_t dest_poff;
> +	void *src_addr;
> +	void *dest_addr;
> +	struct page *src_page;
> +	struct page *dest_page;
> +	loff_t cmp_len;
> +	bool same;
> +	int error;
> +
> +	error = -EINVAL;
> +	same = true;
> +	while (len) {
> +		src_poff = srcoff & (PAGE_SIZE - 1);
> +		dest_poff = destoff & (PAGE_SIZE - 1);
> +		cmp_len = min(PAGE_SIZE - src_poff,
> +			      PAGE_SIZE - dest_poff);
> +		cmp_len = min(cmp_len, len);
> +		if (cmp_len <= 0) {
> +			mlog_errno(-EUCLEAN);
> +			goto out_error;
> +		}
> +
> +		src_page = ocfs2_reflink_get_page(src, srcoff);
> +		if (IS_ERR(src_page)) {
> +			error = PTR_ERR(src_page);
> +			goto out_error;
> +		}
> +		dest_page = ocfs2_reflink_get_page(dest, destoff);
> +		if (IS_ERR(dest_page)) {
> +			error = PTR_ERR(dest_page);
> +			unlock_page(src_page);
> +			put_page(src_page);
> +			goto out_error;
> +		}
> +		src_addr = kmap_atomic(src_page);
> +		dest_addr = kmap_atomic(dest_page);
> +
> +		flush_dcache_page(src_page);
> +		flush_dcache_page(dest_page);
> +
> +		if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
> +			same = false;
> +
> +		kunmap_atomic(dest_addr);
> +		kunmap_atomic(src_addr);
> +		unlock_page(dest_page);
> +		unlock_page(src_page);
> +		put_page(dest_page);
> +		put_page(src_page);
> +
> +		if (!same)
> +			break;
> +
> +		srcoff += cmp_len;
> +		destoff += cmp_len;
> +		len -= cmp_len;
> +	}
> +
> +	*is_same = same;
> +	return 0;
> +
> +out_error:
> +	return error;
> +}
> +
> +/* Link a range of blocks from one file to another. */
> +int ocfs2_reflink_remap_range(struct file *file_in,
> +			      loff_t pos_in,
> +			      struct file *file_out,
> +			      loff_t pos_out,
> +			      u64 len,
> +			      bool is_dedupe)
> +{
> +	struct inode *inode_in = file_inode(file_in);
> +	struct inode *inode_out = file_inode(file_out);
> +	struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb);
> +	struct buffer_head *in_bh = NULL, *out_bh = NULL;
> +	loff_t bs = 1 << OCFS2_SB(inode_in->i_sb)->s_clustersize_bits;
> +	bool same_inode = (inode_in == inode_out);
> +	bool is_same = false;
> +	loff_t isize;
> +	ssize_t ret;
> +	loff_t blen;
> +
> +	if (!ocfs2_refcount_tree(osb))
> +		return -EOPNOTSUPP;
> +	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
> +		return -EROFS;
> +
> +	/* Lock both files against IO */
> +	ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh);
> +	if (ret)
> +		return ret;
> +
> +	ret = -EINVAL;
> +	if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) ||
> +	    (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
> +		goto out_unlock;
> +
> +	/* Don't touch certain kinds of inodes */
> +	ret = -EPERM;
> +	if (IS_IMMUTABLE(inode_out))
> +		goto out_unlock;
> +
> +	ret = -ETXTBSY;
> +	if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
> +		goto out_unlock;
> +
> +	/* Don't reflink dirs, pipes, sockets... */
> +	ret = -EISDIR;
> +	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
> +		goto out_unlock;
> +	ret = -EINVAL;
> +	if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode))
> +		goto out_unlock;
> +	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
> +		goto out_unlock;
> +
> +	/* Are we going all the way to the end? */
> +	isize = i_size_read(inode_in);
> +	if (isize == 0) {
> +		ret = 0;
> +		goto out_unlock;
> +	}
> +
> +	if (len == 0)
> +		len = isize - pos_in;
> +
> +	/* Ensure offsets don't wrap and the input is inside i_size */
> +	if (pos_in + len < pos_in || pos_out + len < pos_out ||
> +	    pos_in + len > isize)
> +		goto out_unlock;
> +
> +	/* Don't allow dedupe past EOF in the dest file */
> +	if (is_dedupe) {
> +		loff_t	disize;
> +
> +		disize = i_size_read(inode_out);
> +		if (pos_out >= disize || pos_out + len > disize)
> +			goto out_unlock;
> +	}
> +
> +	/* If we're linking to EOF, continue to the block boundary. */
> +	if (pos_in + len == isize)
> +		blen = ALIGN(isize, bs) - pos_in;
> +	else
> +		blen = len;
> +
> +	/* Only reflink if we're aligned to block boundaries */
> +	if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
> +	    !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
> +		goto out_unlock;
> +
> +	/* Don't allow overlapped reflink within the same file */
> +	if (same_inode) {
> +		if (pos_out + blen > pos_in && pos_out < pos_in + blen)
> +			goto out_unlock;
> +	}
> +
> +	/* Wait for the completion of any pending IOs on both files */
> +	inode_dio_wait(inode_in);
> +	if (!same_inode)
> +		inode_dio_wait(inode_out);
> +
> +	ret = filemap_write_and_wait_range(inode_in->i_mapping,
> +			pos_in, pos_in + len - 1);
> +	if (ret)
> +		goto out_unlock;
> +
> +	ret = filemap_write_and_wait_range(inode_out->i_mapping,
> +			pos_out, pos_out + len - 1);
> +	if (ret)
> +		goto out_unlock;
> +
> +	/*
> +	 * Check that the extents are the same.
> +	 */
> +	if (is_dedupe) {
> +		ret = ocfs2_reflink_compare_extents(inode_in, pos_in,
> +						    inode_out, pos_out,
> +						    len, &is_same);
> +		if (ret)
> +			goto out_unlock;
> +		if (!is_same) {
> +			ret = -EBADE;
> +			goto out_unlock;
> +		}
> +	}
> +
> +	/* Lock out changes to the allocation maps */
> +	down_write(&OCFS2_I(inode_in)->ip_alloc_sem);
> +	if (!same_inode)
> +		down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
> +				  SINGLE_DEPTH_NESTING);
> +
> +	/*
> +	 * Invalidate the page cache so that we can clear any CoW mappings
> +	 * in the destination file.
> +	 */
> +	truncate_inode_pages_range(&inode_out->i_data, pos_out,
> +				   PAGE_ALIGN(pos_out + len) - 1);
> +
> +	ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out,
> +					 out_bh, pos_out, len);
> +
> +	up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
> +	if (!same_inode)
> +		up_write(&OCFS2_I(inode_out)->ip_alloc_sem);
> +	if (ret) {
> +		mlog_errno(ret);
> +		goto out_unlock;
> +	}
> +
> +	/*
> +	 * Empty the extent map so that we may get the right extent
> +	 * record from the disk.
> +	 */
> +	ocfs2_extent_map_trunc(inode_in, 0);
> +	ocfs2_extent_map_trunc(inode_out, 0);
> +
> +	ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len);
> +	if (ret) {
> +		mlog_errno(ret);
> +		goto out_unlock;
> +	}
> +
> +	ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
> +	return 0;
> +
> +out_unlock:
> +	ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
> +	return ret;
> +}
> diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
> index 553edfb..c023e88 100644
> --- a/fs/ocfs2/refcounttree.h
> +++ b/fs/ocfs2/refcounttree.h
> @@ -117,4 +117,11 @@ int ocfs2_reflink_ioctl(struct inode *inode,
>   			const char __user *oldname,
>   			const char __user *newname,
>   			bool preserve);
> +int ocfs2_reflink_remap_range(struct file *file_in,
> +			      loff_t pos_in,
> +			      struct file *file_out,
> +			      loff_t pos_out,
> +			      u64 len,
> +			      bool is_dedupe);
> +
>   #endif /* OCFS2_REFCOUNTTREE_H */
>
>
> _______________________________________________
> Ocfs2-devel mailing list
> Ocfs2-devel@oss.oracle.com
> https://oss.oracle.com/mailman/listinfo/ocfs2-devel
>
Darrick J. Wong Nov. 11, 2016, 6:20 a.m. UTC | #2
On Fri, Nov 11, 2016 at 01:49:48PM +0800, Eric Ren wrote:
> Hi,
> 
> A few issues obvious to me:
> 
> On 11/10/2016 06:51 AM, Darrick J. Wong wrote:
> >Connect the new VFS clone_range, copy_range, and dedupe_range features
> >to the existing reflink capability of ocfs2.  Compared to the existing
> >ocfs2 reflink ioctl We have to do things a little differently to support
> >the VFS semantics (we can clone subranges of a file but we don't clone
> >xattrs), but the VFS ioctls are more broadly supported.
> 
> How can I test the new ocfs2 reflink (with this patch) manually? What
> commands should I use to do xxx_range things?

See the 'reflink', 'dedupe', and 'copy_range' commands in xfs_io.

The first two were added in xfsprogs 4.3, and copy_range in 4.7.

--D

> 
> >
> >Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> >---
> >  fs/ocfs2/file.c         |   62 ++++-
> >  fs/ocfs2/file.h         |    3
> >  fs/ocfs2/refcounttree.c |  619 +++++++++++++++++++++++++++++++++++++++++++++++
> >  fs/ocfs2/refcounttree.h |    7 +
> >  4 files changed, 688 insertions(+), 3 deletions(-)
> >
> >
> >diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
> >index 000c234..d5a022d 100644
> >--- a/fs/ocfs2/file.c
> >+++ b/fs/ocfs2/file.c
> >@@ -1667,9 +1667,9 @@ static void ocfs2_calc_trunc_pos(struct inode *inode,
> >  	*done = ret;
> >  }
> >-static int ocfs2_remove_inode_range(struct inode *inode,
> >-				    struct buffer_head *di_bh, u64 byte_start,
> >-				    u64 byte_len)
> >+int ocfs2_remove_inode_range(struct inode *inode,
> >+			     struct buffer_head *di_bh, u64 byte_start,
> >+			     u64 byte_len)
> >  {
> >  	int ret = 0, flags = 0, done = 0, i;
> >  	u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
> >@@ -2440,6 +2440,56 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
> >  	return offset;
> >  }
> >+static ssize_t ocfs2_file_copy_range(struct file *file_in,
> >+				     loff_t pos_in,
> >+				     struct file *file_out,
> >+				     loff_t pos_out,
> >+				     size_t len,
> >+				     unsigned int flags)
> >+{
> >+	int error;
> >+
> >+	error = ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
> >+					  len, false);
> >+	if (error)
> >+		return error;
> >+	return len;
> >+}
> >+
> >+static int ocfs2_file_clone_range(struct file *file_in,
> >+				  loff_t pos_in,
> >+				  struct file *file_out,
> >+				  loff_t pos_out,
> >+				  u64 len)
> >+{
> >+	return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
> >+					 len, false);
> >+}
> >+
> >+#define OCFS2_MAX_DEDUPE_LEN	(16 * 1024 * 1024)
> >+static ssize_t ocfs2_file_dedupe_range(struct file *src_file,
> >+				       u64 loff,
> >+				       u64 len,
> >+				       struct file *dst_file,
> >+				       u64 dst_loff)
> >+{
> >+	int error;
> >+
> >+	/*
> >+	 * Limit the total length we will dedupe for each operation.
> >+	 * This is intended to bound the total time spent in this
> >+	 * ioctl to something sane.
> >+	 */
> >+	if (len > OCFS2_MAX_DEDUPE_LEN)
> >+		len = OCFS2_MAX_DEDUPE_LEN;
> >+
> >+	error = ocfs2_reflink_remap_range(src_file, loff, dst_file, dst_loff,
> >+					  len, true);
> >+	if (error)
> >+		return error;
> >+	return len;
> >+}
> >+
> >  const struct inode_operations ocfs2_file_iops = {
> >  	.setattr	= ocfs2_setattr,
> >  	.getattr	= ocfs2_getattr,
> >@@ -2479,6 +2529,9 @@ const struct file_operations ocfs2_fops = {
> >  	.splice_read	= generic_file_splice_read,
> >  	.splice_write	= iter_file_splice_write,
> >  	.fallocate	= ocfs2_fallocate,
> >+	.copy_file_range = ocfs2_file_copy_range,
> >+	.clone_file_range = ocfs2_file_clone_range,
> >+	.dedupe_file_range = ocfs2_file_dedupe_range,
> >  };
> >  const struct file_operations ocfs2_dops = {
> >@@ -2524,6 +2577,9 @@ const struct file_operations ocfs2_fops_no_plocks = {
> >  	.splice_read	= generic_file_splice_read,
> >  	.splice_write	= iter_file_splice_write,
> >  	.fallocate	= ocfs2_fallocate,
> >+	.copy_file_range = ocfs2_file_copy_range,
> >+	.clone_file_range = ocfs2_file_clone_range,
> >+	.dedupe_file_range = ocfs2_file_dedupe_range,
> >  };
> >  const struct file_operations ocfs2_dops_no_plocks = {
> >diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
> >index e8c62f2..897fd9a 100644
> >--- a/fs/ocfs2/file.h
> >+++ b/fs/ocfs2/file.h
> >@@ -82,4 +82,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
> >  int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
> >  				   size_t count);
> >+int ocfs2_remove_inode_range(struct inode *inode,
> >+			     struct buffer_head *di_bh, u64 byte_start,
> >+			     u64 byte_len);
> >  #endif /* OCFS2_FILE_H */
> >diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
> >index d92b6c6..3e2198c 100644
> >--- a/fs/ocfs2/refcounttree.c
> >+++ b/fs/ocfs2/refcounttree.c
> >@@ -34,6 +34,7 @@
> >  #include "xattr.h"
> >  #include "namei.h"
> >  #include "ocfs2_trace.h"
> >+#include "file.h"
> >  #include <linux/bio.h>
> >  #include <linux/blkdev.h>
> >@@ -4447,3 +4448,621 @@ int ocfs2_reflink_ioctl(struct inode *inode,
> >  	return error;
> >  }
> >+
> >+/* Update destination inode size, if necessary. */
> >+static int ocfs2_reflink_update_dest(struct inode *dest,
> >+				     struct buffer_head *d_bh,
> >+				     loff_t newlen)
> >+{
> >+	handle_t *handle;
> >+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)d_bh->b_data;
> >+	int ret;
> >+
> >+	if (newlen <= i_size_read(dest))
> >+		return 0;
> >+
> >+	handle = ocfs2_start_trans(OCFS2_SB(dest->i_sb),
> >+				   OCFS2_INODE_UPDATE_CREDITS);
> >+	if (IS_ERR(handle)) {
> >+		ret = PTR_ERR(handle);
> >+		mlog_errno(ret);
> >+		return ret;
> >+	}
> >+
> >+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(dest), d_bh,
> >+				      OCFS2_JOURNAL_ACCESS_WRITE);
> >+	if (ret) {
> >+		mlog_errno(ret);
> >+		goto out_commit;
> >+	}
> >+
> >+	spin_lock(&OCFS2_I(dest)->ip_lock);
> >+	if (newlen > i_size_read(dest)) {
> >+		i_size_write(dest, newlen);
> >+		di->i_size = newlen;
> 
> di->i_size = cpu_to_le64(newlen);
> 
> >+	}
> >+	spin_unlock(&OCFS2_I(dest)->ip_lock);
> >+
> 
> Add ocfs2_update_inode_fsync_trans() here? Looks this function was
> introduced by you to improve efficiency.
> Just want to awake your memory about this, though I don't know about the
> details why it should be.
> 
> Eric
> 
> >+	ocfs2_journal_dirty(handle, d_bh);
> >+
> >+out_commit:
> >+	ocfs2_commit_trans(OCFS2_SB(dest->i_sb), handle);
> >+	return ret;
> >+}
> >+
> >+/* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */
> >+static int ocfs2_reflink_remap_extent(struct inode *s_inode,
> >+				      struct buffer_head *s_bh,
> >+				      loff_t pos_in,
> >+				      struct inode *t_inode,
> >+				      struct buffer_head *t_bh,
> >+				      loff_t pos_out,
> >+				      loff_t len,
> >+				      struct ocfs2_cached_dealloc_ctxt *dealloc)
> >+{
> >+	struct ocfs2_extent_tree s_et;
> >+	struct ocfs2_extent_tree t_et;
> >+	struct ocfs2_dinode *dis;
> >+	struct buffer_head *ref_root_bh = NULL;
> >+	struct ocfs2_refcount_tree *ref_tree;
> >+	struct ocfs2_super *osb;
> >+	loff_t pstart, plen;
> >+	u32 p_cluster, num_clusters, slast, spos, tpos;
> >+	unsigned int ext_flags;
> >+	int ret = 0;
> >+
> >+	osb = OCFS2_SB(s_inode->i_sb);
> >+	dis = (struct ocfs2_dinode *)s_bh->b_data;
> >+	ocfs2_init_dinode_extent_tree(&s_et, INODE_CACHE(s_inode), s_bh);
> >+	ocfs2_init_dinode_extent_tree(&t_et, INODE_CACHE(t_inode), t_bh);
> >+
> >+	spos = ocfs2_bytes_to_clusters(s_inode->i_sb, pos_in);
> >+	tpos = ocfs2_bytes_to_clusters(t_inode->i_sb, pos_out);
> >+	slast = ocfs2_clusters_for_bytes(s_inode->i_sb, pos_in + len);
> >+
> >+	while (spos < slast) {
> >+		if (fatal_signal_pending(current)) {
> >+			ret = -EINTR;
> >+			goto out;
> >+		}
> >+
> >+		/* Look up the extent. */
> >+		ret = ocfs2_get_clusters(s_inode, spos, &p_cluster,
> >+					 &num_clusters, &ext_flags);
> >+		if (ret) {
> >+			mlog_errno(ret);
> >+			goto out;
> >+		}
> >+
> >+		num_clusters = min_t(u32, num_clusters, slast - spos);
> >+
> >+		/* Punch out the dest range. */
> >+		pstart = ocfs2_clusters_to_bytes(t_inode->i_sb, tpos);
> >+		plen = ocfs2_clusters_to_bytes(t_inode->i_sb, num_clusters);
> >+		ret = ocfs2_remove_inode_range(t_inode, t_bh, pstart, plen);
> >+		if (ret) {
> >+			mlog_errno(ret);
> >+			goto out;
> >+		}
> >+
> >+		if (p_cluster == 0)
> >+			goto next_loop;
> >+
> >+		/* Lock the refcount btree... */
> >+		ret = ocfs2_lock_refcount_tree(osb,
> >+					       le64_to_cpu(dis->i_refcount_loc),
> >+					       1, &ref_tree, &ref_root_bh);
> >+		if (ret) {
> >+			mlog_errno(ret);
> >+			goto out;
> >+		}
> >+
> >+		/* Mark s_inode's extent as refcounted. */
> >+		if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) {
> >+			ret = ocfs2_add_refcount_flag(s_inode, &s_et,
> >+						      &ref_tree->rf_ci,
> >+						      ref_root_bh, spos,
> >+						      p_cluster, num_clusters,
> >+						      dealloc, NULL);
> >+			if (ret) {
> >+				mlog_errno(ret);
> >+				goto out_unlock_refcount;
> >+			}
> >+		}
> >+
> >+		/* Map in the new extent. */
> >+		ext_flags |= OCFS2_EXT_REFCOUNTED;
> >+		ret = ocfs2_add_refcounted_extent(t_inode, &t_et,
> >+						  &ref_tree->rf_ci,
> >+						  ref_root_bh,
> >+						  tpos, p_cluster,
> >+						  num_clusters,
> >+						  ext_flags,
> >+						  dealloc);
> >+		if (ret) {
> >+			mlog_errno(ret);
> >+			goto out_unlock_refcount;
> >+		}
> >+
> >+		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
> >+		brelse(ref_root_bh);
> >+next_loop:
> >+		spos += num_clusters;
> >+		tpos += num_clusters;
> >+	}
> >+
> >+out:
> >+	return ret;
> >+out_unlock_refcount:
> >+	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
> >+	brelse(ref_root_bh);
> >+	return ret;
> >+}
> >+
> >+/* Set up refcount tree and remap s_inode to t_inode. */
> >+static int ocfs2_reflink_remap_blocks(struct inode *s_inode,
> >+				      struct buffer_head *s_bh,
> >+				      loff_t pos_in,
> >+				      struct inode *t_inode,
> >+				      struct buffer_head *t_bh,
> >+				      loff_t pos_out,
> >+				      loff_t len)
> >+{
> >+	struct ocfs2_cached_dealloc_ctxt dealloc;
> >+	struct ocfs2_super *osb;
> >+	struct ocfs2_dinode *dis;
> >+	struct ocfs2_dinode *dit;
> >+	int ret;
> >+
> >+	osb = OCFS2_SB(s_inode->i_sb);
> >+	dis = (struct ocfs2_dinode *)s_bh->b_data;
> >+	dit = (struct ocfs2_dinode *)t_bh->b_data;
> >+	ocfs2_init_dealloc_ctxt(&dealloc);
> >+
> >+	/*
> >+	 * If both inodes belong to two different refcount groups then
> >+	 * forget it because we don't know how (or want) to go merging
> >+	 * refcount trees.
> >+	 */
> >+	ret = -EOPNOTSUPP;
> >+	if (ocfs2_is_refcount_inode(s_inode) &&
> >+	    ocfs2_is_refcount_inode(t_inode) &&
> >+	    le64_to_cpu(dis->i_refcount_loc) !=
> >+	    le64_to_cpu(dit->i_refcount_loc))
> >+		goto out;
> >+
> >+	/* Neither inode has a refcount tree.  Add one to s_inode. */
> >+	if (!ocfs2_is_refcount_inode(s_inode) &&
> >+	    !ocfs2_is_refcount_inode(t_inode)) {
> >+		ret = ocfs2_create_refcount_tree(s_inode, s_bh);
> >+		if (ret) {
> >+			mlog_errno(ret);
> >+			goto out;
> >+		}
> >+	}
> >+
> >+	/* Ensure that both inodes end up with the same refcount tree. */
> >+	if (!ocfs2_is_refcount_inode(s_inode)) {
> >+		ret = ocfs2_set_refcount_tree(s_inode, s_bh,
> >+					      le64_to_cpu(dit->i_refcount_loc));
> >+		if (ret) {
> >+			mlog_errno(ret);
> >+			goto out;
> >+		}
> >+	}
> >+	if (!ocfs2_is_refcount_inode(t_inode)) {
> >+		ret = ocfs2_set_refcount_tree(t_inode, t_bh,
> >+					      le64_to_cpu(dis->i_refcount_loc));
> >+		if (ret) {
> >+			mlog_errno(ret);
> >+			goto out;
> >+		}
> >+	}
> >+
> >+	/*
> >+	 * If we're reflinking the entire file and the source is inline
> >+	 * data, just copy the contents.
> >+	 */
> >+	if (pos_in == pos_out && pos_in == 0 && len == i_size_read(s_inode) &&
> >+	    i_size_read(t_inode) <= len &&
> >+	    (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) {
> >+		ret = ocfs2_duplicate_inline_data(s_inode, s_bh, t_inode, t_bh);
> >+		if (ret)
> >+			mlog_errno(ret);
> >+		goto out;
> >+	}
> >+
> >+	ret = ocfs2_reflink_remap_extent(s_inode, s_bh, pos_in, t_inode, t_bh,
> >+					 pos_out, len, &dealloc);
> >+	if (ret) {
> >+		mlog_errno(ret);
> >+		goto out;
> >+	}
> >+
> >+out:
> >+	if (ocfs2_dealloc_has_cluster(&dealloc)) {
> >+		ocfs2_schedule_truncate_log_flush(osb, 1);
> >+		ocfs2_run_deallocs(osb, &dealloc);
> >+	}
> >+
> >+	return ret;
> >+}
> >+
> >+/* Lock an inode and grab a bh pointing to the inode. */
> >+static int ocfs2_reflink_inodes_lock(struct inode *s_inode,
> >+				     struct buffer_head **bh1,
> >+				     struct inode *t_inode,
> >+				     struct buffer_head **bh2)
> >+{
> >+	struct inode *inode1;
> >+	struct inode *inode2;
> >+	struct ocfs2_inode_info *oi1;
> >+	struct ocfs2_inode_info *oi2;
> >+	bool same_inode = (s_inode == t_inode);
> >+	int status;
> >+
> >+	/* First grab the VFS and rw locks. */
> >+	inode1 = s_inode;
> >+	inode2 = t_inode;
> >+	if (inode1->i_ino > inode2->i_ino)
> >+		swap(inode1, inode2);
> >+
> >+	inode_lock(inode1);
> >+	status = ocfs2_rw_lock(inode1, 1);
> >+	if (status) {
> >+		mlog_errno(status);
> >+		goto out_i1;
> >+	}
> >+	if (!same_inode) {
> >+		inode_lock_nested(inode2, I_MUTEX_CHILD);
> >+		status = ocfs2_rw_lock(inode2, 1);
> >+		if (status) {
> >+			mlog_errno(status);
> >+			goto out_i2;
> >+		}
> >+	}
> >+
> >+	/* Now go for the cluster locks */
> >+	oi1 = OCFS2_I(inode1);
> >+	oi2 = OCFS2_I(inode2);
> >+
> >+	trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
> >+				(unsigned long long)oi2->ip_blkno);
> >+
> >+	if (*bh1)
> >+		*bh1 = NULL;
> >+	if (*bh2)
> >+		*bh2 = NULL;
> >+
> >+	/* We always want to lock the one with the lower lockid first. */
> >+	if (oi1->ip_blkno > oi2->ip_blkno)
> >+		mlog_errno(-ENOLCK);
> >+
> >+	/* lock id1 */
> >+	status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_REFLINK_TARGET);
> >+	if (status < 0) {
> >+		if (status != -ENOENT)
> >+			mlog_errno(status);
> >+		goto out_rw2;
> >+	}
> >+
> >+	/* lock id2 */
> >+	if (!same_inode) {
> >+		status = ocfs2_inode_lock_nested(inode2, bh2, 1,
> >+						 OI_LS_REFLINK_TARGET);
> >+		if (status < 0) {
> >+			if (status != -ENOENT)
> >+				mlog_errno(status);
> >+			goto out_cl1;
> >+		}
> >+	} else
> >+		*bh2 = *bh1;
> >+
> >+	trace_ocfs2_double_lock_end(
> >+			(unsigned long long)OCFS2_I(inode1)->ip_blkno,
> >+			(unsigned long long)OCFS2_I(inode2)->ip_blkno);
> >+
> >+	return 0;
> >+
> >+out_cl1:
> >+	ocfs2_inode_unlock(inode1, 1);
> >+	brelse(*bh1);
> >+	*bh1 = NULL;
> >+out_rw2:
> >+	ocfs2_rw_unlock(inode2, 1);
> >+out_i2:
> >+	inode_unlock(inode2);
> >+	ocfs2_rw_unlock(inode1, 1);
> >+out_i1:
> >+	inode_unlock(inode1);
> >+	return status;
> >+}
> >+
> >+/* Unlock both inodes and release buffers. */
> >+static void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
> >+					struct buffer_head *s_bh,
> >+					struct inode *t_inode,
> >+					struct buffer_head *t_bh)
> >+{
> >+	ocfs2_inode_unlock(s_inode, 1);
> >+	ocfs2_rw_unlock(s_inode, 1);
> >+	inode_unlock(s_inode);
> >+	brelse(s_bh);
> >+
> >+	if (s_inode == t_inode)
> >+		return;
> >+
> >+	ocfs2_inode_unlock(t_inode, 1);
> >+	ocfs2_rw_unlock(t_inode, 1);
> >+	inode_unlock(t_inode);
> >+	brelse(t_bh);
> >+}
> >+
> >+/*
> >+ * Read a page's worth of file data into the page cache.  Return the page
> >+ * locked.
> >+ */
> >+static struct page *ocfs2_reflink_get_page(struct inode *inode,
> >+					   loff_t offset)
> >+{
> >+	struct address_space *mapping;
> >+	struct page *page;
> >+	pgoff_t n;
> >+
> >+	n = offset >> PAGE_SHIFT;
> >+	mapping = inode->i_mapping;
> >+	page = read_mapping_page(mapping, n, NULL);
> >+	if (IS_ERR(page))
> >+		return page;
> >+	if (!PageUptodate(page)) {
> >+		put_page(page);
> >+		return ERR_PTR(-EIO);
> >+	}
> >+	lock_page(page);
> >+	return page;
> >+}
> >+
> >+/*
> >+ * Compare extents of two files to see if they are the same.
> >+ */
> >+static int ocfs2_reflink_compare_extents(struct inode *src,
> >+					 loff_t srcoff,
> >+					 struct inode *dest,
> >+					 loff_t destoff,
> >+					 loff_t len,
> >+					 bool *is_same)
> >+{
> >+	loff_t src_poff;
> >+	loff_t dest_poff;
> >+	void *src_addr;
> >+	void *dest_addr;
> >+	struct page *src_page;
> >+	struct page *dest_page;
> >+	loff_t cmp_len;
> >+	bool same;
> >+	int error;
> >+
> >+	error = -EINVAL;
> >+	same = true;
> >+	while (len) {
> >+		src_poff = srcoff & (PAGE_SIZE - 1);
> >+		dest_poff = destoff & (PAGE_SIZE - 1);
> >+		cmp_len = min(PAGE_SIZE - src_poff,
> >+			      PAGE_SIZE - dest_poff);
> >+		cmp_len = min(cmp_len, len);
> >+		if (cmp_len <= 0) {
> >+			mlog_errno(-EUCLEAN);
> >+			goto out_error;
> >+		}
> >+
> >+		src_page = ocfs2_reflink_get_page(src, srcoff);
> >+		if (IS_ERR(src_page)) {
> >+			error = PTR_ERR(src_page);
> >+			goto out_error;
> >+		}
> >+		dest_page = ocfs2_reflink_get_page(dest, destoff);
> >+		if (IS_ERR(dest_page)) {
> >+			error = PTR_ERR(dest_page);
> >+			unlock_page(src_page);
> >+			put_page(src_page);
> >+			goto out_error;
> >+		}
> >+		src_addr = kmap_atomic(src_page);
> >+		dest_addr = kmap_atomic(dest_page);
> >+
> >+		flush_dcache_page(src_page);
> >+		flush_dcache_page(dest_page);
> >+
> >+		if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
> >+			same = false;
> >+
> >+		kunmap_atomic(dest_addr);
> >+		kunmap_atomic(src_addr);
> >+		unlock_page(dest_page);
> >+		unlock_page(src_page);
> >+		put_page(dest_page);
> >+		put_page(src_page);
> >+
> >+		if (!same)
> >+			break;
> >+
> >+		srcoff += cmp_len;
> >+		destoff += cmp_len;
> >+		len -= cmp_len;
> >+	}
> >+
> >+	*is_same = same;
> >+	return 0;
> >+
> >+out_error:
> >+	return error;
> >+}
> >+
> >+/* Link a range of blocks from one file to another. */
> >+int ocfs2_reflink_remap_range(struct file *file_in,
> >+			      loff_t pos_in,
> >+			      struct file *file_out,
> >+			      loff_t pos_out,
> >+			      u64 len,
> >+			      bool is_dedupe)
> >+{
> >+	struct inode *inode_in = file_inode(file_in);
> >+	struct inode *inode_out = file_inode(file_out);
> >+	struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb);
> >+	struct buffer_head *in_bh = NULL, *out_bh = NULL;
> >+	loff_t bs = 1 << OCFS2_SB(inode_in->i_sb)->s_clustersize_bits;
> >+	bool same_inode = (inode_in == inode_out);
> >+	bool is_same = false;
> >+	loff_t isize;
> >+	ssize_t ret;
> >+	loff_t blen;
> >+
> >+	if (!ocfs2_refcount_tree(osb))
> >+		return -EOPNOTSUPP;
> >+	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
> >+		return -EROFS;
> >+
> >+	/* Lock both files against IO */
> >+	ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh);
> >+	if (ret)
> >+		return ret;
> >+
> >+	ret = -EINVAL;
> >+	if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) ||
> >+	    (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
> >+		goto out_unlock;
> >+
> >+	/* Don't touch certain kinds of inodes */
> >+	ret = -EPERM;
> >+	if (IS_IMMUTABLE(inode_out))
> >+		goto out_unlock;
> >+
> >+	ret = -ETXTBSY;
> >+	if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
> >+		goto out_unlock;
> >+
> >+	/* Don't reflink dirs, pipes, sockets... */
> >+	ret = -EISDIR;
> >+	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
> >+		goto out_unlock;
> >+	ret = -EINVAL;
> >+	if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode))
> >+		goto out_unlock;
> >+	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
> >+		goto out_unlock;
> >+
> >+	/* Are we going all the way to the end? */
> >+	isize = i_size_read(inode_in);
> >+	if (isize == 0) {
> >+		ret = 0;
> >+		goto out_unlock;
> >+	}
> >+
> >+	if (len == 0)
> >+		len = isize - pos_in;
> >+
> >+	/* Ensure offsets don't wrap and the input is inside i_size */
> >+	if (pos_in + len < pos_in || pos_out + len < pos_out ||
> >+	    pos_in + len > isize)
> >+		goto out_unlock;
> >+
> >+	/* Don't allow dedupe past EOF in the dest file */
> >+	if (is_dedupe) {
> >+		loff_t	disize;
> >+
> >+		disize = i_size_read(inode_out);
> >+		if (pos_out >= disize || pos_out + len > disize)
> >+			goto out_unlock;
> >+	}
> >+
> >+	/* If we're linking to EOF, continue to the block boundary. */
> >+	if (pos_in + len == isize)
> >+		blen = ALIGN(isize, bs) - pos_in;
> >+	else
> >+		blen = len;
> >+
> >+	/* Only reflink if we're aligned to block boundaries */
> >+	if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
> >+	    !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
> >+		goto out_unlock;
> >+
> >+	/* Don't allow overlapped reflink within the same file */
> >+	if (same_inode) {
> >+		if (pos_out + blen > pos_in && pos_out < pos_in + blen)
> >+			goto out_unlock;
> >+	}
> >+
> >+	/* Wait for the completion of any pending IOs on both files */
> >+	inode_dio_wait(inode_in);
> >+	if (!same_inode)
> >+		inode_dio_wait(inode_out);
> >+
> >+	ret = filemap_write_and_wait_range(inode_in->i_mapping,
> >+			pos_in, pos_in + len - 1);
> >+	if (ret)
> >+		goto out_unlock;
> >+
> >+	ret = filemap_write_and_wait_range(inode_out->i_mapping,
> >+			pos_out, pos_out + len - 1);
> >+	if (ret)
> >+		goto out_unlock;
> >+
> >+	/*
> >+	 * Check that the extents are the same.
> >+	 */
> >+	if (is_dedupe) {
> >+		ret = ocfs2_reflink_compare_extents(inode_in, pos_in,
> >+						    inode_out, pos_out,
> >+						    len, &is_same);
> >+		if (ret)
> >+			goto out_unlock;
> >+		if (!is_same) {
> >+			ret = -EBADE;
> >+			goto out_unlock;
> >+		}
> >+	}
> >+
> >+	/* Lock out changes to the allocation maps */
> >+	down_write(&OCFS2_I(inode_in)->ip_alloc_sem);
> >+	if (!same_inode)
> >+		down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
> >+				  SINGLE_DEPTH_NESTING);
> >+
> >+	/*
> >+	 * Invalidate the page cache so that we can clear any CoW mappings
> >+	 * in the destination file.
> >+	 */
> >+	truncate_inode_pages_range(&inode_out->i_data, pos_out,
> >+				   PAGE_ALIGN(pos_out + len) - 1);
> >+
> >+	ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out,
> >+					 out_bh, pos_out, len);
> >+
> >+	up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
> >+	if (!same_inode)
> >+		up_write(&OCFS2_I(inode_out)->ip_alloc_sem);
> >+	if (ret) {
> >+		mlog_errno(ret);
> >+		goto out_unlock;
> >+	}
> >+
> >+	/*
> >+	 * Empty the extent map so that we may get the right extent
> >+	 * record from the disk.
> >+	 */
> >+	ocfs2_extent_map_trunc(inode_in, 0);
> >+	ocfs2_extent_map_trunc(inode_out, 0);
> >+
> >+	ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len);
> >+	if (ret) {
> >+		mlog_errno(ret);
> >+		goto out_unlock;
> >+	}
> >+
> >+	ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
> >+	return 0;
> >+
> >+out_unlock:
> >+	ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
> >+	return ret;
> >+}
> >diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
> >index 553edfb..c023e88 100644
> >--- a/fs/ocfs2/refcounttree.h
> >+++ b/fs/ocfs2/refcounttree.h
> >@@ -117,4 +117,11 @@ int ocfs2_reflink_ioctl(struct inode *inode,
> >  			const char __user *oldname,
> >  			const char __user *newname,
> >  			bool preserve);
> >+int ocfs2_reflink_remap_range(struct file *file_in,
> >+			      loff_t pos_in,
> >+			      struct file *file_out,
> >+			      loff_t pos_out,
> >+			      u64 len,
> >+			      bool is_dedupe);
> >+
> >  #endif /* OCFS2_REFCOUNTTREE_H */
> >
> >
> >_______________________________________________
> >Ocfs2-devel mailing list
> >Ocfs2-devel@oss.oracle.com
> >https://oss.oracle.com/mailman/listinfo/ocfs2-devel
> >
>
Zhen Ren Nov. 11, 2016, 6:45 a.m. UTC | #3
On 11/11/2016 02:20 PM, Darrick J. Wong wrote:
> On Fri, Nov 11, 2016 at 01:49:48PM +0800, Eric Ren wrote:
>> Hi,
>>
>> A few issues obvious to me:
>>
>> On 11/10/2016 06:51 AM, Darrick J. Wong wrote:
>>> Connect the new VFS clone_range, copy_range, and dedupe_range features
>>> to the existing reflink capability of ocfs2.  Compared to the existing
>>> ocfs2 reflink ioctl We have to do things a little differently to support
>>> the VFS semantics (we can clone subranges of a file but we don't clone
>>> xattrs), but the VFS ioctls are more broadly supported.
>> How can I test the new ocfs2 reflink (with this patch) manually? What
>> commands should I use to do xxx_range things?
> See the 'reflink', 'dedupe', and 'copy_range' commands in xfs_io.
>
> The first two were added in xfsprogs 4.3, and copy_range in 4.7.

OK, thanks. I think you are missing the following two inline comments:

>>> +	spin_lock(&OCFS2_I(dest)->ip_lock);
>>> +	if (newlen > i_size_read(dest)) {
>>> +		i_size_write(dest, newlen);
>>> +		di->i_size = newlen;
>> di->i_size = cpu_to_le64(newlen);
>>
>>> +	}
>>> +	spin_unlock(&OCFS2_I(dest)->ip_lock);
>>> +
>> Add ocfs2_update_inode_fsync_trans() here? Looks this function was
>> introduced by you to improve efficiency.
>> Just want to awake your memory about this, though I don't know about the
>> details why it should be.
>>
>> Eric
Thanks,
Eric
Darrick J. Wong Nov. 11, 2016, 9:01 a.m. UTC | #4
On Fri, Nov 11, 2016 at 02:45:54PM +0800, Eric Ren wrote:
> On 11/11/2016 02:20 PM, Darrick J. Wong wrote:
> >On Fri, Nov 11, 2016 at 01:49:48PM +0800, Eric Ren wrote:
> >>Hi,
> >>
> >>A few issues obvious to me:
> >>
> >>On 11/10/2016 06:51 AM, Darrick J. Wong wrote:
> >>>Connect the new VFS clone_range, copy_range, and dedupe_range features
> >>>to the existing reflink capability of ocfs2.  Compared to the existing
> >>>ocfs2 reflink ioctl We have to do things a little differently to support
> >>>the VFS semantics (we can clone subranges of a file but we don't clone
> >>>xattrs), but the VFS ioctls are more broadly supported.
> >>How can I test the new ocfs2 reflink (with this patch) manually? What
> >>commands should I use to do xxx_range things?
> >See the 'reflink', 'dedupe', and 'copy_range' commands in xfs_io.
> >
> >The first two were added in xfsprogs 4.3, and copy_range in 4.7.
> 
> OK, thanks. I think you are missing the following two inline comments:
> 
> >>>+	spin_lock(&OCFS2_I(dest)->ip_lock);
> >>>+	if (newlen > i_size_read(dest)) {
> >>>+		i_size_write(dest, newlen);
> >>>+		di->i_size = newlen;
> >>di->i_size = cpu_to_le64(newlen);

Good catch!

> >>>+	}
> >>>+	spin_unlock(&OCFS2_I(dest)->ip_lock);
> >>>+
> >>Add ocfs2_update_inode_fsync_trans() here? Looks this function was
> >>introduced by you to improve efficiency.
> >>Just want to awake your memory about this, though I don't know about the
> >>details why it should be.

D'oh!  Yes, I did miss that.

The function updates the destination inode's information.  Specifically,
it updates i_size if we reflinked blocks into the file past EOF.
Looking at it some more, I also need to update i_blocks or the stat(2) info
will be wrong, and I also need to convert inline data to extents prior
to reflinking.

--D

> >>
> >>Eric
> Thanks,
> Eric
diff mbox

Patch

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 000c234..d5a022d 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1667,9 +1667,9 @@  static void ocfs2_calc_trunc_pos(struct inode *inode,
 	*done = ret;
 }
 
-static int ocfs2_remove_inode_range(struct inode *inode,
-				    struct buffer_head *di_bh, u64 byte_start,
-				    u64 byte_len)
+int ocfs2_remove_inode_range(struct inode *inode,
+			     struct buffer_head *di_bh, u64 byte_start,
+			     u64 byte_len)
 {
 	int ret = 0, flags = 0, done = 0, i;
 	u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
@@ -2440,6 +2440,56 @@  static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
 	return offset;
 }
 
+static ssize_t ocfs2_file_copy_range(struct file *file_in,
+				     loff_t pos_in,
+				     struct file *file_out,
+				     loff_t pos_out,
+				     size_t len,
+				     unsigned int flags)
+{
+	int error;
+
+	error = ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
+					  len, false);
+	if (error)
+		return error;
+	return len;
+}
+
+static int ocfs2_file_clone_range(struct file *file_in,
+				  loff_t pos_in,
+				  struct file *file_out,
+				  loff_t pos_out,
+				  u64 len)
+{
+	return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
+					 len, false);
+}
+
+#define OCFS2_MAX_DEDUPE_LEN	(16 * 1024 * 1024)
+static ssize_t ocfs2_file_dedupe_range(struct file *src_file,
+				       u64 loff,
+				       u64 len,
+				       struct file *dst_file,
+				       u64 dst_loff)
+{
+	int error;
+
+	/*
+	 * Limit the total length we will dedupe for each operation.
+	 * This is intended to bound the total time spent in this
+	 * ioctl to something sane.
+	 */
+	if (len > OCFS2_MAX_DEDUPE_LEN)
+		len = OCFS2_MAX_DEDUPE_LEN;
+
+	error = ocfs2_reflink_remap_range(src_file, loff, dst_file, dst_loff,
+					  len, true);
+	if (error)
+		return error;
+	return len;
+}
+
 const struct inode_operations ocfs2_file_iops = {
 	.setattr	= ocfs2_setattr,
 	.getattr	= ocfs2_getattr,
@@ -2479,6 +2529,9 @@  const struct file_operations ocfs2_fops = {
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= iter_file_splice_write,
 	.fallocate	= ocfs2_fallocate,
+	.copy_file_range = ocfs2_file_copy_range,
+	.clone_file_range = ocfs2_file_clone_range,
+	.dedupe_file_range = ocfs2_file_dedupe_range,
 };
 
 const struct file_operations ocfs2_dops = {
@@ -2524,6 +2577,9 @@  const struct file_operations ocfs2_fops_no_plocks = {
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= iter_file_splice_write,
 	.fallocate	= ocfs2_fallocate,
+	.copy_file_range = ocfs2_file_copy_range,
+	.clone_file_range = ocfs2_file_clone_range,
+	.dedupe_file_range = ocfs2_file_dedupe_range,
 };
 
 const struct file_operations ocfs2_dops_no_plocks = {
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index e8c62f2..897fd9a 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -82,4 +82,7 @@  int ocfs2_change_file_space(struct file *file, unsigned int cmd,
 
 int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
 				   size_t count);
+int ocfs2_remove_inode_range(struct inode *inode,
+			     struct buffer_head *di_bh, u64 byte_start,
+			     u64 byte_len);
 #endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index d92b6c6..3e2198c 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -34,6 +34,7 @@ 
 #include "xattr.h"
 #include "namei.h"
 #include "ocfs2_trace.h"
+#include "file.h"
 
 #include <linux/bio.h>
 #include <linux/blkdev.h>
@@ -4447,3 +4448,621 @@  int ocfs2_reflink_ioctl(struct inode *inode,
 
 	return error;
 }
+
+/* Update destination inode size, if necessary. */
+static int ocfs2_reflink_update_dest(struct inode *dest,
+				     struct buffer_head *d_bh,
+				     loff_t newlen)
+{
+	handle_t *handle;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)d_bh->b_data;
+	int ret;
+
+	if (newlen <= i_size_read(dest))
+		return 0;
+
+	handle = ocfs2_start_trans(OCFS2_SB(dest->i_sb),
+				   OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(dest), d_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	spin_lock(&OCFS2_I(dest)->ip_lock);
+	if (newlen > i_size_read(dest)) {
+		i_size_write(dest, newlen);
+		di->i_size = newlen;
+	}
+	spin_unlock(&OCFS2_I(dest)->ip_lock);
+
+	ocfs2_journal_dirty(handle, d_bh);
+
+out_commit:
+	ocfs2_commit_trans(OCFS2_SB(dest->i_sb), handle);
+	return ret;
+}
+
+/* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */
+static int ocfs2_reflink_remap_extent(struct inode *s_inode,
+				      struct buffer_head *s_bh,
+				      loff_t pos_in,
+				      struct inode *t_inode,
+				      struct buffer_head *t_bh,
+				      loff_t pos_out,
+				      loff_t len,
+				      struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	struct ocfs2_extent_tree s_et;
+	struct ocfs2_extent_tree t_et;
+	struct ocfs2_dinode *dis;
+	struct buffer_head *ref_root_bh = NULL;
+	struct ocfs2_refcount_tree *ref_tree;
+	struct ocfs2_super *osb;
+	loff_t pstart, plen;
+	u32 p_cluster, num_clusters, slast, spos, tpos;
+	unsigned int ext_flags;
+	int ret = 0;
+
+	osb = OCFS2_SB(s_inode->i_sb);
+	dis = (struct ocfs2_dinode *)s_bh->b_data;
+	ocfs2_init_dinode_extent_tree(&s_et, INODE_CACHE(s_inode), s_bh);
+	ocfs2_init_dinode_extent_tree(&t_et, INODE_CACHE(t_inode), t_bh);
+
+	spos = ocfs2_bytes_to_clusters(s_inode->i_sb, pos_in);
+	tpos = ocfs2_bytes_to_clusters(t_inode->i_sb, pos_out);
+	slast = ocfs2_clusters_for_bytes(s_inode->i_sb, pos_in + len);
+
+	while (spos < slast) {
+		if (fatal_signal_pending(current)) {
+			ret = -EINTR;
+			goto out;
+		}
+
+		/* Look up the extent. */
+		ret = ocfs2_get_clusters(s_inode, spos, &p_cluster,
+					 &num_clusters, &ext_flags);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		num_clusters = min_t(u32, num_clusters, slast - spos);
+
+		/* Punch out the dest range. */
+		pstart = ocfs2_clusters_to_bytes(t_inode->i_sb, tpos);
+		plen = ocfs2_clusters_to_bytes(t_inode->i_sb, num_clusters);
+		ret = ocfs2_remove_inode_range(t_inode, t_bh, pstart, plen);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (p_cluster == 0)
+			goto next_loop;
+
+		/* Lock the refcount btree... */
+		ret = ocfs2_lock_refcount_tree(osb,
+					       le64_to_cpu(dis->i_refcount_loc),
+					       1, &ref_tree, &ref_root_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/* Mark s_inode's extent as refcounted. */
+		if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) {
+			ret = ocfs2_add_refcount_flag(s_inode, &s_et,
+						      &ref_tree->rf_ci,
+						      ref_root_bh, spos,
+						      p_cluster, num_clusters,
+						      dealloc, NULL);
+			if (ret) {
+				mlog_errno(ret);
+				goto out_unlock_refcount;
+			}
+		}
+
+		/* Map in the new extent. */
+		ext_flags |= OCFS2_EXT_REFCOUNTED;
+		ret = ocfs2_add_refcounted_extent(t_inode, &t_et,
+						  &ref_tree->rf_ci,
+						  ref_root_bh,
+						  tpos, p_cluster,
+						  num_clusters,
+						  ext_flags,
+						  dealloc);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_unlock_refcount;
+		}
+
+		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+		brelse(ref_root_bh);
+next_loop:
+		spos += num_clusters;
+		tpos += num_clusters;
+	}
+
+out:
+	return ret;
+out_unlock_refcount:
+	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+	brelse(ref_root_bh);
+	return ret;
+}
+
+/* Set up refcount tree and remap s_inode to t_inode. */
+static int ocfs2_reflink_remap_blocks(struct inode *s_inode,
+				      struct buffer_head *s_bh,
+				      loff_t pos_in,
+				      struct inode *t_inode,
+				      struct buffer_head *t_bh,
+				      loff_t pos_out,
+				      loff_t len)
+{
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+	struct ocfs2_super *osb;
+	struct ocfs2_dinode *dis;
+	struct ocfs2_dinode *dit;
+	int ret;
+
+	osb = OCFS2_SB(s_inode->i_sb);
+	dis = (struct ocfs2_dinode *)s_bh->b_data;
+	dit = (struct ocfs2_dinode *)t_bh->b_data;
+	ocfs2_init_dealloc_ctxt(&dealloc);
+
+	/*
+	 * If both inodes belong to two different refcount groups then
+	 * forget it because we don't know how (or want) to go merging
+	 * refcount trees.
+	 */
+	ret = -EOPNOTSUPP;
+	if (ocfs2_is_refcount_inode(s_inode) &&
+	    ocfs2_is_refcount_inode(t_inode) &&
+	    le64_to_cpu(dis->i_refcount_loc) !=
+	    le64_to_cpu(dit->i_refcount_loc))
+		goto out;
+
+	/* Neither inode has a refcount tree.  Add one to s_inode. */
+	if (!ocfs2_is_refcount_inode(s_inode) &&
+	    !ocfs2_is_refcount_inode(t_inode)) {
+		ret = ocfs2_create_refcount_tree(s_inode, s_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	/* Ensure that both inodes end up with the same refcount tree. */
+	if (!ocfs2_is_refcount_inode(s_inode)) {
+		ret = ocfs2_set_refcount_tree(s_inode, s_bh,
+					      le64_to_cpu(dit->i_refcount_loc));
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+	if (!ocfs2_is_refcount_inode(t_inode)) {
+		ret = ocfs2_set_refcount_tree(t_inode, t_bh,
+					      le64_to_cpu(dis->i_refcount_loc));
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	/*
+	 * If we're reflinking the entire file and the source is inline
+	 * data, just copy the contents.
+	 */
+	if (pos_in == pos_out && pos_in == 0 && len == i_size_read(s_inode) &&
+	    i_size_read(t_inode) <= len &&
+	    (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) {
+		ret = ocfs2_duplicate_inline_data(s_inode, s_bh, t_inode, t_bh);
+		if (ret)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_reflink_remap_extent(s_inode, s_bh, pos_in, t_inode, t_bh,
+					 pos_out, len, &dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+out:
+	if (ocfs2_dealloc_has_cluster(&dealloc)) {
+		ocfs2_schedule_truncate_log_flush(osb, 1);
+		ocfs2_run_deallocs(osb, &dealloc);
+	}
+
+	return ret;
+}
+
+/* Lock an inode and grab a bh pointing to the inode. */
+static int ocfs2_reflink_inodes_lock(struct inode *s_inode,
+				     struct buffer_head **bh1,
+				     struct inode *t_inode,
+				     struct buffer_head **bh2)
+{
+	struct inode *inode1;
+	struct inode *inode2;
+	struct ocfs2_inode_info *oi1;
+	struct ocfs2_inode_info *oi2;
+	bool same_inode = (s_inode == t_inode);
+	int status;
+
+	/* First grab the VFS and rw locks. */
+	inode1 = s_inode;
+	inode2 = t_inode;
+	if (inode1->i_ino > inode2->i_ino)
+		swap(inode1, inode2);
+
+	inode_lock(inode1);
+	status = ocfs2_rw_lock(inode1, 1);
+	if (status) {
+		mlog_errno(status);
+		goto out_i1;
+	}
+	if (!same_inode) {
+		inode_lock_nested(inode2, I_MUTEX_CHILD);
+		status = ocfs2_rw_lock(inode2, 1);
+		if (status) {
+			mlog_errno(status);
+			goto out_i2;
+		}
+	}
+
+	/* Now go for the cluster locks */
+	oi1 = OCFS2_I(inode1);
+	oi2 = OCFS2_I(inode2);
+
+	trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
+				(unsigned long long)oi2->ip_blkno);
+
+	if (*bh1)
+		*bh1 = NULL;
+	if (*bh2)
+		*bh2 = NULL;
+
+	/* We always want to lock the one with the lower lockid first. */
+	if (oi1->ip_blkno > oi2->ip_blkno)
+		mlog_errno(-ENOLCK);
+
+	/* lock id1 */
+	status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_REFLINK_TARGET);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto out_rw2;
+	}
+
+	/* lock id2 */
+	if (!same_inode) {
+		status = ocfs2_inode_lock_nested(inode2, bh2, 1,
+						 OI_LS_REFLINK_TARGET);
+		if (status < 0) {
+			if (status != -ENOENT)
+				mlog_errno(status);
+			goto out_cl1;
+		}
+	} else
+		*bh2 = *bh1;
+
+	trace_ocfs2_double_lock_end(
+			(unsigned long long)OCFS2_I(inode1)->ip_blkno,
+			(unsigned long long)OCFS2_I(inode2)->ip_blkno);
+
+	return 0;
+
+out_cl1:
+	ocfs2_inode_unlock(inode1, 1);
+	brelse(*bh1);
+	*bh1 = NULL;
+out_rw2:
+	ocfs2_rw_unlock(inode2, 1);
+out_i2:
+	inode_unlock(inode2);
+	ocfs2_rw_unlock(inode1, 1);
+out_i1:
+	inode_unlock(inode1);
+	return status;
+}
+
+/* Unlock both inodes and release buffers. */
+static void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
+					struct buffer_head *s_bh,
+					struct inode *t_inode,
+					struct buffer_head *t_bh)
+{
+	ocfs2_inode_unlock(s_inode, 1);
+	ocfs2_rw_unlock(s_inode, 1);
+	inode_unlock(s_inode);
+	brelse(s_bh);
+
+	if (s_inode == t_inode)
+		return;
+
+	ocfs2_inode_unlock(t_inode, 1);
+	ocfs2_rw_unlock(t_inode, 1);
+	inode_unlock(t_inode);
+	brelse(t_bh);
+}
+
+/*
+ * Read a page's worth of file data into the page cache.  Return the page
+ * locked.
+ */
+static struct page *ocfs2_reflink_get_page(struct inode *inode,
+					   loff_t offset)
+{
+	struct address_space *mapping;
+	struct page *page;
+	pgoff_t n;
+
+	n = offset >> PAGE_SHIFT;
+	mapping = inode->i_mapping;
+	page = read_mapping_page(mapping, n, NULL);
+	if (IS_ERR(page))
+		return page;
+	if (!PageUptodate(page)) {
+		put_page(page);
+		return ERR_PTR(-EIO);
+	}
+	lock_page(page);
+	return page;
+}
+
+/*
+ * Compare extents of two files to see if they are the same.
+ */
+static int ocfs2_reflink_compare_extents(struct inode *src,
+					 loff_t srcoff,
+					 struct inode *dest,
+					 loff_t destoff,
+					 loff_t len,
+					 bool *is_same)
+{
+	loff_t src_poff;
+	loff_t dest_poff;
+	void *src_addr;
+	void *dest_addr;
+	struct page *src_page;
+	struct page *dest_page;
+	loff_t cmp_len;
+	bool same;
+	int error;
+
+	error = -EINVAL;
+	same = true;
+	while (len) {
+		src_poff = srcoff & (PAGE_SIZE - 1);
+		dest_poff = destoff & (PAGE_SIZE - 1);
+		cmp_len = min(PAGE_SIZE - src_poff,
+			      PAGE_SIZE - dest_poff);
+		cmp_len = min(cmp_len, len);
+		if (cmp_len <= 0) {
+			mlog_errno(-EUCLEAN);
+			goto out_error;
+		}
+
+		src_page = ocfs2_reflink_get_page(src, srcoff);
+		if (IS_ERR(src_page)) {
+			error = PTR_ERR(src_page);
+			goto out_error;
+		}
+		dest_page = ocfs2_reflink_get_page(dest, destoff);
+		if (IS_ERR(dest_page)) {
+			error = PTR_ERR(dest_page);
+			unlock_page(src_page);
+			put_page(src_page);
+			goto out_error;
+		}
+		src_addr = kmap_atomic(src_page);
+		dest_addr = kmap_atomic(dest_page);
+
+		flush_dcache_page(src_page);
+		flush_dcache_page(dest_page);
+
+		if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
+			same = false;
+
+		kunmap_atomic(dest_addr);
+		kunmap_atomic(src_addr);
+		unlock_page(dest_page);
+		unlock_page(src_page);
+		put_page(dest_page);
+		put_page(src_page);
+
+		if (!same)
+			break;
+
+		srcoff += cmp_len;
+		destoff += cmp_len;
+		len -= cmp_len;
+	}
+
+	*is_same = same;
+	return 0;
+
+out_error:
+	return error;
+}
+
+/* Link a range of blocks from one file to another. */
+int ocfs2_reflink_remap_range(struct file *file_in,
+			      loff_t pos_in,
+			      struct file *file_out,
+			      loff_t pos_out,
+			      u64 len,
+			      bool is_dedupe)
+{
+	struct inode *inode_in = file_inode(file_in);
+	struct inode *inode_out = file_inode(file_out);
+	struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb);
+	struct buffer_head *in_bh = NULL, *out_bh = NULL;
+	loff_t bs = 1 << OCFS2_SB(inode_in->i_sb)->s_clustersize_bits;
+	bool same_inode = (inode_in == inode_out);
+	bool is_same = false;
+	loff_t isize;
+	ssize_t ret;
+	loff_t blen;
+
+	if (!ocfs2_refcount_tree(osb))
+		return -EOPNOTSUPP;
+	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+		return -EROFS;
+
+	/* Lock both files against IO */
+	ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh);
+	if (ret)
+		return ret;
+
+	ret = -EINVAL;
+	if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) ||
+	    (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
+		goto out_unlock;
+
+	/* Don't touch certain kinds of inodes */
+	ret = -EPERM;
+	if (IS_IMMUTABLE(inode_out))
+		goto out_unlock;
+
+	ret = -ETXTBSY;
+	if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
+		goto out_unlock;
+
+	/* Don't reflink dirs, pipes, sockets... */
+	ret = -EISDIR;
+	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
+		goto out_unlock;
+	ret = -EINVAL;
+	if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode))
+		goto out_unlock;
+	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
+		goto out_unlock;
+
+	/* Are we going all the way to the end? */
+	isize = i_size_read(inode_in);
+	if (isize == 0) {
+		ret = 0;
+		goto out_unlock;
+	}
+
+	if (len == 0)
+		len = isize - pos_in;
+
+	/* Ensure offsets don't wrap and the input is inside i_size */
+	if (pos_in + len < pos_in || pos_out + len < pos_out ||
+	    pos_in + len > isize)
+		goto out_unlock;
+
+	/* Don't allow dedupe past EOF in the dest file */
+	if (is_dedupe) {
+		loff_t	disize;
+
+		disize = i_size_read(inode_out);
+		if (pos_out >= disize || pos_out + len > disize)
+			goto out_unlock;
+	}
+
+	/* If we're linking to EOF, continue to the block boundary. */
+	if (pos_in + len == isize)
+		blen = ALIGN(isize, bs) - pos_in;
+	else
+		blen = len;
+
+	/* Only reflink if we're aligned to block boundaries */
+	if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
+	    !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
+		goto out_unlock;
+
+	/* Don't allow overlapped reflink within the same file */
+	if (same_inode) {
+		if (pos_out + blen > pos_in && pos_out < pos_in + blen)
+			goto out_unlock;
+	}
+
+	/* Wait for the completion of any pending IOs on both files */
+	inode_dio_wait(inode_in);
+	if (!same_inode)
+		inode_dio_wait(inode_out);
+
+	ret = filemap_write_and_wait_range(inode_in->i_mapping,
+			pos_in, pos_in + len - 1);
+	if (ret)
+		goto out_unlock;
+
+	ret = filemap_write_and_wait_range(inode_out->i_mapping,
+			pos_out, pos_out + len - 1);
+	if (ret)
+		goto out_unlock;
+
+	/*
+	 * Check that the extents are the same.
+	 */
+	if (is_dedupe) {
+		ret = ocfs2_reflink_compare_extents(inode_in, pos_in,
+						    inode_out, pos_out,
+						    len, &is_same);
+		if (ret)
+			goto out_unlock;
+		if (!is_same) {
+			ret = -EBADE;
+			goto out_unlock;
+		}
+	}
+
+	/* Lock out changes to the allocation maps */
+	down_write(&OCFS2_I(inode_in)->ip_alloc_sem);
+	if (!same_inode)
+		down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
+				  SINGLE_DEPTH_NESTING);
+
+	/*
+	 * Invalidate the page cache so that we can clear any CoW mappings
+	 * in the destination file.
+	 */
+	truncate_inode_pages_range(&inode_out->i_data, pos_out,
+				   PAGE_ALIGN(pos_out + len) - 1);
+
+	ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out,
+					 out_bh, pos_out, len);
+
+	up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
+	if (!same_inode)
+		up_write(&OCFS2_I(inode_out)->ip_alloc_sem);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	/*
+	 * Empty the extent map so that we may get the right extent
+	 * record from the disk.
+	 */
+	ocfs2_extent_map_trunc(inode_in, 0);
+	ocfs2_extent_map_trunc(inode_out, 0);
+
+	ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
+	return 0;
+
+out_unlock:
+	ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
+	return ret;
+}
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 553edfb..c023e88 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -117,4 +117,11 @@  int ocfs2_reflink_ioctl(struct inode *inode,
 			const char __user *oldname,
 			const char __user *newname,
 			bool preserve);
+int ocfs2_reflink_remap_range(struct file *file_in,
+			      loff_t pos_in,
+			      struct file *file_out,
+			      loff_t pos_out,
+			      u64 len,
+			      bool is_dedupe);
+
 #endif /* OCFS2_REFCOUNTTREE_H */