[02/15] xfs: refactor clonerange preparation into a separate helper
diff mbox series

Message ID 153870028762.29072.5369530877410002226.stgit@magnolia
State New
Headers show
Series
  • fs: fixes for serious clone/dedupe problems
Related show

Commit Message

Darrick J. Wong Oct. 5, 2018, 12:44 a.m. UTC
From: Darrick J. Wong <darrick.wong@oracle.com>

Refactor all the reflink preparation steps into a separate helper that
we'll use to land all the upcoming fixes for insufficient input checks.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_reflink.c |   96 +++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 71 insertions(+), 25 deletions(-)

Comments

Dave Chinner Oct. 5, 2018, 5:28 a.m. UTC | #1
On Thu, Oct 04, 2018 at 05:44:47PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Refactor all the reflink preparation steps into a separate helper that
> we'll use to land all the upcoming fixes for insufficient input checks.

If I've read the patch right, this also changes the location of the
page cache truncation, right?  i.e. it now happens in the
xfs_reflink_remap_prep() function instead of after the remap?  I
think the commit message needs to mention that because it's a fix to
incorrect behaviour....

I've added:

--
This rework also moves the invalidation of the destination range to
the prep function so that it is done before the range is remapped.
This ensures that nobody can access the data in range being remapped
until the remap is complete.
--

Sound OK?

Otherwise this looks fine.

Reviewed-by: Dave Chinner <dchinner@redhat.com>

-Dave.

> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  fs/xfs/xfs_reflink.c |   96 +++++++++++++++++++++++++++++++++++++-------------
>  1 file changed, 71 insertions(+), 25 deletions(-)
> 
> 
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index 38f405415b88..80ca9b6793cd 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -1195,11 +1195,33 @@ xfs_iolock_two_inodes_and_break_layout(
>  	return 0;
>  }
>  
> +/* Unlock both inodes after they've been prepped for a range clone. */
> +STATIC void
> +xfs_reflink_remap_unlock(
> +	struct file		*file_in,
> +	struct file		*file_out)
> +{
> +	struct inode		*inode_in = file_inode(file_in);
> +	struct xfs_inode	*src = XFS_I(inode_in);
> +	struct inode		*inode_out = file_inode(file_out);
> +	struct xfs_inode	*dest = XFS_I(inode_out);
> +	bool			same_inode = (inode_in == inode_out);
> +
> +	xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
> +	if (!same_inode)
> +		xfs_iunlock(src, XFS_MMAPLOCK_SHARED);
> +	inode_unlock(inode_out);
> +	if (!same_inode)
> +		inode_unlock_shared(inode_in);
> +}
> +
>  /*
> - * Link a range of blocks from one file to another.
> + * Prepare two files for range cloning.  Upon a successful return both inodes
> + * will have the iolock and mmaplock held, the page cache of the out file
> + * will be truncated, and any leases on the out file will have been broken.
>   */
> -int
> -xfs_reflink_remap_range(
> +STATIC int
> +xfs_reflink_remap_prep(
>  	struct file		*file_in,
>  	loff_t			pos_in,
>  	struct file		*file_out,
> @@ -1211,19 +1233,9 @@ xfs_reflink_remap_range(
>  	struct xfs_inode	*src = XFS_I(inode_in);
>  	struct inode		*inode_out = file_inode(file_out);
>  	struct xfs_inode	*dest = XFS_I(inode_out);
> -	struct xfs_mount	*mp = src->i_mount;
>  	bool			same_inode = (inode_in == inode_out);
> -	xfs_fileoff_t		sfsbno, dfsbno;
> -	xfs_filblks_t		fsblen;
> -	xfs_extlen_t		cowextsize;
>  	ssize_t			ret;
>  
> -	if (!xfs_sb_version_hasreflink(&mp->m_sb))
> -		return -EOPNOTSUPP;
> -
> -	if (XFS_FORCED_SHUTDOWN(mp))
> -		return -EIO;
> -
>  	/* Lock both files against IO */
>  	ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out);
>  	if (ret)
> @@ -1254,8 +1266,6 @@ xfs_reflink_remap_range(
>  	if (ret)
>  		goto out_unlock;
>  
> -	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
> -
>  	/*
>  	 * Clear out post-eof preallocations because we don't have page cache
>  	 * backing the delayed allocations and they'll never get freed on
> @@ -1272,6 +1282,51 @@ xfs_reflink_remap_range(
>  	if (ret)
>  		goto out_unlock;
>  
> +	/* Zap any page cache for the destination file's range. */
> +	truncate_inode_pages_range(&inode_out->i_data, pos_out,
> +				   PAGE_ALIGN(pos_out + len) - 1);
> +	return 0;
> +out_unlock:
> +	xfs_reflink_remap_unlock(file_in, file_out);
> +	return ret;
> +}
> +
> +/*
> + * Link a range of blocks from one file to another.
> + */
> +int
> +xfs_reflink_remap_range(
> +	struct file		*file_in,
> +	loff_t			pos_in,
> +	struct file		*file_out,
> +	loff_t			pos_out,
> +	u64			len,
> +	bool			is_dedupe)
> +{
> +	struct inode		*inode_in = file_inode(file_in);
> +	struct xfs_inode	*src = XFS_I(inode_in);
> +	struct inode		*inode_out = file_inode(file_out);
> +	struct xfs_inode	*dest = XFS_I(inode_out);
> +	struct xfs_mount	*mp = src->i_mount;
> +	xfs_fileoff_t		sfsbno, dfsbno;
> +	xfs_filblks_t		fsblen;
> +	xfs_extlen_t		cowextsize;
> +	ssize_t			ret;
> +
> +	if (!xfs_sb_version_hasreflink(&mp->m_sb))
> +		return -EOPNOTSUPP;
> +
> +	if (XFS_FORCED_SHUTDOWN(mp))
> +		return -EIO;
> +
> +	/* Prepare and then clone file data. */
> +	ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
> +			len, is_dedupe);
> +	if (ret)
> +		return ret;
> +
> +	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
> +
>  	dfsbno = XFS_B_TO_FSBT(mp, pos_out);
>  	sfsbno = XFS_B_TO_FSBT(mp, pos_in);
>  	fsblen = XFS_B_TO_FSB(mp, len);
> @@ -1280,10 +1335,6 @@ xfs_reflink_remap_range(
>  	if (ret)
>  		goto out_unlock;
>  
> -	/* Zap any page cache for the destination file's range. */
> -	truncate_inode_pages_range(&inode_out->i_data, pos_out,
> -				   PAGE_ALIGN(pos_out + len) - 1);
> -
>  	/*
>  	 * Carry the cowextsize hint from src to dest if we're sharing the
>  	 * entire source file to the entire destination file, the source file
> @@ -1300,12 +1351,7 @@ xfs_reflink_remap_range(
>  			is_dedupe);
>  
>  out_unlock:
> -	xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
> -	if (!same_inode)
> -		xfs_iunlock(src, XFS_MMAPLOCK_SHARED);
> -	inode_unlock(inode_out);
> -	if (!same_inode)
> -		inode_unlock_shared(inode_in);
> +	xfs_reflink_remap_unlock(file_in, file_out);
>  	if (ret)
>  		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
>  	return ret;
> 
>
Dave Chinner Oct. 5, 2018, 7:02 a.m. UTC | #2
On Thu, Oct 04, 2018 at 05:44:47PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Refactor all the reflink preparation steps into a separate helper that
> we'll use to land all the upcoming fixes for insufficient input checks.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
.....

> +xfs_reflink_remap_range(
> +	struct file		*file_in,
> +	loff_t			pos_in,
> +	struct file		*file_out,
> +	loff_t			pos_out,
> +	u64			len,
> +	bool			is_dedupe)
> +{
> +	struct inode		*inode_in = file_inode(file_in);
> +	struct xfs_inode	*src = XFS_I(inode_in);
> +	struct inode		*inode_out = file_inode(file_out);
> +	struct xfs_inode	*dest = XFS_I(inode_out);
> +	struct xfs_mount	*mp = src->i_mount;
> +	xfs_fileoff_t		sfsbno, dfsbno;
> +	xfs_filblks_t		fsblen;
> +	xfs_extlen_t		cowextsize;
> +	ssize_t			ret;
> +
> +	if (!xfs_sb_version_hasreflink(&mp->m_sb))
> +		return -EOPNOTSUPP;
> +
> +	if (XFS_FORCED_SHUTDOWN(mp))
> +		return -EIO;
> +
> +	/* Prepare and then clone file data. */
> +	ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
> +			len, is_dedupe);
> +	if (ret)
> +		return ret;

generic/013 indicates there's a double unlock bug here.	

vfs_clone_file_prep_inodes() can return zero (do nothing, but don't
fail!), and when that happens xfs_reflink_remap_prep() unlocks the
inodes and returns 0.  This new code doesn't catch it, we do the
remap on unlocked inodes, and then trip lock debugging bugs 

> @@ -1300,12 +1351,7 @@ xfs_reflink_remap_range(
>  			is_dedupe);
>  
>  out_unlock:
> -	xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
> -	if (!same_inode)
> -		xfs_iunlock(src, XFS_MMAPLOCK_SHARED);
> -	inode_unlock(inode_out);
> -	if (!same_inode)
> -		inode_unlock_shared(inode_in);
> +	xfs_reflink_remap_unlock(file_in, file_out);

here:

 DEBUG_LOCKS_WARN_ON(sem->owner != get_current())
 WARNING: CPU: 3 PID: 4766 at kernel/locking/rwsem.c:133 up_write+0x66/0x70
 CPU: 3 PID: 4766 Comm: fsstress Not tainted 4.19.0-rc6-dgc+ #671
....
 Call Trace:
  xfs_iunlock+0x152/0x220
  xfs_reflink_remap_unlock+0x22/0x70
  xfs_reflink_remap_range+0x129/0x2a0
  do_clone_file_range+0x119/0x200
  vfs_clone_file_range+0x35/0xa0
  ioctl_file_clone+0x8a/0xa0
  do_vfs_ioctl+0x2e1/0x6c0
  ksys_ioctl+0x70/0x80
  __x64_sys_ioctl+0x16/0x20
  do_syscall_64+0x5a/0x180
  entry_SYSCALL_64_after_hwframe+0x49/0xbe

I'll fix it for the moment by making xfs_reflink_remap_prep() behave
like vfs_clone_file_prep_inodes() - it will return 1 on success,
0 for nothing to do and < 0 for an error and catch it in this code.

I note that later patches in the series change the
vfs_clone_file_prep_inodes() behaviour so this behaviour is probably
masked by those later changes. It's still a nasty bisect landmine,
though, so I'll fix it here.

Cheers,

Dave.
Dave Chinner Oct. 5, 2018, 9:02 a.m. UTC | #3
On Fri, Oct 05, 2018 at 05:02:28PM +1000, Dave Chinner wrote:
> On Thu, Oct 04, 2018 at 05:44:47PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Refactor all the reflink preparation steps into a separate helper that
> > we'll use to land all the upcoming fixes for insufficient input checks.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> .....
> 
> > +xfs_reflink_remap_range(
> > +	struct file		*file_in,
> > +	loff_t			pos_in,
> > +	struct file		*file_out,
> > +	loff_t			pos_out,
> > +	u64			len,
> > +	bool			is_dedupe)
> > +{
> > +	struct inode		*inode_in = file_inode(file_in);
> > +	struct xfs_inode	*src = XFS_I(inode_in);
> > +	struct inode		*inode_out = file_inode(file_out);
> > +	struct xfs_inode	*dest = XFS_I(inode_out);
> > +	struct xfs_mount	*mp = src->i_mount;
> > +	xfs_fileoff_t		sfsbno, dfsbno;
> > +	xfs_filblks_t		fsblen;
> > +	xfs_extlen_t		cowextsize;
> > +	ssize_t			ret;
> > +
> > +	if (!xfs_sb_version_hasreflink(&mp->m_sb))
> > +		return -EOPNOTSUPP;
> > +
> > +	if (XFS_FORCED_SHUTDOWN(mp))
> > +		return -EIO;
> > +
> > +	/* Prepare and then clone file data. */
> > +	ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
> > +			len, is_dedupe);

More than one bug. vfs_clone_file_prep_inodes() modifes the length
parameter in the case of whole file reflink by way of "len == 0"
on a non-zero length file. So I fixed this, too.

-Dave.
Darrick J. Wong Oct. 5, 2018, 5:06 p.m. UTC | #4
On Fri, Oct 05, 2018 at 03:28:09PM +1000, Dave Chinner wrote:
> On Thu, Oct 04, 2018 at 05:44:47PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Refactor all the reflink preparation steps into a separate helper that
> > we'll use to land all the upcoming fixes for insufficient input checks.
> 
> If I've read the patch right, this also changes the location of the
> page cache truncation, right?  i.e. it now happens in the
> xfs_reflink_remap_prep() function instead of after the remap?  I
> think the commit message needs to mention that because it's a fix to
> incorrect behaviour....

Right.  Sorry I forgot to put that in the changelog.

> I've added:
> 
> --
> This rework also moves the invalidation of the destination range to
> the prep function so that it is done before the range is remapped.
> This ensures that nobody can access the data in range being remapped
> until the remap is complete.
> --
> 
> Sound OK?

Yep.

--D


> Otherwise this looks fine.
> 
> Reviewed-by: Dave Chinner <dchinner@redhat.com>
> 
> -Dave.
> 
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> >  fs/xfs/xfs_reflink.c |   96 +++++++++++++++++++++++++++++++++++++-------------
> >  1 file changed, 71 insertions(+), 25 deletions(-)
> > 
> > 
> > diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> > index 38f405415b88..80ca9b6793cd 100644
> > --- a/fs/xfs/xfs_reflink.c
> > +++ b/fs/xfs/xfs_reflink.c
> > @@ -1195,11 +1195,33 @@ xfs_iolock_two_inodes_and_break_layout(
> >  	return 0;
> >  }
> >  
> > +/* Unlock both inodes after they've been prepped for a range clone. */
> > +STATIC void
> > +xfs_reflink_remap_unlock(
> > +	struct file		*file_in,
> > +	struct file		*file_out)
> > +{
> > +	struct inode		*inode_in = file_inode(file_in);
> > +	struct xfs_inode	*src = XFS_I(inode_in);
> > +	struct inode		*inode_out = file_inode(file_out);
> > +	struct xfs_inode	*dest = XFS_I(inode_out);
> > +	bool			same_inode = (inode_in == inode_out);
> > +
> > +	xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
> > +	if (!same_inode)
> > +		xfs_iunlock(src, XFS_MMAPLOCK_SHARED);
> > +	inode_unlock(inode_out);
> > +	if (!same_inode)
> > +		inode_unlock_shared(inode_in);
> > +}
> > +
> >  /*
> > - * Link a range of blocks from one file to another.
> > + * Prepare two files for range cloning.  Upon a successful return both inodes
> > + * will have the iolock and mmaplock held, the page cache of the out file
> > + * will be truncated, and any leases on the out file will have been broken.
> >   */
> > -int
> > -xfs_reflink_remap_range(
> > +STATIC int
> > +xfs_reflink_remap_prep(
> >  	struct file		*file_in,
> >  	loff_t			pos_in,
> >  	struct file		*file_out,
> > @@ -1211,19 +1233,9 @@ xfs_reflink_remap_range(
> >  	struct xfs_inode	*src = XFS_I(inode_in);
> >  	struct inode		*inode_out = file_inode(file_out);
> >  	struct xfs_inode	*dest = XFS_I(inode_out);
> > -	struct xfs_mount	*mp = src->i_mount;
> >  	bool			same_inode = (inode_in == inode_out);
> > -	xfs_fileoff_t		sfsbno, dfsbno;
> > -	xfs_filblks_t		fsblen;
> > -	xfs_extlen_t		cowextsize;
> >  	ssize_t			ret;
> >  
> > -	if (!xfs_sb_version_hasreflink(&mp->m_sb))
> > -		return -EOPNOTSUPP;
> > -
> > -	if (XFS_FORCED_SHUTDOWN(mp))
> > -		return -EIO;
> > -
> >  	/* Lock both files against IO */
> >  	ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out);
> >  	if (ret)
> > @@ -1254,8 +1266,6 @@ xfs_reflink_remap_range(
> >  	if (ret)
> >  		goto out_unlock;
> >  
> > -	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
> > -
> >  	/*
> >  	 * Clear out post-eof preallocations because we don't have page cache
> >  	 * backing the delayed allocations and they'll never get freed on
> > @@ -1272,6 +1282,51 @@ xfs_reflink_remap_range(
> >  	if (ret)
> >  		goto out_unlock;
> >  
> > +	/* Zap any page cache for the destination file's range. */
> > +	truncate_inode_pages_range(&inode_out->i_data, pos_out,
> > +				   PAGE_ALIGN(pos_out + len) - 1);
> > +	return 0;
> > +out_unlock:
> > +	xfs_reflink_remap_unlock(file_in, file_out);
> > +	return ret;
> > +}
> > +
> > +/*
> > + * Link a range of blocks from one file to another.
> > + */
> > +int
> > +xfs_reflink_remap_range(
> > +	struct file		*file_in,
> > +	loff_t			pos_in,
> > +	struct file		*file_out,
> > +	loff_t			pos_out,
> > +	u64			len,
> > +	bool			is_dedupe)
> > +{
> > +	struct inode		*inode_in = file_inode(file_in);
> > +	struct xfs_inode	*src = XFS_I(inode_in);
> > +	struct inode		*inode_out = file_inode(file_out);
> > +	struct xfs_inode	*dest = XFS_I(inode_out);
> > +	struct xfs_mount	*mp = src->i_mount;
> > +	xfs_fileoff_t		sfsbno, dfsbno;
> > +	xfs_filblks_t		fsblen;
> > +	xfs_extlen_t		cowextsize;
> > +	ssize_t			ret;
> > +
> > +	if (!xfs_sb_version_hasreflink(&mp->m_sb))
> > +		return -EOPNOTSUPP;
> > +
> > +	if (XFS_FORCED_SHUTDOWN(mp))
> > +		return -EIO;
> > +
> > +	/* Prepare and then clone file data. */
> > +	ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
> > +			len, is_dedupe);
> > +	if (ret)
> > +		return ret;
> > +
> > +	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
> > +
> >  	dfsbno = XFS_B_TO_FSBT(mp, pos_out);
> >  	sfsbno = XFS_B_TO_FSBT(mp, pos_in);
> >  	fsblen = XFS_B_TO_FSB(mp, len);
> > @@ -1280,10 +1335,6 @@ xfs_reflink_remap_range(
> >  	if (ret)
> >  		goto out_unlock;
> >  
> > -	/* Zap any page cache for the destination file's range. */
> > -	truncate_inode_pages_range(&inode_out->i_data, pos_out,
> > -				   PAGE_ALIGN(pos_out + len) - 1);
> > -
> >  	/*
> >  	 * Carry the cowextsize hint from src to dest if we're sharing the
> >  	 * entire source file to the entire destination file, the source file
> > @@ -1300,12 +1351,7 @@ xfs_reflink_remap_range(
> >  			is_dedupe);
> >  
> >  out_unlock:
> > -	xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
> > -	if (!same_inode)
> > -		xfs_iunlock(src, XFS_MMAPLOCK_SHARED);
> > -	inode_unlock(inode_out);
> > -	if (!same_inode)
> > -		inode_unlock_shared(inode_in);
> > +	xfs_reflink_remap_unlock(file_in, file_out);
> >  	if (ret)
> >  		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
> >  	return ret;
> > 
> > 
> 
> -- 
> Dave Chinner
> david@fromorbit.com
Darrick J. Wong Oct. 5, 2018, 5:21 p.m. UTC | #5
On Fri, Oct 05, 2018 at 07:02:42PM +1000, Dave Chinner wrote:
> On Fri, Oct 05, 2018 at 05:02:28PM +1000, Dave Chinner wrote:
> > On Thu, Oct 04, 2018 at 05:44:47PM -0700, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > 
> > > Refactor all the reflink preparation steps into a separate helper that
> > > we'll use to land all the upcoming fixes for insufficient input checks.
> > > 
> > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > .....
> > 
> > > +xfs_reflink_remap_range(
> > > +	struct file		*file_in,
> > > +	loff_t			pos_in,
> > > +	struct file		*file_out,
> > > +	loff_t			pos_out,
> > > +	u64			len,
> > > +	bool			is_dedupe)
> > > +{
> > > +	struct inode		*inode_in = file_inode(file_in);
> > > +	struct xfs_inode	*src = XFS_I(inode_in);
> > > +	struct inode		*inode_out = file_inode(file_out);
> > > +	struct xfs_inode	*dest = XFS_I(inode_out);
> > > +	struct xfs_mount	*mp = src->i_mount;
> > > +	xfs_fileoff_t		sfsbno, dfsbno;
> > > +	xfs_filblks_t		fsblen;
> > > +	xfs_extlen_t		cowextsize;
> > > +	ssize_t			ret;
> > > +
> > > +	if (!xfs_sb_version_hasreflink(&mp->m_sb))
> > > +		return -EOPNOTSUPP;
> > > +
> > > +	if (XFS_FORCED_SHUTDOWN(mp))
> > > +		return -EIO;
> > > +
> > > +	/* Prepare and then clone file data. */
> > > +	ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
> > > +			len, is_dedupe);
> 
> More than one bug. vfs_clone_file_prep_inodes() modifes the length
> parameter in the case of whole file reflink by way of "len == 0"
> on a non-zero length file. So I fixed this, too.

Did your patch look something like the attached?

--D

diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 80ca9b6793cd..53158bdb1105 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1219,6 +1219,7 @@ xfs_reflink_remap_unlock(
  * Prepare two files for range cloning.  Upon a successful return both inodes
  * will have the iolock and mmaplock held, the page cache of the out file
  * will be truncated, and any leases on the out file will have been broken.
+ * Returns negative for error, 0 for nothing to do, and 1 for success.
  */
 STATIC int
 xfs_reflink_remap_prep(
@@ -1226,7 +1227,7 @@ xfs_reflink_remap_prep(
 	loff_t			pos_in,
 	struct file		*file_out,
 	loff_t			pos_out,
-	u64			len,
+	u64			*len,
 	bool			is_dedupe)
 {
 	struct inode		*inode_in = file_inode(file_in);
@@ -1257,7 +1258,7 @@ xfs_reflink_remap_prep(
 		goto out_unlock;
 
 	ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
-			&len, is_dedupe);
+			len, is_dedupe);
 	if (ret <= 0)
 		goto out_unlock;
 
@@ -1284,8 +1285,8 @@ xfs_reflink_remap_prep(
 
 	/* Zap any page cache for the destination file's range. */
 	truncate_inode_pages_range(&inode_out->i_data, pos_out,
-				   PAGE_ALIGN(pos_out + len) - 1);
-	return 0;
+				   PAGE_ALIGN(pos_out + *len) - 1);
+	return 1;
 out_unlock:
 	xfs_reflink_remap_unlock(file_in, file_out);
 	return ret;
@@ -1321,8 +1322,8 @@ xfs_reflink_remap_range(
 
 	/* Prepare and then clone file data. */
 	ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
-			len, is_dedupe);
-	if (ret)
+			&len, is_dedupe);
+	if (ret <= 0)
 		return ret;
 
 	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
> -Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
Dave Chinner Oct. 5, 2018, 11:42 p.m. UTC | #6
On Fri, Oct 05, 2018 at 10:21:59AM -0700, Darrick J. Wong wrote:
> On Fri, Oct 05, 2018 at 07:02:42PM +1000, Dave Chinner wrote:
> > On Fri, Oct 05, 2018 at 05:02:28PM +1000, Dave Chinner wrote:
> > > On Thu, Oct 04, 2018 at 05:44:47PM -0700, Darrick J. Wong wrote:
> > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > > 
> > > > Refactor all the reflink preparation steps into a separate helper that
> > > > we'll use to land all the upcoming fixes for insufficient input checks.
> > > > 
> > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > .....
> > > 
> > > > +xfs_reflink_remap_range(
> > > > +	struct file		*file_in,
> > > > +	loff_t			pos_in,
> > > > +	struct file		*file_out,
> > > > +	loff_t			pos_out,
> > > > +	u64			len,
> > > > +	bool			is_dedupe)
> > > > +{
> > > > +	struct inode		*inode_in = file_inode(file_in);
> > > > +	struct xfs_inode	*src = XFS_I(inode_in);
> > > > +	struct inode		*inode_out = file_inode(file_out);
> > > > +	struct xfs_inode	*dest = XFS_I(inode_out);
> > > > +	struct xfs_mount	*mp = src->i_mount;
> > > > +	xfs_fileoff_t		sfsbno, dfsbno;
> > > > +	xfs_filblks_t		fsblen;
> > > > +	xfs_extlen_t		cowextsize;
> > > > +	ssize_t			ret;
> > > > +
> > > > +	if (!xfs_sb_version_hasreflink(&mp->m_sb))
> > > > +		return -EOPNOTSUPP;
> > > > +
> > > > +	if (XFS_FORCED_SHUTDOWN(mp))
> > > > +		return -EIO;
> > > > +
> > > > +	/* Prepare and then clone file data. */
> > > > +	ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
> > > > +			len, is_dedupe);
> > 
> > More than one bug. vfs_clone_file_prep_inodes() modifes the length
> > parameter in the case of whole file reflink by way of "len == 0"
> > on a non-zero length file. So I fixed this, too.
> 
> Did your patch look something like the attached?

Identical. I folded it back in once it tested ok, though.

Cheers,

Dave.
Christoph Hellwig Oct. 6, 2018, 10:30 a.m. UTC | #7
On Fri, Oct 05, 2018 at 03:28:09PM +1000, Dave Chinner wrote:
> I've added:
> 
> --
> This rework also moves the invalidation of the destination range to
> the prep function so that it is done before the range is remapped.
> This ensures that nobody can access the data in range being remapped
> until the remap is complete.
> --
> 
> Sound OK?
> 
> Otherwise this looks fine.

Given that the patch will need a respin anyway I'd rather split the
move of the pagecache invalidation into a separate patch.

Patch
diff mbox series

diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 38f405415b88..80ca9b6793cd 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1195,11 +1195,33 @@  xfs_iolock_two_inodes_and_break_layout(
 	return 0;
 }
 
+/* Unlock both inodes after they've been prepped for a range clone. */
+STATIC void
+xfs_reflink_remap_unlock(
+	struct file		*file_in,
+	struct file		*file_out)
+{
+	struct inode		*inode_in = file_inode(file_in);
+	struct xfs_inode	*src = XFS_I(inode_in);
+	struct inode		*inode_out = file_inode(file_out);
+	struct xfs_inode	*dest = XFS_I(inode_out);
+	bool			same_inode = (inode_in == inode_out);
+
+	xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
+	if (!same_inode)
+		xfs_iunlock(src, XFS_MMAPLOCK_SHARED);
+	inode_unlock(inode_out);
+	if (!same_inode)
+		inode_unlock_shared(inode_in);
+}
+
 /*
- * Link a range of blocks from one file to another.
+ * Prepare two files for range cloning.  Upon a successful return both inodes
+ * will have the iolock and mmaplock held, the page cache of the out file
+ * will be truncated, and any leases on the out file will have been broken.
  */
-int
-xfs_reflink_remap_range(
+STATIC int
+xfs_reflink_remap_prep(
 	struct file		*file_in,
 	loff_t			pos_in,
 	struct file		*file_out,
@@ -1211,19 +1233,9 @@  xfs_reflink_remap_range(
 	struct xfs_inode	*src = XFS_I(inode_in);
 	struct inode		*inode_out = file_inode(file_out);
 	struct xfs_inode	*dest = XFS_I(inode_out);
-	struct xfs_mount	*mp = src->i_mount;
 	bool			same_inode = (inode_in == inode_out);
-	xfs_fileoff_t		sfsbno, dfsbno;
-	xfs_filblks_t		fsblen;
-	xfs_extlen_t		cowextsize;
 	ssize_t			ret;
 
-	if (!xfs_sb_version_hasreflink(&mp->m_sb))
-		return -EOPNOTSUPP;
-
-	if (XFS_FORCED_SHUTDOWN(mp))
-		return -EIO;
-
 	/* Lock both files against IO */
 	ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out);
 	if (ret)
@@ -1254,8 +1266,6 @@  xfs_reflink_remap_range(
 	if (ret)
 		goto out_unlock;
 
-	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
-
 	/*
 	 * Clear out post-eof preallocations because we don't have page cache
 	 * backing the delayed allocations and they'll never get freed on
@@ -1272,6 +1282,51 @@  xfs_reflink_remap_range(
 	if (ret)
 		goto out_unlock;
 
+	/* Zap any page cache for the destination file's range. */
+	truncate_inode_pages_range(&inode_out->i_data, pos_out,
+				   PAGE_ALIGN(pos_out + len) - 1);
+	return 0;
+out_unlock:
+	xfs_reflink_remap_unlock(file_in, file_out);
+	return ret;
+}
+
+/*
+ * Link a range of blocks from one file to another.
+ */
+int
+xfs_reflink_remap_range(
+	struct file		*file_in,
+	loff_t			pos_in,
+	struct file		*file_out,
+	loff_t			pos_out,
+	u64			len,
+	bool			is_dedupe)
+{
+	struct inode		*inode_in = file_inode(file_in);
+	struct xfs_inode	*src = XFS_I(inode_in);
+	struct inode		*inode_out = file_inode(file_out);
+	struct xfs_inode	*dest = XFS_I(inode_out);
+	struct xfs_mount	*mp = src->i_mount;
+	xfs_fileoff_t		sfsbno, dfsbno;
+	xfs_filblks_t		fsblen;
+	xfs_extlen_t		cowextsize;
+	ssize_t			ret;
+
+	if (!xfs_sb_version_hasreflink(&mp->m_sb))
+		return -EOPNOTSUPP;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	/* Prepare and then clone file data. */
+	ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
+			len, is_dedupe);
+	if (ret)
+		return ret;
+
+	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
+
 	dfsbno = XFS_B_TO_FSBT(mp, pos_out);
 	sfsbno = XFS_B_TO_FSBT(mp, pos_in);
 	fsblen = XFS_B_TO_FSB(mp, len);
@@ -1280,10 +1335,6 @@  xfs_reflink_remap_range(
 	if (ret)
 		goto out_unlock;
 
-	/* Zap any page cache for the destination file's range. */
-	truncate_inode_pages_range(&inode_out->i_data, pos_out,
-				   PAGE_ALIGN(pos_out + len) - 1);
-
 	/*
 	 * Carry the cowextsize hint from src to dest if we're sharing the
 	 * entire source file to the entire destination file, the source file
@@ -1300,12 +1351,7 @@  xfs_reflink_remap_range(
 			is_dedupe);
 
 out_unlock:
-	xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
-	if (!same_inode)
-		xfs_iunlock(src, XFS_MMAPLOCK_SHARED);
-	inode_unlock(inode_out);
-	if (!same_inode)
-		inode_unlock_shared(inode_in);
+	xfs_reflink_remap_unlock(file_in, file_out);
 	if (ret)
 		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
 	return ret;