diff mbox series

[5/8] xfs: merge COW handling into xfs_file_iomap_begin_delay

Message ID 20190218091827.12619-6-hch@lst.de (mailing list archive)
State Accepted
Headers show
Series [1/8] xfs: make xfs_bmbt_to_iomap more useful | expand

Commit Message

Christoph Hellwig Feb. 18, 2019, 9:18 a.m. UTC
Besides simplifying the code a bit this allows to actually implement
the behavior of using COW preallocation for non-COW data mentioned
in the current comments.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_iomap.c   | 133 ++++++++++++++++++++++++++++++-------------
 fs/xfs/xfs_reflink.c |  67 ----------------------
 fs/xfs/xfs_reflink.h |   2 -
 fs/xfs/xfs_trace.h   |   3 -
 4 files changed, 94 insertions(+), 111 deletions(-)

Comments

Darrick J. Wong Feb. 19, 2019, 6:12 p.m. UTC | #1
On Mon, Feb 18, 2019 at 10:18:24AM +0100, Christoph Hellwig wrote:
> Besides simplifying the code a bit this allows to actually implement
> the behavior of using COW preallocation for non-COW data mentioned
> in the current comments.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Looks ok,
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>

--D

> ---
>  fs/xfs/xfs_iomap.c   | 133 ++++++++++++++++++++++++++++++-------------
>  fs/xfs/xfs_reflink.c |  67 ----------------------
>  fs/xfs/xfs_reflink.h |   2 -
>  fs/xfs/xfs_trace.h   |   3 -
>  4 files changed, 94 insertions(+), 111 deletions(-)
> 
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 19a3331b4a56..c9fd1e4a1f99 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -534,15 +534,16 @@ xfs_file_iomap_begin_delay(
>  {
>  	struct xfs_inode	*ip = XFS_I(inode);
>  	struct xfs_mount	*mp = ip->i_mount;
> -	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
>  	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
>  	xfs_fileoff_t		maxbytes_fsb =
>  		XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
>  	xfs_fileoff_t		end_fsb;
> -	int			error = 0, eof = 0;
> -	struct xfs_bmbt_irec	got;
> -	struct xfs_iext_cursor	icur;
> +	struct xfs_bmbt_irec	imap, cmap;
> +	struct xfs_iext_cursor	icur, ccur;
>  	xfs_fsblock_t		prealloc_blocks = 0;
> +	bool			eof = false, cow_eof = false, shared;
> +	int			whichfork = XFS_DATA_FORK;
> +	int			error = 0;
>  
>  	ASSERT(!XFS_IS_REALTIME_INODE(ip));
>  	ASSERT(!xfs_get_extsz_hint(ip));
> @@ -560,7 +561,7 @@ xfs_file_iomap_begin_delay(
>  
>  	XFS_STATS_INC(mp, xs_blk_mapw);
>  
> -	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
> +	if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
>  		error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
>  		if (error)
>  			goto out_unlock;
> @@ -568,51 +569,92 @@ xfs_file_iomap_begin_delay(
>  
>  	end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
>  
> -	eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got);
> +	/*
> +	 * Search the data fork fork first to look up our source mapping.  We
> +	 * always need the data fork map, as we have to return it to the
> +	 * iomap code so that the higher level write code can read data in to
> +	 * perform read-modify-write cycles for unaligned writes.
> +	 */
> +	eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
>  	if (eof)
> -		got.br_startoff = end_fsb; /* fake hole until the end */
> +		imap.br_startoff = end_fsb; /* fake hole until the end */
> +
> +	/* We never need to allocate blocks for zeroing a hole. */
> +	if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
> +		xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
> +		goto out_unlock;
> +	}
> +
> +	/*
> +	 * Search the COW fork extent list even if we did not find a data fork
> +	 * extent.  This serves two purposes: first this implements the
> +	 * speculative preallocation using cowextsize, so that we also unshare
> +	 * block adjacent to shared blocks instead of just the shared blocks
> +	 * themselves.  Second the lookup in the extent list is generally faster
> +	 * than going out to the shared extent tree.
> +	 */
> +	if (xfs_is_reflink_inode(ip)) {
> +		cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
> +				&ccur, &cmap);
> +		if (!cow_eof && cmap.br_startoff <= offset_fsb) {
> +			trace_xfs_reflink_cow_found(ip, &cmap);
> +			whichfork = XFS_COW_FORK;
> +			goto done;
> +		}
> +	}
>  
> -	if (got.br_startoff <= offset_fsb) {
> +	if (imap.br_startoff <= offset_fsb) {
>  		/*
>  		 * For reflink files we may need a delalloc reservation when
>  		 * overwriting shared extents.   This includes zeroing of
>  		 * existing extents that contain data.
>  		 */
> -		if (xfs_is_reflink_inode(ip) &&
> -		    ((flags & IOMAP_WRITE) ||
> -		     got.br_state != XFS_EXT_UNWRITTEN)) {
> -			xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb);
> -			error = xfs_reflink_reserve_cow(ip, &got);
> -			if (error)
> -				goto out_unlock;
> +		if (!xfs_is_reflink_inode(ip) ||
> +		    ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) {
> +			trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
> +					&imap);
> +			goto done;
>  		}
>  
> -		trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, &got);
> -		goto done;
> -	}
> +		xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
>  
> -	if (flags & IOMAP_ZERO) {
> -		xfs_hole_to_iomap(ip, iomap, offset_fsb, got.br_startoff);
> -		goto out_unlock;
> +		/* Trim the mapping to the nearest shared extent boundary. */
> +		error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
> +		if (error)
> +			goto out_unlock;
> +
> +		/* Not shared?  Just report the (potentially capped) extent. */
> +		if (!shared) {
> +			trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
> +					&imap);
> +			goto done;
> +		}
> +
> +		/*
> +		 * Fork all the shared blocks from our write offset until the
> +		 * end of the extent.
> +		 */
> +		whichfork = XFS_COW_FORK;
> +		end_fsb = imap.br_startoff + imap.br_blockcount;
> +	} else {
> +		/*
> +		 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
> +		 * pages to keep the chunks of work done where somewhat
> +		 * symmetric with the work writeback does.  This is a completely
> +		 * arbitrary number pulled out of thin air.
> +		 *
> +		 * Note that the values needs to be less than 32-bits wide until
> +		 * the lower level functions are updated.
> +		 */
> +		count = min_t(loff_t, count, 1024 * PAGE_SIZE);
> +		end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
>  	}
>  
>  	error = xfs_qm_dqattach_locked(ip, false);
>  	if (error)
>  		goto out_unlock;
>  
> -	/*
> -	 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages
> -	 * to keep the chunks of work done where somewhat symmetric with the
> -	 * work writeback does. This is a completely arbitrary number pulled
> -	 * out of thin air as a best guess for initial testing.
> -	 *
> -	 * Note that the values needs to be less than 32-bits wide until
> -	 * the lower level functions are updated.
> -	 */
> -	count = min_t(loff_t, count, 1024 * PAGE_SIZE);
> -	end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
> -
> -	if (eof) {
> +	if (eof && whichfork == XFS_DATA_FORK) {
>  		prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count,
>  				&icur);
>  		if (prealloc_blocks) {
> @@ -635,9 +677,11 @@ xfs_file_iomap_begin_delay(
>  	}
>  
>  retry:
> -	error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb,
> -			end_fsb - offset_fsb, prealloc_blocks, &got, &icur,
> -			eof);
> +	error = xfs_bmapi_reserve_delalloc(ip, whichfork, offset_fsb,
> +			end_fsb - offset_fsb, prealloc_blocks,
> +			whichfork == XFS_DATA_FORK ? &imap : &cmap,
> +			whichfork == XFS_DATA_FORK ? &icur : &ccur,
> +			whichfork == XFS_DATA_FORK ? eof : cow_eof);
>  	switch (error) {
>  	case 0:
>  		break;
> @@ -659,9 +703,20 @@ xfs_file_iomap_begin_delay(
>  	 * them out if the write happens to fail.
>  	 */
>  	iomap->flags |= IOMAP_F_NEW;
> -	trace_xfs_iomap_alloc(ip, offset, count, XFS_DATA_FORK, &got);
> +	trace_xfs_iomap_alloc(ip, offset, count, whichfork,
> +			whichfork == XFS_DATA_FORK ? &imap : &cmap);
>  done:
> -	error = xfs_bmbt_to_iomap(ip, iomap, &got, false);
> +	if (whichfork == XFS_COW_FORK) {
> +		if (imap.br_startoff > offset_fsb) {
> +			xfs_trim_extent(&cmap, offset_fsb,
> +					imap.br_startoff - offset_fsb);
> +			error = xfs_bmbt_to_iomap(ip, iomap, &cmap, false);
> +			goto out_unlock;
> +		}
> +		/* ensure we only report blocks we have a reservation for */
> +		xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount);
> +	}
> +	error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
>  out_unlock:
>  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
>  	return error;
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index 8a5353daf9ab..9ef1f79cb3ae 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -234,73 +234,6 @@ xfs_reflink_trim_around_shared(
>  	}
>  }
>  
> -/*
> - * Trim the passed in imap to the next shared/unshared extent boundary, and
> - * if imap->br_startoff points to a shared extent reserve space for it in the
> - * COW fork.
> - *
> - * Note that imap will always contain the block numbers for the existing blocks
> - * in the data fork, as the upper layers need them for read-modify-write
> - * operations.
> - */
> -int
> -xfs_reflink_reserve_cow(
> -	struct xfs_inode	*ip,
> -	struct xfs_bmbt_irec	*imap)
> -{
> -	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
> -	struct xfs_bmbt_irec	got;
> -	int			error = 0;
> -	bool			eof = false;
> -	struct xfs_iext_cursor	icur;
> -	bool			shared;
> -
> -	/*
> -	 * Search the COW fork extent list first.  This serves two purposes:
> -	 * first this implement the speculative preallocation using cowextisze,
> -	 * so that we also unshared block adjacent to shared blocks instead
> -	 * of just the shared blocks themselves.  Second the lookup in the
> -	 * extent list is generally faster than going out to the shared extent
> -	 * tree.
> -	 */
> -
> -	if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got))
> -		eof = true;
> -	if (!eof && got.br_startoff <= imap->br_startoff) {
> -		trace_xfs_reflink_cow_found(ip, imap);
> -		xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
> -		return 0;
> -	}
> -
> -	/* Trim the mapping to the nearest shared extent boundary. */
> -	error = xfs_reflink_trim_around_shared(ip, imap, &shared);
> -	if (error)
> -		return error;
> -
> -	/* Not shared?  Just report the (potentially capped) extent. */
> -	if (!shared)
> -		return 0;
> -
> -	/*
> -	 * Fork all the shared blocks from our write offset until the end of
> -	 * the extent.
> -	 */
> -	error = xfs_qm_dqattach_locked(ip, false);
> -	if (error)
> -		return error;
> -
> -	error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
> -			imap->br_blockcount, 0, &got, &icur, eof);
> -	if (error == -ENOSPC || error == -EDQUOT)
> -		trace_xfs_reflink_cow_enospc(ip, imap);
> -	if (error)
> -		return error;
> -
> -	xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
> -	trace_xfs_reflink_cow_alloc(ip, &got);
> -	return 0;
> -}
> -
>  /* Convert part of an unwritten CoW extent to a real one. */
>  STATIC int
>  xfs_reflink_convert_cow_extent(
> diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
> index 70d68a1a9b49..4a9e3cd4768a 100644
> --- a/fs/xfs/xfs_reflink.h
> +++ b/fs/xfs/xfs_reflink.h
> @@ -12,8 +12,6 @@ extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp,
>  extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
>  		struct xfs_bmbt_irec *irec, bool *shared);
>  
> -extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
> -		struct xfs_bmbt_irec *imap);
>  extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
>  		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode,
>  		unsigned iomap_flags);
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index f1e18ae8a209..47fb07d86efd 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -3196,13 +3196,10 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
>  
>  /* copy on write */
>  DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
> -DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc);
>  DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
>  DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
>  DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
>  
> -DEFINE_RW_EVENT(xfs_reflink_reserve_cow);
> -
>  DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write);
>  
>  DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
> -- 
> 2.20.1
>
Brian Foster Feb. 21, 2019, 5:59 p.m. UTC | #2
On Mon, Feb 18, 2019 at 10:18:24AM +0100, Christoph Hellwig wrote:
> Besides simplifying the code a bit this allows to actually implement
> the behavior of using COW preallocation for non-COW data mentioned
> in the current comments.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---

Heh, I'm somewhat amused by the fact that I sent a variant of this patch
two years ago (for a different purpose) and you explicitly complained
about the factoring. I'm glad you've finally come around. ;)

https://marc.info/?l=linux-xfs&m=149498124730442&w=2

>  fs/xfs/xfs_iomap.c   | 133 ++++++++++++++++++++++++++++++-------------
>  fs/xfs/xfs_reflink.c |  67 ----------------------
>  fs/xfs/xfs_reflink.h |   2 -
>  fs/xfs/xfs_trace.h   |   3 -
>  4 files changed, 94 insertions(+), 111 deletions(-)
> 
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 19a3331b4a56..c9fd1e4a1f99 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
...
> @@ -568,51 +569,92 @@ xfs_file_iomap_begin_delay(
>  
>  	end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
>  
> -	eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got);
> +	/*
> +	 * Search the data fork fork first to look up our source mapping.  We
> +	 * always need the data fork map, as we have to return it to the
> +	 * iomap code so that the higher level write code can read data in to
> +	 * perform read-modify-write cycles for unaligned writes.
> +	 */
> +	eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
>  	if (eof)
> -		got.br_startoff = end_fsb; /* fake hole until the end */
> +		imap.br_startoff = end_fsb; /* fake hole until the end */
> +
> +	/* We never need to allocate blocks for zeroing a hole. */
> +	if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
> +		xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
> +		goto out_unlock;
> +	}
> +

So does this need to account for the case of an overlapping cow block
over a hole in the data fork (with cached data, if that is possible)?
IIUC we introduce that possibility just below.

> +	/*
> +	 * Search the COW fork extent list even if we did not find a data fork
> +	 * extent.  This serves two purposes: first this implements the
> +	 * speculative preallocation using cowextsize, so that we also unshare
> +	 * block adjacent to shared blocks instead of just the shared blocks
> +	 * themselves.  Second the lookup in the extent list is generally faster
> +	 * than going out to the shared extent tree.
> +	 */
> +	if (xfs_is_reflink_inode(ip)) {
> +		cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
> +				&ccur, &cmap);
> +		if (!cow_eof && cmap.br_startoff <= offset_fsb) {
> +			trace_xfs_reflink_cow_found(ip, &cmap);
> +			whichfork = XFS_COW_FORK;
> +			goto done;
> +		}
> +	}
>  
> -	if (got.br_startoff <= offset_fsb) {
> +	if (imap.br_startoff <= offset_fsb) {
>  		/*
>  		 * For reflink files we may need a delalloc reservation when
>  		 * overwriting shared extents.   This includes zeroing of
>  		 * existing extents that contain data.
>  		 */
> -		if (xfs_is_reflink_inode(ip) &&
> -		    ((flags & IOMAP_WRITE) ||
> -		     got.br_state != XFS_EXT_UNWRITTEN)) {
> -			xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb);
> -			error = xfs_reflink_reserve_cow(ip, &got);
> -			if (error)
> -				goto out_unlock;
> +		if (!xfs_is_reflink_inode(ip) ||
> +		    ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) {
> +			trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
> +					&imap);
> +			goto done;
>  		}
>  
> -		trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, &got);
> -		goto done;
> -	}
> +		xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
>  
> -	if (flags & IOMAP_ZERO) {
> -		xfs_hole_to_iomap(ip, iomap, offset_fsb, got.br_startoff);
> -		goto out_unlock;
> +		/* Trim the mapping to the nearest shared extent boundary. */
> +		error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
> +		if (error)
> +			goto out_unlock;
> +
> +		/* Not shared?  Just report the (potentially capped) extent. */
> +		if (!shared) {
> +			trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
> +					&imap);
> +			goto done;
> +		}
> +
> +		/*
> +		 * Fork all the shared blocks from our write offset until the
> +		 * end of the extent.
> +		 */
> +		whichfork = XFS_COW_FORK;
> +		end_fsb = imap.br_startoff + imap.br_blockcount;
> +	} else {
> +		/*
> +		 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
> +		 * pages to keep the chunks of work done where somewhat
> +		 * symmetric with the work writeback does.  This is a completely
> +		 * arbitrary number pulled out of thin air.
> +		 *
> +		 * Note that the values needs to be less than 32-bits wide until
> +		 * the lower level functions are updated.
> +		 */
> +		count = min_t(loff_t, count, 1024 * PAGE_SIZE);
> +		end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);

The existing code doesn't currently do this but ISTM this should apply
to either allocation case, not just data fork delalloc. That could be
something for a separate patch though.

>  	}
>  
>  	error = xfs_qm_dqattach_locked(ip, false);
>  	if (error)
>  		goto out_unlock;
>  
> -	/*
> -	 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages
> -	 * to keep the chunks of work done where somewhat symmetric with the
> -	 * work writeback does. This is a completely arbitrary number pulled
> -	 * out of thin air as a best guess for initial testing.
> -	 *
> -	 * Note that the values needs to be less than 32-bits wide until
> -	 * the lower level functions are updated.
> -	 */
> -	count = min_t(loff_t, count, 1024 * PAGE_SIZE);
> -	end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
> -
> -	if (eof) {
> +	if (eof && whichfork == XFS_DATA_FORK) {
>  		prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count,
>  				&icur);
>  		if (prealloc_blocks) {
...
> @@ -659,9 +703,20 @@ xfs_file_iomap_begin_delay(
>  	 * them out if the write happens to fail.
>  	 */
>  	iomap->flags |= IOMAP_F_NEW;

This looks like it flags the mapping new if we reserve cow blocks, which
I don't think is quite right.

> -	trace_xfs_iomap_alloc(ip, offset, count, XFS_DATA_FORK, &got);
> +	trace_xfs_iomap_alloc(ip, offset, count, whichfork,
> +			whichfork == XFS_DATA_FORK ? &imap : &cmap);
>  done:
> -	error = xfs_bmbt_to_iomap(ip, iomap, &got, false);
> +	if (whichfork == XFS_COW_FORK) {
> +		if (imap.br_startoff > offset_fsb) {
> +			xfs_trim_extent(&cmap, offset_fsb,
> +					imap.br_startoff - offset_fsb);
> +			error = xfs_bmbt_to_iomap(ip, iomap, &cmap, false);
> +			goto out_unlock;
> +		}

Hmm, so this looks like it is in fact handling a COW blocks over a hole
case, pushing the COW mapping into the iomap. We never accounted for
that case before where we'd always just allocate to the data fork. The
change in behavior probably makes sense, but this really should be
separate from the refactoring bits to reuse the data fork delalloc code.
Beyond making this a bit easier to follow, it warrants its own commit
log description and this one makes no mention of it at all.

Brian

> +		/* ensure we only report blocks we have a reservation for */
> +		xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount);
> +	}
> +	error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
>  out_unlock:
>  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
>  	return error;
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index 8a5353daf9ab..9ef1f79cb3ae 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -234,73 +234,6 @@ xfs_reflink_trim_around_shared(
>  	}
>  }
>  
> -/*
> - * Trim the passed in imap to the next shared/unshared extent boundary, and
> - * if imap->br_startoff points to a shared extent reserve space for it in the
> - * COW fork.
> - *
> - * Note that imap will always contain the block numbers for the existing blocks
> - * in the data fork, as the upper layers need them for read-modify-write
> - * operations.
> - */
> -int
> -xfs_reflink_reserve_cow(
> -	struct xfs_inode	*ip,
> -	struct xfs_bmbt_irec	*imap)
> -{
> -	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
> -	struct xfs_bmbt_irec	got;
> -	int			error = 0;
> -	bool			eof = false;
> -	struct xfs_iext_cursor	icur;
> -	bool			shared;
> -
> -	/*
> -	 * Search the COW fork extent list first.  This serves two purposes:
> -	 * first this implement the speculative preallocation using cowextisze,
> -	 * so that we also unshared block adjacent to shared blocks instead
> -	 * of just the shared blocks themselves.  Second the lookup in the
> -	 * extent list is generally faster than going out to the shared extent
> -	 * tree.
> -	 */
> -
> -	if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got))
> -		eof = true;
> -	if (!eof && got.br_startoff <= imap->br_startoff) {
> -		trace_xfs_reflink_cow_found(ip, imap);
> -		xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
> -		return 0;
> -	}
> -
> -	/* Trim the mapping to the nearest shared extent boundary. */
> -	error = xfs_reflink_trim_around_shared(ip, imap, &shared);
> -	if (error)
> -		return error;
> -
> -	/* Not shared?  Just report the (potentially capped) extent. */
> -	if (!shared)
> -		return 0;
> -
> -	/*
> -	 * Fork all the shared blocks from our write offset until the end of
> -	 * the extent.
> -	 */
> -	error = xfs_qm_dqattach_locked(ip, false);
> -	if (error)
> -		return error;
> -
> -	error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
> -			imap->br_blockcount, 0, &got, &icur, eof);
> -	if (error == -ENOSPC || error == -EDQUOT)
> -		trace_xfs_reflink_cow_enospc(ip, imap);
> -	if (error)
> -		return error;
> -
> -	xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
> -	trace_xfs_reflink_cow_alloc(ip, &got);
> -	return 0;
> -}
> -
>  /* Convert part of an unwritten CoW extent to a real one. */
>  STATIC int
>  xfs_reflink_convert_cow_extent(
> diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
> index 70d68a1a9b49..4a9e3cd4768a 100644
> --- a/fs/xfs/xfs_reflink.h
> +++ b/fs/xfs/xfs_reflink.h
> @@ -12,8 +12,6 @@ extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp,
>  extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
>  		struct xfs_bmbt_irec *irec, bool *shared);
>  
> -extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
> -		struct xfs_bmbt_irec *imap);
>  extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
>  		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode,
>  		unsigned iomap_flags);
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index f1e18ae8a209..47fb07d86efd 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -3196,13 +3196,10 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
>  
>  /* copy on write */
>  DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
> -DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc);
>  DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
>  DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
>  DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
>  
> -DEFINE_RW_EVENT(xfs_reflink_reserve_cow);
> -
>  DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write);
>  
>  DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
> -- 
> 2.20.1
>
Darrick J. Wong Feb. 21, 2019, 9:30 p.m. UTC | #3
On Thu, Feb 21, 2019 at 12:59:03PM -0500, Brian Foster wrote:
> On Mon, Feb 18, 2019 at 10:18:24AM +0100, Christoph Hellwig wrote:
> > Besides simplifying the code a bit this allows to actually implement
> > the behavior of using COW preallocation for non-COW data mentioned
> > in the current comments.
> > 
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > ---
> 
> Heh, I'm somewhat amused by the fact that I sent a variant of this patch
> two years ago (for a different purpose) and you explicitly complained
> about the factoring. I'm glad you've finally come around. ;)
> 
> https://marc.info/?l=linux-xfs&m=149498124730442&w=2
> 
> >  fs/xfs/xfs_iomap.c   | 133 ++++++++++++++++++++++++++++++-------------
> >  fs/xfs/xfs_reflink.c |  67 ----------------------
> >  fs/xfs/xfs_reflink.h |   2 -
> >  fs/xfs/xfs_trace.h   |   3 -
> >  4 files changed, 94 insertions(+), 111 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> > index 19a3331b4a56..c9fd1e4a1f99 100644
> > --- a/fs/xfs/xfs_iomap.c
> > +++ b/fs/xfs/xfs_iomap.c
> ...
> > @@ -568,51 +569,92 @@ xfs_file_iomap_begin_delay(
> >  
> >  	end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
> >  
> > -	eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got);
> > +	/*
> > +	 * Search the data fork fork first to look up our source mapping.  We
> > +	 * always need the data fork map, as we have to return it to the
> > +	 * iomap code so that the higher level write code can read data in to
> > +	 * perform read-modify-write cycles for unaligned writes.
> > +	 */
> > +	eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
> >  	if (eof)
> > -		got.br_startoff = end_fsb; /* fake hole until the end */
> > +		imap.br_startoff = end_fsb; /* fake hole until the end */
> > +
> > +	/* We never need to allocate blocks for zeroing a hole. */
> > +	if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
> > +		xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
> > +		goto out_unlock;
> > +	}
> > +
> 
> So does this need to account for the case of an overlapping cow block
> over a hole in the data fork (with cached data, if that is possible)?
> IIUC we introduce that possibility just below.

I think it makes sense to ignore overlapping cow blocks for zeroing a
hole in the data fork -- the user told us to zero part of a file that
didn't have nonzero contents in it, so we just leave the speculative cow
allocation for that block alone.

> > +	/*
> > +	 * Search the COW fork extent list even if we did not find a data fork
> > +	 * extent.  This serves two purposes: first this implements the
> > +	 * speculative preallocation using cowextsize, so that we also unshare
> > +	 * block adjacent to shared blocks instead of just the shared blocks
> > +	 * themselves.  Second the lookup in the extent list is generally faster
> > +	 * than going out to the shared extent tree.
> > +	 */
> > +	if (xfs_is_reflink_inode(ip)) {
> > +		cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
> > +				&ccur, &cmap);
> > +		if (!cow_eof && cmap.br_startoff <= offset_fsb) {
> > +			trace_xfs_reflink_cow_found(ip, &cmap);
> > +			whichfork = XFS_COW_FORK;
> > +			goto done;
> > +		}
> > +	}
> >  
> > -	if (got.br_startoff <= offset_fsb) {
> > +	if (imap.br_startoff <= offset_fsb) {
> >  		/*
> >  		 * For reflink files we may need a delalloc reservation when
> >  		 * overwriting shared extents.   This includes zeroing of
> >  		 * existing extents that contain data.
> >  		 */
> > -		if (xfs_is_reflink_inode(ip) &&
> > -		    ((flags & IOMAP_WRITE) ||
> > -		     got.br_state != XFS_EXT_UNWRITTEN)) {
> > -			xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb);
> > -			error = xfs_reflink_reserve_cow(ip, &got);
> > -			if (error)
> > -				goto out_unlock;
> > +		if (!xfs_is_reflink_inode(ip) ||
> > +		    ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) {
> > +			trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
> > +					&imap);
> > +			goto done;
> >  		}
> >  
> > -		trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, &got);
> > -		goto done;
> > -	}
> > +		xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
> >  
> > -	if (flags & IOMAP_ZERO) {
> > -		xfs_hole_to_iomap(ip, iomap, offset_fsb, got.br_startoff);
> > -		goto out_unlock;
> > +		/* Trim the mapping to the nearest shared extent boundary. */
> > +		error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
> > +		if (error)
> > +			goto out_unlock;
> > +
> > +		/* Not shared?  Just report the (potentially capped) extent. */
> > +		if (!shared) {
> > +			trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
> > +					&imap);
> > +			goto done;
> > +		}
> > +
> > +		/*
> > +		 * Fork all the shared blocks from our write offset until the
> > +		 * end of the extent.
> > +		 */
> > +		whichfork = XFS_COW_FORK;
> > +		end_fsb = imap.br_startoff + imap.br_blockcount;
> > +	} else {
> > +		/*
> > +		 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
> > +		 * pages to keep the chunks of work done where somewhat
> > +		 * symmetric with the work writeback does.  This is a completely
> > +		 * arbitrary number pulled out of thin air.
> > +		 *
> > +		 * Note that the values needs to be less than 32-bits wide until
> > +		 * the lower level functions are updated.
> > +		 */
> > +		count = min_t(loff_t, count, 1024 * PAGE_SIZE);
> > +		end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
> 
> The existing code doesn't currently do this but ISTM this should apply
> to either allocation case, not just data fork delalloc. That could be
> something for a separate patch though.
> 
> >  	}
> >  
> >  	error = xfs_qm_dqattach_locked(ip, false);
> >  	if (error)
> >  		goto out_unlock;
> >  
> > -	/*
> > -	 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages
> > -	 * to keep the chunks of work done where somewhat symmetric with the
> > -	 * work writeback does. This is a completely arbitrary number pulled
> > -	 * out of thin air as a best guess for initial testing.
> > -	 *
> > -	 * Note that the values needs to be less than 32-bits wide until
> > -	 * the lower level functions are updated.
> > -	 */
> > -	count = min_t(loff_t, count, 1024 * PAGE_SIZE);
> > -	end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
> > -
> > -	if (eof) {
> > +	if (eof && whichfork == XFS_DATA_FORK) {
> >  		prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count,
> >  				&icur);
> >  		if (prealloc_blocks) {
> ...
> > @@ -659,9 +703,20 @@ xfs_file_iomap_begin_delay(
> >  	 * them out if the write happens to fail.
> >  	 */
> >  	iomap->flags |= IOMAP_F_NEW;
> 
> This looks like it flags the mapping new if we reserve cow blocks, which
> I don't think is quite right.

Hmmm.  I thought it was correct -- if the write fails, we punch out the
pagecache and trim any delalloc blocks in the data fork.  If they user
tries to reread the area of the failed write we'll just read them back
in from disk...?

--D

> > -	trace_xfs_iomap_alloc(ip, offset, count, XFS_DATA_FORK, &got);
> > +	trace_xfs_iomap_alloc(ip, offset, count, whichfork,
> > +			whichfork == XFS_DATA_FORK ? &imap : &cmap);
> >  done:
> > -	error = xfs_bmbt_to_iomap(ip, iomap, &got, false);
> > +	if (whichfork == XFS_COW_FORK) {
> > +		if (imap.br_startoff > offset_fsb) {
> > +			xfs_trim_extent(&cmap, offset_fsb,
> > +					imap.br_startoff - offset_fsb);
> > +			error = xfs_bmbt_to_iomap(ip, iomap, &cmap, false);
> > +			goto out_unlock;
> > +		}
> 
> Hmm, so this looks like it is in fact handling a COW blocks over a hole
> case, pushing the COW mapping into the iomap. We never accounted for
> that case before where we'd always just allocate to the data fork. The
> change in behavior probably makes sense, but this really should be
> separate from the refactoring bits to reuse the data fork delalloc code.
> Beyond making this a bit easier to follow, it warrants its own commit
> log description and this one makes no mention of it at all.
> 
> Brian
> 
> > +		/* ensure we only report blocks we have a reservation for */
> > +		xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount);
> > +	}
> > +	error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
> >  out_unlock:
> >  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
> >  	return error;
> > diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> > index 8a5353daf9ab..9ef1f79cb3ae 100644
> > --- a/fs/xfs/xfs_reflink.c
> > +++ b/fs/xfs/xfs_reflink.c
> > @@ -234,73 +234,6 @@ xfs_reflink_trim_around_shared(
> >  	}
> >  }
> >  
> > -/*
> > - * Trim the passed in imap to the next shared/unshared extent boundary, and
> > - * if imap->br_startoff points to a shared extent reserve space for it in the
> > - * COW fork.
> > - *
> > - * Note that imap will always contain the block numbers for the existing blocks
> > - * in the data fork, as the upper layers need them for read-modify-write
> > - * operations.
> > - */
> > -int
> > -xfs_reflink_reserve_cow(
> > -	struct xfs_inode	*ip,
> > -	struct xfs_bmbt_irec	*imap)
> > -{
> > -	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
> > -	struct xfs_bmbt_irec	got;
> > -	int			error = 0;
> > -	bool			eof = false;
> > -	struct xfs_iext_cursor	icur;
> > -	bool			shared;
> > -
> > -	/*
> > -	 * Search the COW fork extent list first.  This serves two purposes:
> > -	 * first this implement the speculative preallocation using cowextisze,
> > -	 * so that we also unshared block adjacent to shared blocks instead
> > -	 * of just the shared blocks themselves.  Second the lookup in the
> > -	 * extent list is generally faster than going out to the shared extent
> > -	 * tree.
> > -	 */
> > -
> > -	if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got))
> > -		eof = true;
> > -	if (!eof && got.br_startoff <= imap->br_startoff) {
> > -		trace_xfs_reflink_cow_found(ip, imap);
> > -		xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
> > -		return 0;
> > -	}
> > -
> > -	/* Trim the mapping to the nearest shared extent boundary. */
> > -	error = xfs_reflink_trim_around_shared(ip, imap, &shared);
> > -	if (error)
> > -		return error;
> > -
> > -	/* Not shared?  Just report the (potentially capped) extent. */
> > -	if (!shared)
> > -		return 0;
> > -
> > -	/*
> > -	 * Fork all the shared blocks from our write offset until the end of
> > -	 * the extent.
> > -	 */
> > -	error = xfs_qm_dqattach_locked(ip, false);
> > -	if (error)
> > -		return error;
> > -
> > -	error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
> > -			imap->br_blockcount, 0, &got, &icur, eof);
> > -	if (error == -ENOSPC || error == -EDQUOT)
> > -		trace_xfs_reflink_cow_enospc(ip, imap);
> > -	if (error)
> > -		return error;
> > -
> > -	xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
> > -	trace_xfs_reflink_cow_alloc(ip, &got);
> > -	return 0;
> > -}
> > -
> >  /* Convert part of an unwritten CoW extent to a real one. */
> >  STATIC int
> >  xfs_reflink_convert_cow_extent(
> > diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
> > index 70d68a1a9b49..4a9e3cd4768a 100644
> > --- a/fs/xfs/xfs_reflink.h
> > +++ b/fs/xfs/xfs_reflink.h
> > @@ -12,8 +12,6 @@ extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp,
> >  extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
> >  		struct xfs_bmbt_irec *irec, bool *shared);
> >  
> > -extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
> > -		struct xfs_bmbt_irec *imap);
> >  extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
> >  		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode,
> >  		unsigned iomap_flags);
> > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> > index f1e18ae8a209..47fb07d86efd 100644
> > --- a/fs/xfs/xfs_trace.h
> > +++ b/fs/xfs/xfs_trace.h
> > @@ -3196,13 +3196,10 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
> >  
> >  /* copy on write */
> >  DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
> > -DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc);
> >  DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
> >  DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
> >  DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
> >  
> > -DEFINE_RW_EVENT(xfs_reflink_reserve_cow);
> > -
> >  DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write);
> >  
> >  DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
> > -- 
> > 2.20.1
> >
Brian Foster Feb. 22, 2019, 12:31 p.m. UTC | #4
On Thu, Feb 21, 2019 at 01:30:52PM -0800, Darrick J. Wong wrote:
> On Thu, Feb 21, 2019 at 12:59:03PM -0500, Brian Foster wrote:
> > On Mon, Feb 18, 2019 at 10:18:24AM +0100, Christoph Hellwig wrote:
> > > Besides simplifying the code a bit this allows to actually implement
> > > the behavior of using COW preallocation for non-COW data mentioned
> > > in the current comments.
> > > 
> > > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > > ---
> > 
> > Heh, I'm somewhat amused by the fact that I sent a variant of this patch
> > two years ago (for a different purpose) and you explicitly complained
> > about the factoring. I'm glad you've finally come around. ;)
> > 
> > https://marc.info/?l=linux-xfs&m=149498124730442&w=2
> > 
> > >  fs/xfs/xfs_iomap.c   | 133 ++++++++++++++++++++++++++++++-------------
> > >  fs/xfs/xfs_reflink.c |  67 ----------------------
> > >  fs/xfs/xfs_reflink.h |   2 -
> > >  fs/xfs/xfs_trace.h   |   3 -
> > >  4 files changed, 94 insertions(+), 111 deletions(-)
> > > 
> > > diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> > > index 19a3331b4a56..c9fd1e4a1f99 100644
> > > --- a/fs/xfs/xfs_iomap.c
> > > +++ b/fs/xfs/xfs_iomap.c
> > ...
> > > @@ -568,51 +569,92 @@ xfs_file_iomap_begin_delay(
> > >  
> > >  	end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
> > >  
> > > -	eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got);
> > > +	/*
> > > +	 * Search the data fork fork first to look up our source mapping.  We
> > > +	 * always need the data fork map, as we have to return it to the
> > > +	 * iomap code so that the higher level write code can read data in to
> > > +	 * perform read-modify-write cycles for unaligned writes.
> > > +	 */
> > > +	eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
> > >  	if (eof)
> > > -		got.br_startoff = end_fsb; /* fake hole until the end */
> > > +		imap.br_startoff = end_fsb; /* fake hole until the end */
> > > +
> > > +	/* We never need to allocate blocks for zeroing a hole. */
> > > +	if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
> > > +		xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
> > > +		goto out_unlock;
> > > +	}
> > > +
> > 
> > So does this need to account for the case of an overlapping cow block
> > over a hole in the data fork (with cached data, if that is possible)?
> > IIUC we introduce that possibility just below.
> 
> I think it makes sense to ignore overlapping cow blocks for zeroing a
> hole in the data fork -- the user told us to zero part of a file that
> didn't have nonzero contents in it, so we just leave the speculative cow
> allocation for that block alone.
> 

That makes sense, but with this patch does a hole in the data fork still
always infer a hole in the file content?

We know the current code allows for COW fork overlap of non-shared data
fork blocks due to the cowextsz hint stuff. I'm assuming that means said
COW blocks could just as well overlap a hole in the data fork, but I
could be mistaken on that. If we can and do overlap a hole and then see
a buffered write at the associated file offset, the
(trace_xfs_reflink_cow_found()) case below looks like it pushes the cow
mapping into the iomap and skips the data fork (del)allocation case.
Thus it seems to me that we're now able to have a dirty pagecache page
over a COW delalloc block over a hole in the data fork. If so and we
receive a zero range over this offset, this hunk sees a hole in the data
fork, reports it as such to iomap and thus skips zeroing the page.

Note that I wasn't necessarily sure this could happen. It seems not
given most direct usages iomap_zero_range() in the current code, at
least. The the iomap_truncate_page() -> iomap_zero_range() case looked
suspicious so I just tried to exploit that, but then ran into the fact
that truncate_setsize() also does sub-page zeroing after we call
iomap_truncate_page(). Out of curiousity, I open-coded the
truncate_setsize() call with an i_size_write() and page size aligned
pagecache truncate to bypass truncate_setsize() subpage zeroing and
tried again:

xfs_io -fc "truncate 128k" -c "pwrite 64k" <file>
cp --reflink <file> <file.2>
xfs_io -c "mmap 0 128k" -c "pwrite 64k 4k" -c "pwrite 76k 4k" \
	-c "truncate 78k" -c "mread -v 76k 4k" /mnt/file

... and with that I do see different behavior with and without this
patchset.

The intent here is to speculatively preallocate in the COW fork,
buffered write to a preallocated COW block over a data fork hole,
truncate partially over that block and then inspect whether
iomap_zero_range() locates the block and zeroes the post-eof portion of
the page. Without this patch series, iomap_zero_range() zeroes that
portion of the page, with this patch series it does not.

> > > +	/*
> > > +	 * Search the COW fork extent list even if we did not find a data fork
> > > +	 * extent.  This serves two purposes: first this implements the
> > > +	 * speculative preallocation using cowextsize, so that we also unshare
> > > +	 * block adjacent to shared blocks instead of just the shared blocks
> > > +	 * themselves.  Second the lookup in the extent list is generally faster
> > > +	 * than going out to the shared extent tree.
> > > +	 */
> > > +	if (xfs_is_reflink_inode(ip)) {
> > > +		cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
> > > +				&ccur, &cmap);
> > > +		if (!cow_eof && cmap.br_startoff <= offset_fsb) {
> > > +			trace_xfs_reflink_cow_found(ip, &cmap);
> > > +			whichfork = XFS_COW_FORK;
> > > +			goto done;
> > > +		}
> > > +	}
> > >  
> > > -	if (got.br_startoff <= offset_fsb) {
> > > +	if (imap.br_startoff <= offset_fsb) {
> > >  		/*
> > >  		 * For reflink files we may need a delalloc reservation when
> > >  		 * overwriting shared extents.   This includes zeroing of
> > >  		 * existing extents that contain data.
> > >  		 */
> > > -		if (xfs_is_reflink_inode(ip) &&
> > > -		    ((flags & IOMAP_WRITE) ||
> > > -		     got.br_state != XFS_EXT_UNWRITTEN)) {
> > > -			xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb);
> > > -			error = xfs_reflink_reserve_cow(ip, &got);
> > > -			if (error)
> > > -				goto out_unlock;
> > > +		if (!xfs_is_reflink_inode(ip) ||
> > > +		    ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) {
> > > +			trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
> > > +					&imap);
> > > +			goto done;
> > >  		}
> > >  
> > > -		trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, &got);
> > > -		goto done;
> > > -	}
> > > +		xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
> > >  
> > > -	if (flags & IOMAP_ZERO) {
> > > -		xfs_hole_to_iomap(ip, iomap, offset_fsb, got.br_startoff);
> > > -		goto out_unlock;
> > > +		/* Trim the mapping to the nearest shared extent boundary. */
> > > +		error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
> > > +		if (error)
> > > +			goto out_unlock;
> > > +
> > > +		/* Not shared?  Just report the (potentially capped) extent. */
> > > +		if (!shared) {
> > > +			trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
> > > +					&imap);
> > > +			goto done;
> > > +		}
> > > +
> > > +		/*
> > > +		 * Fork all the shared blocks from our write offset until the
> > > +		 * end of the extent.
> > > +		 */
> > > +		whichfork = XFS_COW_FORK;
> > > +		end_fsb = imap.br_startoff + imap.br_blockcount;
> > > +	} else {
> > > +		/*
> > > +		 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
> > > +		 * pages to keep the chunks of work done where somewhat
> > > +		 * symmetric with the work writeback does.  This is a completely
> > > +		 * arbitrary number pulled out of thin air.
> > > +		 *
> > > +		 * Note that the values needs to be less than 32-bits wide until
> > > +		 * the lower level functions are updated.
> > > +		 */
> > > +		count = min_t(loff_t, count, 1024 * PAGE_SIZE);
> > > +		end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
> > 
> > The existing code doesn't currently do this but ISTM this should apply
> > to either allocation case, not just data fork delalloc. That could be
> > something for a separate patch though.
> > 
> > >  	}
> > >  
> > >  	error = xfs_qm_dqattach_locked(ip, false);
> > >  	if (error)
> > >  		goto out_unlock;
> > >  
> > > -	/*
> > > -	 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages
> > > -	 * to keep the chunks of work done where somewhat symmetric with the
> > > -	 * work writeback does. This is a completely arbitrary number pulled
> > > -	 * out of thin air as a best guess for initial testing.
> > > -	 *
> > > -	 * Note that the values needs to be less than 32-bits wide until
> > > -	 * the lower level functions are updated.
> > > -	 */
> > > -	count = min_t(loff_t, count, 1024 * PAGE_SIZE);
> > > -	end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
> > > -
> > > -	if (eof) {
> > > +	if (eof && whichfork == XFS_DATA_FORK) {
> > >  		prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count,
> > >  				&icur);
> > >  		if (prealloc_blocks) {
> > ...
> > > @@ -659,9 +703,20 @@ xfs_file_iomap_begin_delay(
> > >  	 * them out if the write happens to fail.
> > >  	 */
> > >  	iomap->flags |= IOMAP_F_NEW;
> > 
> > This looks like it flags the mapping new if we reserve cow blocks, which
> > I don't think is quite right.
> 
> Hmmm.  I thought it was correct -- if the write fails, we punch out the
> pagecache and trim any delalloc blocks in the data fork.  If they user
> tries to reread the area of the failed write we'll just read them back
> in from disk...?
> 

For one, I'm not totally clear what the expected behavior is supposed to
be here. The pagecache truncate probably makes sense regardless, but as
you said, the error case punches delalloc blocks out of the data fork.
What if the blocks were allocated in the COW fork?

Note again that I'm not sure there's reproducible bad behavior here. We
probably shouldn't expect delalloc COW reservation over data fork
delalloc (unless always_cow I suppose), but shared data fork blocks
certainly aren't "new" just because we allocated COW reservation (and
don't happen to currently care about non-delalloc blocks in the error
handling implementation).

Brian

> --D
> 
> > > -	trace_xfs_iomap_alloc(ip, offset, count, XFS_DATA_FORK, &got);
> > > +	trace_xfs_iomap_alloc(ip, offset, count, whichfork,
> > > +			whichfork == XFS_DATA_FORK ? &imap : &cmap);
> > >  done:
> > > -	error = xfs_bmbt_to_iomap(ip, iomap, &got, false);
> > > +	if (whichfork == XFS_COW_FORK) {
> > > +		if (imap.br_startoff > offset_fsb) {
> > > +			xfs_trim_extent(&cmap, offset_fsb,
> > > +					imap.br_startoff - offset_fsb);
> > > +			error = xfs_bmbt_to_iomap(ip, iomap, &cmap, false);
> > > +			goto out_unlock;
> > > +		}
> > 
> > Hmm, so this looks like it is in fact handling a COW blocks over a hole
> > case, pushing the COW mapping into the iomap. We never accounted for
> > that case before where we'd always just allocate to the data fork. The
> > change in behavior probably makes sense, but this really should be
> > separate from the refactoring bits to reuse the data fork delalloc code.
> > Beyond making this a bit easier to follow, it warrants its own commit
> > log description and this one makes no mention of it at all.
> > 
> > Brian
> > 
> > > +		/* ensure we only report blocks we have a reservation for */
> > > +		xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount);
> > > +	}
> > > +	error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
> > >  out_unlock:
> > >  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
> > >  	return error;
> > > diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> > > index 8a5353daf9ab..9ef1f79cb3ae 100644
> > > --- a/fs/xfs/xfs_reflink.c
> > > +++ b/fs/xfs/xfs_reflink.c
> > > @@ -234,73 +234,6 @@ xfs_reflink_trim_around_shared(
> > >  	}
> > >  }
> > >  
> > > -/*
> > > - * Trim the passed in imap to the next shared/unshared extent boundary, and
> > > - * if imap->br_startoff points to a shared extent reserve space for it in the
> > > - * COW fork.
> > > - *
> > > - * Note that imap will always contain the block numbers for the existing blocks
> > > - * in the data fork, as the upper layers need them for read-modify-write
> > > - * operations.
> > > - */
> > > -int
> > > -xfs_reflink_reserve_cow(
> > > -	struct xfs_inode	*ip,
> > > -	struct xfs_bmbt_irec	*imap)
> > > -{
> > > -	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
> > > -	struct xfs_bmbt_irec	got;
> > > -	int			error = 0;
> > > -	bool			eof = false;
> > > -	struct xfs_iext_cursor	icur;
> > > -	bool			shared;
> > > -
> > > -	/*
> > > -	 * Search the COW fork extent list first.  This serves two purposes:
> > > -	 * first this implement the speculative preallocation using cowextisze,
> > > -	 * so that we also unshared block adjacent to shared blocks instead
> > > -	 * of just the shared blocks themselves.  Second the lookup in the
> > > -	 * extent list is generally faster than going out to the shared extent
> > > -	 * tree.
> > > -	 */
> > > -
> > > -	if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got))
> > > -		eof = true;
> > > -	if (!eof && got.br_startoff <= imap->br_startoff) {
> > > -		trace_xfs_reflink_cow_found(ip, imap);
> > > -		xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
> > > -		return 0;
> > > -	}
> > > -
> > > -	/* Trim the mapping to the nearest shared extent boundary. */
> > > -	error = xfs_reflink_trim_around_shared(ip, imap, &shared);
> > > -	if (error)
> > > -		return error;
> > > -
> > > -	/* Not shared?  Just report the (potentially capped) extent. */
> > > -	if (!shared)
> > > -		return 0;
> > > -
> > > -	/*
> > > -	 * Fork all the shared blocks from our write offset until the end of
> > > -	 * the extent.
> > > -	 */
> > > -	error = xfs_qm_dqattach_locked(ip, false);
> > > -	if (error)
> > > -		return error;
> > > -
> > > -	error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
> > > -			imap->br_blockcount, 0, &got, &icur, eof);
> > > -	if (error == -ENOSPC || error == -EDQUOT)
> > > -		trace_xfs_reflink_cow_enospc(ip, imap);
> > > -	if (error)
> > > -		return error;
> > > -
> > > -	xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
> > > -	trace_xfs_reflink_cow_alloc(ip, &got);
> > > -	return 0;
> > > -}
> > > -
> > >  /* Convert part of an unwritten CoW extent to a real one. */
> > >  STATIC int
> > >  xfs_reflink_convert_cow_extent(
> > > diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
> > > index 70d68a1a9b49..4a9e3cd4768a 100644
> > > --- a/fs/xfs/xfs_reflink.h
> > > +++ b/fs/xfs/xfs_reflink.h
> > > @@ -12,8 +12,6 @@ extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp,
> > >  extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
> > >  		struct xfs_bmbt_irec *irec, bool *shared);
> > >  
> > > -extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
> > > -		struct xfs_bmbt_irec *imap);
> > >  extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
> > >  		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode,
> > >  		unsigned iomap_flags);
> > > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> > > index f1e18ae8a209..47fb07d86efd 100644
> > > --- a/fs/xfs/xfs_trace.h
> > > +++ b/fs/xfs/xfs_trace.h
> > > @@ -3196,13 +3196,10 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
> > >  
> > >  /* copy on write */
> > >  DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
> > > -DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc);
> > >  DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
> > >  DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
> > >  DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
> > >  
> > > -DEFINE_RW_EVENT(xfs_reflink_reserve_cow);
> > > -
> > >  DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write);
> > >  
> > >  DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
> > > -- 
> > > 2.20.1
> > >
Christoph Hellwig Feb. 22, 2019, 2:20 p.m. UTC | #5
On Thu, Feb 21, 2019 at 12:59:03PM -0500, Brian Foster wrote:
> On Mon, Feb 18, 2019 at 10:18:24AM +0100, Christoph Hellwig wrote:
> > Besides simplifying the code a bit this allows to actually implement
> > the behavior of using COW preallocation for non-COW data mentioned
> > in the current comments.
> > 
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > ---
> 
> Heh, I'm somewhat amused by the fact that I sent a variant of this patch
> two years ago (for a different purpose) and you explicitly complained
> about the factoring. I'm glad you've finally come around. ;)
> 
> https://marc.info/?l=linux-xfs&m=149498124730442&w=2



> 
> >  fs/xfs/xfs_iomap.c   | 133 ++++++++++++++++++++++++++++++-------------
> >  fs/xfs/xfs_reflink.c |  67 ----------------------
> >  fs/xfs/xfs_reflink.h |   2 -
> >  fs/xfs/xfs_trace.h   |   3 -
> >  4 files changed, 94 insertions(+), 111 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> > index 19a3331b4a56..c9fd1e4a1f99 100644
> > --- a/fs/xfs/xfs_iomap.c
> > +++ b/fs/xfs/xfs_iomap.c
> ...
> > @@ -568,51 +569,92 @@ xfs_file_iomap_begin_delay(
> >  
> >  	end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
> >  
> > -	eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got);
> > +	/*
> > +	 * Search the data fork fork first to look up our source mapping.  We
> > +	 * always need the data fork map, as we have to return it to the
> > +	 * iomap code so that the higher level write code can read data in to
> > +	 * perform read-modify-write cycles for unaligned writes.
> > +	 */
> > +	eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
> >  	if (eof)
> > -		got.br_startoff = end_fsb; /* fake hole until the end */
> > +		imap.br_startoff = end_fsb; /* fake hole until the end */
> > +
> > +	/* We never need to allocate blocks for zeroing a hole. */
> > +	if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
> > +		xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
> > +		goto out_unlock;
> > +	}
> > +
> 
> So does this need to account for the case of an overlapping cow block
> over a hole in the data fork (with cached data, if that is possible)?
> IIUC we introduce that possibility just below.

Yes, it probably should, although I need to find a reproducer for that
first. 

> > +		 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
> > +		 * pages to keep the chunks of work done where somewhat
> > +		 * symmetric with the work writeback does.  This is a completely
> > +		 * arbitrary number pulled out of thin air.
> > +		 *
> > +		 * Note that the values needs to be less than 32-bits wide until
> > +		 * the lower level functions are updated.
> > +		 */
> > +		count = min_t(loff_t, count, 1024 * PAGE_SIZE);
> > +		end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
> 
> The existing code doesn't currently do this but ISTM this should apply
> to either allocation case, not just data fork delalloc. That could be
> something for a separate patch though.

I wonder if we need to keep this cap at all, we apply it very
inconsistently through the writeback path.

> > @@ -659,9 +703,20 @@ xfs_file_iomap_begin_delay(
> >  	 * them out if the write happens to fail.
> >  	 */
> >  	iomap->flags |= IOMAP_F_NEW;
> 
> This looks like it flags the mapping new if we reserve cow blocks, which
> I don't think is quite right.

To some extent marking it as new makes a lot of sense, especially if
allocating to a hole.  But we probably only want it for that latter
case.

> 
> > -	trace_xfs_iomap_alloc(ip, offset, count, XFS_DATA_FORK, &got);
> > +	trace_xfs_iomap_alloc(ip, offset, count, whichfork,
> > +			whichfork == XFS_DATA_FORK ? &imap : &cmap);
> >  done:
> > -	error = xfs_bmbt_to_iomap(ip, iomap, &got, false);
> > +	if (whichfork == XFS_COW_FORK) {
> > +		if (imap.br_startoff > offset_fsb) {
> > +			xfs_trim_extent(&cmap, offset_fsb,
> > +					imap.br_startoff - offset_fsb);
> > +			error = xfs_bmbt_to_iomap(ip, iomap, &cmap, false);
> > +			goto out_unlock;
> > +		}
> 
> Hmm, so this looks like it is in fact handling a COW blocks over a hole
> case, pushing the COW mapping into the iomap. We never accounted for
> that case before where we'd always just allocate to the data fork. The
> change in behavior probably makes sense, but this really should be
> separate from the refactoring bits to reuse the data fork delalloc code.
> Beyond making this a bit easier to follow, it warrants its own commit
> log description and this one makes no mention of it at all.

Look at the last sentence of the commit log..
Christoph Hellwig Feb. 22, 2019, 2:22 p.m. UTC | #6
On Thu, Feb 21, 2019 at 01:30:52PM -0800, Darrick J. Wong wrote:
> > So does this need to account for the case of an overlapping cow block
> > over a hole in the data fork (with cached data, if that is possible)?
> > IIUC we introduce that possibility just below.
> 
> I think it makes sense to ignore overlapping cow blocks for zeroing a
> hole in the data fork -- the user told us to zero part of a file that
> didn't have nonzero contents in it, so we just leave the speculative cow
> allocation for that block alone.

For a speculative preallocation I agree.  But if we have valid data
in there due to a cowextsize preallocation being used for data we
should handle it properly.

> > >  	iomap->flags |= IOMAP_F_NEW;
> > 
> > This looks like it flags the mapping new if we reserve cow blocks, which
> > I don't think is quite right.
> 
> Hmmm.  I thought it was correct -- if the write fails, we punch out the
> pagecache and trim any delalloc blocks in the data fork.  If they user
> tries to reread the area of the failed write we'll just read them back
> in from disk...?

Yes, I don't think it is actively harmful, but we don't really need
it either.
Brian Foster Feb. 22, 2019, 3:20 p.m. UTC | #7
On Fri, Feb 22, 2019 at 03:20:58PM +0100, Christoph Hellwig wrote:
> On Thu, Feb 21, 2019 at 12:59:03PM -0500, Brian Foster wrote:
> > On Mon, Feb 18, 2019 at 10:18:24AM +0100, Christoph Hellwig wrote:
> > > Besides simplifying the code a bit this allows to actually implement
> > > the behavior of using COW preallocation for non-COW data mentioned
> > > in the current comments.
> > > 
> > > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > > ---
> > 
> > Heh, I'm somewhat amused by the fact that I sent a variant of this patch
> > two years ago (for a different purpose) and you explicitly complained
> > about the factoring. I'm glad you've finally come around. ;)
> > 
> > https://marc.info/?l=linux-xfs&m=149498124730442&w=2
> 
> 
> 
> > 
> > >  fs/xfs/xfs_iomap.c   | 133 ++++++++++++++++++++++++++++++-------------
> > >  fs/xfs/xfs_reflink.c |  67 ----------------------
> > >  fs/xfs/xfs_reflink.h |   2 -
> > >  fs/xfs/xfs_trace.h   |   3 -
> > >  4 files changed, 94 insertions(+), 111 deletions(-)
> > > 
> > > diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> > > index 19a3331b4a56..c9fd1e4a1f99 100644
> > > --- a/fs/xfs/xfs_iomap.c
> > > +++ b/fs/xfs/xfs_iomap.c
> > ...
> > > @@ -568,51 +569,92 @@ xfs_file_iomap_begin_delay(
> > >  
> > >  	end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
> > >  
> > > -	eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got);
> > > +	/*
> > > +	 * Search the data fork fork first to look up our source mapping.  We
> > > +	 * always need the data fork map, as we have to return it to the
> > > +	 * iomap code so that the higher level write code can read data in to
> > > +	 * perform read-modify-write cycles for unaligned writes.
> > > +	 */
> > > +	eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
> > >  	if (eof)
> > > -		got.br_startoff = end_fsb; /* fake hole until the end */
> > > +		imap.br_startoff = end_fsb; /* fake hole until the end */
> > > +
> > > +	/* We never need to allocate blocks for zeroing a hole. */
> > > +	if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
> > > +		xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
> > > +		goto out_unlock;
> > > +	}
> > > +
> > 
> > So does this need to account for the case of an overlapping cow block
> > over a hole in the data fork (with cached data, if that is possible)?
> > IIUC we introduce that possibility just below.
> 
> Yes, it probably should, although I need to find a reproducer for that
> first. 
> 

See my other reply. I don't think it's currently reproducible, but it
does technically break the iomap_zero_range() mechanism.

> > > +		 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
> > > +		 * pages to keep the chunks of work done where somewhat
> > > +		 * symmetric with the work writeback does.  This is a completely
> > > +		 * arbitrary number pulled out of thin air.
> > > +		 *
> > > +		 * Note that the values needs to be less than 32-bits wide until
> > > +		 * the lower level functions are updated.
> > > +		 */
> > > +		count = min_t(loff_t, count, 1024 * PAGE_SIZE);
> > > +		end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
> > 
> > The existing code doesn't currently do this but ISTM this should apply
> > to either allocation case, not just data fork delalloc. That could be
> > something for a separate patch though.
> 
> I wonder if we need to keep this cap at all, we apply it very
> inconsistently through the writeback path.
> 

Perhaps we can just kill it off then.

> > > @@ -659,9 +703,20 @@ xfs_file_iomap_begin_delay(
> > >  	 * them out if the write happens to fail.
> > >  	 */
> > >  	iomap->flags |= IOMAP_F_NEW;
> > 
> > This looks like it flags the mapping new if we reserve cow blocks, which
> > I don't think is quite right.
> 
> To some extent marking it as new makes a lot of sense, especially if
> allocating to a hole.  But we probably only want it for that latter
> case.
> 

The allocation over a hole case makes more sense to me, but there's also
the case of cow fork delalloc over data fork delalloc. I think we need
some explicit definition of expected behavior here, not to just set the
flag based on what the current error handler happens to do. Perhaps that
might involve fixing up the error handling context to deal with the cow
fork as well.

> > 
> > > -	trace_xfs_iomap_alloc(ip, offset, count, XFS_DATA_FORK, &got);
> > > +	trace_xfs_iomap_alloc(ip, offset, count, whichfork,
> > > +			whichfork == XFS_DATA_FORK ? &imap : &cmap);
> > >  done:
> > > -	error = xfs_bmbt_to_iomap(ip, iomap, &got, false);
> > > +	if (whichfork == XFS_COW_FORK) {
> > > +		if (imap.br_startoff > offset_fsb) {
> > > +			xfs_trim_extent(&cmap, offset_fsb,
> > > +					imap.br_startoff - offset_fsb);
> > > +			error = xfs_bmbt_to_iomap(ip, iomap, &cmap, false);
> > > +			goto out_unlock;
> > > +		}
> > 
> > Hmm, so this looks like it is in fact handling a COW blocks over a hole
> > case, pushing the COW mapping into the iomap. We never accounted for
> > that case before where we'd always just allocate to the data fork. The
> > change in behavior probably makes sense, but this really should be
> > separate from the refactoring bits to reuse the data fork delalloc code.
> > Beyond making this a bit easier to follow, it warrants its own commit
> > log description and this one makes no mention of it at all.
> 
> Look at the last sentence of the commit log..

Ok. I didn't follow that the first time I read it because I thought it
referred to handling COW overlap in writeback, which we already do. It
wasn't until seeing the code (and the in-line comment) and
distinguishing that from the refactoring bits that I realized this
allows for use of existing cow blocks over data fork holes.

So it's not fair to say the commit log doesn't mention it at all, but I
still think that this should have been separate from code reuse
refactoring and warrants more explanation and description than a single
sentence. At minimum, the commit log should describe the current
behavior, the change in behavior and the reason for changing it.

Of course, I guess this is merged now so it doesn't really matter..

Brian
diff mbox series

Patch

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 19a3331b4a56..c9fd1e4a1f99 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -534,15 +534,16 @@  xfs_file_iomap_begin_delay(
 {
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
 	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	xfs_fileoff_t		maxbytes_fsb =
 		XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
 	xfs_fileoff_t		end_fsb;
-	int			error = 0, eof = 0;
-	struct xfs_bmbt_irec	got;
-	struct xfs_iext_cursor	icur;
+	struct xfs_bmbt_irec	imap, cmap;
+	struct xfs_iext_cursor	icur, ccur;
 	xfs_fsblock_t		prealloc_blocks = 0;
+	bool			eof = false, cow_eof = false, shared;
+	int			whichfork = XFS_DATA_FORK;
+	int			error = 0;
 
 	ASSERT(!XFS_IS_REALTIME_INODE(ip));
 	ASSERT(!xfs_get_extsz_hint(ip));
@@ -560,7 +561,7 @@  xfs_file_iomap_begin_delay(
 
 	XFS_STATS_INC(mp, xs_blk_mapw);
 
-	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+	if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
 		error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
 		if (error)
 			goto out_unlock;
@@ -568,51 +569,92 @@  xfs_file_iomap_begin_delay(
 
 	end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
 
-	eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got);
+	/*
+	 * Search the data fork fork first to look up our source mapping.  We
+	 * always need the data fork map, as we have to return it to the
+	 * iomap code so that the higher level write code can read data in to
+	 * perform read-modify-write cycles for unaligned writes.
+	 */
+	eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
 	if (eof)
-		got.br_startoff = end_fsb; /* fake hole until the end */
+		imap.br_startoff = end_fsb; /* fake hole until the end */
+
+	/* We never need to allocate blocks for zeroing a hole. */
+	if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
+		xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
+		goto out_unlock;
+	}
+
+	/*
+	 * Search the COW fork extent list even if we did not find a data fork
+	 * extent.  This serves two purposes: first this implements the
+	 * speculative preallocation using cowextsize, so that we also unshare
+	 * block adjacent to shared blocks instead of just the shared blocks
+	 * themselves.  Second the lookup in the extent list is generally faster
+	 * than going out to the shared extent tree.
+	 */
+	if (xfs_is_reflink_inode(ip)) {
+		cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
+				&ccur, &cmap);
+		if (!cow_eof && cmap.br_startoff <= offset_fsb) {
+			trace_xfs_reflink_cow_found(ip, &cmap);
+			whichfork = XFS_COW_FORK;
+			goto done;
+		}
+	}
 
-	if (got.br_startoff <= offset_fsb) {
+	if (imap.br_startoff <= offset_fsb) {
 		/*
 		 * For reflink files we may need a delalloc reservation when
 		 * overwriting shared extents.   This includes zeroing of
 		 * existing extents that contain data.
 		 */
-		if (xfs_is_reflink_inode(ip) &&
-		    ((flags & IOMAP_WRITE) ||
-		     got.br_state != XFS_EXT_UNWRITTEN)) {
-			xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb);
-			error = xfs_reflink_reserve_cow(ip, &got);
-			if (error)
-				goto out_unlock;
+		if (!xfs_is_reflink_inode(ip) ||
+		    ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) {
+			trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
+					&imap);
+			goto done;
 		}
 
-		trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, &got);
-		goto done;
-	}
+		xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
 
-	if (flags & IOMAP_ZERO) {
-		xfs_hole_to_iomap(ip, iomap, offset_fsb, got.br_startoff);
-		goto out_unlock;
+		/* Trim the mapping to the nearest shared extent boundary. */
+		error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
+		if (error)
+			goto out_unlock;
+
+		/* Not shared?  Just report the (potentially capped) extent. */
+		if (!shared) {
+			trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
+					&imap);
+			goto done;
+		}
+
+		/*
+		 * Fork all the shared blocks from our write offset until the
+		 * end of the extent.
+		 */
+		whichfork = XFS_COW_FORK;
+		end_fsb = imap.br_startoff + imap.br_blockcount;
+	} else {
+		/*
+		 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
+		 * pages to keep the chunks of work done where somewhat
+		 * symmetric with the work writeback does.  This is a completely
+		 * arbitrary number pulled out of thin air.
+		 *
+		 * Note that the values needs to be less than 32-bits wide until
+		 * the lower level functions are updated.
+		 */
+		count = min_t(loff_t, count, 1024 * PAGE_SIZE);
+		end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
 	}
 
 	error = xfs_qm_dqattach_locked(ip, false);
 	if (error)
 		goto out_unlock;
 
-	/*
-	 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages
-	 * to keep the chunks of work done where somewhat symmetric with the
-	 * work writeback does. This is a completely arbitrary number pulled
-	 * out of thin air as a best guess for initial testing.
-	 *
-	 * Note that the values needs to be less than 32-bits wide until
-	 * the lower level functions are updated.
-	 */
-	count = min_t(loff_t, count, 1024 * PAGE_SIZE);
-	end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
-
-	if (eof) {
+	if (eof && whichfork == XFS_DATA_FORK) {
 		prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count,
 				&icur);
 		if (prealloc_blocks) {
@@ -635,9 +677,11 @@  xfs_file_iomap_begin_delay(
 	}
 
 retry:
-	error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb,
-			end_fsb - offset_fsb, prealloc_blocks, &got, &icur,
-			eof);
+	error = xfs_bmapi_reserve_delalloc(ip, whichfork, offset_fsb,
+			end_fsb - offset_fsb, prealloc_blocks,
+			whichfork == XFS_DATA_FORK ? &imap : &cmap,
+			whichfork == XFS_DATA_FORK ? &icur : &ccur,
+			whichfork == XFS_DATA_FORK ? eof : cow_eof);
 	switch (error) {
 	case 0:
 		break;
@@ -659,9 +703,20 @@  xfs_file_iomap_begin_delay(
 	 * them out if the write happens to fail.
 	 */
 	iomap->flags |= IOMAP_F_NEW;
-	trace_xfs_iomap_alloc(ip, offset, count, XFS_DATA_FORK, &got);
+	trace_xfs_iomap_alloc(ip, offset, count, whichfork,
+			whichfork == XFS_DATA_FORK ? &imap : &cmap);
 done:
-	error = xfs_bmbt_to_iomap(ip, iomap, &got, false);
+	if (whichfork == XFS_COW_FORK) {
+		if (imap.br_startoff > offset_fsb) {
+			xfs_trim_extent(&cmap, offset_fsb,
+					imap.br_startoff - offset_fsb);
+			error = xfs_bmbt_to_iomap(ip, iomap, &cmap, false);
+			goto out_unlock;
+		}
+		/* ensure we only report blocks we have a reservation for */
+		xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount);
+	}
+	error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
 out_unlock:
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return error;
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 8a5353daf9ab..9ef1f79cb3ae 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -234,73 +234,6 @@  xfs_reflink_trim_around_shared(
 	}
 }
 
-/*
- * Trim the passed in imap to the next shared/unshared extent boundary, and
- * if imap->br_startoff points to a shared extent reserve space for it in the
- * COW fork.
- *
- * Note that imap will always contain the block numbers for the existing blocks
- * in the data fork, as the upper layers need them for read-modify-write
- * operations.
- */
-int
-xfs_reflink_reserve_cow(
-	struct xfs_inode	*ip,
-	struct xfs_bmbt_irec	*imap)
-{
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
-	struct xfs_bmbt_irec	got;
-	int			error = 0;
-	bool			eof = false;
-	struct xfs_iext_cursor	icur;
-	bool			shared;
-
-	/*
-	 * Search the COW fork extent list first.  This serves two purposes:
-	 * first this implement the speculative preallocation using cowextisze,
-	 * so that we also unshared block adjacent to shared blocks instead
-	 * of just the shared blocks themselves.  Second the lookup in the
-	 * extent list is generally faster than going out to the shared extent
-	 * tree.
-	 */
-
-	if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got))
-		eof = true;
-	if (!eof && got.br_startoff <= imap->br_startoff) {
-		trace_xfs_reflink_cow_found(ip, imap);
-		xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
-		return 0;
-	}
-
-	/* Trim the mapping to the nearest shared extent boundary. */
-	error = xfs_reflink_trim_around_shared(ip, imap, &shared);
-	if (error)
-		return error;
-
-	/* Not shared?  Just report the (potentially capped) extent. */
-	if (!shared)
-		return 0;
-
-	/*
-	 * Fork all the shared blocks from our write offset until the end of
-	 * the extent.
-	 */
-	error = xfs_qm_dqattach_locked(ip, false);
-	if (error)
-		return error;
-
-	error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
-			imap->br_blockcount, 0, &got, &icur, eof);
-	if (error == -ENOSPC || error == -EDQUOT)
-		trace_xfs_reflink_cow_enospc(ip, imap);
-	if (error)
-		return error;
-
-	xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
-	trace_xfs_reflink_cow_alloc(ip, &got);
-	return 0;
-}
-
 /* Convert part of an unwritten CoW extent to a real one. */
 STATIC int
 xfs_reflink_convert_cow_extent(
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 70d68a1a9b49..4a9e3cd4768a 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -12,8 +12,6 @@  extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp,
 extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
 		struct xfs_bmbt_irec *irec, bool *shared);
 
-extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
-		struct xfs_bmbt_irec *imap);
 extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
 		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode,
 		unsigned iomap_flags);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index f1e18ae8a209..47fb07d86efd 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3196,13 +3196,10 @@  DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
 
 /* copy on write */
 DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
-DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
 
-DEFINE_RW_EVENT(xfs_reflink_reserve_cow);
-
 DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write);
 
 DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);