diff mbox series

xfs: fix transaction leak in xfs_reflink_allocate_cow()

Message ID 20180907015113.32448-1-david@fromorbit.com (mailing list archive)
State Superseded
Headers show
Series xfs: fix transaction leak in xfs_reflink_allocate_cow() | expand

Commit Message

Dave Chinner Sept. 7, 2018, 1:51 a.m. UTC
From: Dave Chinner <dchinner@redhat.com>

When xfs_reflink_allocate_cow() allocates a transaction, it drops
the ILOCK to perform the operation. This Introduces a race condition
where another thread modifying the file can perform the COW
allocation operation underneath us. This result in the retry loop
finding an allocated block and jumping straight to the conversion
code. It does not, however, cancel the transaction it holds and so
this gets leaked. This results in a lockdep warning:

Comments

Darrick J. Wong Sept. 7, 2018, 2:40 a.m. UTC | #1
On Fri, Sep 07, 2018 at 11:51:13AM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> When xfs_reflink_allocate_cow() allocates a transaction, it drops
> the ILOCK to perform the operation. This Introduces a race condition
> where another thread modifying the file can perform the COW
> allocation operation underneath us. This result in the retry loop
> finding an allocated block and jumping straight to the conversion
> code. It does not, however, cancel the transaction it holds and so
> this gets leaked. This results in a lockdep warning:
> 
> ================================================
> WARNING: lock held when returning to user space!
> 4.18.5 #1 Not tainted
> ------------------------------------------------
> worker/6123 is leaving the kernel with locks still held!
> 1 lock held by worker/6123:
>  #0: 000000009eab4f1b (sb_internal#2){.+.+}, at: xfs_trans_alloc+0x17c/0x220
> 
> And eventually the filesystem deadlocks because it runs out of log
> space that is reserved by the leaked transaction and never gets
> released.
> 
> The logic flow in xfs_reflink_allocate_cow() is a convoluted mess of
> gotos - it's no surprise that it has bug where the flow through
> several goto jumps then fails to clean up context from a non-obvious
> logic path. CLean up the logic flow and make sure every path does
> the right thing.

Ugh, sorry about leaving that mess.

I haven't tested this at all, but it looks right.  If the original
reporter tests it and the problem goes away I'm willing to attach a:

Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>

--D

> Reported-by: Alexander Y. Fomichev <git.user@gmail.com>
> Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=200981
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/xfs_reflink.c | 98 +++++++++++++++++++++++++++++---------------
>  1 file changed, 66 insertions(+), 32 deletions(-)
> 
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index 38f405415b88..b39f5afa57aa 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -352,6 +352,47 @@ xfs_reflink_convert_cow(
>  	return error;
>  }
>  
> +/*
> + * Find the extent that maps the given range in the COW fork. Even if the extent
> + * is not shared we might have a preallocation for it in the COW fork. If so we
> + * use it that rather than trigger a new allocation.
> + */
> +static int
> +find_trim_cow_extent(
> +	struct xfs_inode	*ip,
> +	struct xfs_bmbt_irec	*imap,
> +	bool			*shared,
> +	bool			*found)
> +{
> +	xfs_fileoff_t		offset_fsb = imap->br_startoff;
> +	xfs_filblks_t		count_fsb = imap->br_blockcount;
> +	struct xfs_iext_cursor	icur;
> +	struct xfs_bmbt_irec	got;
> +	bool			trimmed;
> +
> +	*found = false;
> +
> +	/*
> +	 * if we don't find an overlapping extent, trim the range we need to
> +	 * allocate to fit the hole we found.
> +	 */
> +	if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) ||
> +	    got.br_startoff > offset_fsb)
> +		return xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
> +
> +	*shared = true;
> +	if (isnullstartblock(got.br_startblock)) {
> +		xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
> +		return 0;
> +	}
> +
> +	/* real extent found - no need to allocate */
> +	xfs_trim_extent(&got, offset_fsb, count_fsb);
> +	*imap = got;
> +	*found = true;
> +	return 0;
> +}
> +
>  /* Allocate all CoW reservations covering a range of blocks in a file. */
>  int
>  xfs_reflink_allocate_cow(
> @@ -363,41 +404,37 @@ xfs_reflink_allocate_cow(
>  	struct xfs_mount	*mp = ip->i_mount;
>  	xfs_fileoff_t		offset_fsb = imap->br_startoff;
>  	xfs_filblks_t		count_fsb = imap->br_blockcount;
> -	struct xfs_bmbt_irec	got;
>  	struct xfs_trans	*tp = NULL;
>  	int			nimaps, error = 0;
> -	bool			trimmed;
> +	bool			found;
>  	xfs_filblks_t		resaligned;
>  	xfs_extlen_t		resblks = 0;
> -	struct xfs_iext_cursor	icur;
>  
> -retry:
> -	ASSERT(xfs_is_reflink_inode(ip));
>  	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
>  
>  	/*
> -	 * Even if the extent is not shared we might have a preallocation for
> -	 * it in the COW fork.  If so use it.
> +	 * do-while to run a single lookup retry after transaction allocation.
> +	 * All exits from the loop need to check if the transaction needs
> +	 * cancelling. Dropping the ILOCK to allocate the transaction allows
> +	 * races with other COW fork allocations, so we need to start the extent
> +	 * lookup from scratch once we have the ILOCK again.
>  	 */
> -	if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) &&
> -	    got.br_startoff <= offset_fsb) {
> -		*shared = true;
> +	do {
> +		ASSERT(xfs_is_reflink_inode(ip));
>  
> -		/* If we have a real allocation in the COW fork we're done. */
> -		if (!isnullstartblock(got.br_startblock)) {
> -			xfs_trim_extent(&got, offset_fsb, count_fsb);
> -			*imap = got;
> +		error = find_trim_cow_extent(ip, imap, shared, &found);
> +		if (error || !*shared)
> +			goto out_trans_cancel;
> +		if (found) {
> +			if (tp)
> +				xfs_trans_cancel(tp);
>  			goto convert;
>  		}
>  
> -		xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
> -	} else {
> -		error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
> -		if (error || !*shared)
> -			goto out;
> -	}
> +		/* if we have a transaction, it's time to allocate */
> +		if (tp)
> +			break;
>  
> -	if (!tp) {
>  		resaligned = xfs_aligned_fsb_count(imap->br_startoff,
>  			imap->br_blockcount, xfs_get_cowextsz_hint(ip));
>  		resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
> @@ -412,29 +449,25 @@ xfs_reflink_allocate_cow(
>  
>  		error = xfs_qm_dqattach_locked(ip, false);
>  		if (error)
> -			goto out;
> -		goto retry;
> -	}
> +			goto out_trans_cancel;
> +	} while (tp);
>  
>  	error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0,
>  			XFS_QMOPT_RES_REGBLKS);
>  	if (error)
> -		goto out;
> +		goto out_trans_cancel;
>  
>  	xfs_trans_ijoin(tp, ip, 0);
>  
> -	nimaps = 1;
> -
>  	/* Allocate the entire reservation as unwritten blocks. */
> +	nimaps = 1;
>  	error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
>  			XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC,
>  			resblks, imap, &nimaps);
>  	if (error)
> -		goto out_trans_cancel;
> +		goto out_unreserve;
>  
>  	xfs_inode_set_cowblocks_tag(ip);
> -
> -	/* Finish up. */
>  	error = xfs_trans_commit(tp);
>  	if (error)
>  		return error;
> @@ -447,10 +480,11 @@ xfs_reflink_allocate_cow(
>  		return -ENOSPC;
>  convert:
>  	return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb);
> -out_trans_cancel:
> +
> +out_unreserve:
>  	xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0,
>  			XFS_QMOPT_RES_REGBLKS);
> -out:
> +out_trans_cancel:
>  	if (tp)
>  		xfs_trans_cancel(tp);
>  	return error;
> -- 
> 2.17.0
>
Alexander Y. Fomichev Sept. 7, 2018, 1:11 p.m. UTC | #2
On Thu, 6 Sep 2018 19:40:47 -0700
"Darrick J. Wong" <darrick.wong@oracle.com> wrote:

> On Fri, Sep 07, 2018 at 11:51:13AM +1000, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > When xfs_reflink_allocate_cow() allocates a transaction, it drops
> > the ILOCK to perform the operation. This Introduces a race condition
> > where another thread modifying the file can perform the COW
> > allocation operation underneath us. This result in the retry loop
> > finding an allocated block and jumping straight to the conversion
> > code. It does not, however, cancel the transaction it holds and so
> > this gets leaked. This results in a lockdep warning:
> > 
> > ================================================
> > WARNING: lock held when returning to user space!
> > 4.18.5 #1 Not tainted
> > ------------------------------------------------
> > worker/6123 is leaving the kernel with locks still held!
> > 1 lock held by worker/6123:
> >  #0: 000000009eab4f1b (sb_internal#2){.+.+}, at: xfs_trans_alloc+0x17c/0x220
> > 
> > And eventually the filesystem deadlocks because it runs out of log
> > space that is reserved by the leaked transaction and never gets
> > released.
> > 
> > The logic flow in xfs_reflink_allocate_cow() is a convoluted mess of
> > gotos - it's no surprise that it has bug where the flow through
> > several goto jumps then fails to clean up context from a non-obvious
> > logic path. CLean up the logic flow and make sure every path does
> > the right thing.
> 
> Ugh, sorry about leaving that mess.
> 
> I haven't tested this at all, but it looks right.  If the original
> reporter tests it and the problem goes away I'm willing to attach a:
> 
> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
> 
> --D
>

Tested and I can confirm.
Patch works as expected for v4.19 and (with minimal modifications) for 4.18.
Thanks.

> > Reported-by: Alexander Y. Fomichev <git.user@gmail.com>
> > Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=200981
> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > ---
> >  fs/xfs/xfs_reflink.c | 98 +++++++++++++++++++++++++++++---------------
> >  1 file changed, 66 insertions(+), 32 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> > index 38f405415b88..b39f5afa57aa 100644
> > --- a/fs/xfs/xfs_reflink.c
> > +++ b/fs/xfs/xfs_reflink.c
> > @@ -352,6 +352,47 @@ xfs_reflink_convert_cow(
> >  	return error;
> >  }
> >  
> > +/*
> > + * Find the extent that maps the given range in the COW fork. Even if the extent
> > + * is not shared we might have a preallocation for it in the COW fork. If so we
> > + * use it that rather than trigger a new allocation.
> > + */
> > +static int
> > +find_trim_cow_extent(
> > +	struct xfs_inode	*ip,
> > +	struct xfs_bmbt_irec	*imap,
> > +	bool			*shared,
> > +	bool			*found)
> > +{
> > +	xfs_fileoff_t		offset_fsb = imap->br_startoff;
> > +	xfs_filblks_t		count_fsb = imap->br_blockcount;
> > +	struct xfs_iext_cursor	icur;
> > +	struct xfs_bmbt_irec	got;
> > +	bool			trimmed;
> > +
> > +	*found = false;
> > +
> > +	/*
> > +	 * if we don't find an overlapping extent, trim the range we need to
> > +	 * allocate to fit the hole we found.
> > +	 */
> > +	if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) ||
> > +	    got.br_startoff > offset_fsb)
> > +		return xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
> > +
> > +	*shared = true;
> > +	if (isnullstartblock(got.br_startblock)) {
> > +		xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
> > +		return 0;
> > +	}
> > +
> > +	/* real extent found - no need to allocate */
> > +	xfs_trim_extent(&got, offset_fsb, count_fsb);
> > +	*imap = got;
> > +	*found = true;
> > +	return 0;
> > +}
> > +
> >  /* Allocate all CoW reservations covering a range of blocks in a file. */
> >  int
> >  xfs_reflink_allocate_cow(
> > @@ -363,41 +404,37 @@ xfs_reflink_allocate_cow(
> >  	struct xfs_mount	*mp = ip->i_mount;
> >  	xfs_fileoff_t		offset_fsb = imap->br_startoff;
> >  	xfs_filblks_t		count_fsb = imap->br_blockcount;
> > -	struct xfs_bmbt_irec	got;
> >  	struct xfs_trans	*tp = NULL;
> >  	int			nimaps, error = 0;
> > -	bool			trimmed;
> > +	bool			found;
> >  	xfs_filblks_t		resaligned;
> >  	xfs_extlen_t		resblks = 0;
> > -	struct xfs_iext_cursor	icur;
> >  
> > -retry:
> > -	ASSERT(xfs_is_reflink_inode(ip));
> >  	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
> >  
> >  	/*
> > -	 * Even if the extent is not shared we might have a preallocation for
> > -	 * it in the COW fork.  If so use it.
> > +	 * do-while to run a single lookup retry after transaction allocation.
> > +	 * All exits from the loop need to check if the transaction needs
> > +	 * cancelling. Dropping the ILOCK to allocate the transaction allows
> > +	 * races with other COW fork allocations, so we need to start the extent
> > +	 * lookup from scratch once we have the ILOCK again.
> >  	 */
> > -	if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) &&
> > -	    got.br_startoff <= offset_fsb) {
> > -		*shared = true;
> > +	do {
> > +		ASSERT(xfs_is_reflink_inode(ip));
> >  
> > -		/* If we have a real allocation in the COW fork we're done. */
> > -		if (!isnullstartblock(got.br_startblock)) {
> > -			xfs_trim_extent(&got, offset_fsb, count_fsb);
> > -			*imap = got;
> > +		error = find_trim_cow_extent(ip, imap, shared, &found);
> > +		if (error || !*shared)
> > +			goto out_trans_cancel;
> > +		if (found) {
> > +			if (tp)
> > +				xfs_trans_cancel(tp);
> >  			goto convert;
> >  		}
> >  
> > -		xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
> > -	} else {
> > -		error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
> > -		if (error || !*shared)
> > -			goto out;
> > -	}
> > +		/* if we have a transaction, it's time to allocate */
> > +		if (tp)
> > +			break;
> >  
> > -	if (!tp) {
> >  		resaligned = xfs_aligned_fsb_count(imap->br_startoff,
> >  			imap->br_blockcount, xfs_get_cowextsz_hint(ip));
> >  		resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
> > @@ -412,29 +449,25 @@ xfs_reflink_allocate_cow(
> >  
> >  		error = xfs_qm_dqattach_locked(ip, false);
> >  		if (error)
> > -			goto out;
> > -		goto retry;
> > -	}
> > +			goto out_trans_cancel;
> > +	} while (tp);
> >  
> >  	error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0,
> >  			XFS_QMOPT_RES_REGBLKS);
> >  	if (error)
> > -		goto out;
> > +		goto out_trans_cancel;
> >  
> >  	xfs_trans_ijoin(tp, ip, 0);
> >  
> > -	nimaps = 1;
> > -
> >  	/* Allocate the entire reservation as unwritten blocks. */
> > +	nimaps = 1;
> >  	error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
> >  			XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC,
> >  			resblks, imap, &nimaps);
> >  	if (error)
> > -		goto out_trans_cancel;
> > +		goto out_unreserve;
> >  
> >  	xfs_inode_set_cowblocks_tag(ip);
> > -
> > -	/* Finish up. */
> >  	error = xfs_trans_commit(tp);
> >  	if (error)
> >  		return error;
> > @@ -447,10 +480,11 @@ xfs_reflink_allocate_cow(
> >  		return -ENOSPC;
> >  convert:
> >  	return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb);
> > -out_trans_cancel:
> > +
> > +out_unreserve:
> >  	xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0,
> >  			XFS_QMOPT_RES_REGBLKS);
> > -out:
> > +out_trans_cancel:
> >  	if (tp)
> >  		xfs_trans_cancel(tp);
> >  	return error;
> > -- 
> > 2.17.0
> >
Brian Foster Sept. 7, 2018, 2:03 p.m. UTC | #3
On Fri, Sep 07, 2018 at 11:51:13AM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> When xfs_reflink_allocate_cow() allocates a transaction, it drops
> the ILOCK to perform the operation. This Introduces a race condition
> where another thread modifying the file can perform the COW
> allocation operation underneath us. This result in the retry loop
> finding an allocated block and jumping straight to the conversion
> code. It does not, however, cancel the transaction it holds and so
> this gets leaked. This results in a lockdep warning:
> 
> ================================================
> WARNING: lock held when returning to user space!
> 4.18.5 #1 Not tainted
> ------------------------------------------------
> worker/6123 is leaving the kernel with locks still held!
> 1 lock held by worker/6123:
>  #0: 000000009eab4f1b (sb_internal#2){.+.+}, at: xfs_trans_alloc+0x17c/0x220
> 
> And eventually the filesystem deadlocks because it runs out of log
> space that is reserved by the leaked transaction and never gets
> released.
> 
> The logic flow in xfs_reflink_allocate_cow() is a convoluted mess of
> gotos - it's no surprise that it has bug where the flow through
> several goto jumps then fails to clean up context from a non-obvious
> logic path. CLean up the logic flow and make sure every path does
> the right thing.
> 
> Reported-by: Alexander Y. Fomichev <git.user@gmail.com>
> Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=200981
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---

Looks good to me:

Reviewed-by: Brian Foster <bfoster@redhat.com>

>  fs/xfs/xfs_reflink.c | 98 +++++++++++++++++++++++++++++---------------
>  1 file changed, 66 insertions(+), 32 deletions(-)
> 
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index 38f405415b88..b39f5afa57aa 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -352,6 +352,47 @@ xfs_reflink_convert_cow(
>  	return error;
>  }
>  
> +/*
> + * Find the extent that maps the given range in the COW fork. Even if the extent
> + * is not shared we might have a preallocation for it in the COW fork. If so we
> + * use it that rather than trigger a new allocation.
> + */
> +static int
> +find_trim_cow_extent(
> +	struct xfs_inode	*ip,
> +	struct xfs_bmbt_irec	*imap,
> +	bool			*shared,
> +	bool			*found)
> +{
> +	xfs_fileoff_t		offset_fsb = imap->br_startoff;
> +	xfs_filblks_t		count_fsb = imap->br_blockcount;
> +	struct xfs_iext_cursor	icur;
> +	struct xfs_bmbt_irec	got;
> +	bool			trimmed;
> +
> +	*found = false;
> +
> +	/*
> +	 * if we don't find an overlapping extent, trim the range we need to
> +	 * allocate to fit the hole we found.
> +	 */
> +	if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) ||
> +	    got.br_startoff > offset_fsb)
> +		return xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
> +
> +	*shared = true;
> +	if (isnullstartblock(got.br_startblock)) {
> +		xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
> +		return 0;
> +	}
> +
> +	/* real extent found - no need to allocate */
> +	xfs_trim_extent(&got, offset_fsb, count_fsb);
> +	*imap = got;
> +	*found = true;
> +	return 0;
> +}
> +
>  /* Allocate all CoW reservations covering a range of blocks in a file. */
>  int
>  xfs_reflink_allocate_cow(
> @@ -363,41 +404,37 @@ xfs_reflink_allocate_cow(
>  	struct xfs_mount	*mp = ip->i_mount;
>  	xfs_fileoff_t		offset_fsb = imap->br_startoff;
>  	xfs_filblks_t		count_fsb = imap->br_blockcount;
> -	struct xfs_bmbt_irec	got;
>  	struct xfs_trans	*tp = NULL;
>  	int			nimaps, error = 0;
> -	bool			trimmed;
> +	bool			found;
>  	xfs_filblks_t		resaligned;
>  	xfs_extlen_t		resblks = 0;
> -	struct xfs_iext_cursor	icur;
>  
> -retry:
> -	ASSERT(xfs_is_reflink_inode(ip));
>  	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
>  
>  	/*
> -	 * Even if the extent is not shared we might have a preallocation for
> -	 * it in the COW fork.  If so use it.
> +	 * do-while to run a single lookup retry after transaction allocation.
> +	 * All exits from the loop need to check if the transaction needs
> +	 * cancelling. Dropping the ILOCK to allocate the transaction allows
> +	 * races with other COW fork allocations, so we need to start the extent
> +	 * lookup from scratch once we have the ILOCK again.
>  	 */
> -	if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) &&
> -	    got.br_startoff <= offset_fsb) {
> -		*shared = true;
> +	do {
> +		ASSERT(xfs_is_reflink_inode(ip));
>  
> -		/* If we have a real allocation in the COW fork we're done. */
> -		if (!isnullstartblock(got.br_startblock)) {
> -			xfs_trim_extent(&got, offset_fsb, count_fsb);
> -			*imap = got;
> +		error = find_trim_cow_extent(ip, imap, shared, &found);
> +		if (error || !*shared)
> +			goto out_trans_cancel;
> +		if (found) {
> +			if (tp)
> +				xfs_trans_cancel(tp);
>  			goto convert;
>  		}
>  
> -		xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
> -	} else {
> -		error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
> -		if (error || !*shared)
> -			goto out;
> -	}
> +		/* if we have a transaction, it's time to allocate */
> +		if (tp)
> +			break;
>  
> -	if (!tp) {
>  		resaligned = xfs_aligned_fsb_count(imap->br_startoff,
>  			imap->br_blockcount, xfs_get_cowextsz_hint(ip));
>  		resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
> @@ -412,29 +449,25 @@ xfs_reflink_allocate_cow(
>  
>  		error = xfs_qm_dqattach_locked(ip, false);
>  		if (error)
> -			goto out;
> -		goto retry;
> -	}
> +			goto out_trans_cancel;
> +	} while (tp);
>  
>  	error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0,
>  			XFS_QMOPT_RES_REGBLKS);
>  	if (error)
> -		goto out;
> +		goto out_trans_cancel;
>  
>  	xfs_trans_ijoin(tp, ip, 0);
>  
> -	nimaps = 1;
> -
>  	/* Allocate the entire reservation as unwritten blocks. */
> +	nimaps = 1;
>  	error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
>  			XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC,
>  			resblks, imap, &nimaps);
>  	if (error)
> -		goto out_trans_cancel;
> +		goto out_unreserve;
>  
>  	xfs_inode_set_cowblocks_tag(ip);
> -
> -	/* Finish up. */
>  	error = xfs_trans_commit(tp);
>  	if (error)
>  		return error;
> @@ -447,10 +480,11 @@ xfs_reflink_allocate_cow(
>  		return -ENOSPC;
>  convert:
>  	return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb);
> -out_trans_cancel:
> +
> +out_unreserve:
>  	xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0,
>  			XFS_QMOPT_RES_REGBLKS);
> -out:
> +out_trans_cancel:
>  	if (tp)
>  		xfs_trans_cancel(tp);
>  	return error;
> -- 
> 2.17.0
>
Christoph Hellwig Sept. 10, 2018, 7:02 a.m. UTC | #4
> +	/*
> +	 * if we don't find an overlapping extent, trim the range we need to
> +	 * allocate to fit the hole we found.
> +	 */

Please capitalize the first letter in each sentence.

> +	*shared = true;
> +	if (isnullstartblock(got.br_startblock)) {
> +		xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
> +		return 0;
> +	}

Taken out of the context of the bigger function this conditional
could really use a comment as well.

> +	} while (tp);

This looks a little odd as tp will always be true if we reach this
point.  I'd suggest to switch to a do { } while (1) or for (;;) style
loop.

Alternatively we could just skip the loop entirely now that we have
the lookup + trim helper.  Untested patch below:

diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 38f405415b88..79e2279d8e15 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -352,6 +352,47 @@ xfs_reflink_convert_cow(
 	return error;
 }
 
+/*
+ * Find the extent that maps the given range in the COW fork. Even if the extent
+ * is not shared we might have a preallocation for it in the COW fork. If so we
+ * use it that rather than trigger a new allocation.
+ */
+static int
+find_trim_cow_extent(
+	struct xfs_inode	*ip,
+	struct xfs_bmbt_irec	*imap,
+	bool			*shared,
+	bool			*found)
+{
+	xfs_fileoff_t		offset_fsb = imap->br_startoff;
+	xfs_filblks_t		count_fsb = imap->br_blockcount;
+	struct xfs_iext_cursor	icur;
+	struct xfs_bmbt_irec	got;
+	bool			trimmed;
+
+	*found = false;
+
+	/*
+	 * If we don't find an overlapping extent, trim the range we need to
+	 * allocate to fit the hole we found.
+	 */
+	if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) ||
+	    got.br_startoff > offset_fsb)
+		return xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
+
+	*shared = true;
+	if (isnullstartblock(got.br_startblock)) {
+		xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
+		return 0;
+	}
+
+	/* real extent found - no need to allocate */
+	xfs_trim_extent(&got, offset_fsb, count_fsb);
+	*imap = got;
+	*found = true;
+	return 0;
+}
+
 /* Allocate all CoW reservations covering a range of blocks in a file. */
 int
 xfs_reflink_allocate_cow(
@@ -363,78 +404,62 @@ xfs_reflink_allocate_cow(
 	struct xfs_mount	*mp = ip->i_mount;
 	xfs_fileoff_t		offset_fsb = imap->br_startoff;
 	xfs_filblks_t		count_fsb = imap->br_blockcount;
-	struct xfs_bmbt_irec	got;
-	struct xfs_trans	*tp = NULL;
+	struct xfs_trans	*tp;
 	int			nimaps, error = 0;
-	bool			trimmed;
+	bool			found;
 	xfs_filblks_t		resaligned;
 	xfs_extlen_t		resblks = 0;
-	struct xfs_iext_cursor	icur;
 
-retry:
-	ASSERT(xfs_is_reflink_inode(ip));
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(xfs_is_reflink_inode(ip));
 
-	/*
-	 * Even if the extent is not shared we might have a preallocation for
-	 * it in the COW fork.  If so use it.
-	 */
-	if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) &&
-	    got.br_startoff <= offset_fsb) {
-		*shared = true;
-
-		/* If we have a real allocation in the COW fork we're done. */
-		if (!isnullstartblock(got.br_startblock)) {
-			xfs_trim_extent(&got, offset_fsb, count_fsb);
-			*imap = got;
-			goto convert;
-		}
+	error = find_trim_cow_extent(ip, imap, shared, &found);
+	if (error || !*shared)
+		return error;
+	if (found)
+		goto convert;
 
-		xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
-	} else {
-		error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
-		if (error || !*shared)
-			goto out;
-	}
+	resaligned = xfs_aligned_fsb_count(imap->br_startoff,
+		imap->br_blockcount, xfs_get_cowextsz_hint(ip));
+	resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
 
-	if (!tp) {
-		resaligned = xfs_aligned_fsb_count(imap->br_startoff,
-			imap->br_blockcount, xfs_get_cowextsz_hint(ip));
-		resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
+	xfs_iunlock(ip, *lockmode);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
+	*lockmode = XFS_ILOCK_EXCL;
+	xfs_ilock(ip, *lockmode);
 
-		xfs_iunlock(ip, *lockmode);
-		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
-		*lockmode = XFS_ILOCK_EXCL;
-		xfs_ilock(ip, *lockmode);
+	if (error)
+		return error;
 
-		if (error)
-			return error;
+	error = xfs_qm_dqattach_locked(ip, false);
+	if (error)
+		goto out_trans_cancel;
 
-		error = xfs_qm_dqattach_locked(ip, false);
-		if (error)
-			goto out;
-		goto retry;
+	/* check for an overlapping extent again no that we dropped the ilock */
+	error = find_trim_cow_extent(ip, imap, shared, &found);
+	if (error || !*shared)
+		goto out_trans_cancel;
+	if (found) {
+		xfs_trans_cancel(tp);
+		goto convert;
 	}
 
 	error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0,
 			XFS_QMOPT_RES_REGBLKS);
 	if (error)
-		goto out;
+		goto out_trans_cancel;
 
 	xfs_trans_ijoin(tp, ip, 0);
 
-	nimaps = 1;
-
 	/* Allocate the entire reservation as unwritten blocks. */
+	nimaps = 1;
 	error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
 			XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC,
 			resblks, imap, &nimaps);
 	if (error)
-		goto out_trans_cancel;
+		goto out_unreserve;
 
 	xfs_inode_set_cowblocks_tag(ip);
-
-	/* Finish up. */
 	error = xfs_trans_commit(tp);
 	if (error)
 		return error;
@@ -447,12 +472,12 @@ xfs_reflink_allocate_cow(
 		return -ENOSPC;
 convert:
 	return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb);
-out_trans_cancel:
+
+out_unreserve:
 	xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0,
 			XFS_QMOPT_RES_REGBLKS);
-out:
-	if (tp)
-		xfs_trans_cancel(tp);
+out_trans_cancel:
+	xfs_trans_cancel(tp);
 	return error;
 }
Alexander Y. Fomichev Sept. 12, 2018, 9:24 a.m. UTC | #5
On Mon, 10 Sep 2018 00:02:41 -0700
Christoph Hellwig <hch@infradead.org> wrote:

> > +	/*
> > +	 * if we don't find an overlapping extent, trim the range
> > we need to
> > +	 * allocate to fit the hole we found.
> > +	 */  
> 
> Please capitalize the first letter in each sentence.
> 
> > +	*shared = true;
> > +	if (isnullstartblock(got.br_startblock)) {
> > +		xfs_trim_extent(imap, got.br_startoff,
> > got.br_blockcount);
> > +		return 0;
> > +	}  
> 
> Taken out of the context of the bigger function this conditional
> could really use a comment as well.
> 
> > +	} while (tp);  
> 
> This looks a little odd as tp will always be true if we reach this
> point.  I'd suggest to switch to a do { } while (1) or for (;;) style
> loop.
> 
> Alternatively we could just skip the loop entirely now that we have
> the lookup + trim helper.  Untested patch below:

I have just tested and it looks ok.
 
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index 38f405415b88..79e2279d8e15 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -352,6 +352,47 @@ xfs_reflink_convert_cow(
>  	return error;
>  }
>  
> +/*
> + * Find the extent that maps the given range in the COW fork. Even
> if the extent
> + * is not shared we might have a preallocation for it in the COW
> fork. If so we
> + * use it that rather than trigger a new allocation.
> + */
> +static int
> +find_trim_cow_extent(
> +	struct xfs_inode	*ip,
> +	struct xfs_bmbt_irec	*imap,
> +	bool			*shared,
> +	bool			*found)
> +{
> +	xfs_fileoff_t		offset_fsb = imap->br_startoff;
> +	xfs_filblks_t		count_fsb = imap->br_blockcount;
> +	struct xfs_iext_cursor	icur;
> +	struct xfs_bmbt_irec	got;
> +	bool			trimmed;
> +
> +	*found = false;
> +
> +	/*
> +	 * If we don't find an overlapping extent, trim the range we
> need to
> +	 * allocate to fit the hole we found.
> +	 */
> +	if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
> &icur, &got) ||
> +	    got.br_startoff > offset_fsb)
> +		return xfs_reflink_trim_around_shared(ip, imap,
> shared, &trimmed); +
> +	*shared = true;
> +	if (isnullstartblock(got.br_startblock)) {
> +		xfs_trim_extent(imap, got.br_startoff,
> got.br_blockcount);
> +		return 0;
> +	}
> +
> +	/* real extent found - no need to allocate */
> +	xfs_trim_extent(&got, offset_fsb, count_fsb);
> +	*imap = got;
> +	*found = true;
> +	return 0;
> +}
> +
>  /* Allocate all CoW reservations covering a range of blocks in a
> file. */ int
>  xfs_reflink_allocate_cow(
> @@ -363,78 +404,62 @@ xfs_reflink_allocate_cow(
>  	struct xfs_mount	*mp = ip->i_mount;
>  	xfs_fileoff_t		offset_fsb = imap->br_startoff;
>  	xfs_filblks_t		count_fsb = imap->br_blockcount;
> -	struct xfs_bmbt_irec	got;
> -	struct xfs_trans	*tp = NULL;
> +	struct xfs_trans	*tp;
>  	int			nimaps, error = 0;
> -	bool			trimmed;
> +	bool			found;
>  	xfs_filblks_t		resaligned;
>  	xfs_extlen_t		resblks = 0;
> -	struct xfs_iext_cursor	icur;
>  
> -retry:
> -	ASSERT(xfs_is_reflink_inode(ip));
>  	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
> +	ASSERT(xfs_is_reflink_inode(ip));
>  
> -	/*
> -	 * Even if the extent is not shared we might have a
> preallocation for
> -	 * it in the COW fork.  If so use it.
> -	 */
> -	if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
> &icur, &got) &&
> -	    got.br_startoff <= offset_fsb) {
> -		*shared = true;
> -
> -		/* If we have a real allocation in the COW fork
> we're done. */
> -		if (!isnullstartblock(got.br_startblock)) {
> -			xfs_trim_extent(&got, offset_fsb, count_fsb);
> -			*imap = got;
> -			goto convert;
> -		}
> +	error = find_trim_cow_extent(ip, imap, shared, &found);
> +	if (error || !*shared)
> +		return error;
> +	if (found)
> +		goto convert;
>  
> -		xfs_trim_extent(imap, got.br_startoff,
> got.br_blockcount);
> -	} else {
> -		error = xfs_reflink_trim_around_shared(ip, imap,
> shared, &trimmed);
> -		if (error || !*shared)
> -			goto out;
> -	}
> +	resaligned = xfs_aligned_fsb_count(imap->br_startoff,
> +		imap->br_blockcount, xfs_get_cowextsz_hint(ip));
> +	resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
>  
> -	if (!tp) {
> -		resaligned = xfs_aligned_fsb_count(imap->br_startoff,
> -			imap->br_blockcount,
> xfs_get_cowextsz_hint(ip));
> -		resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
> +	xfs_iunlock(ip, *lockmode);
> +	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks,
> 0, 0, &tp);
> +	*lockmode = XFS_ILOCK_EXCL;
> +	xfs_ilock(ip, *lockmode);
>  
> -		xfs_iunlock(ip, *lockmode);
> -		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write,
> resblks, 0, 0, &tp);
> -		*lockmode = XFS_ILOCK_EXCL;
> -		xfs_ilock(ip, *lockmode);
> +	if (error)
> +		return error;
>  
> -		if (error)
> -			return error;
> +	error = xfs_qm_dqattach_locked(ip, false);
> +	if (error)
> +		goto out_trans_cancel;
>  
> -		error = xfs_qm_dqattach_locked(ip, false);
> -		if (error)
> -			goto out;
> -		goto retry;
> +	/* check for an overlapping extent again no that we dropped
> the ilock */
> +	error = find_trim_cow_extent(ip, imap, shared, &found);
> +	if (error || !*shared)
> +		goto out_trans_cancel;
> +	if (found) {
> +		xfs_trans_cancel(tp);
> +		goto convert;
>  	}
>  
>  	error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0,
>  			XFS_QMOPT_RES_REGBLKS);
>  	if (error)
> -		goto out;
> +		goto out_trans_cancel;
>  
>  	xfs_trans_ijoin(tp, ip, 0);
>  
> -	nimaps = 1;
> -
>  	/* Allocate the entire reservation as unwritten blocks. */
> +	nimaps = 1;
>  	error = xfs_bmapi_write(tp, ip, imap->br_startoff,
> imap->br_blockcount, XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC,
>  			resblks, imap, &nimaps);
>  	if (error)
> -		goto out_trans_cancel;
> +		goto out_unreserve;
>  
>  	xfs_inode_set_cowblocks_tag(ip);
> -
> -	/* Finish up. */
>  	error = xfs_trans_commit(tp);
>  	if (error)
>  		return error;
> @@ -447,12 +472,12 @@ xfs_reflink_allocate_cow(
>  		return -ENOSPC;
>  convert:
>  	return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb,
> count_fsb); -out_trans_cancel:
> +
> +out_unreserve:
>  	xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0,
>  			XFS_QMOPT_RES_REGBLKS);
> -out:
> -	if (tp)
> -		xfs_trans_cancel(tp);
> +out_trans_cancel:
> +	xfs_trans_cancel(tp);
>  	return error;
>  }
>
Darrick J. Wong Sept. 17, 2018, 4:29 p.m. UTC | #6
On Mon, Sep 10, 2018 at 12:02:41AM -0700, Christoph Hellwig wrote:
> > +	/*
> > +	 * if we don't find an overlapping extent, trim the range we need to
> > +	 * allocate to fit the hole we found.
> > +	 */
> 
> Please capitalize the first letter in each sentence.
> 
> > +	*shared = true;
> > +	if (isnullstartblock(got.br_startblock)) {
> > +		xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
> > +		return 0;
> > +	}
> 
> Taken out of the context of the bigger function this conditional
> could really use a comment as well.
> 
> > +	} while (tp);
> 
> This looks a little odd as tp will always be true if we reach this
> point.  I'd suggest to switch to a do { } while (1) or for (;;) style
> loop.
> 
> Alternatively we could just skip the loop entirely now that we have
> the lookup + trim helper.  Untested patch below:
> 
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index 38f405415b88..79e2279d8e15 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -352,6 +352,47 @@ xfs_reflink_convert_cow(
>  	return error;
>  }
>  
> +/*
> + * Find the extent that maps the given range in the COW fork. Even if the extent
> + * is not shared we might have a preallocation for it in the COW fork. If so we
> + * use it that rather than trigger a new allocation.
> + */
> +static int
> +find_trim_cow_extent(

/me prefers that we try to prefix all the xfs functions, even if they
are static...

> +	struct xfs_inode	*ip,
> +	struct xfs_bmbt_irec	*imap,
> +	bool			*shared,
> +	bool			*found)
> +{
> +	xfs_fileoff_t		offset_fsb = imap->br_startoff;
> +	xfs_filblks_t		count_fsb = imap->br_blockcount;
> +	struct xfs_iext_cursor	icur;
> +	struct xfs_bmbt_irec	got;
> +	bool			trimmed;
> +
> +	*found = false;
> +
> +	/*
> +	 * If we don't find an overlapping extent, trim the range we need to
> +	 * allocate to fit the hole we found.
> +	 */
> +	if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) ||
> +	    got.br_startoff > offset_fsb)
> +		return xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
> +
> +	*shared = true;
> +	if (isnullstartblock(got.br_startblock)) {
> +		xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
> +		return 0;
> +	}
> +
> +	/* real extent found - no need to allocate */
> +	xfs_trim_extent(&got, offset_fsb, count_fsb);
> +	*imap = got;
> +	*found = true;
> +	return 0;
> +}
> +
>  /* Allocate all CoW reservations covering a range of blocks in a file. */
>  int
>  xfs_reflink_allocate_cow(
> @@ -363,78 +404,62 @@ xfs_reflink_allocate_cow(
>  	struct xfs_mount	*mp = ip->i_mount;
>  	xfs_fileoff_t		offset_fsb = imap->br_startoff;
>  	xfs_filblks_t		count_fsb = imap->br_blockcount;
> -	struct xfs_bmbt_irec	got;
> -	struct xfs_trans	*tp = NULL;
> +	struct xfs_trans	*tp;
>  	int			nimaps, error = 0;
> -	bool			trimmed;
> +	bool			found;
>  	xfs_filblks_t		resaligned;
>  	xfs_extlen_t		resblks = 0;
> -	struct xfs_iext_cursor	icur;
>  
> -retry:
> -	ASSERT(xfs_is_reflink_inode(ip));
>  	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
> +	ASSERT(xfs_is_reflink_inode(ip));
>  
> -	/*
> -	 * Even if the extent is not shared we might have a preallocation for
> -	 * it in the COW fork.  If so use it.
> -	 */
> -	if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) &&
> -	    got.br_startoff <= offset_fsb) {
> -		*shared = true;
> -
> -		/* If we have a real allocation in the COW fork we're done. */
> -		if (!isnullstartblock(got.br_startblock)) {
> -			xfs_trim_extent(&got, offset_fsb, count_fsb);
> -			*imap = got;
> -			goto convert;
> -		}
> +	error = find_trim_cow_extent(ip, imap, shared, &found);
> +	if (error || !*shared)
> +		return error;
> +	if (found)
> +		goto convert;
>  
> -		xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
> -	} else {
> -		error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
> -		if (error || !*shared)
> -			goto out;
> -	}
> +	resaligned = xfs_aligned_fsb_count(imap->br_startoff,
> +		imap->br_blockcount, xfs_get_cowextsz_hint(ip));
> +	resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
>  
> -	if (!tp) {
> -		resaligned = xfs_aligned_fsb_count(imap->br_startoff,
> -			imap->br_blockcount, xfs_get_cowextsz_hint(ip));
> -		resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
> +	xfs_iunlock(ip, *lockmode);
> +	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
> +	*lockmode = XFS_ILOCK_EXCL;
> +	xfs_ilock(ip, *lockmode);
>  
> -		xfs_iunlock(ip, *lockmode);
> -		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
> -		*lockmode = XFS_ILOCK_EXCL;
> -		xfs_ilock(ip, *lockmode);
> +	if (error)
> +		return error;
>  
> -		if (error)
> -			return error;
> +	error = xfs_qm_dqattach_locked(ip, false);
> +	if (error)
> +		goto out_trans_cancel;
>  
> -		error = xfs_qm_dqattach_locked(ip, false);
> -		if (error)
> -			goto out;
> -		goto retry;
> +	/* check for an overlapping extent again no that we dropped the ilock */

"...again now that we dropped..."

Otherwise this also looks fine to me.

--D

> +	error = find_trim_cow_extent(ip, imap, shared, &found);
> +	if (error || !*shared)
> +		goto out_trans_cancel;
> +	if (found) {
> +		xfs_trans_cancel(tp);
> +		goto convert;
>  	}
>  
>  	error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0,
>  			XFS_QMOPT_RES_REGBLKS);
>  	if (error)
> -		goto out;
> +		goto out_trans_cancel;
>  
>  	xfs_trans_ijoin(tp, ip, 0);
>  
> -	nimaps = 1;
> -
>  	/* Allocate the entire reservation as unwritten blocks. */
> +	nimaps = 1;
>  	error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
>  			XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC,
>  			resblks, imap, &nimaps);
>  	if (error)
> -		goto out_trans_cancel;
> +		goto out_unreserve;
>  
>  	xfs_inode_set_cowblocks_tag(ip);
> -
> -	/* Finish up. */
>  	error = xfs_trans_commit(tp);
>  	if (error)
>  		return error;
> @@ -447,12 +472,12 @@ xfs_reflink_allocate_cow(
>  		return -ENOSPC;
>  convert:
>  	return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb);
> -out_trans_cancel:
> +
> +out_unreserve:
>  	xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0,
>  			XFS_QMOPT_RES_REGBLKS);
> -out:
> -	if (tp)
> -		xfs_trans_cancel(tp);
> +out_trans_cancel:
> +	xfs_trans_cancel(tp);
>  	return error;
>  }
>
Christoph Hellwig Sept. 17, 2018, 4:30 p.m. UTC | #7
On Mon, Sep 17, 2018 at 09:29:12AM -0700, Darrick J. Wong wrote:
> > +find_trim_cow_extent(
> 
> /me prefers that we try to prefix all the xfs functions, even if they
> are static...

Heh, I actually just did that.  I have a series that starts with this
respin that I plan to post later today.
diff mbox series

Patch

================================================
WARNING: lock held when returning to user space!
4.18.5 #1 Not tainted
------------------------------------------------
worker/6123 is leaving the kernel with locks still held!
1 lock held by worker/6123:
 #0: 000000009eab4f1b (sb_internal#2){.+.+}, at: xfs_trans_alloc+0x17c/0x220

And eventually the filesystem deadlocks because it runs out of log
space that is reserved by the leaked transaction and never gets
released.

The logic flow in xfs_reflink_allocate_cow() is a convoluted mess of
gotos - it's no surprise that it has bug where the flow through
several goto jumps then fails to clean up context from a non-obvious
logic path. CLean up the logic flow and make sure every path does
the right thing.

Reported-by: Alexander Y. Fomichev <git.user@gmail.com>
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=200981
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_reflink.c | 98 +++++++++++++++++++++++++++++---------------
 1 file changed, 66 insertions(+), 32 deletions(-)

diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 38f405415b88..b39f5afa57aa 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -352,6 +352,47 @@  xfs_reflink_convert_cow(
 	return error;
 }
 
+/*
+ * Find the extent that maps the given range in the COW fork. Even if the extent
+ * is not shared we might have a preallocation for it in the COW fork. If so we
+ * use it that rather than trigger a new allocation.
+ */
+static int
+find_trim_cow_extent(
+	struct xfs_inode	*ip,
+	struct xfs_bmbt_irec	*imap,
+	bool			*shared,
+	bool			*found)
+{
+	xfs_fileoff_t		offset_fsb = imap->br_startoff;
+	xfs_filblks_t		count_fsb = imap->br_blockcount;
+	struct xfs_iext_cursor	icur;
+	struct xfs_bmbt_irec	got;
+	bool			trimmed;
+
+	*found = false;
+
+	/*
+	 * if we don't find an overlapping extent, trim the range we need to
+	 * allocate to fit the hole we found.
+	 */
+	if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) ||
+	    got.br_startoff > offset_fsb)
+		return xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
+
+	*shared = true;
+	if (isnullstartblock(got.br_startblock)) {
+		xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
+		return 0;
+	}
+
+	/* real extent found - no need to allocate */
+	xfs_trim_extent(&got, offset_fsb, count_fsb);
+	*imap = got;
+	*found = true;
+	return 0;
+}
+
 /* Allocate all CoW reservations covering a range of blocks in a file. */
 int
 xfs_reflink_allocate_cow(
@@ -363,41 +404,37 @@  xfs_reflink_allocate_cow(
 	struct xfs_mount	*mp = ip->i_mount;
 	xfs_fileoff_t		offset_fsb = imap->br_startoff;
 	xfs_filblks_t		count_fsb = imap->br_blockcount;
-	struct xfs_bmbt_irec	got;
 	struct xfs_trans	*tp = NULL;
 	int			nimaps, error = 0;
-	bool			trimmed;
+	bool			found;
 	xfs_filblks_t		resaligned;
 	xfs_extlen_t		resblks = 0;
-	struct xfs_iext_cursor	icur;
 
-retry:
-	ASSERT(xfs_is_reflink_inode(ip));
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 
 	/*
-	 * Even if the extent is not shared we might have a preallocation for
-	 * it in the COW fork.  If so use it.
+	 * do-while to run a single lookup retry after transaction allocation.
+	 * All exits from the loop need to check if the transaction needs
+	 * cancelling. Dropping the ILOCK to allocate the transaction allows
+	 * races with other COW fork allocations, so we need to start the extent
+	 * lookup from scratch once we have the ILOCK again.
 	 */
-	if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) &&
-	    got.br_startoff <= offset_fsb) {
-		*shared = true;
+	do {
+		ASSERT(xfs_is_reflink_inode(ip));
 
-		/* If we have a real allocation in the COW fork we're done. */
-		if (!isnullstartblock(got.br_startblock)) {
-			xfs_trim_extent(&got, offset_fsb, count_fsb);
-			*imap = got;
+		error = find_trim_cow_extent(ip, imap, shared, &found);
+		if (error || !*shared)
+			goto out_trans_cancel;
+		if (found) {
+			if (tp)
+				xfs_trans_cancel(tp);
 			goto convert;
 		}
 
-		xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
-	} else {
-		error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
-		if (error || !*shared)
-			goto out;
-	}
+		/* if we have a transaction, it's time to allocate */
+		if (tp)
+			break;
 
-	if (!tp) {
 		resaligned = xfs_aligned_fsb_count(imap->br_startoff,
 			imap->br_blockcount, xfs_get_cowextsz_hint(ip));
 		resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
@@ -412,29 +449,25 @@  xfs_reflink_allocate_cow(
 
 		error = xfs_qm_dqattach_locked(ip, false);
 		if (error)
-			goto out;
-		goto retry;
-	}
+			goto out_trans_cancel;
+	} while (tp);
 
 	error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0,
 			XFS_QMOPT_RES_REGBLKS);
 	if (error)
-		goto out;
+		goto out_trans_cancel;
 
 	xfs_trans_ijoin(tp, ip, 0);
 
-	nimaps = 1;
-
 	/* Allocate the entire reservation as unwritten blocks. */
+	nimaps = 1;
 	error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
 			XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC,
 			resblks, imap, &nimaps);
 	if (error)
-		goto out_trans_cancel;
+		goto out_unreserve;
 
 	xfs_inode_set_cowblocks_tag(ip);
-
-	/* Finish up. */
 	error = xfs_trans_commit(tp);
 	if (error)
 		return error;
@@ -447,10 +480,11 @@  xfs_reflink_allocate_cow(
 		return -ENOSPC;
 convert:
 	return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb);
-out_trans_cancel:
+
+out_unreserve:
 	xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0,
 			XFS_QMOPT_RES_REGBLKS);
-out:
+out_trans_cancel:
 	if (tp)
 		xfs_trans_cancel(tp);
 	return error;