diff mbox

[RFC] xfs: try to avoid blowing out the transaction reservation when bunmaping a shared extent

Message ID 20170425020954.GV23371@birch.djwong.org (mailing list archive)
State Superseded
Headers show

Commit Message

Darrick J. Wong April 25, 2017, 2:09 a.m. UTC
In a pathological scenario where we are trying to bunmapi a single
extent in which every other block is shared, it's possible that trying
to unmap the entire large extent in a single transaction can generate so
many EFIs that we overflow the transaction reservation.

Therefore, use a heuristic to guess at the number of blocks we can
safely unmap from a reflink file's data fork in an single transaction.
This should prevent problems such as the log head slamming into the tail
and ASSERTs that trigger because we've exceeded the transaction
reservation.

Note that since bunmapi can fail to unmap the entire range, we must also
teach the deferred unmap code to roll into a new transaction whenever we
get low on reservation.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_bmap.c     |   38 ++++++++++++++++++++++++++++++--------
 fs/xfs/libxfs/xfs_bmap.h     |    2 +-
 fs/xfs/libxfs/xfs_refcount.c |   10 +---------
 fs/xfs/libxfs/xfs_refcount.h |   16 ++++++++++++++++
 fs/xfs/xfs_bmap_item.c       |   17 +++++++++++++++--
 fs/xfs/xfs_trans.h           |    2 +-
 fs/xfs/xfs_trans_bmap.c      |   11 +++++++++--
 7 files changed, 73 insertions(+), 23 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Brian Foster April 26, 2017, 1:59 p.m. UTC | #1
On Mon, Apr 24, 2017 at 07:09:54PM -0700, Darrick J. Wong wrote:
> In a pathological scenario where we are trying to bunmapi a single
> extent in which every other block is shared, it's possible that trying
> to unmap the entire large extent in a single transaction can generate so
> many EFIs that we overflow the transaction reservation.
> 
> Therefore, use a heuristic to guess at the number of blocks we can
> safely unmap from a reflink file's data fork in an single transaction.
> This should prevent problems such as the log head slamming into the tail
> and ASSERTs that trigger because we've exceeded the transaction
> reservation.
> 
> Note that since bunmapi can fail to unmap the entire range, we must also
> teach the deferred unmap code to roll into a new transaction whenever we
> get low on reservation.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  fs/xfs/libxfs/xfs_bmap.c     |   38 ++++++++++++++++++++++++++++++--------
>  fs/xfs/libxfs/xfs_bmap.h     |    2 +-
>  fs/xfs/libxfs/xfs_refcount.c |   10 +---------
>  fs/xfs/libxfs/xfs_refcount.h |   16 ++++++++++++++++
>  fs/xfs/xfs_bmap_item.c       |   17 +++++++++++++++--
>  fs/xfs/xfs_trans.h           |    2 +-
>  fs/xfs/xfs_trans_bmap.c      |   11 +++++++++--
>  7 files changed, 73 insertions(+), 23 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
> index f02eb76..1e79fb5 100644
> --- a/fs/xfs/libxfs/xfs_bmap.c
> +++ b/fs/xfs/libxfs/xfs_bmap.c
> @@ -5435,6 +5435,7 @@ __xfs_bunmapi(
>  	int			whichfork;	/* data or attribute fork */
>  	xfs_fsblock_t		sum;
>  	xfs_filblks_t		len = *rlen;	/* length to unmap in file */
> +	xfs_fileoff_t		max_len;
>  
>  	trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
>  
> @@ -5456,6 +5457,16 @@ __xfs_bunmapi(
>  	ASSERT(len > 0);
>  	ASSERT(nexts >= 0);
>  
> +	/*
> +	 * Guesstimate how many blocks we can unmap without running the risk of
> +	 * blowing out the transaction with a mix of EFIs and reflink
> +	 * adjustments.
> +	 */
> +	if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK)
> +		max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res));
> +	else
> +		max_len = len;
> +

Hmm... on first glance, this seems a bit overcomplicated, particularly
the non-determinism of how many blocks we can free being based on an
estimation of an already estimated transaction reservation.

Since we already have a transaction reservation that is calculated based
on a fixed number of modifications (i.e., 2 extent removals) and an
associated extent count parameter. Would it address the problem if we
considered shared extent boundaries as physical extent boundaries for
reflinked files? E.g., unmap of an extent with two total blocks and one
shared block is effectively equivalent to a file with two (discontig)
single block extents..? Just a thought.

Brian

>  	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
>  	    (error = xfs_iread_extents(tp, ip, whichfork)))
>  		return error;
> @@ -5500,7 +5511,7 @@ __xfs_bunmapi(
>  
>  	extno = 0;
>  	while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 &&
> -	       (nexts == 0 || extno < nexts)) {
> +	       (nexts == 0 || extno < nexts) && max_len > 0) {
>  		/*
>  		 * Is the found extent after a hole in which bno lives?
>  		 * Just back up to the previous extent, if so.
> @@ -5532,6 +5543,15 @@ __xfs_bunmapi(
>  		}
>  		if (del.br_startoff + del.br_blockcount > bno + 1)
>  			del.br_blockcount = bno + 1 - del.br_startoff;
> +
> +		/* How much can we safely unmap? */
> +		if (max_len < del.br_blockcount) {
> +			del.br_startoff += del.br_blockcount - max_len;
> +			if (!wasdel)
> +				del.br_startblock += del.br_blockcount - max_len;
> +			del.br_blockcount = max_len;
> +		}
> +
>  		sum = del.br_startblock + del.br_blockcount;
>  		if (isrt &&
>  		    (mod = do_mod(sum, mp->m_sb.sb_rextsize))) {
> @@ -5708,6 +5728,7 @@ __xfs_bunmapi(
>  		if (!isrt && wasdel)
>  			xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, false);
>  
> +		max_len -= del.br_blockcount;
>  		bno = del.br_startoff - 1;
>  nodelete:
>  		/*
> @@ -6473,15 +6494,16 @@ xfs_bmap_finish_one(
>  	int				whichfork,
>  	xfs_fileoff_t			startoff,
>  	xfs_fsblock_t			startblock,
> -	xfs_filblks_t			blockcount,
> +	xfs_filblks_t			*blockcount,
>  	xfs_exntst_t			state)
>  {
> -	int				error = 0, done;
> +	xfs_fsblock_t			firstfsb;
> +	int				error = 0;
>  
>  	trace_xfs_bmap_deferred(tp->t_mountp,
>  			XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type,
>  			XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
> -			ip->i_ino, whichfork, startoff, blockcount, state);
> +			ip->i_ino, whichfork, startoff, *blockcount, state);
>  
>  	if (WARN_ON_ONCE(whichfork != XFS_DATA_FORK))
>  		return -EFSCORRUPTED;
> @@ -6493,13 +6515,13 @@ xfs_bmap_finish_one(
>  
>  	switch (type) {
>  	case XFS_BMAP_MAP:
> -		error = xfs_bmapi_remap(tp, ip, startoff, blockcount,
> +		error = xfs_bmapi_remap(tp, ip, startoff, *blockcount,
>  				startblock, dfops);
> +		*blockcount = 0;
>  		break;
>  	case XFS_BMAP_UNMAP:
> -		error = xfs_bunmapi(tp, ip, startoff, blockcount,
> -				XFS_BMAPI_REMAP, 1, &startblock, dfops, &done);
> -		ASSERT(done);
> +		error = __xfs_bunmapi(tp, ip, startoff, blockcount,
> +				XFS_BMAPI_REMAP, 1, &firstfsb, dfops);
>  		break;
>  	default:
>  		ASSERT(0);
> diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
> index c35a14f..851982a 100644
> --- a/fs/xfs/libxfs/xfs_bmap.h
> +++ b/fs/xfs/libxfs/xfs_bmap.h
> @@ -271,7 +271,7 @@ struct xfs_bmap_intent {
>  int	xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_defer_ops *dfops,
>  		struct xfs_inode *ip, enum xfs_bmap_intent_type type,
>  		int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock,
> -		xfs_filblks_t blockcount, xfs_exntst_t state);
> +		xfs_filblks_t *blockcount, xfs_exntst_t state);
>  int	xfs_bmap_map_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
>  		struct xfs_inode *ip, struct xfs_bmbt_irec *imap);
>  int	xfs_bmap_unmap_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
> diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
> index b177ef3..29dcde3 100644
> --- a/fs/xfs/libxfs/xfs_refcount.c
> +++ b/fs/xfs/libxfs/xfs_refcount.c
> @@ -784,14 +784,6 @@ xfs_refcount_merge_extents(
>  }
>  
>  /*
> - * While we're adjusting the refcounts records of an extent, we have
> - * to keep an eye on the number of extents we're dirtying -- run too
> - * many in a single transaction and we'll exceed the transaction's
> - * reservation and crash the fs.  Each record adds 12 bytes to the
> - * log (plus any key updates) so we'll conservatively assume 24 bytes
> - * per record.  We must also leave space for btree splits on both ends
> - * of the range and space for the CUD and a new CUI.
> - *
>   * XXX: This is a pretty hand-wavy estimate.  The penalty for guessing
>   * true incorrectly is a shutdown FS; the penalty for guessing false
>   * incorrectly is more transaction rolls than might be necessary.
> @@ -822,7 +814,7 @@ xfs_refcount_still_have_space(
>  	else if (overhead > cur->bc_tp->t_log_res)
>  		return false;
>  	return  cur->bc_tp->t_log_res - overhead >
> -		cur->bc_private.a.priv.refc.nr_ops * 32;
> +		cur->bc_private.a.priv.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
>  }
>  
>  /*
> diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
> index 098dc66..eafb9d1 100644
> --- a/fs/xfs/libxfs/xfs_refcount.h
> +++ b/fs/xfs/libxfs/xfs_refcount.h
> @@ -67,4 +67,20 @@ extern int xfs_refcount_free_cow_extent(struct xfs_mount *mp,
>  extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
>  		xfs_agnumber_t agno);
>  
> +/*
> + * While we're adjusting the refcounts records of an extent, we have
> + * to keep an eye on the number of extents we're dirtying -- run too
> + * many in a single transaction and we'll exceed the transaction's
> + * reservation and crash the fs.  Each record adds 12 bytes to the
> + * log (plus any key updates) so we'll conservatively assume 32 bytes
> + * per record.  We must also leave space for btree splits on both ends
> + * of the range and space for the CUD and a new CUI.
> + */
> +#define XFS_REFCOUNT_ITEM_OVERHEAD	32
> +
> +static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res)
> +{
> +	return (log_res * 3 / 4) / XFS_REFCOUNT_ITEM_OVERHEAD;
> +}
> +
>  #endif	/* __XFS_REFCOUNT_H__ */
> diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
> index 378f144..89908bb 100644
> --- a/fs/xfs/xfs_bmap_item.c
> +++ b/fs/xfs/xfs_bmap_item.c
> @@ -396,6 +396,7 @@ xfs_bui_recover(
>  	struct xfs_map_extent		*bmap;
>  	xfs_fsblock_t			startblock_fsb;
>  	xfs_fsblock_t			inode_fsb;
> +	xfs_filblks_t			count;
>  	bool				op_ok;
>  	struct xfs_bud_log_item		*budp;
>  	enum xfs_bmap_intent_type	type;
> @@ -404,6 +405,7 @@ xfs_bui_recover(
>  	struct xfs_trans		*tp;
>  	struct xfs_inode		*ip = NULL;
>  	struct xfs_defer_ops		dfops;
> +	struct xfs_bmbt_irec		irec;
>  	xfs_fsblock_t			firstfsb;
>  	unsigned int			resblks;
>  
> @@ -483,13 +485,24 @@ xfs_bui_recover(
>  	}
>  	xfs_trans_ijoin(tp, ip, 0);
>  
> +	count = bmap->me_len;
>  	error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type,
>  			ip, whichfork, bmap->me_startoff,
> -			bmap->me_startblock, bmap->me_len,
> -			state);
> +			bmap->me_startblock, &count, state);
>  	if (error)
>  		goto err_dfops;
>  
> +	if (count > 0) {
> +		ASSERT(type == XFS_BMAP_UNMAP);
> +		irec.br_startblock = bmap->me_startblock;
> +		irec.br_blockcount = count;
> +		irec.br_startoff = bmap->me_startoff;
> +		irec.br_state = state;
> +		error = xfs_bmap_unmap_extent(tp->t_mountp, &dfops, ip, &irec);
> +		if (error)
> +			goto err_dfops;
> +	}
> +
>  	/* Finish transaction, free inodes. */
>  	error = xfs_defer_finish(&tp, &dfops, NULL);
>  	if (error)
> diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
> index a07acbf..08923e5 100644
> --- a/fs/xfs/xfs_trans.h
> +++ b/fs/xfs/xfs_trans.h
> @@ -275,6 +275,6 @@ int xfs_trans_log_finish_bmap_update(struct xfs_trans *tp,
>  		struct xfs_bud_log_item *rudp, struct xfs_defer_ops *dfops,
>  		enum xfs_bmap_intent_type type, struct xfs_inode *ip,
>  		int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock,
> -		xfs_filblks_t blockcount, xfs_exntst_t state);
> +		xfs_filblks_t *blockcount, xfs_exntst_t state);
>  
>  #endif	/* __XFS_TRANS_H__ */
> diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c
> index 6408e7d..14543d9 100644
> --- a/fs/xfs/xfs_trans_bmap.c
> +++ b/fs/xfs/xfs_trans_bmap.c
> @@ -63,7 +63,7 @@ xfs_trans_log_finish_bmap_update(
>  	int				whichfork,
>  	xfs_fileoff_t			startoff,
>  	xfs_fsblock_t			startblock,
> -	xfs_filblks_t			blockcount,
> +	xfs_filblks_t			*blockcount,
>  	xfs_exntst_t			state)
>  {
>  	int				error;
> @@ -196,16 +196,23 @@ xfs_bmap_update_finish_item(
>  	void				**state)
>  {
>  	struct xfs_bmap_intent		*bmap;
> +	xfs_filblks_t			count;
>  	int				error;
>  
>  	bmap = container_of(item, struct xfs_bmap_intent, bi_list);
> +	count = bmap->bi_bmap.br_blockcount;
>  	error = xfs_trans_log_finish_bmap_update(tp, done_item, dop,
>  			bmap->bi_type,
>  			bmap->bi_owner, bmap->bi_whichfork,
>  			bmap->bi_bmap.br_startoff,
>  			bmap->bi_bmap.br_startblock,
> -			bmap->bi_bmap.br_blockcount,
> +			&count,
>  			bmap->bi_bmap.br_state);
> +	if (!error && count > 0) {
> +		ASSERT(bmap->bi_type == XFS_BMAP_UNMAP);
> +		bmap->bi_bmap.br_blockcount = count;
> +		return -EAGAIN;
> +	}
>  	kmem_free(bmap);
>  	return error;
>  }
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong April 26, 2017, 9:37 p.m. UTC | #2
On Wed, Apr 26, 2017 at 09:59:14AM -0400, Brian Foster wrote:
> On Mon, Apr 24, 2017 at 07:09:54PM -0700, Darrick J. Wong wrote:
> > In a pathological scenario where we are trying to bunmapi a single
> > extent in which every other block is shared, it's possible that trying
> > to unmap the entire large extent in a single transaction can generate so
> > many EFIs that we overflow the transaction reservation.
> > 
> > Therefore, use a heuristic to guess at the number of blocks we can
> > safely unmap from a reflink file's data fork in an single transaction.
> > This should prevent problems such as the log head slamming into the tail
> > and ASSERTs that trigger because we've exceeded the transaction
> > reservation.
> > 
> > Note that since bunmapi can fail to unmap the entire range, we must also
> > teach the deferred unmap code to roll into a new transaction whenever we
> > get low on reservation.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> >  fs/xfs/libxfs/xfs_bmap.c     |   38 ++++++++++++++++++++++++++++++--------
> >  fs/xfs/libxfs/xfs_bmap.h     |    2 +-
> >  fs/xfs/libxfs/xfs_refcount.c |   10 +---------
> >  fs/xfs/libxfs/xfs_refcount.h |   16 ++++++++++++++++
> >  fs/xfs/xfs_bmap_item.c       |   17 +++++++++++++++--
> >  fs/xfs/xfs_trans.h           |    2 +-
> >  fs/xfs/xfs_trans_bmap.c      |   11 +++++++++--
> >  7 files changed, 73 insertions(+), 23 deletions(-)
> > 
> > diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
> > index f02eb76..1e79fb5 100644
> > --- a/fs/xfs/libxfs/xfs_bmap.c
> > +++ b/fs/xfs/libxfs/xfs_bmap.c
> > @@ -5435,6 +5435,7 @@ __xfs_bunmapi(
> >  	int			whichfork;	/* data or attribute fork */
> >  	xfs_fsblock_t		sum;
> >  	xfs_filblks_t		len = *rlen;	/* length to unmap in file */
> > +	xfs_fileoff_t		max_len;
> >  
> >  	trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
> >  
> > @@ -5456,6 +5457,16 @@ __xfs_bunmapi(
> >  	ASSERT(len > 0);
> >  	ASSERT(nexts >= 0);
> >  
> > +	/*
> > +	 * Guesstimate how many blocks we can unmap without running the risk of
> > +	 * blowing out the transaction with a mix of EFIs and reflink
> > +	 * adjustments.
> > +	 */
> > +	if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK)
> > +		max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res));
> > +	else
> > +		max_len = len;
> > +
> 
> Hmm... on first glance, this seems a bit overcomplicated, particularly
> the non-determinism of how many blocks we can free being based on an
> estimation of an already estimated transaction reservation.
> 
> Since we already have a transaction reservation that is calculated based
> on a fixed number of modifications (i.e., 2 extent removals) and an
> associated extent count parameter. Would it address the problem if we
> considered shared extent boundaries as physical extent boundaries for
> reflinked files? E.g., unmap of an extent with two total blocks and one
> shared block is effectively equivalent to a file with two (discontig)
> single block extents..? Just a thought.

That's so shockingly obvious I don't know why it didn't occur to me.
Uh, I'll go give that a try with that horrid generic/931 testcase that
I sent to the list a couple of weeks ago. :)

--D

> 
> Brian
> 
> >  	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
> >  	    (error = xfs_iread_extents(tp, ip, whichfork)))
> >  		return error;
> > @@ -5500,7 +5511,7 @@ __xfs_bunmapi(
> >  
> >  	extno = 0;
> >  	while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 &&
> > -	       (nexts == 0 || extno < nexts)) {
> > +	       (nexts == 0 || extno < nexts) && max_len > 0) {
> >  		/*
> >  		 * Is the found extent after a hole in which bno lives?
> >  		 * Just back up to the previous extent, if so.
> > @@ -5532,6 +5543,15 @@ __xfs_bunmapi(
> >  		}
> >  		if (del.br_startoff + del.br_blockcount > bno + 1)
> >  			del.br_blockcount = bno + 1 - del.br_startoff;
> > +
> > +		/* How much can we safely unmap? */
> > +		if (max_len < del.br_blockcount) {
> > +			del.br_startoff += del.br_blockcount - max_len;
> > +			if (!wasdel)
> > +				del.br_startblock += del.br_blockcount - max_len;
> > +			del.br_blockcount = max_len;
> > +		}
> > +
> >  		sum = del.br_startblock + del.br_blockcount;
> >  		if (isrt &&
> >  		    (mod = do_mod(sum, mp->m_sb.sb_rextsize))) {
> > @@ -5708,6 +5728,7 @@ __xfs_bunmapi(
> >  		if (!isrt && wasdel)
> >  			xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, false);
> >  
> > +		max_len -= del.br_blockcount;
> >  		bno = del.br_startoff - 1;
> >  nodelete:
> >  		/*
> > @@ -6473,15 +6494,16 @@ xfs_bmap_finish_one(
> >  	int				whichfork,
> >  	xfs_fileoff_t			startoff,
> >  	xfs_fsblock_t			startblock,
> > -	xfs_filblks_t			blockcount,
> > +	xfs_filblks_t			*blockcount,
> >  	xfs_exntst_t			state)
> >  {
> > -	int				error = 0, done;
> > +	xfs_fsblock_t			firstfsb;
> > +	int				error = 0;
> >  
> >  	trace_xfs_bmap_deferred(tp->t_mountp,
> >  			XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type,
> >  			XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
> > -			ip->i_ino, whichfork, startoff, blockcount, state);
> > +			ip->i_ino, whichfork, startoff, *blockcount, state);
> >  
> >  	if (WARN_ON_ONCE(whichfork != XFS_DATA_FORK))
> >  		return -EFSCORRUPTED;
> > @@ -6493,13 +6515,13 @@ xfs_bmap_finish_one(
> >  
> >  	switch (type) {
> >  	case XFS_BMAP_MAP:
> > -		error = xfs_bmapi_remap(tp, ip, startoff, blockcount,
> > +		error = xfs_bmapi_remap(tp, ip, startoff, *blockcount,
> >  				startblock, dfops);
> > +		*blockcount = 0;
> >  		break;
> >  	case XFS_BMAP_UNMAP:
> > -		error = xfs_bunmapi(tp, ip, startoff, blockcount,
> > -				XFS_BMAPI_REMAP, 1, &startblock, dfops, &done);
> > -		ASSERT(done);
> > +		error = __xfs_bunmapi(tp, ip, startoff, blockcount,
> > +				XFS_BMAPI_REMAP, 1, &firstfsb, dfops);
> >  		break;
> >  	default:
> >  		ASSERT(0);
> > diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
> > index c35a14f..851982a 100644
> > --- a/fs/xfs/libxfs/xfs_bmap.h
> > +++ b/fs/xfs/libxfs/xfs_bmap.h
> > @@ -271,7 +271,7 @@ struct xfs_bmap_intent {
> >  int	xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_defer_ops *dfops,
> >  		struct xfs_inode *ip, enum xfs_bmap_intent_type type,
> >  		int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock,
> > -		xfs_filblks_t blockcount, xfs_exntst_t state);
> > +		xfs_filblks_t *blockcount, xfs_exntst_t state);
> >  int	xfs_bmap_map_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
> >  		struct xfs_inode *ip, struct xfs_bmbt_irec *imap);
> >  int	xfs_bmap_unmap_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
> > diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
> > index b177ef3..29dcde3 100644
> > --- a/fs/xfs/libxfs/xfs_refcount.c
> > +++ b/fs/xfs/libxfs/xfs_refcount.c
> > @@ -784,14 +784,6 @@ xfs_refcount_merge_extents(
> >  }
> >  
> >  /*
> > - * While we're adjusting the refcounts records of an extent, we have
> > - * to keep an eye on the number of extents we're dirtying -- run too
> > - * many in a single transaction and we'll exceed the transaction's
> > - * reservation and crash the fs.  Each record adds 12 bytes to the
> > - * log (plus any key updates) so we'll conservatively assume 24 bytes
> > - * per record.  We must also leave space for btree splits on both ends
> > - * of the range and space for the CUD and a new CUI.
> > - *
> >   * XXX: This is a pretty hand-wavy estimate.  The penalty for guessing
> >   * true incorrectly is a shutdown FS; the penalty for guessing false
> >   * incorrectly is more transaction rolls than might be necessary.
> > @@ -822,7 +814,7 @@ xfs_refcount_still_have_space(
> >  	else if (overhead > cur->bc_tp->t_log_res)
> >  		return false;
> >  	return  cur->bc_tp->t_log_res - overhead >
> > -		cur->bc_private.a.priv.refc.nr_ops * 32;
> > +		cur->bc_private.a.priv.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
> >  }
> >  
> >  /*
> > diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
> > index 098dc66..eafb9d1 100644
> > --- a/fs/xfs/libxfs/xfs_refcount.h
> > +++ b/fs/xfs/libxfs/xfs_refcount.h
> > @@ -67,4 +67,20 @@ extern int xfs_refcount_free_cow_extent(struct xfs_mount *mp,
> >  extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
> >  		xfs_agnumber_t agno);
> >  
> > +/*
> > + * While we're adjusting the refcounts records of an extent, we have
> > + * to keep an eye on the number of extents we're dirtying -- run too
> > + * many in a single transaction and we'll exceed the transaction's
> > + * reservation and crash the fs.  Each record adds 12 bytes to the
> > + * log (plus any key updates) so we'll conservatively assume 32 bytes
> > + * per record.  We must also leave space for btree splits on both ends
> > + * of the range and space for the CUD and a new CUI.
> > + */
> > +#define XFS_REFCOUNT_ITEM_OVERHEAD	32
> > +
> > +static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res)
> > +{
> > +	return (log_res * 3 / 4) / XFS_REFCOUNT_ITEM_OVERHEAD;
> > +}
> > +
> >  #endif	/* __XFS_REFCOUNT_H__ */
> > diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
> > index 378f144..89908bb 100644
> > --- a/fs/xfs/xfs_bmap_item.c
> > +++ b/fs/xfs/xfs_bmap_item.c
> > @@ -396,6 +396,7 @@ xfs_bui_recover(
> >  	struct xfs_map_extent		*bmap;
> >  	xfs_fsblock_t			startblock_fsb;
> >  	xfs_fsblock_t			inode_fsb;
> > +	xfs_filblks_t			count;
> >  	bool				op_ok;
> >  	struct xfs_bud_log_item		*budp;
> >  	enum xfs_bmap_intent_type	type;
> > @@ -404,6 +405,7 @@ xfs_bui_recover(
> >  	struct xfs_trans		*tp;
> >  	struct xfs_inode		*ip = NULL;
> >  	struct xfs_defer_ops		dfops;
> > +	struct xfs_bmbt_irec		irec;
> >  	xfs_fsblock_t			firstfsb;
> >  	unsigned int			resblks;
> >  
> > @@ -483,13 +485,24 @@ xfs_bui_recover(
> >  	}
> >  	xfs_trans_ijoin(tp, ip, 0);
> >  
> > +	count = bmap->me_len;
> >  	error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type,
> >  			ip, whichfork, bmap->me_startoff,
> > -			bmap->me_startblock, bmap->me_len,
> > -			state);
> > +			bmap->me_startblock, &count, state);
> >  	if (error)
> >  		goto err_dfops;
> >  
> > +	if (count > 0) {
> > +		ASSERT(type == XFS_BMAP_UNMAP);
> > +		irec.br_startblock = bmap->me_startblock;
> > +		irec.br_blockcount = count;
> > +		irec.br_startoff = bmap->me_startoff;
> > +		irec.br_state = state;
> > +		error = xfs_bmap_unmap_extent(tp->t_mountp, &dfops, ip, &irec);
> > +		if (error)
> > +			goto err_dfops;
> > +	}
> > +
> >  	/* Finish transaction, free inodes. */
> >  	error = xfs_defer_finish(&tp, &dfops, NULL);
> >  	if (error)
> > diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
> > index a07acbf..08923e5 100644
> > --- a/fs/xfs/xfs_trans.h
> > +++ b/fs/xfs/xfs_trans.h
> > @@ -275,6 +275,6 @@ int xfs_trans_log_finish_bmap_update(struct xfs_trans *tp,
> >  		struct xfs_bud_log_item *rudp, struct xfs_defer_ops *dfops,
> >  		enum xfs_bmap_intent_type type, struct xfs_inode *ip,
> >  		int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock,
> > -		xfs_filblks_t blockcount, xfs_exntst_t state);
> > +		xfs_filblks_t *blockcount, xfs_exntst_t state);
> >  
> >  #endif	/* __XFS_TRANS_H__ */
> > diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c
> > index 6408e7d..14543d9 100644
> > --- a/fs/xfs/xfs_trans_bmap.c
> > +++ b/fs/xfs/xfs_trans_bmap.c
> > @@ -63,7 +63,7 @@ xfs_trans_log_finish_bmap_update(
> >  	int				whichfork,
> >  	xfs_fileoff_t			startoff,
> >  	xfs_fsblock_t			startblock,
> > -	xfs_filblks_t			blockcount,
> > +	xfs_filblks_t			*blockcount,
> >  	xfs_exntst_t			state)
> >  {
> >  	int				error;
> > @@ -196,16 +196,23 @@ xfs_bmap_update_finish_item(
> >  	void				**state)
> >  {
> >  	struct xfs_bmap_intent		*bmap;
> > +	xfs_filblks_t			count;
> >  	int				error;
> >  
> >  	bmap = container_of(item, struct xfs_bmap_intent, bi_list);
> > +	count = bmap->bi_bmap.br_blockcount;
> >  	error = xfs_trans_log_finish_bmap_update(tp, done_item, dop,
> >  			bmap->bi_type,
> >  			bmap->bi_owner, bmap->bi_whichfork,
> >  			bmap->bi_bmap.br_startoff,
> >  			bmap->bi_bmap.br_startblock,
> > -			bmap->bi_bmap.br_blockcount,
> > +			&count,
> >  			bmap->bi_bmap.br_state);
> > +	if (!error && count > 0) {
> > +		ASSERT(bmap->bi_type == XFS_BMAP_UNMAP);
> > +		bmap->bi_bmap.br_blockcount = count;
> > +		return -EAGAIN;
> > +	}
> >  	kmem_free(bmap);
> >  	return error;
> >  }
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig April 27, 2017, 7:35 a.m. UTC | #3
On Wed, Apr 26, 2017 at 02:37:31PM -0700, Darrick J. Wong wrote:
> That's so shockingly obvious I don't know why it didn't occur to me.
> Uh, I'll go give that a try with that horrid generic/931 testcase that
> I sent to the list a couple of weeks ago. :)

I like the idea too.  But once thing to remember is that freeing
blocks from two AG in the same transaction is deadlock prone as the
other thread shows.  I've been doing some work to mitigate that, but
I guess I'll just wait for your new patch and will rebase on top of
that.
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig April 27, 2017, 7:40 a.m. UTC | #4
> @@ -483,13 +485,24 @@ xfs_bui_recover(
>  	}
>  	xfs_trans_ijoin(tp, ip, 0);
>  
> +	count = bmap->me_len;
>  	error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type,
>  			ip, whichfork, bmap->me_startoff,
> -			bmap->me_startblock, bmap->me_len,
> -			state);
> +			bmap->me_startblock, &count, state);
>  	if (error)
>  		goto err_dfops;
>  
> +	if (count > 0) {
> +		ASSERT(type == XFS_BMAP_UNMAP);
> +		irec.br_startblock = bmap->me_startblock;
> +		irec.br_blockcount = count;
> +		irec.br_startoff = bmap->me_startoff;
> +		irec.br_state = state;
> +		error = xfs_bmap_unmap_extent(tp->t_mountp, &dfops, ip, &irec);
> +		if (error)
> +			goto err_dfops;
> +	}

Aren't we always returning -EAGAIN from xfs_trans_log_finish_bmap_update
if count is non-zero?  Seems like this path isn't currently hit by
testing.

> +
>  	/* Finish transaction, free inodes. */
>  	error = xfs_defer_finish(&tp, &dfops, NULL);
>  	if (error)

> diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c
> index 6408e7d..14543d9 100644
> --- a/fs/xfs/xfs_trans_bmap.c
> +++ b/fs/xfs/xfs_trans_bmap.c
> @@ -63,7 +63,7 @@ xfs_trans_log_finish_bmap_update(
>  	int				whichfork,
>  	xfs_fileoff_t			startoff,
>  	xfs_fsblock_t			startblock,
> -	xfs_filblks_t			blockcount,
> +	xfs_filblks_t			*blockcount,
>  	xfs_exntst_t			state)
>  {
>  	int				error;
> @@ -196,16 +196,23 @@ xfs_bmap_update_finish_item(
>  	void				**state)
>  {
>  	struct xfs_bmap_intent		*bmap;
> +	xfs_filblks_t			count;
>  	int				error;
>  
>  	bmap = container_of(item, struct xfs_bmap_intent, bi_list);
> +	count = bmap->bi_bmap.br_blockcount;
>  	error = xfs_trans_log_finish_bmap_update(tp, done_item, dop,
>  			bmap->bi_type,
>  			bmap->bi_owner, bmap->bi_whichfork,
>  			bmap->bi_bmap.br_startoff,
>  			bmap->bi_bmap.br_startblock,
> -			bmap->bi_bmap.br_blockcount,
> +			&count,
>  			bmap->bi_bmap.br_state);
> +	if (!error && count > 0) {
> +		ASSERT(bmap->bi_type == XFS_BMAP_UNMAP);
> +		bmap->bi_bmap.br_blockcount = count;
> +		return -EAGAIN;
> +	}

Can we just kill off xfs_trans_log_finish_bmap_update, move the
code here and avoid the weird calling conventions?
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong April 27, 2017, 3:58 p.m. UTC | #5
On Thu, Apr 27, 2017 at 12:40:42AM -0700, Christoph Hellwig wrote:
> > @@ -483,13 +485,24 @@ xfs_bui_recover(
> >  	}
> >  	xfs_trans_ijoin(tp, ip, 0);
> >  
> > +	count = bmap->me_len;
> >  	error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type,
> >  			ip, whichfork, bmap->me_startoff,
> > -			bmap->me_startblock, bmap->me_len,
> > -			state);
> > +			bmap->me_startblock, &count, state);
> >  	if (error)
> >  		goto err_dfops;
> >  
> > +	if (count > 0) {
> > +		ASSERT(type == XFS_BMAP_UNMAP);
> > +		irec.br_startblock = bmap->me_startblock;
> > +		irec.br_blockcount = count;
> > +		irec.br_startoff = bmap->me_startoff;
> > +		irec.br_state = state;
> > +		error = xfs_bmap_unmap_extent(tp->t_mountp, &dfops, ip, &irec);
> > +		if (error)
> > +			goto err_dfops;
> > +	}
> 
> Aren't we always returning -EAGAIN from xfs_trans_log_finish_bmap_update
> if count is non-zero?  Seems like this path isn't currently hit by
> testing.

I hit it on generic/187 with a 1k block size.

> 
> > +
> >  	/* Finish transaction, free inodes. */
> >  	error = xfs_defer_finish(&tp, &dfops, NULL);
> >  	if (error)
> 
> > diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c
> > index 6408e7d..14543d9 100644
> > --- a/fs/xfs/xfs_trans_bmap.c
> > +++ b/fs/xfs/xfs_trans_bmap.c
> > @@ -63,7 +63,7 @@ xfs_trans_log_finish_bmap_update(
> >  	int				whichfork,
> >  	xfs_fileoff_t			startoff,
> >  	xfs_fsblock_t			startblock,
> > -	xfs_filblks_t			blockcount,
> > +	xfs_filblks_t			*blockcount,
> >  	xfs_exntst_t			state)
> >  {
> >  	int				error;
> > @@ -196,16 +196,23 @@ xfs_bmap_update_finish_item(
> >  	void				**state)
> >  {
> >  	struct xfs_bmap_intent		*bmap;
> > +	xfs_filblks_t			count;
> >  	int				error;
> >  
> >  	bmap = container_of(item, struct xfs_bmap_intent, bi_list);
> > +	count = bmap->bi_bmap.br_blockcount;
> >  	error = xfs_trans_log_finish_bmap_update(tp, done_item, dop,
> >  			bmap->bi_type,
> >  			bmap->bi_owner, bmap->bi_whichfork,
> >  			bmap->bi_bmap.br_startoff,
> >  			bmap->bi_bmap.br_startblock,
> > -			bmap->bi_bmap.br_blockcount,
> > +			&count,
> >  			bmap->bi_bmap.br_state);
> > +	if (!error && count > 0) {
> > +		ASSERT(bmap->bi_type == XFS_BMAP_UNMAP);
> > +		bmap->bi_bmap.br_blockcount = count;
> > +		return -EAGAIN;
> > +	}
> 
> Can we just kill off xfs_trans_log_finish_bmap_update, move the
> code here and avoid the weird calling conventions?

I guess we could open-code all the stuff that it does in
xfs_bmap_update_finish_item and xfs_bui_recover, but I'd hate to have to
remember to update both copies.

--D

> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong April 28, 2017, 7:40 p.m. UTC | #6
On Thu, Apr 27, 2017 at 12:35:19AM -0700, Christoph Hellwig wrote:
> On Wed, Apr 26, 2017 at 02:37:31PM -0700, Darrick J. Wong wrote:
> > That's so shockingly obvious I don't know why it didn't occur to me.
> > Uh, I'll go give that a try with that horrid generic/931 testcase that
> > I sent to the list a couple of weeks ago. :)

...and now I've realized why I never thought of it -- it's unfortunately
racy.  We want to constrain the number of blocks that we unmap from the
file based on the refcount, but because the transaction rolling unlocks
the AGF in between the unmap and decrement-refcount transactions,
another process can swoop in and make changes to the sharedness that
screw us over.  Let's say files A and B both share the same 1000 blocks,
and threads A and B are modifying files A and B, respectively.

Thread A:                            Thread B:
1. punch blocks backed by (0-1000)   1. punch blocks backed by (500-600)
2. grab agf to read refcbt  
3. unmap blocks 0-1000; no
   refcount edges so we can
   unmap everything at once
4. roll transaction, unlock agf
                                     2. grab agf to read refcbt
                                     3. unmap blocks 500-600; no refcount
                                        edges so we can unmap everything in
                                        a single operation
                                     4. roll transaction, unlock agf
                                     5. grab agf again. lucky!
                                     6. split refcbt record because 500-600
                                        is now only owned by file A.
                                     7. complete transaction, unlock agf
5. grab agf to read refcbt
6. whoah!  why are there two
   edges in the refcount tree at
   500 and 600???

So either we have to keep the AGF buffer locked and attached across
all the transaction rolls until we around to processing the refcount
decrement deferred op, or figure out something else clever...?

--D

> I like the idea too.  But once thing to remember is that freeing
> blocks from two AG in the same transaction is deadlock prone as the
> other thread shows.  I've been doing some work to mitigate that, but
> I guess I'll just wait for your new patch and will rebase on top of
> that.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Brian Foster May 1, 2017, 2:58 p.m. UTC | #7
On Fri, Apr 28, 2017 at 12:40:32PM -0700, Darrick J. Wong wrote:
> On Thu, Apr 27, 2017 at 12:35:19AM -0700, Christoph Hellwig wrote:
> > On Wed, Apr 26, 2017 at 02:37:31PM -0700, Darrick J. Wong wrote:
> > > That's so shockingly obvious I don't know why it didn't occur to me.
> > > Uh, I'll go give that a try with that horrid generic/931 testcase that
> > > I sent to the list a couple of weeks ago. :)
> 
> ...and now I've realized why I never thought of it -- it's unfortunately
> racy.  We want to constrain the number of blocks that we unmap from the
> file based on the refcount, but because the transaction rolling unlocks
> the AGF in between the unmap and decrement-refcount transactions,
> another process can swoop in and make changes to the sharedness that
> screw us over.  Let's say files A and B both share the same 1000 blocks,
> and threads A and B are modifying files A and B, respectively.
> 
> Thread A:                            Thread B:
> 1. punch blocks backed by (0-1000)   1. punch blocks backed by (500-600)
> 2. grab agf to read refcbt  
> 3. unmap blocks 0-1000; no
>    refcount edges so we can
>    unmap everything at once
> 4. roll transaction, unlock agf
>                                      2. grab agf to read refcbt
>                                      3. unmap blocks 500-600; no refcount
>                                         edges so we can unmap everything in
>                                         a single operation
>                                      4. roll transaction, unlock agf
>                                      5. grab agf again. lucky!
>                                      6. split refcbt record because 500-600
>                                         is now only owned by file A.
>                                      7. complete transaction, unlock agf
> 5. grab agf to read refcbt
> 6. whoah!  why are there two
>    edges in the refcount tree at
>    500 and 600???
> 

Ugh, ok. So we basically can't make any assumptions about shared extent
state across the transactions, because that can change as soon as we
start processing deferred actions. This is presumably why we have high
level refcount inc/dec deferred ops such that the specific shared state
of a range of blocks isn't really assessed until the deferred op runs.

Therefore, this patch limits the high level unmap operation based on a
worst case assumption that every other block is shared..? If we do end
up going with this approach, I think the comment where we calc the max
unmap block count could be slightly improved to point out the specifics
of the issue here. E.g., each individual block of a contiguous extent
could result in an independent EFI or reflink adjustment, therefore we
need to convert available log reservation into a block based unmap
limit.

> So either we have to keep the AGF buffer locked and attached across
> all the transaction rolls until we around to processing the refcount
> decrement deferred op, or figure out something else clever...?
> 

Keeping the AGF locked still sounds like the most preferable approach to
me, provided there are no other dragons lurking around locking or
whatnot and we can do so cleanly. Perhaps we could plumb in a mechanism
to xfs_trans_bhold() a buffer across a dop list and then
_bhold_release() it for the last transaction returned by
xfs_defer_finish()..?

Otherwise, another question I had more related to the approach of the
current patch... could we use a deterministic unmap length baked into
the transaction rather than a dynamic assumption that a certain
percentage of the transaction is designated for reflink adjustments?
E.g., use a transaction that is explicitly designed for a max extent
modification count or a max block adjustment (based on the worst link
reflink requirements) and fix xfs_bunmapi() to use that max count. What
concerns me about the dynamic approach is that if we ever had to
increase or modify this transaction res for something else that happens
to also affect unmap (*handwave*), we'd have to also know to go and
possibly reverse engineer/adjust this reflink heuristic appropriately
(or more likely, not do so and set a landmine).

Brian

> --D
> 
> > I like the idea too.  But once thing to remember is that freeing
> > blocks from two AG in the same transaction is deadlock prone as the
> > other thread shows.  I've been doing some work to mitigate that, but
> > I guess I'll just wait for your new patch and will rebase on top of
> > that.
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig May 2, 2017, 8 a.m. UTC | #8
On Fri, Apr 28, 2017 at 12:40:32PM -0700, Darrick J. Wong wrote:
> So either we have to keep the AGF buffer locked and attached across
> all the transaction rolls until we around to processing the refcount
> decrement deferred op, or figure out something else clever...?

We can't.  We need to unlock the AGF buffer over rolls because we
must not lock two AGF buffers unless we can ensure they are in
increasing agno order.  For the alloc path we go great length to
ensure this, but we will also need to do that for the free path,
which currently is a problem due to the two extents we free per
transaction.

Maybe we need to move away from the defer everything in one defer_ops
scheme to target defers instead so that we can keep the AGF locked
where needed and release it when we can't keep it locked.
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Brian Foster May 2, 2017, 12:05 p.m. UTC | #9
On Tue, May 02, 2017 at 01:00:23AM -0700, Christoph Hellwig wrote:
> On Fri, Apr 28, 2017 at 12:40:32PM -0700, Darrick J. Wong wrote:
> > So either we have to keep the AGF buffer locked and attached across
> > all the transaction rolls until we around to processing the refcount
> > decrement deferred op, or figure out something else clever...?
> 
> We can't.  We need to unlock the AGF buffer over rolls because we
> must not lock two AGF buffers unless we can ensure they are in
> increasing agno order.  For the alloc path we go great length to
> ensure this, but we will also need to do that for the free path,
> which currently is a problem due to the two extents we free per
> transaction.
> 

But doesn't that mean this wouldn't be a new problem introduced by such
a change?

Stepping back and taking a look at the code, I realize we don't
currently lock the agf until xfs_free_extent() or
xfs_refcount_finish_one(), which is the deferred op. I assume Darrick's
previous example thus requires that we introduce locking of the AGF
during the unmap for reflinked files, but it seems to me we can end up
locking two AGFs as such whether we do the locking in the unmap or in
the deferred ops (as we do today). IOW, the only difference here seems
to be whether we do the locking before or after the transaction roll..?

> Maybe we need to move away from the defer everything in one defer_ops
> scheme to target defers instead so that we can keep the AGF locked
> where needed and release it when we can't keep it locked.

Yeah, I'm very much in favor of targeted use of such mechanisms for
situations where there is a specific purpose, whether it be lock order
management or atomicity of operations too large for a single
transaction. That being said, I'm not following your thought wrt to this
particular situation. Are you suggesting that we not defer the reflink
adjustment in particular unmap cases, or that we just limit the number
of extent unmaps per-tp based on crossing an AG boundary, or something
else entirely?

Brian

> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig May 4, 2017, 11:59 a.m. UTC | #10
On Tue, May 02, 2017 at 08:05:25AM -0400, Brian Foster wrote:
> But doesn't that mean this wouldn't be a new problem introduced by such
> a change?

It would make the existing problem at lot easier to hit I think.

> transaction. That being said, I'm not following your thought wrt to this
> particular situation. Are you suggesting that we not defer the reflink
> adjustment in particular unmap cases, or that we just limit the number
> of extent unmaps per-tp based on crossing an AG boundary, or something
> else entirely?

To me it seems like we should try to do the extent count adjustments
in the current transaction for a given extent if we can, but give me
a little more time to think how to best do that.  I'm travelling at the
moment and don't have much quiet time to actually engage my brain.
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Brian Foster May 4, 2017, 1:40 p.m. UTC | #11
On Thu, May 04, 2017 at 04:59:43AM -0700, Christoph Hellwig wrote:
> On Tue, May 02, 2017 at 08:05:25AM -0400, Brian Foster wrote:
> > But doesn't that mean this wouldn't be a new problem introduced by such
> > a change?
> 
> It would make the existing problem at lot easier to hit I think.
> 

I don't really see how. As it is, we currently do something like unmap
extent 1, unmap extent 2, roll transaction, free extent 1, free extent 2
(where the extent frees lock the agf). As I understand it, the
prospective change would move the agf locking up into the unmap
transaction and hold it across the roll.

I suppose you could make the case that the AGFs are held locked for a
longer period of time, but it's hard to characterize whether that would
make a deadlock more likely or not. Even if it did, couldn't we either
just fix the unmap count to one extent (unconditionally or in just the
reflink case) or until we hit an extent that is in decreasing ag
order from one that has already been unmapped in the current tp..?

> > transaction. That being said, I'm not following your thought wrt to this
> > particular situation. Are you suggesting that we not defer the reflink
> > adjustment in particular unmap cases, or that we just limit the number
> > of extent unmaps per-tp based on crossing an AG boundary, or something
> > else entirely?
> 
> To me it seems like we should try to do the extent count adjustments
> in the current transaction for a given extent if we can, but give me
> a little more time to think how to best do that.  I'm travelling at the
> moment and don't have much quiet time to actually engage my brain.

Sure, that sounds potentially interesting.

Brian

> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index f02eb76..1e79fb5 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -5435,6 +5435,7 @@  __xfs_bunmapi(
 	int			whichfork;	/* data or attribute fork */
 	xfs_fsblock_t		sum;
 	xfs_filblks_t		len = *rlen;	/* length to unmap in file */
+	xfs_fileoff_t		max_len;
 
 	trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
 
@@ -5456,6 +5457,16 @@  __xfs_bunmapi(
 	ASSERT(len > 0);
 	ASSERT(nexts >= 0);
 
+	/*
+	 * Guesstimate how many blocks we can unmap without running the risk of
+	 * blowing out the transaction with a mix of EFIs and reflink
+	 * adjustments.
+	 */
+	if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK)
+		max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res));
+	else
+		max_len = len;
+
 	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
 	    (error = xfs_iread_extents(tp, ip, whichfork)))
 		return error;
@@ -5500,7 +5511,7 @@  __xfs_bunmapi(
 
 	extno = 0;
 	while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 &&
-	       (nexts == 0 || extno < nexts)) {
+	       (nexts == 0 || extno < nexts) && max_len > 0) {
 		/*
 		 * Is the found extent after a hole in which bno lives?
 		 * Just back up to the previous extent, if so.
@@ -5532,6 +5543,15 @@  __xfs_bunmapi(
 		}
 		if (del.br_startoff + del.br_blockcount > bno + 1)
 			del.br_blockcount = bno + 1 - del.br_startoff;
+
+		/* How much can we safely unmap? */
+		if (max_len < del.br_blockcount) {
+			del.br_startoff += del.br_blockcount - max_len;
+			if (!wasdel)
+				del.br_startblock += del.br_blockcount - max_len;
+			del.br_blockcount = max_len;
+		}
+
 		sum = del.br_startblock + del.br_blockcount;
 		if (isrt &&
 		    (mod = do_mod(sum, mp->m_sb.sb_rextsize))) {
@@ -5708,6 +5728,7 @@  __xfs_bunmapi(
 		if (!isrt && wasdel)
 			xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, false);
 
+		max_len -= del.br_blockcount;
 		bno = del.br_startoff - 1;
 nodelete:
 		/*
@@ -6473,15 +6494,16 @@  xfs_bmap_finish_one(
 	int				whichfork,
 	xfs_fileoff_t			startoff,
 	xfs_fsblock_t			startblock,
-	xfs_filblks_t			blockcount,
+	xfs_filblks_t			*blockcount,
 	xfs_exntst_t			state)
 {
-	int				error = 0, done;
+	xfs_fsblock_t			firstfsb;
+	int				error = 0;
 
 	trace_xfs_bmap_deferred(tp->t_mountp,
 			XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type,
 			XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
-			ip->i_ino, whichfork, startoff, blockcount, state);
+			ip->i_ino, whichfork, startoff, *blockcount, state);
 
 	if (WARN_ON_ONCE(whichfork != XFS_DATA_FORK))
 		return -EFSCORRUPTED;
@@ -6493,13 +6515,13 @@  xfs_bmap_finish_one(
 
 	switch (type) {
 	case XFS_BMAP_MAP:
-		error = xfs_bmapi_remap(tp, ip, startoff, blockcount,
+		error = xfs_bmapi_remap(tp, ip, startoff, *blockcount,
 				startblock, dfops);
+		*blockcount = 0;
 		break;
 	case XFS_BMAP_UNMAP:
-		error = xfs_bunmapi(tp, ip, startoff, blockcount,
-				XFS_BMAPI_REMAP, 1, &startblock, dfops, &done);
-		ASSERT(done);
+		error = __xfs_bunmapi(tp, ip, startoff, blockcount,
+				XFS_BMAPI_REMAP, 1, &firstfsb, dfops);
 		break;
 	default:
 		ASSERT(0);
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index c35a14f..851982a 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -271,7 +271,7 @@  struct xfs_bmap_intent {
 int	xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_defer_ops *dfops,
 		struct xfs_inode *ip, enum xfs_bmap_intent_type type,
 		int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock,
-		xfs_filblks_t blockcount, xfs_exntst_t state);
+		xfs_filblks_t *blockcount, xfs_exntst_t state);
 int	xfs_bmap_map_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
 		struct xfs_inode *ip, struct xfs_bmbt_irec *imap);
 int	xfs_bmap_unmap_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index b177ef3..29dcde3 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -784,14 +784,6 @@  xfs_refcount_merge_extents(
 }
 
 /*
- * While we're adjusting the refcounts records of an extent, we have
- * to keep an eye on the number of extents we're dirtying -- run too
- * many in a single transaction and we'll exceed the transaction's
- * reservation and crash the fs.  Each record adds 12 bytes to the
- * log (plus any key updates) so we'll conservatively assume 24 bytes
- * per record.  We must also leave space for btree splits on both ends
- * of the range and space for the CUD and a new CUI.
- *
  * XXX: This is a pretty hand-wavy estimate.  The penalty for guessing
  * true incorrectly is a shutdown FS; the penalty for guessing false
  * incorrectly is more transaction rolls than might be necessary.
@@ -822,7 +814,7 @@  xfs_refcount_still_have_space(
 	else if (overhead > cur->bc_tp->t_log_res)
 		return false;
 	return  cur->bc_tp->t_log_res - overhead >
-		cur->bc_private.a.priv.refc.nr_ops * 32;
+		cur->bc_private.a.priv.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
 }
 
 /*
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 098dc66..eafb9d1 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -67,4 +67,20 @@  extern int xfs_refcount_free_cow_extent(struct xfs_mount *mp,
 extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
 		xfs_agnumber_t agno);
 
+/*
+ * While we're adjusting the refcounts records of an extent, we have
+ * to keep an eye on the number of extents we're dirtying -- run too
+ * many in a single transaction and we'll exceed the transaction's
+ * reservation and crash the fs.  Each record adds 12 bytes to the
+ * log (plus any key updates) so we'll conservatively assume 32 bytes
+ * per record.  We must also leave space for btree splits on both ends
+ * of the range and space for the CUD and a new CUI.
+ */
+#define XFS_REFCOUNT_ITEM_OVERHEAD	32
+
+static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res)
+{
+	return (log_res * 3 / 4) / XFS_REFCOUNT_ITEM_OVERHEAD;
+}
+
 #endif	/* __XFS_REFCOUNT_H__ */
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 378f144..89908bb 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -396,6 +396,7 @@  xfs_bui_recover(
 	struct xfs_map_extent		*bmap;
 	xfs_fsblock_t			startblock_fsb;
 	xfs_fsblock_t			inode_fsb;
+	xfs_filblks_t			count;
 	bool				op_ok;
 	struct xfs_bud_log_item		*budp;
 	enum xfs_bmap_intent_type	type;
@@ -404,6 +405,7 @@  xfs_bui_recover(
 	struct xfs_trans		*tp;
 	struct xfs_inode		*ip = NULL;
 	struct xfs_defer_ops		dfops;
+	struct xfs_bmbt_irec		irec;
 	xfs_fsblock_t			firstfsb;
 	unsigned int			resblks;
 
@@ -483,13 +485,24 @@  xfs_bui_recover(
 	}
 	xfs_trans_ijoin(tp, ip, 0);
 
+	count = bmap->me_len;
 	error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type,
 			ip, whichfork, bmap->me_startoff,
-			bmap->me_startblock, bmap->me_len,
-			state);
+			bmap->me_startblock, &count, state);
 	if (error)
 		goto err_dfops;
 
+	if (count > 0) {
+		ASSERT(type == XFS_BMAP_UNMAP);
+		irec.br_startblock = bmap->me_startblock;
+		irec.br_blockcount = count;
+		irec.br_startoff = bmap->me_startoff;
+		irec.br_state = state;
+		error = xfs_bmap_unmap_extent(tp->t_mountp, &dfops, ip, &irec);
+		if (error)
+			goto err_dfops;
+	}
+
 	/* Finish transaction, free inodes. */
 	error = xfs_defer_finish(&tp, &dfops, NULL);
 	if (error)
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index a07acbf..08923e5 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -275,6 +275,6 @@  int xfs_trans_log_finish_bmap_update(struct xfs_trans *tp,
 		struct xfs_bud_log_item *rudp, struct xfs_defer_ops *dfops,
 		enum xfs_bmap_intent_type type, struct xfs_inode *ip,
 		int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock,
-		xfs_filblks_t blockcount, xfs_exntst_t state);
+		xfs_filblks_t *blockcount, xfs_exntst_t state);
 
 #endif	/* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c
index 6408e7d..14543d9 100644
--- a/fs/xfs/xfs_trans_bmap.c
+++ b/fs/xfs/xfs_trans_bmap.c
@@ -63,7 +63,7 @@  xfs_trans_log_finish_bmap_update(
 	int				whichfork,
 	xfs_fileoff_t			startoff,
 	xfs_fsblock_t			startblock,
-	xfs_filblks_t			blockcount,
+	xfs_filblks_t			*blockcount,
 	xfs_exntst_t			state)
 {
 	int				error;
@@ -196,16 +196,23 @@  xfs_bmap_update_finish_item(
 	void				**state)
 {
 	struct xfs_bmap_intent		*bmap;
+	xfs_filblks_t			count;
 	int				error;
 
 	bmap = container_of(item, struct xfs_bmap_intent, bi_list);
+	count = bmap->bi_bmap.br_blockcount;
 	error = xfs_trans_log_finish_bmap_update(tp, done_item, dop,
 			bmap->bi_type,
 			bmap->bi_owner, bmap->bi_whichfork,
 			bmap->bi_bmap.br_startoff,
 			bmap->bi_bmap.br_startblock,
-			bmap->bi_bmap.br_blockcount,
+			&count,
 			bmap->bi_bmap.br_state);
+	if (!error && count > 0) {
+		ASSERT(bmap->bi_type == XFS_BMAP_UNMAP);
+		bmap->bi_bmap.br_blockcount = count;
+		return -EAGAIN;
+	}
 	kmem_free(bmap);
 	return error;
 }