diff mbox series

[15/42] xfs: rework xfs_alloc_vextent()

Message ID 20230118224505.1964941-16-david@fromorbit.com (mailing list archive)
State Superseded
Headers show
Series xfs: per-ag centric allocation alogrithms | expand

Commit Message

Dave Chinner Jan. 18, 2023, 10:44 p.m. UTC
From: Dave Chinner <dchinner@redhat.com>

It's a multiplexing mess that can be greatly simplified, and really
needs to be simplified to allow active per-ag references to
propagate from initial AG selection code the the bmapi code.

This splits the code out into separate a parameter checking
function, an iterator function, and allocation completion functions
and then implements the individual policies using these functions.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/libxfs/xfs_alloc.c | 464 +++++++++++++++++++++++---------------
 1 file changed, 285 insertions(+), 179 deletions(-)

Comments

Darrick J. Wong Feb. 1, 2023, 7:39 p.m. UTC | #1
On Thu, Jan 19, 2023 at 09:44:38AM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> It's a multiplexing mess that can be greatly simplified, and really
> needs to be simplified to allow active per-ag references to
> propagate from initial AG selection code the the bmapi code.
> 
> This splits the code out into separate a parameter checking
> function, an iterator function, and allocation completion functions
> and then implements the individual policies using these functions.

This patch was **so** much easier to read once I imported it and
re-exported it with git set to patience diff mode.  With that in hand
it's far easier to see that the diff really does break up an overlong
function and nothing else.

$ cat /etc/gitconfig
[diff]
        algorithm = patience

Reviewed-by: Darrick J. Wong <djwong@kernel.org>

--D


> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/libxfs/xfs_alloc.c | 464 +++++++++++++++++++++++---------------
>  1 file changed, 285 insertions(+), 179 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
> index 246c2e7d9e7a..39e34a1bfa31 100644
> --- a/fs/xfs/libxfs/xfs_alloc.c
> +++ b/fs/xfs/libxfs/xfs_alloc.c
> @@ -3151,29 +3151,20 @@ xfs_alloc_read_agf(
>  }
>  
>  /*
> - * Allocate an extent (variable-size).
> - * Depending on the allocation type, we either look in a single allocation
> - * group or loop over the allocation groups to find the result.
> + * Pre-proces allocation arguments to set initial state that we don't require
> + * callers to set up correctly, as well as bounds check the allocation args
> + * that are set up.
>   */
> -int				/* error */
> -xfs_alloc_vextent(
> -	struct xfs_alloc_arg	*args)	/* allocation argument structure */
> +static int
> +xfs_alloc_vextent_check_args(
> +	struct xfs_alloc_arg	*args)
>  {
> -	xfs_agblock_t		agsize;	/* allocation group size */
> -	int			error;
> -	int			flags;	/* XFS_ALLOC_FLAG_... locking flags */
> -	struct xfs_mount	*mp;	/* mount structure pointer */
> -	xfs_agnumber_t		sagno;	/* starting allocation group number */
> -	xfs_alloctype_t		type;	/* input allocation type */
> -	int			bump_rotor = 0;
> -	xfs_agnumber_t		rotorstep = xfs_rotorstep; /* inode32 agf stepper */
> -	xfs_agnumber_t		minimum_agno = 0;
> +	struct xfs_mount	*mp = args->mp;
> +	xfs_agblock_t		agsize;
>  
> -	mp = args->mp;
> -	type = args->otype = args->type;
> +	args->otype = args->type;
>  	args->agbno = NULLAGBLOCK;
> -	if (args->tp->t_highest_agno != NULLAGNUMBER)
> -		minimum_agno = args->tp->t_highest_agno;
> +
>  	/*
>  	 * Just fix this up, for the case where the last a.g. is shorter
>  	 * (or there's only one a.g.) and the caller couldn't easily figure
> @@ -3195,199 +3186,314 @@ xfs_alloc_vextent(
>  	    args->mod >= args->prod) {
>  		args->fsbno = NULLFSBLOCK;
>  		trace_xfs_alloc_vextent_badargs(args);
> +		return -ENOSPC;
> +	}
> +	return 0;
> +}
> +
> +/*
> + * Post-process allocation results to set the allocated block number correctly
> + * for the caller.
> + *
> + * XXX: xfs_alloc_vextent() should really be returning ENOSPC for ENOSPC, not
> + * hiding it behind a "successful" NULLFSBLOCK allocation.
> + */
> +static void
> +xfs_alloc_vextent_set_fsbno(
> +	struct xfs_alloc_arg	*args,
> +	xfs_agnumber_t		minimum_agno)
> +{
> +	struct xfs_mount	*mp = args->mp;
> +
> +	/*
> +	 * We can end up here with a locked AGF. If we failed, the caller is
> +	 * likely going to try to allocate again with different parameters, and
> +	 * that can widen the AGs that are searched for free space. If we have
> +	 * to do BMBT block allocation, we have to do a new allocation.
> +	 *
> +	 * Hence leaving this function with the AGF locked opens up potential
> +	 * ABBA AGF deadlocks because a future allocation attempt in this
> +	 * transaction may attempt to lock a lower number AGF.
> +	 *
> +	 * We can't release the AGF until the transaction is commited, so at
> +	 * this point we must update the "first allocation" tracker to point at
> +	 * this AG if the tracker is empty or points to a lower AG. This allows
> +	 * the next allocation attempt to be modified appropriately to avoid
> +	 * deadlocks.
> +	 */
> +	if (args->agbp &&
> +	    (args->tp->t_highest_agno == NULLAGNUMBER ||
> +	     args->agno > minimum_agno))
> +		args->tp->t_highest_agno = args->agno;
> +
> +	/* Allocation failed with ENOSPC if NULLAGBLOCK was returned. */
> +	if (args->agbno == NULLAGBLOCK) {
> +		args->fsbno = NULLFSBLOCK;
> +		return;
> +	}
> +
> +	args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
> +#ifdef DEBUG
> +	ASSERT(args->len >= args->minlen);
> +	ASSERT(args->len <= args->maxlen);
> +	ASSERT(args->agbno % args->alignment == 0);
> +	XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno), args->len);
> +#endif
> +}
> +
> +/*
> + * Allocate within a single AG only.
> + */
> +static int
> +xfs_alloc_vextent_this_ag(
> +	struct xfs_alloc_arg	*args,
> +	xfs_agnumber_t		minimum_agno)
> +{
> +	struct xfs_mount	*mp = args->mp;
> +	int			error;
> +
> +	error = xfs_alloc_vextent_check_args(args);
> +	if (error) {
> +		if (error == -ENOSPC)
> +			return 0;
> +		return error;
> +	}
> +
> +	args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
> +	if (minimum_agno > args->agno) {
> +		trace_xfs_alloc_vextent_skip_deadlock(args);
> +		args->fsbno = NULLFSBLOCK;
>  		return 0;
>  	}
>  
> -	switch (type) {
> -	case XFS_ALLOCTYPE_THIS_AG:
> -	case XFS_ALLOCTYPE_NEAR_BNO:
> -	case XFS_ALLOCTYPE_THIS_BNO:
> -		/*
> -		 * These three force us into a single a.g.
> -		 */
> -		args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
> -		args->pag = xfs_perag_get(mp, args->agno);
> +	args->pag = xfs_perag_get(mp, args->agno);
> +	error = xfs_alloc_fix_freelist(args, 0);
> +	if (error) {
> +		trace_xfs_alloc_vextent_nofix(args);
> +		goto out_error;
> +	}
> +	if (!args->agbp) {
> +		trace_xfs_alloc_vextent_noagbp(args);
> +		args->fsbno = NULLFSBLOCK;
> +		goto out_error;
> +	}
> +	args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
> +	error = xfs_alloc_ag_vextent(args);
>  
> -		if (minimum_agno > args->agno) {
> -			trace_xfs_alloc_vextent_skip_deadlock(args);
> -			error = 0;
> -			break;
> -		}
> +	xfs_alloc_vextent_set_fsbno(args, minimum_agno);
> +out_error:
> +	xfs_perag_put(args->pag);
> +	return error;
> +}
> +
> +/*
> + * Iterate all AGs trying to allocate an extent starting from @start_ag.
> + *
> + * If the incoming allocation type is XFS_ALLOCTYPE_NEAR_BNO, it means the
> + * allocation attempts in @start_agno have locality information. If we fail to
> + * allocate in that AG, then we revert to anywhere-in-AG for all the other AGs
> + * we attempt to allocation in as there is no locality optimisation possible for
> + * those allocations.
> + *
> + * When we wrap the AG iteration at the end of the filesystem, we have to be
> + * careful not to wrap into AGs below ones we already have locked in the
> + * transaction if we are doing a blocking iteration. This will result in an
> + * out-of-order locking of AGFs and hence can cause deadlocks.
> + */
> +static int
> +xfs_alloc_vextent_iterate_ags(
> +	struct xfs_alloc_arg	*args,
> +	xfs_agnumber_t		minimum_agno,
> +	xfs_agnumber_t		start_agno,
> +	uint32_t		flags)
> +{
> +	struct xfs_mount	*mp = args->mp;
> +	int			error = 0;
>  
> -		error = xfs_alloc_fix_freelist(args, 0);
> +	ASSERT(start_agno >= minimum_agno);
> +
> +	/*
> +	 * Loop over allocation groups twice; first time with
> +	 * trylock set, second time without.
> +	 */
> +	args->agno = start_agno;
> +	for (;;) {
> +		args->pag = xfs_perag_get(mp, args->agno);
> +		error = xfs_alloc_fix_freelist(args, flags);
>  		if (error) {
>  			trace_xfs_alloc_vextent_nofix(args);
> -			goto error0;
> -		}
> -		if (!args->agbp) {
> -			trace_xfs_alloc_vextent_noagbp(args);
>  			break;
>  		}
> -		args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
> -		if ((error = xfs_alloc_ag_vextent(args)))
> -			goto error0;
> -		break;
> -	case XFS_ALLOCTYPE_START_BNO:
>  		/*
> -		 * Try near allocation first, then anywhere-in-ag after
> -		 * the first a.g. fails.
> +		 * If we get a buffer back then the allocation will fly.
>  		 */
> -		if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
> -		    xfs_is_inode32(mp)) {
> -			args->fsbno = XFS_AGB_TO_FSB(mp,
> -					((mp->m_agfrotor / rotorstep) %
> -					mp->m_sb.sb_agcount), 0);
> -			bump_rotor = 1;
> +		if (args->agbp) {
> +			error = xfs_alloc_ag_vextent(args);
> +			break;
>  		}
> -		args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
> -		args->type = XFS_ALLOCTYPE_NEAR_BNO;
> -		fallthrough;
> -	case XFS_ALLOCTYPE_FIRST_AG:
> +
> +		trace_xfs_alloc_vextent_loopfailed(args);
> +
>  		/*
> -		 * Rotate through the allocation groups looking for a winner.
> -		 * If we are blocking, we must obey minimum_agno contraints for
> -		 * avoiding ABBA deadlocks on AGF locking.
> +		 * Didn't work, figure out the next iteration.
>  		 */
> -		if (type == XFS_ALLOCTYPE_FIRST_AG) {
> -			/*
> -			 * Start with allocation group given by bno.
> -			 */
> -			args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
> +		if (args->agno == start_agno &&
> +		    args->otype == XFS_ALLOCTYPE_START_BNO)
>  			args->type = XFS_ALLOCTYPE_THIS_AG;
> -			sagno = minimum_agno;
> -			flags = 0;
> -		} else {
> -			/*
> -			 * Start with the given allocation group.
> -			 */
> -			args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
> -			flags = XFS_ALLOC_FLAG_TRYLOCK;
> +
> +		/*
> +		 * If we are try-locking, we can't deadlock on AGF locks so we
> +		 * can wrap all the way back to the first AG. Otherwise, wrap
> +		 * back to the start AG so we can't deadlock and let the end of
> +		 * scan handler decide what to do next.
> +		 */
> +		if (++(args->agno) == mp->m_sb.sb_agcount) {
> +			if (flags & XFS_ALLOC_FLAG_TRYLOCK)
> +				args->agno = 0;
> +			else
> +				args->agno = minimum_agno;
>  		}
>  
>  		/*
> -		 * Loop over allocation groups twice; first time with
> -		 * trylock set, second time without.
> +		 * Reached the starting a.g., must either be done
> +		 * or switch to non-trylock mode.
>  		 */
> -		for (;;) {
> -			args->pag = xfs_perag_get(mp, args->agno);
> -			error = xfs_alloc_fix_freelist(args, flags);
> -			if (error) {
> -				trace_xfs_alloc_vextent_nofix(args);
> -				goto error0;
> -			}
> -			/*
> -			 * If we get a buffer back then the allocation will fly.
> -			 */
> -			if (args->agbp) {
> -				if ((error = xfs_alloc_ag_vextent(args)))
> -					goto error0;
> +		if (args->agno == start_agno) {
> +			if (flags == 0) {
> +				args->agbno = NULLAGBLOCK;
> +				trace_xfs_alloc_vextent_allfailed(args);
>  				break;
>  			}
>  
> -			trace_xfs_alloc_vextent_loopfailed(args);
> +			flags = 0;
> +			if (args->otype == XFS_ALLOCTYPE_START_BNO) {
> +				args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
> +				args->type = XFS_ALLOCTYPE_NEAR_BNO;
> +			}
> +		}
> +		xfs_perag_put(args->pag);
> +		args->pag = NULL;
> +	}
> +	if (args->pag) {
> +		xfs_perag_put(args->pag);
> +		args->pag = NULL;
> +	}
> +	return error;
> +}
>  
> -			/*
> -			 * Didn't work, figure out the next iteration.
> -			 */
> -			if (args->agno == sagno &&
> -			    type == XFS_ALLOCTYPE_START_BNO)
> -				args->type = XFS_ALLOCTYPE_THIS_AG;
> +/*
> + * Iterate from the AGs from the start AG to the end of the filesystem, trying
> + * to allocate blocks. It starts with a near allocation attempt in the initial
> + * AG, then falls back to anywhere-in-ag after the first AG fails. It will wrap
> + * back to zero if allowed by previous allocations in this transaction,
> + * otherwise will wrap back to the start AG and run a second blocking pass to
> + * the end of the filesystem.
> + */
> +static int
> +xfs_alloc_vextent_start_ag(
> +	struct xfs_alloc_arg	*args,
> +	xfs_agnumber_t		minimum_agno)
> +{
> +	struct xfs_mount	*mp = args->mp;
> +	xfs_agnumber_t		start_agno;
> +	xfs_agnumber_t		rotorstep = xfs_rotorstep;
> +	bool			bump_rotor = false;
> +	int			error;
>  
> -			/*
> -			 * If we are try-locking, we can't deadlock on AGF
> -			 * locks, so we can wrap all the way back to the first
> -			 * AG. Otherwise, wrap back to the start AG so we can't
> -			 * deadlock, and let the end of scan handler decide what
> -			 * to do next.
> -			 */
> -			if (++(args->agno) == mp->m_sb.sb_agcount) {
> -				if (flags & XFS_ALLOC_FLAG_TRYLOCK)
> -					args->agno = 0;
> -				else
> -					args->agno = sagno;
> -			}
> +	error = xfs_alloc_vextent_check_args(args);
> +	if (error) {
> +		if (error == -ENOSPC)
> +			return 0;
> +		return error;
> +	}
>  
> -			/*
> -			 * Reached the starting a.g., must either be done
> -			 * or switch to non-trylock mode.
> -			 */
> -			if (args->agno == sagno) {
> -				if (flags == 0) {
> -					args->agbno = NULLAGBLOCK;
> -					trace_xfs_alloc_vextent_allfailed(args);
> -					break;
> -				}
> +	if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
> +	    xfs_is_inode32(mp)) {
> +		args->fsbno = XFS_AGB_TO_FSB(mp,
> +				((mp->m_agfrotor / rotorstep) %
> +				mp->m_sb.sb_agcount), 0);
> +		bump_rotor = 1;
> +	}
> +	start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, args->fsbno));
> +	args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
> +	args->type = XFS_ALLOCTYPE_NEAR_BNO;
>  
> -				/*
> -				 * Blocking pass next, so we must obey minimum
> -				 * agno constraints to avoid ABBA AGF deadlocks.
> -				 */
> -				flags = 0;
> -				if (minimum_agno > sagno)
> -					sagno = minimum_agno;
> -
> -				if (type == XFS_ALLOCTYPE_START_BNO) {
> -					args->agbno = XFS_FSB_TO_AGBNO(mp,
> -						args->fsbno);
> -					args->type = XFS_ALLOCTYPE_NEAR_BNO;
> -				}
> -			}
> -			xfs_perag_put(args->pag);
> -		}
> -		if (bump_rotor) {
> -			if (args->agno == sagno)
> -				mp->m_agfrotor = (mp->m_agfrotor + 1) %
> -					(mp->m_sb.sb_agcount * rotorstep);
> -			else
> -				mp->m_agfrotor = (args->agno * rotorstep + 1) %
> -					(mp->m_sb.sb_agcount * rotorstep);
> -		}
> -		break;
> -	default:
> -		ASSERT(0);
> -		/* NOTREACHED */
> +	error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno,
> +			XFS_ALLOC_FLAG_TRYLOCK);
> +	if (bump_rotor) {
> +		if (args->agno == start_agno)
> +			mp->m_agfrotor = (mp->m_agfrotor + 1) %
> +				(mp->m_sb.sb_agcount * rotorstep);
> +		else
> +			mp->m_agfrotor = (args->agno * rotorstep + 1) %
> +				(mp->m_sb.sb_agcount * rotorstep);
>  	}
> -	if (args->agbno == NULLAGBLOCK) {
> -		args->fsbno = NULLFSBLOCK;
> -	} else {
> -		args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
> -#ifdef DEBUG
> -		ASSERT(args->len >= args->minlen);
> -		ASSERT(args->len <= args->maxlen);
> -		ASSERT(args->agbno % args->alignment == 0);
> -		XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
> -			args->len);
> -#endif
>  
> +	xfs_alloc_vextent_set_fsbno(args, minimum_agno);
> +	return error;
> +}
> +
> +/*
> + * Iterate from the agno indicated from args->fsbno through to the end of the
> + * filesystem attempting blocking allocation. This does not wrap or try a second
> + * pass, so will not recurse into AGs lower than indicated by fsbno.
> + */
> +static int
> +xfs_alloc_vextent_first_ag(
> +	struct xfs_alloc_arg	*args,
> +	xfs_agnumber_t		minimum_agno)
> +{
> +	struct xfs_mount	*mp = args->mp;
> +	xfs_agnumber_t		start_agno;
> +	int			error;
> +
> +	error = xfs_alloc_vextent_check_args(args);
> +	if (error) {
> +		if (error == -ENOSPC)
> +			return 0;
> +		return error;
>  	}
>  
> -	/*
> -	 * We end up here with a locked AGF. If we failed, the caller is likely
> -	 * going to try to allocate again with different parameters, and that
> -	 * can widen the AGs that are searched for free space. If we have to do
> -	 * BMBT block allocation, we have to do a new allocation.
> -	 *
> -	 * Hence leaving this function with the AGF locked opens up potential
> -	 * ABBA AGF deadlocks because a future allocation attempt in this
> -	 * transaction may attempt to lock a lower number AGF.
> -	 *
> -	 * We can't release the AGF until the transaction is commited, so at
> -	 * this point we must update the "firstblock" tracker to point at this
> -	 * AG if the tracker is empty or points to a lower AG. This allows the
> -	 * next allocation attempt to be modified appropriately to avoid
> -	 * deadlocks.
> -	 */
> -	if (args->agbp &&
> -	    (args->tp->t_highest_agno == NULLAGNUMBER ||
> -	     args->pag->pag_agno > minimum_agno))
> -		args->tp->t_highest_agno = args->pag->pag_agno;
> -	xfs_perag_put(args->pag);
> -	return 0;
> -error0:
> -	xfs_perag_put(args->pag);
> +	start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, args->fsbno));
> +
> +	args->type = XFS_ALLOCTYPE_THIS_AG;
> +	error =  xfs_alloc_vextent_iterate_ags(args, minimum_agno,
> +			start_agno, 0);
> +	xfs_alloc_vextent_set_fsbno(args, minimum_agno);
>  	return error;
>  }
>  
> +/*
> + * Allocate an extent (variable-size).
> + * Depending on the allocation type, we either look in a single allocation
> + * group or loop over the allocation groups to find the result.
> + */
> +int
> +xfs_alloc_vextent(
> +	struct xfs_alloc_arg	*args)
> +{
> +	xfs_agnumber_t		minimum_agno = 0;
> +
> +	if (args->tp->t_highest_agno != NULLAGNUMBER)
> +		minimum_agno = args->tp->t_highest_agno;
> +
> +	switch (args->type) {
> +	case XFS_ALLOCTYPE_THIS_AG:
> +	case XFS_ALLOCTYPE_NEAR_BNO:
> +	case XFS_ALLOCTYPE_THIS_BNO:
> +		return xfs_alloc_vextent_this_ag(args, minimum_agno);
> +	case XFS_ALLOCTYPE_START_BNO:
> +		return xfs_alloc_vextent_start_ag(args, minimum_agno);
> +	case XFS_ALLOCTYPE_FIRST_AG:
> +		return xfs_alloc_vextent_first_ag(args, minimum_agno);
> +	default:
> +		ASSERT(0);
> +		/* NOTREACHED */
> +	}
> +	/* Should never get here */
> +	return -EFSCORRUPTED;
> +}
> +
>  /* Ensure that the freelist is at full capacity. */
>  int
>  xfs_free_extent_fix_freelist(
> -- 
> 2.39.0
>
diff mbox series

Patch

diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 246c2e7d9e7a..39e34a1bfa31 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -3151,29 +3151,20 @@  xfs_alloc_read_agf(
 }
 
 /*
- * Allocate an extent (variable-size).
- * Depending on the allocation type, we either look in a single allocation
- * group or loop over the allocation groups to find the result.
+ * Pre-proces allocation arguments to set initial state that we don't require
+ * callers to set up correctly, as well as bounds check the allocation args
+ * that are set up.
  */
-int				/* error */
-xfs_alloc_vextent(
-	struct xfs_alloc_arg	*args)	/* allocation argument structure */
+static int
+xfs_alloc_vextent_check_args(
+	struct xfs_alloc_arg	*args)
 {
-	xfs_agblock_t		agsize;	/* allocation group size */
-	int			error;
-	int			flags;	/* XFS_ALLOC_FLAG_... locking flags */
-	struct xfs_mount	*mp;	/* mount structure pointer */
-	xfs_agnumber_t		sagno;	/* starting allocation group number */
-	xfs_alloctype_t		type;	/* input allocation type */
-	int			bump_rotor = 0;
-	xfs_agnumber_t		rotorstep = xfs_rotorstep; /* inode32 agf stepper */
-	xfs_agnumber_t		minimum_agno = 0;
+	struct xfs_mount	*mp = args->mp;
+	xfs_agblock_t		agsize;
 
-	mp = args->mp;
-	type = args->otype = args->type;
+	args->otype = args->type;
 	args->agbno = NULLAGBLOCK;
-	if (args->tp->t_highest_agno != NULLAGNUMBER)
-		minimum_agno = args->tp->t_highest_agno;
+
 	/*
 	 * Just fix this up, for the case where the last a.g. is shorter
 	 * (or there's only one a.g.) and the caller couldn't easily figure
@@ -3195,199 +3186,314 @@  xfs_alloc_vextent(
 	    args->mod >= args->prod) {
 		args->fsbno = NULLFSBLOCK;
 		trace_xfs_alloc_vextent_badargs(args);
+		return -ENOSPC;
+	}
+	return 0;
+}
+
+/*
+ * Post-process allocation results to set the allocated block number correctly
+ * for the caller.
+ *
+ * XXX: xfs_alloc_vextent() should really be returning ENOSPC for ENOSPC, not
+ * hiding it behind a "successful" NULLFSBLOCK allocation.
+ */
+static void
+xfs_alloc_vextent_set_fsbno(
+	struct xfs_alloc_arg	*args,
+	xfs_agnumber_t		minimum_agno)
+{
+	struct xfs_mount	*mp = args->mp;
+
+	/*
+	 * We can end up here with a locked AGF. If we failed, the caller is
+	 * likely going to try to allocate again with different parameters, and
+	 * that can widen the AGs that are searched for free space. If we have
+	 * to do BMBT block allocation, we have to do a new allocation.
+	 *
+	 * Hence leaving this function with the AGF locked opens up potential
+	 * ABBA AGF deadlocks because a future allocation attempt in this
+	 * transaction may attempt to lock a lower number AGF.
+	 *
+	 * We can't release the AGF until the transaction is commited, so at
+	 * this point we must update the "first allocation" tracker to point at
+	 * this AG if the tracker is empty or points to a lower AG. This allows
+	 * the next allocation attempt to be modified appropriately to avoid
+	 * deadlocks.
+	 */
+	if (args->agbp &&
+	    (args->tp->t_highest_agno == NULLAGNUMBER ||
+	     args->agno > minimum_agno))
+		args->tp->t_highest_agno = args->agno;
+
+	/* Allocation failed with ENOSPC if NULLAGBLOCK was returned. */
+	if (args->agbno == NULLAGBLOCK) {
+		args->fsbno = NULLFSBLOCK;
+		return;
+	}
+
+	args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
+#ifdef DEBUG
+	ASSERT(args->len >= args->minlen);
+	ASSERT(args->len <= args->maxlen);
+	ASSERT(args->agbno % args->alignment == 0);
+	XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno), args->len);
+#endif
+}
+
+/*
+ * Allocate within a single AG only.
+ */
+static int
+xfs_alloc_vextent_this_ag(
+	struct xfs_alloc_arg	*args,
+	xfs_agnumber_t		minimum_agno)
+{
+	struct xfs_mount	*mp = args->mp;
+	int			error;
+
+	error = xfs_alloc_vextent_check_args(args);
+	if (error) {
+		if (error == -ENOSPC)
+			return 0;
+		return error;
+	}
+
+	args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+	if (minimum_agno > args->agno) {
+		trace_xfs_alloc_vextent_skip_deadlock(args);
+		args->fsbno = NULLFSBLOCK;
 		return 0;
 	}
 
-	switch (type) {
-	case XFS_ALLOCTYPE_THIS_AG:
-	case XFS_ALLOCTYPE_NEAR_BNO:
-	case XFS_ALLOCTYPE_THIS_BNO:
-		/*
-		 * These three force us into a single a.g.
-		 */
-		args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
-		args->pag = xfs_perag_get(mp, args->agno);
+	args->pag = xfs_perag_get(mp, args->agno);
+	error = xfs_alloc_fix_freelist(args, 0);
+	if (error) {
+		trace_xfs_alloc_vextent_nofix(args);
+		goto out_error;
+	}
+	if (!args->agbp) {
+		trace_xfs_alloc_vextent_noagbp(args);
+		args->fsbno = NULLFSBLOCK;
+		goto out_error;
+	}
+	args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
+	error = xfs_alloc_ag_vextent(args);
 
-		if (minimum_agno > args->agno) {
-			trace_xfs_alloc_vextent_skip_deadlock(args);
-			error = 0;
-			break;
-		}
+	xfs_alloc_vextent_set_fsbno(args, minimum_agno);
+out_error:
+	xfs_perag_put(args->pag);
+	return error;
+}
+
+/*
+ * Iterate all AGs trying to allocate an extent starting from @start_ag.
+ *
+ * If the incoming allocation type is XFS_ALLOCTYPE_NEAR_BNO, it means the
+ * allocation attempts in @start_agno have locality information. If we fail to
+ * allocate in that AG, then we revert to anywhere-in-AG for all the other AGs
+ * we attempt to allocation in as there is no locality optimisation possible for
+ * those allocations.
+ *
+ * When we wrap the AG iteration at the end of the filesystem, we have to be
+ * careful not to wrap into AGs below ones we already have locked in the
+ * transaction if we are doing a blocking iteration. This will result in an
+ * out-of-order locking of AGFs and hence can cause deadlocks.
+ */
+static int
+xfs_alloc_vextent_iterate_ags(
+	struct xfs_alloc_arg	*args,
+	xfs_agnumber_t		minimum_agno,
+	xfs_agnumber_t		start_agno,
+	uint32_t		flags)
+{
+	struct xfs_mount	*mp = args->mp;
+	int			error = 0;
 
-		error = xfs_alloc_fix_freelist(args, 0);
+	ASSERT(start_agno >= minimum_agno);
+
+	/*
+	 * Loop over allocation groups twice; first time with
+	 * trylock set, second time without.
+	 */
+	args->agno = start_agno;
+	for (;;) {
+		args->pag = xfs_perag_get(mp, args->agno);
+		error = xfs_alloc_fix_freelist(args, flags);
 		if (error) {
 			trace_xfs_alloc_vextent_nofix(args);
-			goto error0;
-		}
-		if (!args->agbp) {
-			trace_xfs_alloc_vextent_noagbp(args);
 			break;
 		}
-		args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
-		if ((error = xfs_alloc_ag_vextent(args)))
-			goto error0;
-		break;
-	case XFS_ALLOCTYPE_START_BNO:
 		/*
-		 * Try near allocation first, then anywhere-in-ag after
-		 * the first a.g. fails.
+		 * If we get a buffer back then the allocation will fly.
 		 */
-		if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
-		    xfs_is_inode32(mp)) {
-			args->fsbno = XFS_AGB_TO_FSB(mp,
-					((mp->m_agfrotor / rotorstep) %
-					mp->m_sb.sb_agcount), 0);
-			bump_rotor = 1;
+		if (args->agbp) {
+			error = xfs_alloc_ag_vextent(args);
+			break;
 		}
-		args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
-		args->type = XFS_ALLOCTYPE_NEAR_BNO;
-		fallthrough;
-	case XFS_ALLOCTYPE_FIRST_AG:
+
+		trace_xfs_alloc_vextent_loopfailed(args);
+
 		/*
-		 * Rotate through the allocation groups looking for a winner.
-		 * If we are blocking, we must obey minimum_agno contraints for
-		 * avoiding ABBA deadlocks on AGF locking.
+		 * Didn't work, figure out the next iteration.
 		 */
-		if (type == XFS_ALLOCTYPE_FIRST_AG) {
-			/*
-			 * Start with allocation group given by bno.
-			 */
-			args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+		if (args->agno == start_agno &&
+		    args->otype == XFS_ALLOCTYPE_START_BNO)
 			args->type = XFS_ALLOCTYPE_THIS_AG;
-			sagno = minimum_agno;
-			flags = 0;
-		} else {
-			/*
-			 * Start with the given allocation group.
-			 */
-			args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
-			flags = XFS_ALLOC_FLAG_TRYLOCK;
+
+		/*
+		 * If we are try-locking, we can't deadlock on AGF locks so we
+		 * can wrap all the way back to the first AG. Otherwise, wrap
+		 * back to the start AG so we can't deadlock and let the end of
+		 * scan handler decide what to do next.
+		 */
+		if (++(args->agno) == mp->m_sb.sb_agcount) {
+			if (flags & XFS_ALLOC_FLAG_TRYLOCK)
+				args->agno = 0;
+			else
+				args->agno = minimum_agno;
 		}
 
 		/*
-		 * Loop over allocation groups twice; first time with
-		 * trylock set, second time without.
+		 * Reached the starting a.g., must either be done
+		 * or switch to non-trylock mode.
 		 */
-		for (;;) {
-			args->pag = xfs_perag_get(mp, args->agno);
-			error = xfs_alloc_fix_freelist(args, flags);
-			if (error) {
-				trace_xfs_alloc_vextent_nofix(args);
-				goto error0;
-			}
-			/*
-			 * If we get a buffer back then the allocation will fly.
-			 */
-			if (args->agbp) {
-				if ((error = xfs_alloc_ag_vextent(args)))
-					goto error0;
+		if (args->agno == start_agno) {
+			if (flags == 0) {
+				args->agbno = NULLAGBLOCK;
+				trace_xfs_alloc_vextent_allfailed(args);
 				break;
 			}
 
-			trace_xfs_alloc_vextent_loopfailed(args);
+			flags = 0;
+			if (args->otype == XFS_ALLOCTYPE_START_BNO) {
+				args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
+				args->type = XFS_ALLOCTYPE_NEAR_BNO;
+			}
+		}
+		xfs_perag_put(args->pag);
+		args->pag = NULL;
+	}
+	if (args->pag) {
+		xfs_perag_put(args->pag);
+		args->pag = NULL;
+	}
+	return error;
+}
 
-			/*
-			 * Didn't work, figure out the next iteration.
-			 */
-			if (args->agno == sagno &&
-			    type == XFS_ALLOCTYPE_START_BNO)
-				args->type = XFS_ALLOCTYPE_THIS_AG;
+/*
+ * Iterate from the AGs from the start AG to the end of the filesystem, trying
+ * to allocate blocks. It starts with a near allocation attempt in the initial
+ * AG, then falls back to anywhere-in-ag after the first AG fails. It will wrap
+ * back to zero if allowed by previous allocations in this transaction,
+ * otherwise will wrap back to the start AG and run a second blocking pass to
+ * the end of the filesystem.
+ */
+static int
+xfs_alloc_vextent_start_ag(
+	struct xfs_alloc_arg	*args,
+	xfs_agnumber_t		minimum_agno)
+{
+	struct xfs_mount	*mp = args->mp;
+	xfs_agnumber_t		start_agno;
+	xfs_agnumber_t		rotorstep = xfs_rotorstep;
+	bool			bump_rotor = false;
+	int			error;
 
-			/*
-			 * If we are try-locking, we can't deadlock on AGF
-			 * locks, so we can wrap all the way back to the first
-			 * AG. Otherwise, wrap back to the start AG so we can't
-			 * deadlock, and let the end of scan handler decide what
-			 * to do next.
-			 */
-			if (++(args->agno) == mp->m_sb.sb_agcount) {
-				if (flags & XFS_ALLOC_FLAG_TRYLOCK)
-					args->agno = 0;
-				else
-					args->agno = sagno;
-			}
+	error = xfs_alloc_vextent_check_args(args);
+	if (error) {
+		if (error == -ENOSPC)
+			return 0;
+		return error;
+	}
 
-			/*
-			 * Reached the starting a.g., must either be done
-			 * or switch to non-trylock mode.
-			 */
-			if (args->agno == sagno) {
-				if (flags == 0) {
-					args->agbno = NULLAGBLOCK;
-					trace_xfs_alloc_vextent_allfailed(args);
-					break;
-				}
+	if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
+	    xfs_is_inode32(mp)) {
+		args->fsbno = XFS_AGB_TO_FSB(mp,
+				((mp->m_agfrotor / rotorstep) %
+				mp->m_sb.sb_agcount), 0);
+		bump_rotor = 1;
+	}
+	start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, args->fsbno));
+	args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
+	args->type = XFS_ALLOCTYPE_NEAR_BNO;
 
-				/*
-				 * Blocking pass next, so we must obey minimum
-				 * agno constraints to avoid ABBA AGF deadlocks.
-				 */
-				flags = 0;
-				if (minimum_agno > sagno)
-					sagno = minimum_agno;
-
-				if (type == XFS_ALLOCTYPE_START_BNO) {
-					args->agbno = XFS_FSB_TO_AGBNO(mp,
-						args->fsbno);
-					args->type = XFS_ALLOCTYPE_NEAR_BNO;
-				}
-			}
-			xfs_perag_put(args->pag);
-		}
-		if (bump_rotor) {
-			if (args->agno == sagno)
-				mp->m_agfrotor = (mp->m_agfrotor + 1) %
-					(mp->m_sb.sb_agcount * rotorstep);
-			else
-				mp->m_agfrotor = (args->agno * rotorstep + 1) %
-					(mp->m_sb.sb_agcount * rotorstep);
-		}
-		break;
-	default:
-		ASSERT(0);
-		/* NOTREACHED */
+	error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno,
+			XFS_ALLOC_FLAG_TRYLOCK);
+	if (bump_rotor) {
+		if (args->agno == start_agno)
+			mp->m_agfrotor = (mp->m_agfrotor + 1) %
+				(mp->m_sb.sb_agcount * rotorstep);
+		else
+			mp->m_agfrotor = (args->agno * rotorstep + 1) %
+				(mp->m_sb.sb_agcount * rotorstep);
 	}
-	if (args->agbno == NULLAGBLOCK) {
-		args->fsbno = NULLFSBLOCK;
-	} else {
-		args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
-#ifdef DEBUG
-		ASSERT(args->len >= args->minlen);
-		ASSERT(args->len <= args->maxlen);
-		ASSERT(args->agbno % args->alignment == 0);
-		XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
-			args->len);
-#endif
 
+	xfs_alloc_vextent_set_fsbno(args, minimum_agno);
+	return error;
+}
+
+/*
+ * Iterate from the agno indicated from args->fsbno through to the end of the
+ * filesystem attempting blocking allocation. This does not wrap or try a second
+ * pass, so will not recurse into AGs lower than indicated by fsbno.
+ */
+static int
+xfs_alloc_vextent_first_ag(
+	struct xfs_alloc_arg	*args,
+	xfs_agnumber_t		minimum_agno)
+{
+	struct xfs_mount	*mp = args->mp;
+	xfs_agnumber_t		start_agno;
+	int			error;
+
+	error = xfs_alloc_vextent_check_args(args);
+	if (error) {
+		if (error == -ENOSPC)
+			return 0;
+		return error;
 	}
 
-	/*
-	 * We end up here with a locked AGF. If we failed, the caller is likely
-	 * going to try to allocate again with different parameters, and that
-	 * can widen the AGs that are searched for free space. If we have to do
-	 * BMBT block allocation, we have to do a new allocation.
-	 *
-	 * Hence leaving this function with the AGF locked opens up potential
-	 * ABBA AGF deadlocks because a future allocation attempt in this
-	 * transaction may attempt to lock a lower number AGF.
-	 *
-	 * We can't release the AGF until the transaction is commited, so at
-	 * this point we must update the "firstblock" tracker to point at this
-	 * AG if the tracker is empty or points to a lower AG. This allows the
-	 * next allocation attempt to be modified appropriately to avoid
-	 * deadlocks.
-	 */
-	if (args->agbp &&
-	    (args->tp->t_highest_agno == NULLAGNUMBER ||
-	     args->pag->pag_agno > minimum_agno))
-		args->tp->t_highest_agno = args->pag->pag_agno;
-	xfs_perag_put(args->pag);
-	return 0;
-error0:
-	xfs_perag_put(args->pag);
+	start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, args->fsbno));
+
+	args->type = XFS_ALLOCTYPE_THIS_AG;
+	error =  xfs_alloc_vextent_iterate_ags(args, minimum_agno,
+			start_agno, 0);
+	xfs_alloc_vextent_set_fsbno(args, minimum_agno);
 	return error;
 }
 
+/*
+ * Allocate an extent (variable-size).
+ * Depending on the allocation type, we either look in a single allocation
+ * group or loop over the allocation groups to find the result.
+ */
+int
+xfs_alloc_vextent(
+	struct xfs_alloc_arg	*args)
+{
+	xfs_agnumber_t		minimum_agno = 0;
+
+	if (args->tp->t_highest_agno != NULLAGNUMBER)
+		minimum_agno = args->tp->t_highest_agno;
+
+	switch (args->type) {
+	case XFS_ALLOCTYPE_THIS_AG:
+	case XFS_ALLOCTYPE_NEAR_BNO:
+	case XFS_ALLOCTYPE_THIS_BNO:
+		return xfs_alloc_vextent_this_ag(args, minimum_agno);
+	case XFS_ALLOCTYPE_START_BNO:
+		return xfs_alloc_vextent_start_ag(args, minimum_agno);
+	case XFS_ALLOCTYPE_FIRST_AG:
+		return xfs_alloc_vextent_first_ag(args, minimum_agno);
+	default:
+		ASSERT(0);
+		/* NOTREACHED */
+	}
+	/* Should never get here */
+	return -EFSCORRUPTED;
+}
+
 /* Ensure that the freelist is at full capacity. */
 int
 xfs_free_extent_fix_freelist(