diff mbox series

[RFC,2/2] xfs: don't access AGI on unlinked inodes if it can

Message ID 20200707135741.487-3-hsiangkao@redhat.com (mailing list archive)
State Deferred, archived
Headers show
Series xfs: more unlinked inode list optimization v1 | expand

Commit Message

Gao Xiang July 7, 2020, 1:57 p.m. UTC
Currently, we use AGI buffer lock to protect in-memory linked list for
unlinked inodes but since it's not necessary to modify AGI unless the
head of the unlinked list is modified. So let's removing the AGI buffer
modification dependency if possible, including 1) adding another per-AG
dedicated lock to protect the whole list and 2) inserting unlinked
inodes from tail.

For 2), the tail of bucket 0 is now recorded in perag for xfs_iunlink()
to use. xfs_iunlink_remove() still support old multiple short bucket
lists for recovery code.

Note that some paths take AGI lock in its transaction in advance,
so the proper locking order is only AGI lock -> unlinked list lock.

Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
---
 fs/xfs/xfs_inode.c       | 251 ++++++++++++++++++++-------------------
 fs/xfs/xfs_log_recover.c |   6 +
 fs/xfs/xfs_mount.c       |   3 +
 fs/xfs/xfs_mount.h       |   3 +
 4 files changed, 144 insertions(+), 119 deletions(-)

Comments

Darrick J. Wong July 8, 2020, 5:03 p.m. UTC | #1
On Tue, Jul 07, 2020 at 09:57:41PM +0800, Gao Xiang wrote:
> Currently, we use AGI buffer lock to protect in-memory linked list for
> unlinked inodes but since it's not necessary to modify AGI unless the
> head of the unlinked list is modified. So let's removing the AGI buffer
> modification dependency if possible, including 1) adding another per-AG
> dedicated lock to protect the whole list and 2) inserting unlinked
> inodes from tail.
> 
> For 2), the tail of bucket 0 is now recorded in perag for xfs_iunlink()
> to use. xfs_iunlink_remove() still support old multiple short bucket
> lists for recovery code.
> 
> Note that some paths take AGI lock in its transaction in advance,
> so the proper locking order is only AGI lock -> unlinked list lock.

How much agi buffer lock contention does this eliminate?

> Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
> ---
>  fs/xfs/xfs_inode.c       | 251 ++++++++++++++++++++-------------------
>  fs/xfs/xfs_log_recover.c |   6 +
>  fs/xfs/xfs_mount.c       |   3 +
>  fs/xfs/xfs_mount.h       |   3 +
>  4 files changed, 144 insertions(+), 119 deletions(-)
> 
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 10565fa5ace4..d33e5b198534 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -1994,182 +1994,195 @@ xfs_iunlink_update_bucket(
>  }
>  
>  /*
> - * Always insert at the head, so we only have to do a next inode lookup to
> - * update it's prev pointer. The AGI bucket will point at the one we are
> - * inserting.
> + * This is called when the inode's link count has gone to 0 or we are creating
> + * a tmpfile via O_TMPFILE.  The inode @ip must have nlink == 0.
> + *
> + * We place the on-disk inode on a list in the AGI.  It will be pulled from this
> + * list when the inode is freed.
>   */
> -static int
> -xfs_iunlink_insert_inode(
> +STATIC int
> +xfs_iunlink(
>  	struct xfs_trans	*tp,
> -	struct xfs_buf		*agibp,
>  	struct xfs_inode	*ip)
>  {
>  	struct xfs_mount	*mp = tp->t_mountp;
> -	struct xfs_agi		*agi = agibp->b_addr;
> -	xfs_agino_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
> +	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
> +	struct xfs_perag	*pag;
> +	struct xfs_inode	*pip;
>  	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
> -	xfs_agino_t		next_agino;
> +	int			error;
> +
> +	ASSERT(VFS_I(ip)->i_nlink == 0);
> +	ASSERT(VFS_I(ip)->i_mode != 0);
> +	trace_xfs_iunlink(ip);
> +
> +	pag = xfs_perag_get(mp, agno);
>  
>  	/*
> -	 * Get the index into the agi hash table for the list this inode will
> -	 * go on.  Make sure the pointer isn't garbage and that this inode
> -	 * isn't already on the list.
> +	 * some paths (e.g. xfs_create_tmpfile) could take AGI lock
> +	 * in this transaction in advance and the only locking order
> +	 * AGI buf lock -> pag_unlinked_mutex is safe.
>  	 */
> -	next_agino = be32_to_cpu(agi->agi_unlinked[0]);
> -	if (next_agino == agino ||
> -	    !xfs_verify_agino_or_null(mp, agno, next_agino)) {
> -		xfs_buf_mark_corrupt(agibp);
> -		return -EFSCORRUPTED;
> -	}
> -
> -	ip->i_prev_unlinked = NULLAGINO;
> -	ip->i_next_unlinked = next_agino;
> -	if (ip->i_next_unlinked != NULLAGINO) {
> -		struct xfs_inode	*nip;
> +	mutex_lock(&pag->pag_unlinked_mutex);
> +	pip = pag->pag_unlinked_tail;
> +	if (!pip) {
> +		struct xfs_buf	*agibp;
>  
> -		nip = xfs_iunlink_lookup_next(agibp->b_pag, ip);
> -		if (IS_ERR_OR_NULL(nip))
> -			return -EFSCORRUPTED;
> +		mutex_unlock(&pag->pag_unlinked_mutex);
>  
> -		if (nip->i_prev_unlinked != NULLAGINO) {
> -			xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
> -						NULL, 0, __this_address);
> -			return -EFSCORRUPTED;
> +		/*
> +		 * there could be some race, but it doesn't matter though
> +		 * since !pip doesn't happen frequently.
> +		 */
> +		error = xfs_read_agi(mp, tp, agno, &agibp);
> +		if (error) {
> +			xfs_perag_put(pag);
> +			return error;
>  		}
> -		nip->i_prev_unlinked = agino;
>  
> -		/* update the on disk inode now */
> -		xfs_iunlink_log(tp, ip);
> -	}
> +		mutex_lock(&pag->pag_unlinked_mutex);
> +		pip = pag->pag_unlinked_tail;
> +		if (!pip) {
> +			struct xfs_agi	*agi;
> +
> +			ip->i_prev_unlinked = NULLAGINO;
>  
> -	/* Point the head of the list to point to this inode. */
> -	return xfs_iunlink_update_bucket(tp, agno, agibp, next_agino, agino);
> +			agi = agibp->b_addr;
> +			ASSERT(be32_to_cpu(agi->agi_unlinked[0]) == NULLAGINO);
> +
> +			/* Point the head of the list to point to this inode. */
> +			error = xfs_iunlink_update_bucket(tp, agno,
> +					agibp, NULLAGINO, agino);
> +			goto out;
> +		}
> +	}
> +	ip->i_prev_unlinked = XFS_INO_TO_AGINO(mp, pip->i_ino);
> +	ASSERT(pip->i_next_unlinked == NULLAGINO);
> +	pip->i_next_unlinked = agino;
> +	xfs_iunlink_log(tp, pip);
> +out:
> +	ip->i_next_unlinked = NULLAGINO;
> +	pag->pag_unlinked_tail = ip;
> +	mutex_unlock(&pag->pag_unlinked_mutex);
> +	xfs_perag_put(pag);
> +	return 0;
>  }
>  
>  /*
> + * Pull the on-disk inode from the AGI unlinked list.
> + *
>   * Remove can be from anywhere in the list, so we have to do two adjacent inode
>   * lookups here so we can update list pointers. We may be at the head or the
>   * tail of the list, so we have to handle those cases as well.
>   */
> -static int
> -xfs_iunlink_remove_inode(
> +STATIC int
> +xfs_iunlink_remove(
>  	struct xfs_trans	*tp,
> -	struct xfs_buf		*agibp,
>  	struct xfs_inode	*ip)
>  {
>  	struct xfs_mount	*mp = tp->t_mountp;
> -	xfs_agino_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
> +	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
> +	struct xfs_perag	*pag;
> +	struct xfs_inode	*pip = NULL;
>  	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
> -	xfs_agino_t		next_agino = ip->i_next_unlinked;
> +	xfs_agino_t		next_agino;
>  	int			error;
>  
> +	trace_xfs_iunlink_remove(ip);
> +
> +	pag = xfs_perag_get(mp, agno);
> +	mutex_lock(&pag->pag_unlinked_mutex);
> +
> +	/* see the comment in xfs_iunlink() on the only proper locking order */
>  	if (ip->i_prev_unlinked == NULLAGINO) {
> -		/* remove from head of list */
> -		if (next_agino == agino ||
> -		    !xfs_verify_agino_or_null(mp, agno, next_agino))
> -			return -EFSCORRUPTED;
> +		struct xfs_buf	*agibp;
>  
> -		error = xfs_iunlink_update_bucket(tp, agno, agibp, agino, next_agino);
> -		if (error)
> +		mutex_unlock(&pag->pag_unlinked_mutex);
> +
> +		error = xfs_read_agi(mp, tp, agno, &agibp);
> +		if (error) {
> +			xfs_perag_put(pag);
>  			return error;
> -	} else {
> -		/* lookup previous inode and update to point at next */
> -		struct xfs_inode	*pip;
> +		}
>  
> -		pip = xfs_iunlink_lookup_prev(agibp->b_pag, ip);
> -		if (IS_ERR_OR_NULL(pip))
> -			return -EFSCORRUPTED;
> +		mutex_lock(&pag->pag_unlinked_mutex);
> +		if (ip->i_prev_unlinked == NULLAGINO) {
> +			struct xfs_agi	*agi;
>  
> -		if (pip->i_next_unlinked != agino) {
> -			xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
> -						NULL, 0, __this_address);
> -			return -EFSCORRUPTED;
> +			next_agino = ip->i_next_unlinked;
> +			agi = agibp->b_addr;
> +
> +			/* remove from head of list */
> +			if (next_agino == agino ||
> +			    !xfs_verify_agino_or_null(mp, agno, next_agino))
> +				goto err_fscorrupted;
> +
> +			error = xfs_iunlink_update_bucket(tp, agno,
> +				agibp, agino, next_agino);
> +			if (error)
> +				goto err_out;
> +
> +			goto next_fixup;
>  		}
> +	}
>  
> -		/* update the on disk inode now */
> -		pip->i_next_unlinked = next_agino;
> -		xfs_iunlink_log(tp, pip);
> +	next_agino = ip->i_next_unlinked;
> +
> +	/* lookup previous inode and update to point at next */
> +	pip = xfs_iunlink_lookup_prev(pag, ip);
> +	if (IS_ERR_OR_NULL(pip))
> +		goto err_fscorrupted;
> +
> +	if (pip->i_next_unlinked != agino) {
> +		xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
> +					NULL, 0, __this_address);
> +		goto err_fscorrupted;
>  	}
> +	pip->i_next_unlinked = next_agino;
> +	xfs_iunlink_log(tp, pip);
>  
> -	/* lookup the next inode and update to point at prev */
> -	if (ip->i_next_unlinked != NULLAGINO) {
> +next_fixup:
> +	if (next_agino == NULLAGINO) {
> +		/* so iunlink recovery can work here */
> +		if (pag->pag_unlinked_tail == ip)
> +			pag->pag_unlinked_tail = pip;
> +	} else {
> +		/* lookup the next inode and update to point at prev */
>  		struct xfs_inode	*nip;
>  
> -		nip = xfs_iunlink_lookup_next(agibp->b_pag, ip);
> +		nip = xfs_iunlink_lookup_next(pag, ip);
>  		if (IS_ERR_OR_NULL(nip))
> -			return -EFSCORRUPTED;
> +			goto err_fscorrupted;
>  
>  		if (nip->i_prev_unlinked != agino) {
>  			xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
>  						NULL, 0, __this_address);
> -			return -EFSCORRUPTED;
> +			goto err_fscorrupted;
>  		}
>  		/* in memory update only */
>  		nip->i_prev_unlinked = ip->i_prev_unlinked;
>  	}
> +	mutex_unlock(&pag->pag_unlinked_mutex);
> +	xfs_perag_put(pag);
>  
> +	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
>  	/*
>  	 * Now clear prev/next from this inode and update on disk if we
>  	 * need to clear the on-disk link.
>  	 */
>  	ip->i_prev_unlinked = NULLAGINO;
> -	ip->i_next_unlinked = NULLAGINO;
> -	if (next_agino != NULLAGINO)
> +	if (next_agino != NULLAGINO) {
> +		ip->i_next_unlinked = NULLAGINO;
>  		xfs_iunlink_log(tp, ip);
> +	}
>  	return 0;
> -}
>  
> -/*
> - * This is called when the inode's link count has gone to 0 or we are creating
> - * a tmpfile via O_TMPFILE.  The inode @ip must have nlink == 0.
> - *
> - * We place the on-disk inode on a list in the AGI.  It will be pulled from this
> - * list when the inode is freed.
> - */
> -STATIC int
> -xfs_iunlink(
> -	struct xfs_trans	*tp,
> -	struct xfs_inode	*ip)
> -{
> -	struct xfs_mount	*mp = tp->t_mountp;
> -	struct xfs_buf		*agibp;
> -	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
> -	int			error;
> -
> -	ASSERT(ip->i_next_unlinked == NULLAGINO);
> -	ASSERT(VFS_I(ip)->i_nlink == 0);
> -	ASSERT(VFS_I(ip)->i_mode != 0);
> -	trace_xfs_iunlink(ip);
> -
> -	/* Get the agi buffer first.  It ensures lock ordering on the list. */
> -	error = xfs_read_agi(mp, tp, agno, &agibp);
> -	if (error)
> -		return error;
> -
> -	return xfs_iunlink_insert_inode(tp, agibp, ip);
> -}
> -
> -/*
> - * Pull the on-disk inode from the AGI unlinked list.
> - */
> -STATIC int
> -xfs_iunlink_remove(
> -	struct xfs_trans	*tp,
> -	struct xfs_inode	*ip)
> -{
> -	struct xfs_mount	*mp = tp->t_mountp;
> -	struct xfs_buf		*agibp;
> -	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
> -	int			error;
> -
> -	trace_xfs_iunlink_remove(ip);
> -
> -	/* Get the agi buffer first.  It ensures lock ordering on the list. */
> -	error = xfs_read_agi(mp, tp, agno, &agibp);
> -	if (error)
> -		return error;
> -
> -	return xfs_iunlink_remove_inode(tp, agibp, ip);
> +err_fscorrupted:
> +	error = -EFSCORRUPTED;
> +err_out:
> +	mutex_unlock(&pag->pag_unlinked_mutex);
> +	xfs_perag_put(pag);
> +	return error;
>  }
>  
>  /*
> diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
> index d47eea31c165..3f2739316424 100644
> --- a/fs/xfs/xfs_log_recover.c
> +++ b/fs/xfs/xfs_log_recover.c
> @@ -2801,6 +2801,11 @@ xlog_recover_iunlink_ag(
>  					prev_ip->i_next_unlinked = NULLAGINO;
>  				break;
>  			}
> +
> +			/* XXX: take pag_unlinked_mutex across the loop? */
> +			if (!bucket)
> +				agibp->b_pag->pag_unlinked_tail = ip;
> +
>  			if (prev_ip) {
>  				ip->i_prev_unlinked = prev_agino;
>  				xfs_irele(prev_ip);
> @@ -2812,6 +2817,7 @@ xlog_recover_iunlink_ag(
>  		}
>  		if (prev_ip)
>  			xfs_irele(prev_ip);
> +
>  		if (error) {
>  			/*
>  			 * We can't read an inode this bucket points to, or an
> diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> index 031e96ff022d..a9701f863e84 100644
> --- a/fs/xfs/xfs_mount.c
> +++ b/fs/xfs/xfs_mount.c
> @@ -223,6 +223,9 @@ xfs_initialize_perag(
>  		if (first_initialised == NULLAGNUMBER)
>  			first_initialised = index;
>  		spin_lock_init(&pag->pag_state_lock);
> +
> +		mutex_init(&pag->pag_unlinked_mutex);
> +		pag->pag_unlinked_tail = NULL;
>  	}
>  
>  	index = xfs_set_inode_alloc(mp, agcount);
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index a72cfcaa4ad1..107a634a34f7 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -372,6 +372,9 @@ typedef struct xfs_perag {
>  	/* reference count */
>  	uint8_t			pagf_refcount_level;
>  
> +	struct mutex		pag_unlinked_mutex;
> +	struct xfs_inode	*pag_unlinked_tail;

What do these fields do?  There aren't any comments....

--D

> +
>  	/*
>  	 * Unlinked inode information.  This incore information reflects
>  	 * data stored in the AGI, so callers must hold the AGI buffer lock
> -- 
> 2.18.1
>
Dave Chinner July 8, 2020, 11:33 p.m. UTC | #2
On Tue, Jul 07, 2020 at 09:57:41PM +0800, Gao Xiang wrote:
> Currently, we use AGI buffer lock to protect in-memory linked list for
> unlinked inodes but since it's not necessary to modify AGI unless the
> head of the unlinked list is modified. So let's removing the AGI buffer
> modification dependency if possible, including 1) adding another per-AG
> dedicated lock to protect the whole list and 2) inserting unlinked
> inodes from tail.
> 
> For 2), the tail of bucket 0 is now recorded in perag for xfs_iunlink()
> to use. xfs_iunlink_remove() still support old multiple short bucket
> lists for recovery code.

I would split this into two separate patches. One to move to a perag
based locking strategy, another to change from head to tail
addition as they are largely independent algorithmic changes.

> Note that some paths take AGI lock in its transaction in advance,
> so the proper locking order is only AGI lock -> unlinked list lock.

These paths should be documented in the commit message as well
as in code comments so the reviewer is aware of those code paths
and can verify that your assumptions about locking order are
correct.

> 
> Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
> ---
>  fs/xfs/xfs_inode.c       | 251 ++++++++++++++++++++-------------------
>  fs/xfs/xfs_log_recover.c |   6 +
>  fs/xfs/xfs_mount.c       |   3 +
>  fs/xfs/xfs_mount.h       |   3 +
>  4 files changed, 144 insertions(+), 119 deletions(-)
> 
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 10565fa5ace4..d33e5b198534 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -1994,182 +1994,195 @@ xfs_iunlink_update_bucket(
>  }
>  
>  /*
> - * Always insert at the head, so we only have to do a next inode lookup to
> - * update it's prev pointer. The AGI bucket will point at the one we are
> - * inserting.
> + * This is called when the inode's link count has gone to 0 or we are creating
> + * a tmpfile via O_TMPFILE.  The inode @ip must have nlink == 0.
> + *
> + * We place the on-disk inode on a list in the AGI.  It will be pulled from this
> + * list when the inode is freed.
>   */
> -static int
> -xfs_iunlink_insert_inode(

Hmmm. Flattening the code also make the patch harder to follow as it
combines code movement/rearrangement with algorithmic changes.  We
try to separate out code movement/rearrangement into their own
patches so that the movement is easy to verify by itself.

Also, the helper functions help document the separation of the
unlinked list manipulations from the from setup and locking
requirements for the list manipulations, and this largely undoes all
that. I added these helpers because it completely untangled the mess
that was present before the RFC patchset I posted. THat is, I
couldn't easily modify the existing code because it interleaved
the locking, the backref hash manipulations and
the on-disk list manipulations in ways I found difficult to
understand and manage. Short, simple, clear functions are much
better than long, multiple operation functions...

i.e. this:

xfs_iunlink()
{

	get locks
	do list insert
	drop locks
}

Is better for understanding, maintenance and future modification
than:

xfs_iunlink()
{

	get perag
	lock perag
	look at tail of list
	if (empty) {
		unlock perag
		read/lock AGI
		lock perag
		look at tail of list
		if (empty)
			do head insert
			goto out
	}
	do tail insert
out:
	update inode/pag tails
	unlock
	drop perag
}

It's trivial for a reader to understand what the first version of
xfs_iunlink() is going to do without needing to understand the
intraccies of the locking strategies. However, it takes time and
effort to undestand exactly waht the second one is doing because
it's not clear where lock ends and list modifications start, nor
what the locking rules are for the different modifications that are
being made. Essentially, it goes back to the complex
locking-intertwined-with-modification-algorithm problem the current
TOT code has.

I'd much prefer to see something like this:

/*
 * Inode allocation in the O_TMPFILE path defines the AGI/unlinked
 * list lock order as being AGI->perag unlinked list lock. We are
 * inverting it here as the fast path tail addition does not need to
 * modify the AGI at all. Hence we only need the AGI lock if the
 * tail is empty, but if we fail to get it without blocking then we
 * need to fall back to the slower, correct lock order.
 */
xfs_iunlink_insert_lock()
{
	get perag;
	lock_perag();
	if (!tail empty)
		return;
	if (trylock AGI)
		return;

	/*
	 * Slow path, need to lock AGI first. Don't even bother
	 * rechecking tail pointers or trying to optimise for
	 * minimal AGI lock hold time as racing unlink list mods
	 * will all block on the perag lock once we take that. They
	 * will then hit the !tail empty fast path and not require
	 * the AGI lock at all.
	 */
	lock AGI
	lock_perag()
	return;
}

The non-AGI locking fast path is slightly different in the remove
case, so we'll have a slightly different helper function in that
case which checks where the inode being removed is in the list.

In both cases, though, the unlock should be the same:

xfs_iunlink_unlock()
{
	/* Does not unlock AGI, ever. commit does that. */
	unlock perag
	put perag
}

This keeps the list locking completely separate from the list
manipulations and allows us to document the locking constraints and
reasons for why it is or isn't optimised for specific conditions
without cluttering up the list manipulations code.

Cheers,

Dave.
Gao Xiang July 8, 2020, 11:40 p.m. UTC | #3
Hi Darrick,

On Wed, Jul 08, 2020 at 10:03:07AM -0700, Darrick J. Wong wrote:
> On Tue, Jul 07, 2020 at 09:57:41PM +0800, Gao Xiang wrote:
> > Currently, we use AGI buffer lock to protect in-memory linked list for
> > unlinked inodes but since it's not necessary to modify AGI unless the
> > head of the unlinked list is modified. So let's removing the AGI buffer
> > modification dependency if possible, including 1) adding another per-AG
> > dedicated lock to protect the whole list and 2) inserting unlinked
> > inodes from tail.
> > 
> > For 2), the tail of bucket 0 is now recorded in perag for xfs_iunlink()
> > to use. xfs_iunlink_remove() still support old multiple short bucket
> > lists for recovery code.
> > 
> > Note that some paths take AGI lock in its transaction in advance,
> > so the proper locking order is only AGI lock -> unlinked list lock.
> 
> How much agi buffer lock contention does this eliminate?

Sorry, I haven't got the point.

Did you mean I should show some number in the commit message from some workload?
I'm not sure if you mean that, it depends on the specific workload indeed. I
could do that if needed (it's of much help if some more hints here...)

Or I'm not sure if you mean the deadlock. There are some exist path I've found
when running fsstress, for example, xfs_create_tmpfile() will reading/locking
AGI when preparing an orphan inode in this transaction. So if I reading AGI
again under a new lock unconditionally, it should cause ABBA deadlock (since
another path takes mutex first and locks AGI buffer again).

> 
> > @@ -372,6 +372,9 @@ typedef struct xfs_perag {
> >  	/* reference count */
> >  	uint8_t			pagf_refcount_level;
> >  
> > +	struct mutex		pag_unlinked_mutex;
> > +	struct xfs_inode	*pag_unlinked_tail;
> 
> What do these fields do?  There aren't any comments....

Will add some comments in the next version, Thanks for the advice!

Thanks,
Gao Xiang

> 
> --D
> 
> > +
> >  	/*
> >  	 * Unlinked inode information.  This incore information reflects
> >  	 * data stored in the AGI, so callers must hold the AGI buffer lock
> > -- 
> > 2.18.1
> > 
>
Gao Xiang July 9, 2020, 12:55 a.m. UTC | #4
On Thu, Jul 09, 2020 at 09:33:11AM +1000, Dave Chinner wrote:
> On Tue, Jul 07, 2020 at 09:57:41PM +0800, Gao Xiang wrote:
> > Currently, we use AGI buffer lock to protect in-memory linked list for
> > unlinked inodes but since it's not necessary to modify AGI unless the
> > head of the unlinked list is modified. So let's removing the AGI buffer
> > modification dependency if possible, including 1) adding another per-AG
> > dedicated lock to protect the whole list and 2) inserting unlinked
> > inodes from tail.
> > 
> > For 2), the tail of bucket 0 is now recorded in perag for xfs_iunlink()
> > to use. xfs_iunlink_remove() still support old multiple short bucket
> > lists for recovery code.
> 
> I would split this into two separate patches. One to move to a perag
> based locking strategy, another to change from head to tail
> addition as they are largely independent algorithmic changes.

Yes, that is much better from the perspective of spilting patches and
I thought that before. It seems that is like 2 steps but the proposed
target solution is as a whole (in other words, 2 steps are code-related)
and I'm not sure how large these code is sharable or can be inherited
but rather than introduce some code in patch 2 and then remove immediately
and turn into a new code in patch 3). I'm not sure how large logic could
be sharable between these 2 dependent steps so I didn't do that.

I will spilt patches in the next RFC version to make a try.

> 
> > Note that some paths take AGI lock in its transaction in advance,
> > so the proper locking order is only AGI lock -> unlinked list lock.
> 
> These paths should be documented in the commit message as well
> as in code comments so the reviewer is aware of those code paths
> and can verify that your assumptions about locking order are
> correct.

Yeah, It has been documented in the following.

+xfs_iunlink(
 	struct xfs_trans	*tp,
-	struct xfs_buf		*agibp,
 	struct xfs_inode	*ip)
 {
 	struct xfs_mount	*mp = tp->t_mountp;
-	struct xfs_agi		*agi = agibp->b_addr;
-	xfs_agino_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+	struct xfs_perag	*pag;
+	struct xfs_inode	*pip;
 	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
-	xfs_agino_t		next_agino;
+	int			error;
+
+	ASSERT(VFS_I(ip)->i_nlink == 0);
+	ASSERT(VFS_I(ip)->i_mode != 0);
+	trace_xfs_iunlink(ip);
+
+	pag = xfs_perag_get(mp, agno);
 
 	/*
-	 * Get the index into the agi hash table for the list this inode will
-	 * go on.  Make sure the pointer isn't garbage and that this inode
-	 * isn't already on the list.
+	 * some paths (e.g. xfs_create_tmpfile) could take AGI lock
+	 * in this transaction in advance and the only locking order
+	 * AGI buf lock -> pag_unlinked_mutex is safe.

> 
> > 
> > Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
> > ---
> >  fs/xfs/xfs_inode.c       | 251 ++++++++++++++++++++-------------------
> >  fs/xfs/xfs_log_recover.c |   6 +
> >  fs/xfs/xfs_mount.c       |   3 +
> >  fs/xfs/xfs_mount.h       |   3 +
> >  4 files changed, 144 insertions(+), 119 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > index 10565fa5ace4..d33e5b198534 100644
> > --- a/fs/xfs/xfs_inode.c
> > +++ b/fs/xfs/xfs_inode.c
> > @@ -1994,182 +1994,195 @@ xfs_iunlink_update_bucket(
> >  }
> >  
> >  /*
> > - * Always insert at the head, so we only have to do a next inode lookup to
> > - * update it's prev pointer. The AGI bucket will point at the one we are
> > - * inserting.
> > + * This is called when the inode's link count has gone to 0 or we are creating
> > + * a tmpfile via O_TMPFILE.  The inode @ip must have nlink == 0.
> > + *
> > + * We place the on-disk inode on a list in the AGI.  It will be pulled from this
> > + * list when the inode is freed.
> >   */
> > -static int
> > -xfs_iunlink_insert_inode(
> 
> Hmmm. Flattening the code also make the patch harder to follow as it
> combines code movement/rearrangement with algorithmic changes.  We
> try to separate out code movement/rearrangement into their own
> patches so that the movement is easy to verify by itself.
> 
> Also, the helper functions help document the separation of the
> unlinked list manipulations from the from setup and locking
> requirements for the list manipulations, and this largely undoes all
> that. I added these helpers because it completely untangled the mess
> that was present before the RFC patchset I posted. THat is, I
> couldn't easily modify the existing code because it interleaved
> the locking, the backref hash manipulations and
> the on-disk list manipulations in ways I found difficult to
> understand and manage. Short, simple, clear functions are much
> better than long, multiple operation functions...
> 
> i.e. this:
> 
> xfs_iunlink()
> {
> 
> 	get locks
> 	do list insert
> 	drop locks
> }
> 
> Is better for understanding, maintenance and future modification
> than:
> 
> xfs_iunlink()
> {
> 
> 	get perag
> 	lock perag
> 	look at tail of list
> 	if (empty) {
> 		unlock perag
> 		read/lock AGI
> 		lock perag
> 		look at tail of list
> 		if (empty)
> 			do head insert
> 			goto out
> 	}
> 	do tail insert
> out:
> 	update inode/pag tails
> 	unlock
> 	drop perag
> }
> 
> It's trivial for a reader to understand what the first version of
> xfs_iunlink() is going to do without needing to understand the
> intraccies of the locking strategies. However, it takes time and
> effort to undestand exactly waht the second one is doing because
> it's not clear where lock ends and list modifications start, nor
> what the locking rules are for the different modifications that are
> being made. Essentially, it goes back to the complex
> locking-intertwined-with-modification-algorithm problem the current
> TOT code has.
> 
> I'd much prefer to see something like this:
> 
> /*
>  * Inode allocation in the O_TMPFILE path defines the AGI/unlinked
>  * list lock order as being AGI->perag unlinked list lock. We are
>  * inverting it here as the fast path tail addition does not need to
>  * modify the AGI at all. Hence we only need the AGI lock if the
>  * tail is empty, but if we fail to get it without blocking then we
>  * need to fall back to the slower, correct lock order.
>  */
> xfs_iunlink_insert_lock()
> {
> 	get perag;
> 	lock_perag();
> 	if (!tail empty)
> 		return;
> 	if (trylock AGI)
> 		return;

(adding some notes here, this patch doesn't use try lock here
 finally but unlock perag and take AGI and relock and recheck tail_empty....
 since the tail non-empty is rare...)

> 
> 	/*
> 	 * Slow path, need to lock AGI first. Don't even bother
> 	 * rechecking tail pointers or trying to optimise for
> 	 * minimal AGI lock hold time as racing unlink list mods
> 	 * will all block on the perag lock once we take that. They
> 	 * will then hit the !tail empty fast path and not require
> 	 * the AGI lock at all.
> 	 */
> 	lock AGI
> 	lock_perag()
> 	return;
> }
> 
> The non-AGI locking fast path is slightly different in the remove
> case, so we'll have a slightly different helper function in that
> case which checks where the inode being removed is in the list.
> 
> In both cases, though, the unlock should be the same:
> 
> xfs_iunlink_unlock()
> {
> 	/* Does not unlock AGI, ever. commit does that. */
> 	unlock perag
> 	put perag
> }
> 
> This keeps the list locking completely separate from the list
> manipulations and allows us to document the locking constraints and
> reasons for why it is or isn't optimised for specific conditions
> without cluttering up the list manipulations code.

Yeah, the _lock and _unlock pair is helpful, I will go for this direction.

Thanks,
Gao Xiang

> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
>
Dave Chinner July 9, 2020, 2:32 a.m. UTC | #5
On Thu, Jul 09, 2020 at 08:55:26AM +0800, Gao Xiang wrote:
> On Thu, Jul 09, 2020 at 09:33:11AM +1000, Dave Chinner wrote:
> > On Tue, Jul 07, 2020 at 09:57:41PM +0800, Gao Xiang wrote:
> > > Currently, we use AGI buffer lock to protect in-memory linked list for
> > > unlinked inodes but since it's not necessary to modify AGI unless the
> > > head of the unlinked list is modified. So let's removing the AGI buffer
> > > modification dependency if possible, including 1) adding another per-AG
> > > dedicated lock to protect the whole list and 2) inserting unlinked
> > > inodes from tail.
> > > 
> > > For 2), the tail of bucket 0 is now recorded in perag for xfs_iunlink()
> > > to use. xfs_iunlink_remove() still support old multiple short bucket
> > > lists for recovery code.
> > 
> > I would split this into two separate patches. One to move to a perag
> > based locking strategy, another to change from head to tail
> > addition as they are largely independent algorithmic changes.
> 
> Yes, that is much better from the perspective of spilting patches and
> I thought that before. It seems that is like 2 steps but the proposed
> target solution is as a whole (in other words, 2 steps are code-related)
> and I'm not sure how large these code is sharable or can be inherited
> but rather than introduce some code in patch 2 and then remove immediately
> and turn into a new code in patch 3). I'm not sure how large logic could
> be sharable between these 2 dependent steps so I didn't do that.
> 
> I will spilt patches in the next RFC version to make a try.

Thanks!

[snip]


> > i.e. this:
> > 
> > xfs_iunlink()
> > {
> > 
> > 	get locks
> > 	do list insert
> > 	drop locks
> > }
> > 
> > Is better for understanding, maintenance and future modification
> > than:
> > 
> > xfs_iunlink()
> > {
> > 
> > 	get perag
> > 	lock perag
> > 	look at tail of list
> > 	if (empty) {
> > 		unlock perag
> > 		read/lock AGI
> > 		lock perag
> > 		look at tail of list
> > 		if (empty)
> > 			do head insert
> > 			goto out
> > 	}
> > 	do tail insert
> > out:
> > 	update inode/pag tails
> > 	unlock
> > 	drop perag
> > }
> > 
> > It's trivial for a reader to understand what the first version of
> > xfs_iunlink() is going to do without needing to understand the
> > intraccies of the locking strategies. However, it takes time and
> > effort to undestand exactly waht the second one is doing because
> > it's not clear where lock ends and list modifications start, nor
> > what the locking rules are for the different modifications that are
> > being made. Essentially, it goes back to the complex
> > locking-intertwined-with-modification-algorithm problem the current
> > TOT code has.
> > 
> > I'd much prefer to see something like this:
> > 
> > /*
> >  * Inode allocation in the O_TMPFILE path defines the AGI/unlinked
> >  * list lock order as being AGI->perag unlinked list lock. We are
> >  * inverting it here as the fast path tail addition does not need to
> >  * modify the AGI at all. Hence we only need the AGI lock if the
> >  * tail is empty, but if we fail to get it without blocking then we
> >  * need to fall back to the slower, correct lock order.
> >  */
> > xfs_iunlink_insert_lock()
> > {
> > 	get perag;
> > 	lock_perag();
> > 	if (!tail empty)
> > 		return;
> > 	if (trylock AGI)
> > 		return;
> 
> (adding some notes here, this patch doesn't use try lock here
>  finally but unlock perag and take AGI and relock and recheck tail_empty....
>  since the tail non-empty is rare...)

*nod*

My point was largely that this sort of thing is really obvious and
easy to optimise once the locking is cleanly separated. Adding a
trylock rather than drop/relock is another patch for the series :P

Cheers,

Dave.
Gao Xiang July 9, 2020, 10:36 a.m. UTC | #6
Hi Dave,

On Thu, Jul 09, 2020 at 12:32:46PM +1000, Dave Chinner wrote:

...

> > > 	if (trylock AGI)
> > > 		return;
> > 
> > (adding some notes here, this patch doesn't use try lock here
> >  finally but unlock perag and take AGI and relock and recheck tail_empty....
> >  since the tail non-empty is rare...)
> 
> *nod*
> 
> My point was largely that this sort of thing is really obvious and
> easy to optimise once the locking is cleanly separated. Adding a
> trylock rather than drop/relock is another patch for the series :P

I thought trylock way yesterday as well...
Apart from that we don't have the AGI trylock method yet, it seems
not work as well...

Thinking about one thread is holding a AGI lock and wants to lock perag.
And another thread holds a perag, but the trylock will fail and the second
wait-lock will cause deadlock... like below:

Thread 1			Thread 2
 lock AGI
                                 lock perag
 lock perag
                                 trylock AGI
                                 lock AGI           <- dead lock here

And trylock perag instead seems not work well as well, since it fails
more frequently so we need lock AGI and then lock perag as a fallback.

So currently, I think it's better to use unlock, do something, regrab,
recheck method for now...

And yeah, that can be done in another patch if some better way. (e.g.
moving modifing AGI into pre-commit, so we can only take one per-ag
lock here...)

Thanks,
Gao Xiang

> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
>
Gao Xiang July 9, 2020, 10:47 a.m. UTC | #7
On Thu, Jul 09, 2020 at 06:36:21PM +0800, Gao Xiang wrote:
> Hi Dave,
> 
> On Thu, Jul 09, 2020 at 12:32:46PM +1000, Dave Chinner wrote:
> 
> ...
> 
> > > > 	if (trylock AGI)
> > > > 		return;
> > > 
> > > (adding some notes here, this patch doesn't use try lock here
> > >  finally but unlock perag and take AGI and relock and recheck tail_empty....
> > >  since the tail non-empty is rare...)
> > 
> > *nod*
> > 
> > My point was largely that this sort of thing is really obvious and
> > easy to optimise once the locking is cleanly separated. Adding a
> > trylock rather than drop/relock is another patch for the series :P
> 
> I thought trylock way yesterday as well...
> Apart from that we don't have the AGI trylock method yet, it seems
> not work as well...
> 
> Thinking about one thread is holding a AGI lock and wants to lock perag.
> And another thread holds a perag, but the trylock will fail and the second
> wait-lock will cause deadlock... like below:
> 
> Thread 1			Thread 2
>  lock AGI
>                                  lock perag
>  lock perag
>                                  trylock AGI
>                                  lock AGI           <- dead lock here
> 
> And trylock perag instead seems not work well as well, since it fails
> more frequently so we need lock AGI and then lock perag as a fallback.
> 
> So currently, I think it's better to use unlock, do something, regrab,
> recheck method for now...
> 
> And yeah, that can be done in another patch if some better way. (e.g.
> moving modifing AGI into pre-commit, so we can only take one per-ag
> lock here...)

Oh, sorry, I didn't notice the words

> 	/*
> 	 * Slow path, need to lock AGI first. Don't even bother
> 	 * rechecking tail pointers or trying to optimise for
> 	 * minimal AGI lock hold time as racing unlink list mods
> 	 * will all block on the perag lock once we take that. They
> 	 * will then hit the !tail empty fast path and not require
> 	 * the AGI lock at all.
> 	 */

If we'd only try to trylock AGI and release perag and relock again,
and that's fine. Sorry about the noise :)

btw, also add some other notes here. For AGI pre-commits, I'm not
quite sure if some users take another locked buffer first (but
without AGI), so it would have some potential locking order concern
as well... But let us think about them later :)

add agi to precommit
lock other buffers

precommit --- lock AGI


Thanks,
Gao Xiang

> 
> Thanks,
> Gao Xiang
> 
> > 
> > Cheers,
> > 
> > Dave.
> > -- 
> > Dave Chinner
> > david@fromorbit.com
> >
Dave Chinner July 9, 2020, 10:36 p.m. UTC | #8
On Thu, Jul 09, 2020 at 06:36:21PM +0800, Gao Xiang wrote:
> Hi Dave,
> 
> On Thu, Jul 09, 2020 at 12:32:46PM +1000, Dave Chinner wrote:
> 
> ...
> 
> > > > 	if (trylock AGI)
> > > > 		return;
> > > 
> > > (adding some notes here, this patch doesn't use try lock here
> > >  finally but unlock perag and take AGI and relock and recheck tail_empty....
> > >  since the tail non-empty is rare...)
> > 
> > *nod*
> > 
> > My point was largely that this sort of thing is really obvious and
> > easy to optimise once the locking is cleanly separated. Adding a
> > trylock rather than drop/relock is another patch for the series :P
> 
> I thought trylock way yesterday as well...
> Apart from that we don't have the AGI trylock method yet, it seems
> not work as well...
> 
> Thinking about one thread is holding a AGI lock and wants to lock perag.
> And another thread holds a perag, but the trylock will fail and the second
> wait-lock will cause deadlock... like below:
> 
> Thread 1			Thread 2
>  lock AGI
>                                  lock perag
>  lock perag
>                                  trylock AGI
>                                  lock AGI           <- dead lock here

You forgot to unlock the perag before locking the AGI, so of course
it will deadlock.

Yes, I know my psuedocode example didn't explicitly unlock the perag
because I simply forgot to write it in as I was going. It's pretty
common that quickly written psuedo code to explain a concept is not
perfect, so you still need to think about whether it is correct,
complete, etc.  i.e. if you do:

Thread 1			Thread 2
 lock AGI
                                 lock perag
 lock perag
                                 trylock AGI
				 unlock perag
                                 lock AGI
				 <blocks>

There is no deadlock in this algorithm.

> And trylock perag instead seems not work well as well, since it fails
> more frequently so we need lock AGI and then lock perag as a fallback.

That's kind of what I'd expect - if there is contention on the AGI,
the trylock will fail and we know we are about to sleep. The point
is that "trylock fails" now gives us a point at which we can measure
contention that puts us into the slow path. And because we are now
in the slow path, the overhead of backing out just doesn't matter
because we are going to sleep anyway.

We use this pattern all over XFS to attempt to get locks in reverse
order with minimal overhead in fast paths...

> So currently, I think it's better to use unlock, do something, regrab,
> recheck method for now...

I think that's a mistake - it's smells of trying to optimise the
implementation around immediately observed behaviour instead of
designing a clean, well structured locking scheme that will not need
to be re-optimised over and over again as the algorithm it protects
changes over time.

Cheers,

Dave.
diff mbox series

Patch

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 10565fa5ace4..d33e5b198534 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1994,182 +1994,195 @@  xfs_iunlink_update_bucket(
 }
 
 /*
- * Always insert at the head, so we only have to do a next inode lookup to
- * update it's prev pointer. The AGI bucket will point at the one we are
- * inserting.
+ * This is called when the inode's link count has gone to 0 or we are creating
+ * a tmpfile via O_TMPFILE.  The inode @ip must have nlink == 0.
+ *
+ * We place the on-disk inode on a list in the AGI.  It will be pulled from this
+ * list when the inode is freed.
  */
-static int
-xfs_iunlink_insert_inode(
+STATIC int
+xfs_iunlink(
 	struct xfs_trans	*tp,
-	struct xfs_buf		*agibp,
 	struct xfs_inode	*ip)
 {
 	struct xfs_mount	*mp = tp->t_mountp;
-	struct xfs_agi		*agi = agibp->b_addr;
-	xfs_agino_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+	struct xfs_perag	*pag;
+	struct xfs_inode	*pip;
 	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
-	xfs_agino_t		next_agino;
+	int			error;
+
+	ASSERT(VFS_I(ip)->i_nlink == 0);
+	ASSERT(VFS_I(ip)->i_mode != 0);
+	trace_xfs_iunlink(ip);
+
+	pag = xfs_perag_get(mp, agno);
 
 	/*
-	 * Get the index into the agi hash table for the list this inode will
-	 * go on.  Make sure the pointer isn't garbage and that this inode
-	 * isn't already on the list.
+	 * some paths (e.g. xfs_create_tmpfile) could take AGI lock
+	 * in this transaction in advance and the only locking order
+	 * AGI buf lock -> pag_unlinked_mutex is safe.
 	 */
-	next_agino = be32_to_cpu(agi->agi_unlinked[0]);
-	if (next_agino == agino ||
-	    !xfs_verify_agino_or_null(mp, agno, next_agino)) {
-		xfs_buf_mark_corrupt(agibp);
-		return -EFSCORRUPTED;
-	}
-
-	ip->i_prev_unlinked = NULLAGINO;
-	ip->i_next_unlinked = next_agino;
-	if (ip->i_next_unlinked != NULLAGINO) {
-		struct xfs_inode	*nip;
+	mutex_lock(&pag->pag_unlinked_mutex);
+	pip = pag->pag_unlinked_tail;
+	if (!pip) {
+		struct xfs_buf	*agibp;
 
-		nip = xfs_iunlink_lookup_next(agibp->b_pag, ip);
-		if (IS_ERR_OR_NULL(nip))
-			return -EFSCORRUPTED;
+		mutex_unlock(&pag->pag_unlinked_mutex);
 
-		if (nip->i_prev_unlinked != NULLAGINO) {
-			xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
-						NULL, 0, __this_address);
-			return -EFSCORRUPTED;
+		/*
+		 * there could be some race, but it doesn't matter though
+		 * since !pip doesn't happen frequently.
+		 */
+		error = xfs_read_agi(mp, tp, agno, &agibp);
+		if (error) {
+			xfs_perag_put(pag);
+			return error;
 		}
-		nip->i_prev_unlinked = agino;
 
-		/* update the on disk inode now */
-		xfs_iunlink_log(tp, ip);
-	}
+		mutex_lock(&pag->pag_unlinked_mutex);
+		pip = pag->pag_unlinked_tail;
+		if (!pip) {
+			struct xfs_agi	*agi;
+
+			ip->i_prev_unlinked = NULLAGINO;
 
-	/* Point the head of the list to point to this inode. */
-	return xfs_iunlink_update_bucket(tp, agno, agibp, next_agino, agino);
+			agi = agibp->b_addr;
+			ASSERT(be32_to_cpu(agi->agi_unlinked[0]) == NULLAGINO);
+
+			/* Point the head of the list to point to this inode. */
+			error = xfs_iunlink_update_bucket(tp, agno,
+					agibp, NULLAGINO, agino);
+			goto out;
+		}
+	}
+	ip->i_prev_unlinked = XFS_INO_TO_AGINO(mp, pip->i_ino);
+	ASSERT(pip->i_next_unlinked == NULLAGINO);
+	pip->i_next_unlinked = agino;
+	xfs_iunlink_log(tp, pip);
+out:
+	ip->i_next_unlinked = NULLAGINO;
+	pag->pag_unlinked_tail = ip;
+	mutex_unlock(&pag->pag_unlinked_mutex);
+	xfs_perag_put(pag);
+	return 0;
 }
 
 /*
+ * Pull the on-disk inode from the AGI unlinked list.
+ *
  * Remove can be from anywhere in the list, so we have to do two adjacent inode
  * lookups here so we can update list pointers. We may be at the head or the
  * tail of the list, so we have to handle those cases as well.
  */
-static int
-xfs_iunlink_remove_inode(
+STATIC int
+xfs_iunlink_remove(
 	struct xfs_trans	*tp,
-	struct xfs_buf		*agibp,
 	struct xfs_inode	*ip)
 {
 	struct xfs_mount	*mp = tp->t_mountp;
-	xfs_agino_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+	struct xfs_perag	*pag;
+	struct xfs_inode	*pip = NULL;
 	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
-	xfs_agino_t		next_agino = ip->i_next_unlinked;
+	xfs_agino_t		next_agino;
 	int			error;
 
+	trace_xfs_iunlink_remove(ip);
+
+	pag = xfs_perag_get(mp, agno);
+	mutex_lock(&pag->pag_unlinked_mutex);
+
+	/* see the comment in xfs_iunlink() on the only proper locking order */
 	if (ip->i_prev_unlinked == NULLAGINO) {
-		/* remove from head of list */
-		if (next_agino == agino ||
-		    !xfs_verify_agino_or_null(mp, agno, next_agino))
-			return -EFSCORRUPTED;
+		struct xfs_buf	*agibp;
 
-		error = xfs_iunlink_update_bucket(tp, agno, agibp, agino, next_agino);
-		if (error)
+		mutex_unlock(&pag->pag_unlinked_mutex);
+
+		error = xfs_read_agi(mp, tp, agno, &agibp);
+		if (error) {
+			xfs_perag_put(pag);
 			return error;
-	} else {
-		/* lookup previous inode and update to point at next */
-		struct xfs_inode	*pip;
+		}
 
-		pip = xfs_iunlink_lookup_prev(agibp->b_pag, ip);
-		if (IS_ERR_OR_NULL(pip))
-			return -EFSCORRUPTED;
+		mutex_lock(&pag->pag_unlinked_mutex);
+		if (ip->i_prev_unlinked == NULLAGINO) {
+			struct xfs_agi	*agi;
 
-		if (pip->i_next_unlinked != agino) {
-			xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
-						NULL, 0, __this_address);
-			return -EFSCORRUPTED;
+			next_agino = ip->i_next_unlinked;
+			agi = agibp->b_addr;
+
+			/* remove from head of list */
+			if (next_agino == agino ||
+			    !xfs_verify_agino_or_null(mp, agno, next_agino))
+				goto err_fscorrupted;
+
+			error = xfs_iunlink_update_bucket(tp, agno,
+				agibp, agino, next_agino);
+			if (error)
+				goto err_out;
+
+			goto next_fixup;
 		}
+	}
 
-		/* update the on disk inode now */
-		pip->i_next_unlinked = next_agino;
-		xfs_iunlink_log(tp, pip);
+	next_agino = ip->i_next_unlinked;
+
+	/* lookup previous inode and update to point at next */
+	pip = xfs_iunlink_lookup_prev(pag, ip);
+	if (IS_ERR_OR_NULL(pip))
+		goto err_fscorrupted;
+
+	if (pip->i_next_unlinked != agino) {
+		xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
+					NULL, 0, __this_address);
+		goto err_fscorrupted;
 	}
+	pip->i_next_unlinked = next_agino;
+	xfs_iunlink_log(tp, pip);
 
-	/* lookup the next inode and update to point at prev */
-	if (ip->i_next_unlinked != NULLAGINO) {
+next_fixup:
+	if (next_agino == NULLAGINO) {
+		/* so iunlink recovery can work here */
+		if (pag->pag_unlinked_tail == ip)
+			pag->pag_unlinked_tail = pip;
+	} else {
+		/* lookup the next inode and update to point at prev */
 		struct xfs_inode	*nip;
 
-		nip = xfs_iunlink_lookup_next(agibp->b_pag, ip);
+		nip = xfs_iunlink_lookup_next(pag, ip);
 		if (IS_ERR_OR_NULL(nip))
-			return -EFSCORRUPTED;
+			goto err_fscorrupted;
 
 		if (nip->i_prev_unlinked != agino) {
 			xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
 						NULL, 0, __this_address);
-			return -EFSCORRUPTED;
+			goto err_fscorrupted;
 		}
 		/* in memory update only */
 		nip->i_prev_unlinked = ip->i_prev_unlinked;
 	}
+	mutex_unlock(&pag->pag_unlinked_mutex);
+	xfs_perag_put(pag);
 
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	/*
 	 * Now clear prev/next from this inode and update on disk if we
 	 * need to clear the on-disk link.
 	 */
 	ip->i_prev_unlinked = NULLAGINO;
-	ip->i_next_unlinked = NULLAGINO;
-	if (next_agino != NULLAGINO)
+	if (next_agino != NULLAGINO) {
+		ip->i_next_unlinked = NULLAGINO;
 		xfs_iunlink_log(tp, ip);
+	}
 	return 0;
-}
 
-/*
- * This is called when the inode's link count has gone to 0 or we are creating
- * a tmpfile via O_TMPFILE.  The inode @ip must have nlink == 0.
- *
- * We place the on-disk inode on a list in the AGI.  It will be pulled from this
- * list when the inode is freed.
- */
-STATIC int
-xfs_iunlink(
-	struct xfs_trans	*tp,
-	struct xfs_inode	*ip)
-{
-	struct xfs_mount	*mp = tp->t_mountp;
-	struct xfs_buf		*agibp;
-	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
-	int			error;
-
-	ASSERT(ip->i_next_unlinked == NULLAGINO);
-	ASSERT(VFS_I(ip)->i_nlink == 0);
-	ASSERT(VFS_I(ip)->i_mode != 0);
-	trace_xfs_iunlink(ip);
-
-	/* Get the agi buffer first.  It ensures lock ordering on the list. */
-	error = xfs_read_agi(mp, tp, agno, &agibp);
-	if (error)
-		return error;
-
-	return xfs_iunlink_insert_inode(tp, agibp, ip);
-}
-
-/*
- * Pull the on-disk inode from the AGI unlinked list.
- */
-STATIC int
-xfs_iunlink_remove(
-	struct xfs_trans	*tp,
-	struct xfs_inode	*ip)
-{
-	struct xfs_mount	*mp = tp->t_mountp;
-	struct xfs_buf		*agibp;
-	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
-	int			error;
-
-	trace_xfs_iunlink_remove(ip);
-
-	/* Get the agi buffer first.  It ensures lock ordering on the list. */
-	error = xfs_read_agi(mp, tp, agno, &agibp);
-	if (error)
-		return error;
-
-	return xfs_iunlink_remove_inode(tp, agibp, ip);
+err_fscorrupted:
+	error = -EFSCORRUPTED;
+err_out:
+	mutex_unlock(&pag->pag_unlinked_mutex);
+	xfs_perag_put(pag);
+	return error;
 }
 
 /*
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index d47eea31c165..3f2739316424 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2801,6 +2801,11 @@  xlog_recover_iunlink_ag(
 					prev_ip->i_next_unlinked = NULLAGINO;
 				break;
 			}
+
+			/* XXX: take pag_unlinked_mutex across the loop? */
+			if (!bucket)
+				agibp->b_pag->pag_unlinked_tail = ip;
+
 			if (prev_ip) {
 				ip->i_prev_unlinked = prev_agino;
 				xfs_irele(prev_ip);
@@ -2812,6 +2817,7 @@  xlog_recover_iunlink_ag(
 		}
 		if (prev_ip)
 			xfs_irele(prev_ip);
+
 		if (error) {
 			/*
 			 * We can't read an inode this bucket points to, or an
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 031e96ff022d..a9701f863e84 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -223,6 +223,9 @@  xfs_initialize_perag(
 		if (first_initialised == NULLAGNUMBER)
 			first_initialised = index;
 		spin_lock_init(&pag->pag_state_lock);
+
+		mutex_init(&pag->pag_unlinked_mutex);
+		pag->pag_unlinked_tail = NULL;
 	}
 
 	index = xfs_set_inode_alloc(mp, agcount);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a72cfcaa4ad1..107a634a34f7 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -372,6 +372,9 @@  typedef struct xfs_perag {
 	/* reference count */
 	uint8_t			pagf_refcount_level;
 
+	struct mutex		pag_unlinked_mutex;
+	struct xfs_inode	*pag_unlinked_tail;
+
 	/*
 	 * Unlinked inode information.  This incore information reflects
 	 * data stored in the AGI, so callers must hold the AGI buffer lock