diff mbox series

[24/30] xfs: rework stale inodes in xfs_ifree_cluster

Message ID 20200604074606.266213-25-david@fromorbit.com (mailing list archive)
State Superseded, archived
Headers show
Series xfs: rework inode flushing to make inode reclaim fully asynchronous | expand

Commit Message

Dave Chinner June 4, 2020, 7:46 a.m. UTC
From: Dave Chinner <dchinner@redhat.com>

Once we have inodes pinning the cluster buffer and attached whenever
they are dirty, we no longer have a guarantee that the items are
flush locked when we lock the cluster buffer. Hence we cannot just
walk the buffer log item list and modify the attached inodes.

If the inode is not flush locked, we have to ILOCK it first and then
flush lock it to do all the prerequisite checks needed to avoid
races with other code. This is already handled by
xfs_ifree_get_one_inode(), so rework the inode iteration loop and
function to update all inodes in cache whether they are attached to
the buffer or not.

Note: we also remove the copying of the log item lsn to the
ili_flush_lsn as xfs_iflush_done() now uses the XFS_ISTALE flag to
trigger aborts and so flush lsn matching is not needed in IO
completion for processing freed inodes.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_inode.c | 158 ++++++++++++++++++---------------------------
 1 file changed, 62 insertions(+), 96 deletions(-)

Comments

Brian Foster June 5, 2020, 6:27 p.m. UTC | #1
On Thu, Jun 04, 2020 at 05:46:00PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> Once we have inodes pinning the cluster buffer and attached whenever
> they are dirty, we no longer have a guarantee that the items are
> flush locked when we lock the cluster buffer. Hence we cannot just
> walk the buffer log item list and modify the attached inodes.
> 
> If the inode is not flush locked, we have to ILOCK it first and then
> flush lock it to do all the prerequisite checks needed to avoid
> races with other code. This is already handled by
> xfs_ifree_get_one_inode(), so rework the inode iteration loop and
> function to update all inodes in cache whether they are attached to
> the buffer or not.
> 
> Note: we also remove the copying of the log item lsn to the
> ili_flush_lsn as xfs_iflush_done() now uses the XFS_ISTALE flag to
> trigger aborts and so flush lsn matching is not needed in IO
> completion for processing freed inodes.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  fs/xfs/xfs_inode.c | 158 ++++++++++++++++++---------------------------
>  1 file changed, 62 insertions(+), 96 deletions(-)
> 
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 272b54cf97000..fb4c614c64fda 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
...
> @@ -2559,43 +2563,53 @@ xfs_ifree_get_one_inode(
>  	 */
>  	if (ip != free_ip) {
>  		if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
> +			spin_unlock(&ip->i_flags_lock);
>  			rcu_read_unlock();
>  			delay(1);
>  			goto retry;
>  		}
> -
> -		/*
> -		 * Check the inode number again in case we're racing with
> -		 * freeing in xfs_reclaim_inode().  See the comments in that
> -		 * function for more information as to why the initial check is
> -		 * not sufficient.
> -		 */
> -		if (ip->i_ino != inum) {
> -			xfs_iunlock(ip, XFS_ILOCK_EXCL);
> -			goto out_rcu_unlock;
> -		}

Why is the recheck under ILOCK_EXCL no longer necessary? It looks like
reclaim decides whether to proceed or not under the ilock and doesn't
acquire the spinlock until it decides to reclaim. Hm?

>  	}
> +	ip->i_flags |= XFS_ISTALE;
> +	spin_unlock(&ip->i_flags_lock);
>  	rcu_read_unlock();
>  
> -	xfs_iflock(ip);
> -	xfs_iflags_set(ip, XFS_ISTALE);
> +	/*
> +	 * If we can't get the flush lock, the inode is already attached.  All
> +	 * we needed to do here is mark the inode stale so buffer IO completion
> +	 * will remove it from the AIL.
> +	 */

To make sure I'm following this correctly, we can assume the inode is
attached based on an iflock_nowait() failure because we hold the ilock,
right? IOW, any other task doing a similar iflock check would have to do
so under ilock and release the flush lock first if the inode didn't end
up flushed, for whatever reason.

> +	iip = ip->i_itemp;
> +	if (!xfs_iflock_nowait(ip)) {
> +		ASSERT(!list_empty(&iip->ili_item.li_bio_list));
> +		ASSERT(iip->ili_last_fields);
> +		goto out_iunlock;
> +	}
> +	ASSERT(!iip || list_empty(&iip->ili_item.li_bio_list));
>  
>  	/*
> -	 * We don't need to attach clean inodes or those only with unlogged
> -	 * changes (which we throw away, anyway).
> +	 * Clean inodes can be released immediately.  Everything else has to go
> +	 * through xfs_iflush_abort() on journal commit as the flock
> +	 * synchronises removal of the inode from the cluster buffer against
> +	 * inode reclaim.
>  	 */
> -	if (!ip->i_itemp || xfs_inode_clean(ip)) {
> -		ASSERT(ip != free_ip);
> +	if (xfs_inode_clean(ip)) {
>  		xfs_ifunlock(ip);
> -		xfs_iunlock(ip, XFS_ILOCK_EXCL);
> -		goto out_no_inode;
> +		goto out_iunlock;
>  	}
> -	return ip;
>  
> -out_rcu_unlock:
> -	rcu_read_unlock();
> -out_no_inode:
> -	return NULL;
> +	/* we have a dirty inode in memory that has not yet been flushed. */
> +	ASSERT(iip->ili_fields);
> +	spin_lock(&iip->ili_lock);
> +	iip->ili_last_fields = iip->ili_fields;
> +	iip->ili_fields = 0;
> +	iip->ili_fsync_fields = 0;
> +	spin_unlock(&iip->ili_lock);
> +	list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list);
> +	ASSERT(iip->ili_last_fields);

We already asserted ->ili_fields and assigned ->ili_fields to
->ili_last_fields, so this assert seems spurious.

Brian

> +
> +out_iunlock:
> +	if (ip != free_ip)
> +		xfs_iunlock(ip, XFS_ILOCK_EXCL);
>  }
>  
>  /*
> @@ -2605,26 +2619,20 @@ xfs_ifree_get_one_inode(
>   */
>  STATIC int
>  xfs_ifree_cluster(
> -	xfs_inode_t		*free_ip,
> -	xfs_trans_t		*tp,
> +	struct xfs_inode	*free_ip,
> +	struct xfs_trans	*tp,
>  	struct xfs_icluster	*xic)
>  {
> -	xfs_mount_t		*mp = free_ip->i_mount;
> +	struct xfs_mount	*mp = free_ip->i_mount;
> +	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
> +	struct xfs_buf		*bp;
> +	xfs_daddr_t		blkno;
> +	xfs_ino_t		inum = xic->first_ino;
>  	int			nbufs;
>  	int			i, j;
>  	int			ioffset;
> -	xfs_daddr_t		blkno;
> -	xfs_buf_t		*bp;
> -	xfs_inode_t		*ip;
> -	struct xfs_inode_log_item *iip;
> -	struct xfs_log_item	*lip;
> -	struct xfs_perag	*pag;
> -	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
> -	xfs_ino_t		inum;
>  	int			error;
>  
> -	inum = xic->first_ino;
> -	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
>  	nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster;
>  
>  	for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) {
> @@ -2668,59 +2676,16 @@ xfs_ifree_cluster(
>  		bp->b_ops = &xfs_inode_buf_ops;
>  
>  		/*
> -		 * Walk the inodes already attached to the buffer and mark them
> -		 * stale. These will all have the flush locks held, so an
> -		 * in-memory inode walk can't lock them. By marking them all
> -		 * stale first, we will not attempt to lock them in the loop
> -		 * below as the XFS_ISTALE flag will be set.
> -		 */
> -		list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
> -			if (lip->li_type == XFS_LI_INODE) {
> -				iip = (struct xfs_inode_log_item *)lip;
> -				xfs_trans_ail_copy_lsn(mp->m_ail,
> -							&iip->ili_flush_lsn,
> -							&iip->ili_item.li_lsn);
> -				xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
> -			}
> -		}
> -
> -
> -		/*
> -		 * For each inode in memory attempt to add it to the inode
> -		 * buffer and set it up for being staled on buffer IO
> -		 * completion.  This is safe as we've locked out tail pushing
> -		 * and flushing by locking the buffer.
> -		 *
> -		 * We have already marked every inode that was part of a
> -		 * transaction stale above, which means there is no point in
> -		 * even trying to lock them.
> +		 * Now we need to set all the cached clean inodes as XFS_ISTALE,
> +		 * too. This requires lookups, and will skip inodes that we've
> +		 * already marked XFS_ISTALE.
>  		 */
> -		for (i = 0; i < igeo->inodes_per_cluster; i++) {
> -			ip = xfs_ifree_get_one_inode(pag, free_ip, inum + i);
> -			if (!ip)
> -				continue;
> -
> -			iip = ip->i_itemp;
> -			spin_lock(&iip->ili_lock);
> -			iip->ili_last_fields = iip->ili_fields;
> -			iip->ili_fields = 0;
> -			iip->ili_fsync_fields = 0;
> -			spin_unlock(&iip->ili_lock);
> -			xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
> -						&iip->ili_item.li_lsn);
> -
> -			list_add_tail(&iip->ili_item.li_bio_list,
> -						&bp->b_li_list);
> -
> -			if (ip != free_ip)
> -				xfs_iunlock(ip, XFS_ILOCK_EXCL);
> -		}
> +		for (i = 0; i < igeo->inodes_per_cluster; i++)
> +			xfs_ifree_mark_inode_stale(bp, free_ip, inum + i);
>  
>  		xfs_trans_stale_inode_buf(tp, bp);
>  		xfs_trans_binval(tp, bp);
>  	}
> -
> -	xfs_perag_put(pag);
>  	return 0;
>  }
>  
> @@ -3845,6 +3810,7 @@ xfs_iflush_int(
>  	iip->ili_fields = 0;
>  	iip->ili_fsync_fields = 0;
>  	spin_unlock(&iip->ili_lock);
> +	ASSERT(iip->ili_last_fields);
>  
>  	/*
>  	 * Store the current LSN of the inode so that we can tell whether the
> -- 
> 2.26.2.761.g0e0b3e54be
>
Dave Chinner June 5, 2020, 9:32 p.m. UTC | #2
On Fri, Jun 05, 2020 at 02:27:22PM -0400, Brian Foster wrote:
> On Thu, Jun 04, 2020 at 05:46:00PM +1000, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > Once we have inodes pinning the cluster buffer and attached whenever
> > they are dirty, we no longer have a guarantee that the items are
> > flush locked when we lock the cluster buffer. Hence we cannot just
> > walk the buffer log item list and modify the attached inodes.
> > 
> > If the inode is not flush locked, we have to ILOCK it first and then
> > flush lock it to do all the prerequisite checks needed to avoid
> > races with other code. This is already handled by
> > xfs_ifree_get_one_inode(), so rework the inode iteration loop and
> > function to update all inodes in cache whether they are attached to
> > the buffer or not.
> > 
> > Note: we also remove the copying of the log item lsn to the
> > ili_flush_lsn as xfs_iflush_done() now uses the XFS_ISTALE flag to
> > trigger aborts and so flush lsn matching is not needed in IO
> > completion for processing freed inodes.
> > 
> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> >  fs/xfs/xfs_inode.c | 158 ++++++++++++++++++---------------------------
> >  1 file changed, 62 insertions(+), 96 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > index 272b54cf97000..fb4c614c64fda 100644
> > --- a/fs/xfs/xfs_inode.c
> > +++ b/fs/xfs/xfs_inode.c
> ...
> > @@ -2559,43 +2563,53 @@ xfs_ifree_get_one_inode(
> >  	 */
> >  	if (ip != free_ip) {
> >  		if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
> > +			spin_unlock(&ip->i_flags_lock);
> >  			rcu_read_unlock();
> >  			delay(1);
> >  			goto retry;
> >  		}
> > -
> > -		/*
> > -		 * Check the inode number again in case we're racing with
> > -		 * freeing in xfs_reclaim_inode().  See the comments in that
> > -		 * function for more information as to why the initial check is
> > -		 * not sufficient.
> > -		 */
> > -		if (ip->i_ino != inum) {
> > -			xfs_iunlock(ip, XFS_ILOCK_EXCL);
> > -			goto out_rcu_unlock;
> > -		}
> 
> Why is the recheck under ILOCK_EXCL no longer necessary? It looks like
> reclaim decides whether to proceed or not under the ilock and doesn't
> acquire the spinlock until it decides to reclaim. Hm?

Because we now take the ILOCK while still holding the i_flags_lock
instead of dropping the spin lock then trying to get the ILOCK.
Hence with this change, if we get the ILOCK we are guaranteed that
the inode number has not changed and don't need to recheck it.

This is guaranteed by xfs_reclaim_inode() because it locks in
the order of ILOCK -> i_flags_lock and it zeroes the ip->i_ino
while holding both these locks. Hence if we've got the i_flags_lock
and we try to get the ILOCK, the inode is either going to be valid and
reclaim will skip the inode (because we hold locks) or the inode
will already be in reclaim and the ip->i_ino will be zero....


> >  	}
> > +	ip->i_flags |= XFS_ISTALE;
> > +	spin_unlock(&ip->i_flags_lock);
> >  	rcu_read_unlock();
> >  
> > -	xfs_iflock(ip);
> > -	xfs_iflags_set(ip, XFS_ISTALE);
> > +	/*
> > +	 * If we can't get the flush lock, the inode is already attached.  All
> > +	 * we needed to do here is mark the inode stale so buffer IO completion
> > +	 * will remove it from the AIL.
> > +	 */
> 
> To make sure I'm following this correctly, we can assume the inode is
> attached based on an iflock_nowait() failure because we hold the ilock,
> right?

Actually, because we hold the buffer lock. We only flush the inode
to the buffer when we are holding the buffer lock, so all flush
locking shold nest inside the buffer lock. So for taking the flock,
the lock order is bp->b_sema -> ILOCK_EXCL -> iflock. We drop the
flush lock before we drop the buffer lock in IO completion, and
hence if we hold the buffer lock, nothing else can actually unlock
the inode flush lock.

> IOW, any other task doing a similar iflock check would have to do
> so under ilock and release the flush lock first if the inode didn't end
> up flushed, for whatever reason.

Yes, anything taking the flush lock needs to first hold the ILOCK -
that's always been the case and we've always done that because the
ILOCK is needed to provides serialisation against a) other
modifications while we are accessing/flushing the inode, and b)
inode reclaim.

/me checks.

After this patchset nothing calls xfs_iflock() at all - everything
uses xfs_iflock_nowait(), so it might be time to turn this back into
a plain status flag and get rid of the iflock stuff altogether as
it's just a state flag now...

> > +	ASSERT(iip->ili_fields);
> > +	spin_lock(&iip->ili_lock);
> > +	iip->ili_last_fields = iip->ili_fields;
> > +	iip->ili_fields = 0;
> > +	iip->ili_fsync_fields = 0;
> > +	spin_unlock(&iip->ili_lock);
> > +	list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list);
> > +	ASSERT(iip->ili_last_fields);
> 
> We already asserted ->ili_fields and assigned ->ili_fields to
> ->ili_last_fields, so this assert seems spurious.

Ah, the first ASSERT goes away in the next patch, I think. It was
debug, and I may have removed it from the wrong patch...

Cheers,

Dave.
Brian Foster June 8, 2020, 4:44 p.m. UTC | #3
On Sat, Jun 06, 2020 at 07:32:10AM +1000, Dave Chinner wrote:
> On Fri, Jun 05, 2020 at 02:27:22PM -0400, Brian Foster wrote:
> > On Thu, Jun 04, 2020 at 05:46:00PM +1000, Dave Chinner wrote:
> > > From: Dave Chinner <dchinner@redhat.com>
> > > 
> > > Once we have inodes pinning the cluster buffer and attached whenever
> > > they are dirty, we no longer have a guarantee that the items are
> > > flush locked when we lock the cluster buffer. Hence we cannot just
> > > walk the buffer log item list and modify the attached inodes.
> > > 
> > > If the inode is not flush locked, we have to ILOCK it first and then
> > > flush lock it to do all the prerequisite checks needed to avoid
> > > races with other code. This is already handled by
> > > xfs_ifree_get_one_inode(), so rework the inode iteration loop and
> > > function to update all inodes in cache whether they are attached to
> > > the buffer or not.
> > > 
> > > Note: we also remove the copying of the log item lsn to the
> > > ili_flush_lsn as xfs_iflush_done() now uses the XFS_ISTALE flag to
> > > trigger aborts and so flush lsn matching is not needed in IO
> > > completion for processing freed inodes.
> > > 
> > > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > ---
> > >  fs/xfs/xfs_inode.c | 158 ++++++++++++++++++---------------------------
> > >  1 file changed, 62 insertions(+), 96 deletions(-)
> > > 
> > > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > > index 272b54cf97000..fb4c614c64fda 100644
> > > --- a/fs/xfs/xfs_inode.c
> > > +++ b/fs/xfs/xfs_inode.c
> > ...
> > > @@ -2559,43 +2563,53 @@ xfs_ifree_get_one_inode(
> > >  	 */
> > >  	if (ip != free_ip) {
> > >  		if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
> > > +			spin_unlock(&ip->i_flags_lock);
> > >  			rcu_read_unlock();
> > >  			delay(1);
> > >  			goto retry;
> > >  		}
> > > -
> > > -		/*
> > > -		 * Check the inode number again in case we're racing with
> > > -		 * freeing in xfs_reclaim_inode().  See the comments in that
> > > -		 * function for more information as to why the initial check is
> > > -		 * not sufficient.
> > > -		 */
> > > -		if (ip->i_ino != inum) {
> > > -			xfs_iunlock(ip, XFS_ILOCK_EXCL);
> > > -			goto out_rcu_unlock;
> > > -		}
> > 
> > Why is the recheck under ILOCK_EXCL no longer necessary? It looks like
> > reclaim decides whether to proceed or not under the ilock and doesn't
> > acquire the spinlock until it decides to reclaim. Hm?
> 
> Because we now take the ILOCK while still holding the i_flags_lock
> instead of dropping the spin lock then trying to get the ILOCK.
> Hence with this change, if we get the ILOCK we are guaranteed that
> the inode number has not changed and don't need to recheck it.
> 
> This is guaranteed by xfs_reclaim_inode() because it locks in
> the order of ILOCK -> i_flags_lock and it zeroes the ip->i_ino
> while holding both these locks. Hence if we've got the i_flags_lock
> and we try to get the ILOCK, the inode is either going to be valid and
> reclaim will skip the inode (because we hold locks) or the inode
> will already be in reclaim and the ip->i_ino will be zero....
> 

Got it, thanks.

> 
> > >  	}
> > > +	ip->i_flags |= XFS_ISTALE;
> > > +	spin_unlock(&ip->i_flags_lock);
> > >  	rcu_read_unlock();
> > >  
> > > -	xfs_iflock(ip);
> > > -	xfs_iflags_set(ip, XFS_ISTALE);
> > > +	/*
> > > +	 * If we can't get the flush lock, the inode is already attached.  All
> > > +	 * we needed to do here is mark the inode stale so buffer IO completion
> > > +	 * will remove it from the AIL.
> > > +	 */
> > 
> > To make sure I'm following this correctly, we can assume the inode is
> > attached based on an iflock_nowait() failure because we hold the ilock,
> > right?
> 
> Actually, because we hold the buffer lock. We only flush the inode
> to the buffer when we are holding the buffer lock, so all flush
> locking shold nest inside the buffer lock. So for taking the flock,
> the lock order is bp->b_sema -> ILOCK_EXCL -> iflock. We drop the
> flush lock before we drop the buffer lock in IO completion, and
> hence if we hold the buffer lock, nothing else can actually unlock
> the inode flush lock.
> 

I was thinking about xfs_reclaim_inode() again where we trylock the
flush lock before checking clean state. There's no buffer involved in
that path (as of this patch, at least), so afaict the ILOCK is what
protects us in that general scenario.

> > IOW, any other task doing a similar iflock check would have to do
> > so under ilock and release the flush lock first if the inode didn't end
> > up flushed, for whatever reason.
> 
> Yes, anything taking the flush lock needs to first hold the ILOCK -
> that's always been the case and we've always done that because the
> ILOCK is needed to provides serialisation against a) other
> modifications while we are accessing/flushing the inode, and b)
> inode reclaim.
> 

Right, Ok.

> /me checks.
> 
> After this patchset nothing calls xfs_iflock() at all - everything
> uses xfs_iflock_nowait(), so it might be time to turn this back into
> a plain status flag and get rid of the iflock stuff altogether as
> it's just a state flag now...
> 

Yeah, I think that would be a nice follow on cleanup. The whole "if
trylock fails, then assume we own the lock" type logic is really obscure
and confusing to read IMO. Treating it as the "inode is flushed" state
the lock fundamentally represents would be far more readable.

That addresses both my concerns, so with the nit below fixed:

Reviewed-by: Brian Foster <bfoster@redhat.com>

Brian

> > > +	ASSERT(iip->ili_fields);
> > > +	spin_lock(&iip->ili_lock);
> > > +	iip->ili_last_fields = iip->ili_fields;
> > > +	iip->ili_fields = 0;
> > > +	iip->ili_fsync_fields = 0;
> > > +	spin_unlock(&iip->ili_lock);
> > > +	list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list);
> > > +	ASSERT(iip->ili_last_fields);
> > 
> > We already asserted ->ili_fields and assigned ->ili_fields to
> > ->ili_last_fields, so this assert seems spurious.
> 
> Ah, the first ASSERT goes away in the next patch, I think. It was
> debug, and I may have removed it from the wrong patch...
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
>
diff mbox series

Patch

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 272b54cf97000..fb4c614c64fda 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2517,17 +2517,19 @@  xfs_iunlink_remove(
 }
 
 /*
- * Look up the inode number specified and mark it stale if it is found. If it is
- * dirty, return the inode so it can be attached to the cluster buffer so it can
- * be processed appropriately when the cluster free transaction completes.
+ * Look up the inode number specified and if it is not already marked XFS_ISTALE
+ * mark it stale. We should only find clean inodes in this lookup that aren't
+ * already stale.
  */
-static struct xfs_inode *
-xfs_ifree_get_one_inode(
-	struct xfs_perag	*pag,
+static void
+xfs_ifree_mark_inode_stale(
+	struct xfs_buf		*bp,
 	struct xfs_inode	*free_ip,
 	xfs_ino_t		inum)
 {
-	struct xfs_mount	*mp = pag->pag_mount;
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_perag	*pag = bp->b_pag;
+	struct xfs_inode_log_item *iip;
 	struct xfs_inode	*ip;
 
 retry:
@@ -2535,8 +2537,10 @@  xfs_ifree_get_one_inode(
 	ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum));
 
 	/* Inode not in memory, nothing to do */
-	if (!ip)
-		goto out_rcu_unlock;
+	if (!ip) {
+		rcu_read_unlock();
+		return;
+	}
 
 	/*
 	 * because this is an RCU protected lookup, we could find a recently
@@ -2547,9 +2551,9 @@  xfs_ifree_get_one_inode(
 	spin_lock(&ip->i_flags_lock);
 	if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) {
 		spin_unlock(&ip->i_flags_lock);
-		goto out_rcu_unlock;
+		rcu_read_unlock();
+		return;
 	}
-	spin_unlock(&ip->i_flags_lock);
 
 	/*
 	 * Don't try to lock/unlock the current inode, but we _cannot_ skip the
@@ -2559,43 +2563,53 @@  xfs_ifree_get_one_inode(
 	 */
 	if (ip != free_ip) {
 		if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+			spin_unlock(&ip->i_flags_lock);
 			rcu_read_unlock();
 			delay(1);
 			goto retry;
 		}
-
-		/*
-		 * Check the inode number again in case we're racing with
-		 * freeing in xfs_reclaim_inode().  See the comments in that
-		 * function for more information as to why the initial check is
-		 * not sufficient.
-		 */
-		if (ip->i_ino != inum) {
-			xfs_iunlock(ip, XFS_ILOCK_EXCL);
-			goto out_rcu_unlock;
-		}
 	}
+	ip->i_flags |= XFS_ISTALE;
+	spin_unlock(&ip->i_flags_lock);
 	rcu_read_unlock();
 
-	xfs_iflock(ip);
-	xfs_iflags_set(ip, XFS_ISTALE);
+	/*
+	 * If we can't get the flush lock, the inode is already attached.  All
+	 * we needed to do here is mark the inode stale so buffer IO completion
+	 * will remove it from the AIL.
+	 */
+	iip = ip->i_itemp;
+	if (!xfs_iflock_nowait(ip)) {
+		ASSERT(!list_empty(&iip->ili_item.li_bio_list));
+		ASSERT(iip->ili_last_fields);
+		goto out_iunlock;
+	}
+	ASSERT(!iip || list_empty(&iip->ili_item.li_bio_list));
 
 	/*
-	 * We don't need to attach clean inodes or those only with unlogged
-	 * changes (which we throw away, anyway).
+	 * Clean inodes can be released immediately.  Everything else has to go
+	 * through xfs_iflush_abort() on journal commit as the flock
+	 * synchronises removal of the inode from the cluster buffer against
+	 * inode reclaim.
 	 */
-	if (!ip->i_itemp || xfs_inode_clean(ip)) {
-		ASSERT(ip != free_ip);
+	if (xfs_inode_clean(ip)) {
 		xfs_ifunlock(ip);
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-		goto out_no_inode;
+		goto out_iunlock;
 	}
-	return ip;
 
-out_rcu_unlock:
-	rcu_read_unlock();
-out_no_inode:
-	return NULL;
+	/* we have a dirty inode in memory that has not yet been flushed. */
+	ASSERT(iip->ili_fields);
+	spin_lock(&iip->ili_lock);
+	iip->ili_last_fields = iip->ili_fields;
+	iip->ili_fields = 0;
+	iip->ili_fsync_fields = 0;
+	spin_unlock(&iip->ili_lock);
+	list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list);
+	ASSERT(iip->ili_last_fields);
+
+out_iunlock:
+	if (ip != free_ip)
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 }
 
 /*
@@ -2605,26 +2619,20 @@  xfs_ifree_get_one_inode(
  */
 STATIC int
 xfs_ifree_cluster(
-	xfs_inode_t		*free_ip,
-	xfs_trans_t		*tp,
+	struct xfs_inode	*free_ip,
+	struct xfs_trans	*tp,
 	struct xfs_icluster	*xic)
 {
-	xfs_mount_t		*mp = free_ip->i_mount;
+	struct xfs_mount	*mp = free_ip->i_mount;
+	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
+	struct xfs_buf		*bp;
+	xfs_daddr_t		blkno;
+	xfs_ino_t		inum = xic->first_ino;
 	int			nbufs;
 	int			i, j;
 	int			ioffset;
-	xfs_daddr_t		blkno;
-	xfs_buf_t		*bp;
-	xfs_inode_t		*ip;
-	struct xfs_inode_log_item *iip;
-	struct xfs_log_item	*lip;
-	struct xfs_perag	*pag;
-	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
-	xfs_ino_t		inum;
 	int			error;
 
-	inum = xic->first_ino;
-	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
 	nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster;
 
 	for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) {
@@ -2668,59 +2676,16 @@  xfs_ifree_cluster(
 		bp->b_ops = &xfs_inode_buf_ops;
 
 		/*
-		 * Walk the inodes already attached to the buffer and mark them
-		 * stale. These will all have the flush locks held, so an
-		 * in-memory inode walk can't lock them. By marking them all
-		 * stale first, we will not attempt to lock them in the loop
-		 * below as the XFS_ISTALE flag will be set.
-		 */
-		list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
-			if (lip->li_type == XFS_LI_INODE) {
-				iip = (struct xfs_inode_log_item *)lip;
-				xfs_trans_ail_copy_lsn(mp->m_ail,
-							&iip->ili_flush_lsn,
-							&iip->ili_item.li_lsn);
-				xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
-			}
-		}
-
-
-		/*
-		 * For each inode in memory attempt to add it to the inode
-		 * buffer and set it up for being staled on buffer IO
-		 * completion.  This is safe as we've locked out tail pushing
-		 * and flushing by locking the buffer.
-		 *
-		 * We have already marked every inode that was part of a
-		 * transaction stale above, which means there is no point in
-		 * even trying to lock them.
+		 * Now we need to set all the cached clean inodes as XFS_ISTALE,
+		 * too. This requires lookups, and will skip inodes that we've
+		 * already marked XFS_ISTALE.
 		 */
-		for (i = 0; i < igeo->inodes_per_cluster; i++) {
-			ip = xfs_ifree_get_one_inode(pag, free_ip, inum + i);
-			if (!ip)
-				continue;
-
-			iip = ip->i_itemp;
-			spin_lock(&iip->ili_lock);
-			iip->ili_last_fields = iip->ili_fields;
-			iip->ili_fields = 0;
-			iip->ili_fsync_fields = 0;
-			spin_unlock(&iip->ili_lock);
-			xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
-						&iip->ili_item.li_lsn);
-
-			list_add_tail(&iip->ili_item.li_bio_list,
-						&bp->b_li_list);
-
-			if (ip != free_ip)
-				xfs_iunlock(ip, XFS_ILOCK_EXCL);
-		}
+		for (i = 0; i < igeo->inodes_per_cluster; i++)
+			xfs_ifree_mark_inode_stale(bp, free_ip, inum + i);
 
 		xfs_trans_stale_inode_buf(tp, bp);
 		xfs_trans_binval(tp, bp);
 	}
-
-	xfs_perag_put(pag);
 	return 0;
 }
 
@@ -3845,6 +3810,7 @@  xfs_iflush_int(
 	iip->ili_fields = 0;
 	iip->ili_fsync_fields = 0;
 	spin_unlock(&iip->ili_lock);
+	ASSERT(iip->ili_last_fields);
 
 	/*
 	 * Store the current LSN of the inode so that we can tell whether the