diff mbox series

[03/12] xfs: always attach iflush_done and simplify error handling

Message ID 20200417150859.14734-4-bfoster@redhat.com (mailing list archive)
State Deferred, archived
Headers show
Series xfs: flush related error handling cleanups | expand

Commit Message

Brian Foster April 17, 2020, 3:08 p.m. UTC
The inode flush code has several layers of error handling between
the inode and cluster flushing code. If the inode flush fails before
acquiring the backing buffer, the inode flush is aborted. If the
cluster flush fails, the current inode flush is aborted and the
cluster buffer is failed to handle the initial inode and any others
that might have been attached before the error.

Since xfs_iflush() is the only caller of xfs_iflush_cluser(), the
error handling between the two can be condensed in the top-level
function. If we update xfs_iflush_int() to attach the item
completion handler to the buffer first, any errors that occur after
the first call to xfs_iflush_int() can be handled with a buffer
I/O failure.

Lift the error handling from xfs_iflush_cluster() into xfs_iflush()
and consolidate with the existing error handling. This also replaces
the need to release the buffer because failing the buffer with
XBF_ASYNC drops the current reference.

Signed-off-by: Brian Foster <bfoster@redhat.com>
---
 fs/xfs/xfs_inode.c | 94 +++++++++++++++-------------------------------
 1 file changed, 30 insertions(+), 64 deletions(-)

Comments

Allison Henderson April 18, 2020, 12:07 a.m. UTC | #1
On 4/17/20 8:08 AM, Brian Foster wrote:
> The inode flush code has several layers of error handling between
> the inode and cluster flushing code. If the inode flush fails before
> acquiring the backing buffer, the inode flush is aborted. If the
> cluster flush fails, the current inode flush is aborted and the
> cluster buffer is failed to handle the initial inode and any others
> that might have been attached before the error.
> 
> Since xfs_iflush() is the only caller of xfs_iflush_cluser(), the
> error handling between the two can be condensed in the top-level
> function. If we update xfs_iflush_int() to attach the item
> completion handler to the buffer first, any errors that occur after
> the first call to xfs_iflush_int() can be handled with a buffer
> I/O failure.
> 
> Lift the error handling from xfs_iflush_cluster() into xfs_iflush()
> and consolidate with the existing error handling. This also replaces
> the need to release the buffer because failing the buffer with
> XBF_ASYNC drops the current reference.
> 
> Signed-off-by: Brian Foster <bfoster@redhat.com>
> ---
>   fs/xfs/xfs_inode.c | 94 +++++++++++++++-------------------------------
>   1 file changed, 30 insertions(+), 64 deletions(-)
> 
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index b539ee221ce5..4c9971ec6fa6 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -3496,6 +3496,7 @@ xfs_iflush_cluster(
>   	struct xfs_inode	**cilist;
>   	struct xfs_inode	*cip;
>   	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
> +	int			error = 0;
>   	int			nr_found;
>   	int			clcount = 0;
>   	int			i;
> @@ -3588,11 +3589,10 @@ xfs_iflush_cluster(
>   		 * re-check that it's dirty before flushing.
>   		 */
>   		if (!xfs_inode_clean(cip)) {
> -			int	error;
>   			error = xfs_iflush_int(cip, bp);
>   			if (error) {
>   				xfs_iunlock(cip, XFS_ILOCK_SHARED);
> -				goto cluster_corrupt_out;
> +				goto out_free;
>   			}
>   			clcount++;
>   		} else {
> @@ -3611,32 +3611,7 @@ xfs_iflush_cluster(
>   	kmem_free(cilist);
>   out_put:
>   	xfs_perag_put(pag);
> -	return 0;
> -
> -
> -cluster_corrupt_out:
> -	/*
> -	 * Corruption detected in the clustering loop.  Invalidate the
> -	 * inode buffer and shut down the filesystem.
> -	 */
> -	rcu_read_unlock();
Hmm, I can't find where this unlock moved?  Is it not needed anymore?  I 
think I followed the rest of it though.

Reviewed-by: Allison Collins <allison.henderson@oracle.com>

Allison

> -
> -	/*
> -	 * We'll always have an inode attached to the buffer for completion
> -	 * process by the time we are called from xfs_iflush(). Hence we have
> -	 * always need to do IO completion processing to abort the inodes
> -	 * attached to the buffer.  handle them just like the shutdown case in
> -	 * xfs_buf_submit().
> -	 */
> -	ASSERT(bp->b_iodone);
> -	xfs_buf_iofail(bp, XBF_ASYNC);
> -	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
> -
> -	/* abort the corrupt inode, as it was not attached to the buffer */
> -	xfs_iflush_abort(cip, false);
> -	kmem_free(cilist);
> -	xfs_perag_put(pag);
> -	return -EFSCORRUPTED;
> +	return error;
>   }
>   
>   /*
> @@ -3692,17 +3667,16 @@ xfs_iflush(
>   	 */
>   	if (XFS_FORCED_SHUTDOWN(mp)) {
>   		error = -EIO;
> -		goto abort_out;
> +		goto abort;
>   	}
>   
>   	/*
>   	 * Get the buffer containing the on-disk inode. We are doing a try-lock
> -	 * operation here, so we may get  an EAGAIN error. In that case, we
> -	 * simply want to return with the inode still dirty.
> +	 * operation here, so we may get an EAGAIN error. In that case, return
> +	 * leaving the inode dirty.
>   	 *
>   	 * If we get any other error, we effectively have a corruption situation
> -	 * and we cannot flush the inode, so we treat it the same as failing
> -	 * xfs_iflush_int().
> +	 * and we cannot flush the inode. Abort the flush and shut down.
>   	 */
>   	error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK,
>   			       0);
> @@ -3711,14 +3685,7 @@ xfs_iflush(
>   		return error;
>   	}
>   	if (error)
> -		goto corrupt_out;
> -
> -	/*
> -	 * First flush out the inode that xfs_iflush was called with.
> -	 */
> -	error = xfs_iflush_int(ip, bp);
> -	if (error)
> -		goto corrupt_out;
> +		goto abort;
>   
>   	/*
>   	 * If the buffer is pinned then push on the log now so we won't
> @@ -3728,28 +3695,28 @@ xfs_iflush(
>   		xfs_log_force(mp, 0);
>   
>   	/*
> -	 * inode clustering: try to gather other inodes into this write
> +	 * Flush the provided inode then attempt to gather others from the
> +	 * cluster into the write.
>   	 *
> -	 * Note: Any error during clustering will result in the filesystem
> -	 * being shut down and completion callbacks run on the cluster buffer.
> -	 * As we have already flushed and attached this inode to the buffer,
> -	 * it has already been aborted and released by xfs_iflush_cluster() and
> -	 * so we have no further error handling to do here.
> +	 * Note: Once we attempt to flush an inode, we must run buffer
> +	 * completion callbacks on any failure. If this fails, simulate an I/O
> +	 * failure on the buffer and shut down.
>   	 */
> -	error = xfs_iflush_cluster(ip, bp);
> -	if (error)
> -		return error;
> +	error = xfs_iflush_int(ip, bp);
> +	if (!error)
> +		error = xfs_iflush_cluster(ip, bp);
> +	if (error) {
> +		xfs_buf_iofail(bp, XBF_ASYNC);
> +		goto shutdown;
> +	}
>   
>   	*bpp = bp;
>   	return 0;
>   
> -corrupt_out:
> -	if (bp)
> -		xfs_buf_relse(bp);
> -	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
> -abort_out:
> -	/* abort the corrupt inode, as it was not attached to the buffer */
> +abort:
>   	xfs_iflush_abort(ip, false);
> +shutdown:
> +	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
>   	return error;
>   }
>   
> @@ -3798,6 +3765,13 @@ xfs_iflush_int(
>   	       ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
>   	ASSERT(iip != NULL && iip->ili_fields != 0);
>   
> +	/*
> +	 * Attach the inode item callback to the buffer. Whether the flush
> +	 * succeeds or not, buffer I/O completion processing is now required to
> +	 * remove the inode from the AIL and release the flush lock.
> +	 */
> +	xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
> +
>   	/* set *dip = inode's place in the buffer */
>   	dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
>   
> @@ -3913,14 +3887,6 @@ xfs_iflush_int(
>   	xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
>   				&iip->ili_item.li_lsn);
>   
> -	/*
> -	 * Attach the function xfs_iflush_done to the inode's
> -	 * buffer.  This will remove the inode from the AIL
> -	 * and unlock the inode's flush lock when the inode is
> -	 * completely written to disk.
> -	 */
> -	xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
> -
>   	/* generate the checksum. */
>   	xfs_dinode_calc_crc(mp, dip);
>   
>
Dave Chinner April 20, 2020, 3:08 a.m. UTC | #2
On Fri, Apr 17, 2020 at 11:08:50AM -0400, Brian Foster wrote:
> The inode flush code has several layers of error handling between
> the inode and cluster flushing code. If the inode flush fails before
> acquiring the backing buffer, the inode flush is aborted. If the
> cluster flush fails, the current inode flush is aborted and the
> cluster buffer is failed to handle the initial inode and any others
> that might have been attached before the error.
> 
> Since xfs_iflush() is the only caller of xfs_iflush_cluser(), the

xfs_iflush_cluster()

> error handling between the two can be condensed in the top-level
> function. If we update xfs_iflush_int() to attach the item
> completion handler to the buffer first, any errors that occur after
> the first call to xfs_iflush_int() can be handled with a buffer
> I/O failure.
> 
> Lift the error handling from xfs_iflush_cluster() into xfs_iflush()
> and consolidate with the existing error handling. This also replaces
> the need to release the buffer because failing the buffer with
> XBF_ASYNC drops the current reference.

Yeah, that makes sense. I've lifted the cluster flush error handling
into the callers, even though xfs_iflush() has gone away.

However...

> @@ -3798,6 +3765,13 @@ xfs_iflush_int(
>  	       ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
>  	ASSERT(iip != NULL && iip->ili_fields != 0);
>  
> +	/*
> +	 * Attach the inode item callback to the buffer. Whether the flush
> +	 * succeeds or not, buffer I/O completion processing is now required to
> +	 * remove the inode from the AIL and release the flush lock.
> +	 */
> +	xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
> +
>  	/* set *dip = inode's place in the buffer */
>  	dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);

...I'm not convinced this is a valid thing to do at this point. The
inode item has not been set up yet with the correct state that is
associated with the flushing of the inode (e.g. lsn, last_flags,
etc) and so this kinda leaves a landmine in the item IO completion
processing in that failure cannot rely on any of the inode log item
state to make condition decisions.

While it's technically not wrong, it just makes me uneasy, as in
future the flush abort code will have to be careful about using
inode state in making decisions, and there's not comments in the
abort code to indicate that the state may be invalid...

/me has chased several subtle issues through this code recently...

Cheers,

Dave.
Brian Foster April 20, 2020, 1:59 p.m. UTC | #3
On Fri, Apr 17, 2020 at 05:07:16PM -0700, Allison Collins wrote:
> On 4/17/20 8:08 AM, Brian Foster wrote:
> > The inode flush code has several layers of error handling between
> > the inode and cluster flushing code. If the inode flush fails before
> > acquiring the backing buffer, the inode flush is aborted. If the
> > cluster flush fails, the current inode flush is aborted and the
> > cluster buffer is failed to handle the initial inode and any others
> > that might have been attached before the error.
> > 
> > Since xfs_iflush() is the only caller of xfs_iflush_cluser(), the
> > error handling between the two can be condensed in the top-level
> > function. If we update xfs_iflush_int() to attach the item
> > completion handler to the buffer first, any errors that occur after
> > the first call to xfs_iflush_int() can be handled with a buffer
> > I/O failure.
> > 
> > Lift the error handling from xfs_iflush_cluster() into xfs_iflush()
> > and consolidate with the existing error handling. This also replaces
> > the need to release the buffer because failing the buffer with
> > XBF_ASYNC drops the current reference.
> > 
> > Signed-off-by: Brian Foster <bfoster@redhat.com>
> > ---
> >   fs/xfs/xfs_inode.c | 94 +++++++++++++++-------------------------------
> >   1 file changed, 30 insertions(+), 64 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > index b539ee221ce5..4c9971ec6fa6 100644
> > --- a/fs/xfs/xfs_inode.c
> > +++ b/fs/xfs/xfs_inode.c
> > @@ -3496,6 +3496,7 @@ xfs_iflush_cluster(
> >   	struct xfs_inode	**cilist;
> >   	struct xfs_inode	*cip;
> >   	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
> > +	int			error = 0;
> >   	int			nr_found;
> >   	int			clcount = 0;
> >   	int			i;
> > @@ -3588,11 +3589,10 @@ xfs_iflush_cluster(
> >   		 * re-check that it's dirty before flushing.
> >   		 */
> >   		if (!xfs_inode_clean(cip)) {
> > -			int	error;
> >   			error = xfs_iflush_int(cip, bp);
> >   			if (error) {
> >   				xfs_iunlock(cip, XFS_ILOCK_SHARED);
> > -				goto cluster_corrupt_out;
> > +				goto out_free;
> >   			}
> >   			clcount++;
> >   		} else {
> > @@ -3611,32 +3611,7 @@ xfs_iflush_cluster(
> >   	kmem_free(cilist);
> >   out_put:
> >   	xfs_perag_put(pag);
> > -	return 0;
> > -
> > -
> > -cluster_corrupt_out:
> > -	/*
> > -	 * Corruption detected in the clustering loop.  Invalidate the
> > -	 * inode buffer and shut down the filesystem.
> > -	 */
> > -	rcu_read_unlock();
> Hmm, I can't find where this unlock moved?  Is it not needed anymore?  I
> think I followed the rest of it though.
> 

It's not needed because the whole cluster_corrupt_out path goes away.
out_free is used instead, which also calls rcu_read_unlock() and returns
the error to the caller..

> Reviewed-by: Allison Collins <allison.henderson@oracle.com>
> 

Thanks!

Brian

> Allison
> 
> > -
> > -	/*
> > -	 * We'll always have an inode attached to the buffer for completion
> > -	 * process by the time we are called from xfs_iflush(). Hence we have
> > -	 * always need to do IO completion processing to abort the inodes
> > -	 * attached to the buffer.  handle them just like the shutdown case in
> > -	 * xfs_buf_submit().
> > -	 */
> > -	ASSERT(bp->b_iodone);
> > -	xfs_buf_iofail(bp, XBF_ASYNC);
> > -	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
> > -
> > -	/* abort the corrupt inode, as it was not attached to the buffer */
> > -	xfs_iflush_abort(cip, false);
> > -	kmem_free(cilist);
> > -	xfs_perag_put(pag);
> > -	return -EFSCORRUPTED;
> > +	return error;
> >   }
> >   /*
> > @@ -3692,17 +3667,16 @@ xfs_iflush(
> >   	 */
> >   	if (XFS_FORCED_SHUTDOWN(mp)) {
> >   		error = -EIO;
> > -		goto abort_out;
> > +		goto abort;
> >   	}
> >   	/*
> >   	 * Get the buffer containing the on-disk inode. We are doing a try-lock
> > -	 * operation here, so we may get  an EAGAIN error. In that case, we
> > -	 * simply want to return with the inode still dirty.
> > +	 * operation here, so we may get an EAGAIN error. In that case, return
> > +	 * leaving the inode dirty.
> >   	 *
> >   	 * If we get any other error, we effectively have a corruption situation
> > -	 * and we cannot flush the inode, so we treat it the same as failing
> > -	 * xfs_iflush_int().
> > +	 * and we cannot flush the inode. Abort the flush and shut down.
> >   	 */
> >   	error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK,
> >   			       0);
> > @@ -3711,14 +3685,7 @@ xfs_iflush(
> >   		return error;
> >   	}
> >   	if (error)
> > -		goto corrupt_out;
> > -
> > -	/*
> > -	 * First flush out the inode that xfs_iflush was called with.
> > -	 */
> > -	error = xfs_iflush_int(ip, bp);
> > -	if (error)
> > -		goto corrupt_out;
> > +		goto abort;
> >   	/*
> >   	 * If the buffer is pinned then push on the log now so we won't
> > @@ -3728,28 +3695,28 @@ xfs_iflush(
> >   		xfs_log_force(mp, 0);
> >   	/*
> > -	 * inode clustering: try to gather other inodes into this write
> > +	 * Flush the provided inode then attempt to gather others from the
> > +	 * cluster into the write.
> >   	 *
> > -	 * Note: Any error during clustering will result in the filesystem
> > -	 * being shut down and completion callbacks run on the cluster buffer.
> > -	 * As we have already flushed and attached this inode to the buffer,
> > -	 * it has already been aborted and released by xfs_iflush_cluster() and
> > -	 * so we have no further error handling to do here.
> > +	 * Note: Once we attempt to flush an inode, we must run buffer
> > +	 * completion callbacks on any failure. If this fails, simulate an I/O
> > +	 * failure on the buffer and shut down.
> >   	 */
> > -	error = xfs_iflush_cluster(ip, bp);
> > -	if (error)
> > -		return error;
> > +	error = xfs_iflush_int(ip, bp);
> > +	if (!error)
> > +		error = xfs_iflush_cluster(ip, bp);
> > +	if (error) {
> > +		xfs_buf_iofail(bp, XBF_ASYNC);
> > +		goto shutdown;
> > +	}
> >   	*bpp = bp;
> >   	return 0;
> > -corrupt_out:
> > -	if (bp)
> > -		xfs_buf_relse(bp);
> > -	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
> > -abort_out:
> > -	/* abort the corrupt inode, as it was not attached to the buffer */
> > +abort:
> >   	xfs_iflush_abort(ip, false);
> > +shutdown:
> > +	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
> >   	return error;
> >   }
> > @@ -3798,6 +3765,13 @@ xfs_iflush_int(
> >   	       ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
> >   	ASSERT(iip != NULL && iip->ili_fields != 0);
> > +	/*
> > +	 * Attach the inode item callback to the buffer. Whether the flush
> > +	 * succeeds or not, buffer I/O completion processing is now required to
> > +	 * remove the inode from the AIL and release the flush lock.
> > +	 */
> > +	xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
> > +
> >   	/* set *dip = inode's place in the buffer */
> >   	dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
> > @@ -3913,14 +3887,6 @@ xfs_iflush_int(
> >   	xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
> >   				&iip->ili_item.li_lsn);
> > -	/*
> > -	 * Attach the function xfs_iflush_done to the inode's
> > -	 * buffer.  This will remove the inode from the AIL
> > -	 * and unlock the inode's flush lock when the inode is
> > -	 * completely written to disk.
> > -	 */
> > -	xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
> > -
> >   	/* generate the checksum. */
> >   	xfs_dinode_calc_crc(mp, dip);
> > 
>
Brian Foster April 20, 2020, 2 p.m. UTC | #4
On Mon, Apr 20, 2020 at 01:08:52PM +1000, Dave Chinner wrote:
> On Fri, Apr 17, 2020 at 11:08:50AM -0400, Brian Foster wrote:
> > The inode flush code has several layers of error handling between
> > the inode and cluster flushing code. If the inode flush fails before
> > acquiring the backing buffer, the inode flush is aborted. If the
> > cluster flush fails, the current inode flush is aborted and the
> > cluster buffer is failed to handle the initial inode and any others
> > that might have been attached before the error.
> > 
> > Since xfs_iflush() is the only caller of xfs_iflush_cluser(), the
> 
> xfs_iflush_cluster()
> 

Fixed.

> > error handling between the two can be condensed in the top-level
> > function. If we update xfs_iflush_int() to attach the item
> > completion handler to the buffer first, any errors that occur after
> > the first call to xfs_iflush_int() can be handled with a buffer
> > I/O failure.
> > 
> > Lift the error handling from xfs_iflush_cluster() into xfs_iflush()
> > and consolidate with the existing error handling. This also replaces
> > the need to release the buffer because failing the buffer with
> > XBF_ASYNC drops the current reference.
> 
> Yeah, that makes sense. I've lifted the cluster flush error handling
> into the callers, even though xfs_iflush() has gone away.
> 
> However...
> 
> > @@ -3798,6 +3765,13 @@ xfs_iflush_int(
> >  	       ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
> >  	ASSERT(iip != NULL && iip->ili_fields != 0);
> >  
> > +	/*
> > +	 * Attach the inode item callback to the buffer. Whether the flush
> > +	 * succeeds or not, buffer I/O completion processing is now required to
> > +	 * remove the inode from the AIL and release the flush lock.
> > +	 */
> > +	xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
> > +
> >  	/* set *dip = inode's place in the buffer */
> >  	dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
> 
> ...I'm not convinced this is a valid thing to do at this point. The
> inode item has not been set up yet with the correct state that is
> associated with the flushing of the inode (e.g. lsn, last_flags,
> etc) and so this kinda leaves a landmine in the item IO completion
> processing in that failure cannot rely on any of the inode log item
> state to make condition decisions.
> 

It is a bit ugly. I moved it just because it seemed like it could
facilitate enough simplification to be worthwhile. It's debatable
whether what ultimately fell out of that is worthwhile, of course. TBH,
I find the whole I/O approach to the error sequence rather rickety, even
though it's currently necessary, so I'm sympathetic to the argument of
not risking the common code path in service to the uncommon error path.

We could consider other tweaks like making flush imminent and leaving
the error checks at the end of the function (with documentation to
explain that we're shutting down, etc.), but I'd have to think about
that some more. It could be best just to leave this code as is if you
anticipate it going away relatively soon. This whole series was just
intended to be quick hit cleanups anyways...

Brian

> While it's technically not wrong, it just makes me uneasy, as in
> future the flush abort code will have to be careful about using
> inode state in making decisions, and there's not comments in the
> abort code to indicate that the state may be invalid...
> 
> /me has chased several subtle issues through this code recently...
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
>
diff mbox series

Patch

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index b539ee221ce5..4c9971ec6fa6 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3496,6 +3496,7 @@  xfs_iflush_cluster(
 	struct xfs_inode	**cilist;
 	struct xfs_inode	*cip;
 	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
+	int			error = 0;
 	int			nr_found;
 	int			clcount = 0;
 	int			i;
@@ -3588,11 +3589,10 @@  xfs_iflush_cluster(
 		 * re-check that it's dirty before flushing.
 		 */
 		if (!xfs_inode_clean(cip)) {
-			int	error;
 			error = xfs_iflush_int(cip, bp);
 			if (error) {
 				xfs_iunlock(cip, XFS_ILOCK_SHARED);
-				goto cluster_corrupt_out;
+				goto out_free;
 			}
 			clcount++;
 		} else {
@@ -3611,32 +3611,7 @@  xfs_iflush_cluster(
 	kmem_free(cilist);
 out_put:
 	xfs_perag_put(pag);
-	return 0;
-
-
-cluster_corrupt_out:
-	/*
-	 * Corruption detected in the clustering loop.  Invalidate the
-	 * inode buffer and shut down the filesystem.
-	 */
-	rcu_read_unlock();
-
-	/*
-	 * We'll always have an inode attached to the buffer for completion
-	 * process by the time we are called from xfs_iflush(). Hence we have
-	 * always need to do IO completion processing to abort the inodes
-	 * attached to the buffer.  handle them just like the shutdown case in
-	 * xfs_buf_submit().
-	 */
-	ASSERT(bp->b_iodone);
-	xfs_buf_iofail(bp, XBF_ASYNC);
-	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-
-	/* abort the corrupt inode, as it was not attached to the buffer */
-	xfs_iflush_abort(cip, false);
-	kmem_free(cilist);
-	xfs_perag_put(pag);
-	return -EFSCORRUPTED;
+	return error;
 }
 
 /*
@@ -3692,17 +3667,16 @@  xfs_iflush(
 	 */
 	if (XFS_FORCED_SHUTDOWN(mp)) {
 		error = -EIO;
-		goto abort_out;
+		goto abort;
 	}
 
 	/*
 	 * Get the buffer containing the on-disk inode. We are doing a try-lock
-	 * operation here, so we may get  an EAGAIN error. In that case, we
-	 * simply want to return with the inode still dirty.
+	 * operation here, so we may get an EAGAIN error. In that case, return
+	 * leaving the inode dirty.
 	 *
 	 * If we get any other error, we effectively have a corruption situation
-	 * and we cannot flush the inode, so we treat it the same as failing
-	 * xfs_iflush_int().
+	 * and we cannot flush the inode. Abort the flush and shut down.
 	 */
 	error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK,
 			       0);
@@ -3711,14 +3685,7 @@  xfs_iflush(
 		return error;
 	}
 	if (error)
-		goto corrupt_out;
-
-	/*
-	 * First flush out the inode that xfs_iflush was called with.
-	 */
-	error = xfs_iflush_int(ip, bp);
-	if (error)
-		goto corrupt_out;
+		goto abort;
 
 	/*
 	 * If the buffer is pinned then push on the log now so we won't
@@ -3728,28 +3695,28 @@  xfs_iflush(
 		xfs_log_force(mp, 0);
 
 	/*
-	 * inode clustering: try to gather other inodes into this write
+	 * Flush the provided inode then attempt to gather others from the
+	 * cluster into the write.
 	 *
-	 * Note: Any error during clustering will result in the filesystem
-	 * being shut down and completion callbacks run on the cluster buffer.
-	 * As we have already flushed and attached this inode to the buffer,
-	 * it has already been aborted and released by xfs_iflush_cluster() and
-	 * so we have no further error handling to do here.
+	 * Note: Once we attempt to flush an inode, we must run buffer
+	 * completion callbacks on any failure. If this fails, simulate an I/O
+	 * failure on the buffer and shut down.
 	 */
-	error = xfs_iflush_cluster(ip, bp);
-	if (error)
-		return error;
+	error = xfs_iflush_int(ip, bp);
+	if (!error)
+		error = xfs_iflush_cluster(ip, bp);
+	if (error) {
+		xfs_buf_iofail(bp, XBF_ASYNC);
+		goto shutdown;
+	}
 
 	*bpp = bp;
 	return 0;
 
-corrupt_out:
-	if (bp)
-		xfs_buf_relse(bp);
-	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-abort_out:
-	/* abort the corrupt inode, as it was not attached to the buffer */
+abort:
 	xfs_iflush_abort(ip, false);
+shutdown:
+	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 	return error;
 }
 
@@ -3798,6 +3765,13 @@  xfs_iflush_int(
 	       ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
 	ASSERT(iip != NULL && iip->ili_fields != 0);
 
+	/*
+	 * Attach the inode item callback to the buffer. Whether the flush
+	 * succeeds or not, buffer I/O completion processing is now required to
+	 * remove the inode from the AIL and release the flush lock.
+	 */
+	xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
+
 	/* set *dip = inode's place in the buffer */
 	dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
 
@@ -3913,14 +3887,6 @@  xfs_iflush_int(
 	xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
 				&iip->ili_item.li_lsn);
 
-	/*
-	 * Attach the function xfs_iflush_done to the inode's
-	 * buffer.  This will remove the inode from the AIL
-	 * and unlock the inode's flush lock when the inode is
-	 * completely written to disk.
-	 */
-	xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
-
 	/* generate the checksum. */
 	xfs_dinode_calc_crc(mp, dip);