diff mbox series

[22/24] xfs: rework xfs_iflush_cluster() dirty inode iteration

Message ID 20200522035029.3022405-23-david@fromorbit.com (mailing list archive)
State Superseded
Headers show
Series xfs: rework inode flushing to make inode reclaim fully asynchronous | expand

Commit Message

Dave Chinner May 22, 2020, 3:50 a.m. UTC
From: Dave Chinner <dchinner@redhat.com>

Now that we have all the dirty inodes attached to the cluster
buffer, we don't actually have to do radix tree lookups to find
them. Sure, the radix tree is efficient, but walking a linked list
of just the dirty inodes attached to the buffer is much better.

We are also no longer dependent on having a locked inode passed into
the function to determine where to start the lookup. This means we
can drop it from the function call and treat all inodes the same.

We also make xfs_iflush_cluster skip inodes marked with
XFS_IRECLAIM. This we avoid races with inodes that reclaim is
actively referencing or are being re-initialised by inode lookup. If
they are actually dirty, they'll get written by a future cluster
flush....

We also add a shutdown check after obtaining the flush lock so that
we catch inodes that are dirty in memory and may have inconsistent
state due to the shutdown in progress. We abort these inodes
directly and so they remove themselves directly from the buffer list
and the AIL rather than having to wait for the buffer to be failed
and callbacks run to be processed correctly.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_inode.c      | 150 ++++++++++++++++------------------------
 fs/xfs/xfs_inode.h      |   2 +-
 fs/xfs/xfs_inode_item.c |  15 +++-
 3 files changed, 74 insertions(+), 93 deletions(-)

Comments

Darrick J. Wong May 23, 2020, 12:13 a.m. UTC | #1
On Fri, May 22, 2020 at 01:50:27PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> Now that we have all the dirty inodes attached to the cluster
> buffer, we don't actually have to do radix tree lookups to find
> them. Sure, the radix tree is efficient, but walking a linked list
> of just the dirty inodes attached to the buffer is much better.
> 
> We are also no longer dependent on having a locked inode passed into
> the function to determine where to start the lookup. This means we
> can drop it from the function call and treat all inodes the same.
> 
> We also make xfs_iflush_cluster skip inodes marked with
> XFS_IRECLAIM. This we avoid races with inodes that reclaim is
> actively referencing or are being re-initialised by inode lookup. If
> they are actually dirty, they'll get written by a future cluster
> flush....
> 
> We also add a shutdown check after obtaining the flush lock so that
> we catch inodes that are dirty in memory and may have inconsistent
> state due to the shutdown in progress. We abort these inodes
> directly and so they remove themselves directly from the buffer list
> and the AIL rather than having to wait for the buffer to be failed
> and callbacks run to be processed correctly.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/xfs_inode.c      | 150 ++++++++++++++++------------------------
>  fs/xfs/xfs_inode.h      |   2 +-
>  fs/xfs/xfs_inode_item.c |  15 +++-
>  3 files changed, 74 insertions(+), 93 deletions(-)
> 
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index cbf8edf62d102..7db0f97e537e3 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -3428,7 +3428,7 @@ xfs_rename(
>  	return error;
>  }
>  
> -static int
> +int
>  xfs_iflush(

Not sure why this drops the static?

>  	struct xfs_inode	*ip,
>  	struct xfs_buf		*bp)
> @@ -3590,117 +3590,94 @@ xfs_iflush(
>   */
>  int
>  xfs_iflush_cluster(
> -	struct xfs_inode	*ip,
>  	struct xfs_buf		*bp)
>  {
> -	struct xfs_mount	*mp = ip->i_mount;
> -	struct xfs_perag	*pag;
> -	unsigned long		first_index, mask;
> -	int			cilist_size;
> -	struct xfs_inode	**cilist;
> -	struct xfs_inode	*cip;
> -	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
> -	int			error = 0;
> -	int			nr_found;
> +	struct xfs_mount	*mp = bp->b_mount;
> +	struct xfs_log_item	*lip, *n;
> +	struct xfs_inode	*ip;
> +	struct xfs_inode_log_item *iip;
>  	int			clcount = 0;
> -	int			i;
> -
> -	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
> -
> -	cilist_size = igeo->inodes_per_cluster * sizeof(struct xfs_inode *);
> -	cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS);
> -	if (!cilist)
> -		goto out_put;
> -
> -	mask = ~(igeo->inodes_per_cluster - 1);
> -	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
> -	rcu_read_lock();
> -	/* really need a gang lookup range call here */
> -	nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist,
> -					first_index, igeo->inodes_per_cluster);
> -	if (nr_found == 0)
> -		goto out_free;
> +	int			error = 0;
>  
> -	for (i = 0; i < nr_found; i++) {
> -		cip = cilist[i];
> +	/*
> +	 * We must use the safe variant here as on shutdown xfs_iflush_abort()
> +	 * can remove itself from the list.
> +	 */
> +	list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
> +		iip = (struct xfs_inode_log_item *)lip;
> +		ip = iip->ili_inode;
>  
>  		/*
> -		 * because this is an RCU protected lookup, we could find a
> -		 * recently freed or even reallocated inode during the lookup.
> -		 * We need to check under the i_flags_lock for a valid inode
> -		 * here. Skip it if it is not valid or the wrong inode.
> +		 * Quick and dirty check to avoid locks if possible.
>  		 */
> -		spin_lock(&cip->i_flags_lock);
> -		if (!cip->i_ino ||
> -		    __xfs_iflags_test(cip, XFS_ISTALE)) {
> -			spin_unlock(&cip->i_flags_lock);
> +		if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK))
> +			continue;
> +		if (xfs_ipincount(ip))
>  			continue;
> -		}
>  
>  		/*
> -		 * Once we fall off the end of the cluster, no point checking
> -		 * any more inodes in the list because they will also all be
> -		 * outside the cluster.
> +		 * The inode is still attached to the buffer, which means it is
> +		 * dirty but reclaim might try to grab it. Check carefully for
> +		 * that, and grab the ilock while still holding the i_flags_lock
> +		 * to guarantee reclaim will not be able to reclaim this inode
> +		 * once we drop the i_flags_lock.
>  		 */
> -		if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) {
> -			spin_unlock(&cip->i_flags_lock);
> -			break;
> +		spin_lock(&ip->i_flags_lock);
> +		ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE));
> +		if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) {
> +			spin_unlock(&ip->i_flags_lock);
> +			continue;
>  		}
> -		spin_unlock(&cip->i_flags_lock);
>  
>  		/*
> -		 * Do an un-protected check to see if the inode is dirty and
> -		 * is a candidate for flushing.  These checks will be repeated
> -		 * later after the appropriate locks are acquired.
> +		 * ILOCK will pin the inode against reclaim and prevent
> +		 * concurrent transactions modifying the inode while we are
> +		 * flushing the inode.
>  		 */
> -		if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0)
> +		if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
> +			spin_unlock(&ip->i_flags_lock);
>  			continue;
> +		}
> +		spin_unlock(&ip->i_flags_lock);
>  
>  		/*
> -		 * Try to get locks.  If any are unavailable or it is pinned,
> -		 * then this inode cannot be flushed and is skipped.
> +		 * Skip inodes that are already flush locked as they have
> +		 * already been written to the buffer.
>  		 */
> -
> -		if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED))
> -			continue;
> -		if (!xfs_iflock_nowait(cip)) {
> -			xfs_iunlock(cip, XFS_ILOCK_SHARED);
> +		if (!xfs_iflock_nowait(ip)) {
> +			xfs_iunlock(ip, XFS_ILOCK_SHARED);
>  			continue;
>  		}
> -		if (xfs_ipincount(cip)) {
> -			xfs_ifunlock(cip);
> -			xfs_iunlock(cip, XFS_ILOCK_SHARED);
> -			continue;
> -		}
> -
>  
>  		/*
> -		 * Check the inode number again, just to be certain we are not
> -		 * racing with freeing in xfs_reclaim_inode(). See the comments
> -		 * in that function for more information as to why the initial
> -		 * check is not sufficient.
> +		 * If we are shut down, unpin and abort the inode now as there
> +		 * is no point in flushing it to the buffer just to get an IO
> +		 * completion to abort the buffer and remove it from the AIL.
>  		 */
> -		if (!cip->i_ino) {
> -			xfs_ifunlock(cip);
> -			xfs_iunlock(cip, XFS_ILOCK_SHARED);
> +		if (XFS_FORCED_SHUTDOWN(mp)) {
> +			xfs_iunpin_wait(ip);
> +			/* xfs_iflush_abort() drops the flush lock */
> +			xfs_iflush_abort(ip);
> +			xfs_iunlock(ip, XFS_ILOCK_SHARED);
> +			error = EIO;

error = -EIO?

>  			continue;
>  		}
>  
> -		/*
> -		 * arriving here means that this inode can be flushed.  First
> -		 * re-check that it's dirty before flushing.
> -		 */
> -		if (!xfs_inode_clean(cip)) {
> -			error = xfs_iflush(cip, bp);
> -			if (error) {
> -				xfs_iunlock(cip, XFS_ILOCK_SHARED);
> -				goto out_free;
> -			}
> -			clcount++;
> -		} else {
> -			xfs_ifunlock(cip);
> +		/* don't block waiting on a log force to unpin dirty inodes */
> +		if (xfs_ipincount(ip)) {
> +			xfs_ifunlock(ip);
> +			xfs_iunlock(ip, XFS_ILOCK_SHARED);
> +			continue;
>  		}
> -		xfs_iunlock(cip, XFS_ILOCK_SHARED);
> +
> +		if (!xfs_inode_clean(ip))
> +			error = xfs_iflush(ip, bp);
> +		else
> +			xfs_ifunlock(ip);
> +		xfs_iunlock(ip, XFS_ILOCK_SHARED);
> +		if (error)
> +			break;
> +		clcount++;
>  	}
>  
>  	if (clcount) {
> @@ -3708,11 +3685,6 @@ xfs_iflush_cluster(
>  		XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
>  	}
>  
> -out_free:
> -	rcu_read_unlock();
> -	kmem_free(cilist);
> -out_put:
> -	xfs_perag_put(pag);
>  	if (error) {
>  		bp->b_flags |= XBF_ASYNC;
>  		xfs_buf_ioend_fail(bp);
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index d1109eb13ba2e..b93cf9076df8a 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -427,7 +427,7 @@ int		xfs_log_force_inode(struct xfs_inode *ip);
>  void		xfs_iunpin_wait(xfs_inode_t *);
>  #define xfs_ipincount(ip)	((unsigned int) atomic_read(&ip->i_pincount))
>  
> -int		xfs_iflush_cluster(struct xfs_inode *, struct xfs_buf *);
> +int		xfs_iflush_cluster(struct xfs_buf *);
>  void		xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode,
>  				struct xfs_inode *ip1, uint ip1_mode);
>  
> diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
> index 42b4b5fe1e2a9..af4764f97a339 100644
> --- a/fs/xfs/xfs_inode_item.c
> +++ b/fs/xfs/xfs_inode_item.c
> @@ -523,11 +523,20 @@ xfs_inode_item_push(
>  	}
>  	spin_unlock(&lip->li_ailp->ail_lock);
>  
> -	error = xfs_iflush_cluster(ip, bp);
> +	/*
> +	 * We need to hold a reference for flushing the cluster buffer as it may
> +	 * fail the buffer without IO submission. In which case, we better have
> +	 * a reference for that completion as otherwise we don't get a reference
> +	 * for IO until we queue it for delwri submission.

<confused>

What completion are we talking about?  Does this refer to the fact that
xfs_iflush_cluster handles a flush failure by simulating an async write
failure which could result in us giving away the inode log item's
reference to the buffer?

--D

> +	 */
> +	xfs_buf_hold(bp);
> +	error = xfs_iflush_cluster(bp);
>  	if (!error) {
> -		if (!xfs_buf_delwri_queue(bp, buffer_list))
> +		if (!xfs_buf_delwri_queue(bp, buffer_list)) {
> +			ASSERT(0);
>  			rval = XFS_ITEM_FLUSHING;
> -		xfs_buf_unlock(bp);
> +		}
> +		xfs_buf_relse(bp);
>  	} else {
>  		rval = XFS_ITEM_LOCKED;
>  	}
> -- 
> 2.26.2.761.g0e0b3e54be
>
Christoph Hellwig May 23, 2020, 11:31 a.m. UTC | #2
On Fri, May 22, 2020 at 01:50:27PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> Now that we have all the dirty inodes attached to the cluster
> buffer, we don't actually have to do radix tree lookups to find
> them. Sure, the radix tree is efficient, but walking a linked list
> of just the dirty inodes attached to the buffer is much better.
> 
> We are also no longer dependent on having a locked inode passed into
> the function to determine where to start the lookup. This means we
> can drop it from the function call and treat all inodes the same.
> 
> We also make xfs_iflush_cluster skip inodes marked with
> XFS_IRECLAIM. This we avoid races with inodes that reclaim is
> actively referencing or are being re-initialised by inode lookup. If
> they are actually dirty, they'll get written by a future cluster
> flush....
> 
> We also add a shutdown check after obtaining the flush lock so that
> we catch inodes that are dirty in memory and may have inconsistent
> state due to the shutdown in progress. We abort these inodes
> directly and so they remove themselves directly from the buffer list
> and the AIL rather than having to wait for the buffer to be failed
> and callbacks run to be processed correctly.

I suspect we should just kill off xfs_iflush_cluster with this, as it
is best split between xfs_iflush and xfs_inode_item_push.  POC patch
below, but as part of that I noticed some really odd error handling,
which I'll bring up separately.

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 6f873eca8c916..8784e70626403 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1102,12 +1102,12 @@ xfs_reclaim_inode(
 	/*
 	 * Because we use RCU freeing we need to ensure the inode always appears
 	 * to be reclaimed with an invalid inode number when in the free state.
-	 * We do this as early as possible under the ILOCK so that
-	 * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to
-	 * detect races with us here. By doing this, we guarantee that once
-	 * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that
-	 * it will see either a valid inode that will serialise correctly, or it
-	 * will see an invalid inode that it can skip.
+	 * We do this as early as possible under the ILOCK so that xfs_iflush()
+	 * and xfs_ifree_cluster() can be guaranteed to detect races with us
+	 * here. By doing this, we guarantee that once xfs_iflush() or
+	 * xfs_ifree_cluster() has locked XFS_ILOCK that it will see either a
+	 * valid inode that will serialise correctly, or it will see an invalid
+	 * inode that it can skip.
 	 */
 	spin_lock(&ip->i_flags_lock);
 	ip->i_flags = XFS_IRECLAIM;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 5bbdb0c5ed172..8b0197a386a0f 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3431,19 +3431,70 @@ xfs_rename(
 
 int
 xfs_iflush(
-	struct xfs_inode	*ip,
+	struct xfs_inode_log_item *iip,
 	struct xfs_buf		*bp)
 {
-	struct xfs_inode_log_item *iip = ip->i_itemp;
-	struct xfs_dinode	*dip;
+	struct xfs_inode	*ip = iip->ili_inode;
 	struct xfs_mount	*mp = ip->i_mount;
-	int			error;
+	struct xfs_dinode	*dip;
+	int			error = 0;
+
+	/*
+	 * Quick and dirty check to avoid locks if possible.
+	 */
+	if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK))
+		return 0;
+	if (xfs_ipincount(ip))
+		return 0;
+
+	/*
+	 * The inode is still attached to the buffer, which means it is dirty
+	 * but reclaim might try to grab it.  Check carefully for that, and grab
+	 * the ilock while still holding the i_flags_lock to guarantee reclaim
+	 * will not be able to reclaim this inode once we drop the i_flags_lock.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE));
+	if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK))
+		goto out_unlock_flags_lock;
+
+	/*
+	 * ILOCK will pin the inode against reclaim and prevent concurrent
+	 * transactions modifying the inode while we are flushing the inode.
+	 */
+	if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
+		goto out_unlock_flags_lock;
+	spin_unlock(&ip->i_flags_lock);
+
+	/*
+	 * Skip inodes that are already flush locked as they have already been
+	 * written to the buffer.
+	 */
+	if (!xfs_iflock_nowait(ip))
+		goto out_unlock_ilock;
+
+	/*
+	 * If we are shut down, unpin and abort the inode now as there is no
+	 * point in flushing it to the buffer just to get an IO completion to
+	 * abort the buffer and remove it from the AIL.
+	 */
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+		xfs_iunpin_wait(ip);
+		/* xfs_iflush_abort() drops the flush lock */
+		xfs_iflush_abort(ip);
+		error = -EIO;
+		goto out_unlock_ilock;
+	}
+
+	/* don't block waiting on a log force to unpin dirty inodes */
+	if (xfs_ipincount(ip) || xfs_inode_clean(ip)) {
+		xfs_ifunlock(ip);
+		goto out_unlock_ilock;
+	}
 
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
-	ASSERT(xfs_isiflocked(ip));
 	ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE ||
 	       ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
-	ASSERT(iip != NULL && iip->ili_fields != 0);
+	ASSERT(iip->ili_fields != 0);
 	ASSERT(iip->ili_item.li_buf == bp);
 
 	dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
@@ -3574,122 +3625,13 @@ xfs_iflush(
 
 	/* generate the checksum. */
 	xfs_dinode_calc_crc(mp, dip);
-	return error;
-}
-
-/*
- * Non-blocking flush of dirty inode metadata into the backing buffer.
- *
- * The caller must have a reference to the inode and hold the cluster buffer
- * locked. The function will walk across all the inodes on the cluster buffer it
- * can find and lock without blocking, and flush them to the cluster buffer.
- *
- * On success, the caller must write out the buffer returned in *bp and
- * release it. On failure, the filesystem will be shut down, the buffer will
- * have been unlocked and released, and EFSCORRUPTED will be returned.
- */
-int
-xfs_iflush_cluster(
-	struct xfs_buf		*bp)
-{
-	struct xfs_mount	*mp = bp->b_mount;
-	struct xfs_log_item	*lip, *n;
-	struct xfs_inode	*ip;
-	struct xfs_inode_log_item *iip;
-	int			clcount = 0;
-	int			error = 0;
 
-	/*
-	 * We must use the safe variant here as on shutdown xfs_iflush_abort()
-	 * can remove itself from the list.
-	 */
-	list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
-		iip = (struct xfs_inode_log_item *)lip;
-		ip = iip->ili_inode;
-
-		/*
-		 * Quick and dirty check to avoid locks if possible.
-		 */
-		if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK))
-			continue;
-		if (xfs_ipincount(ip))
-			continue;
-
-		/*
-		 * The inode is still attached to the buffer, which means it is
-		 * dirty but reclaim might try to grab it. Check carefully for
-		 * that, and grab the ilock while still holding the i_flags_lock
-		 * to guarantee reclaim will not be able to reclaim this inode
-		 * once we drop the i_flags_lock.
-		 */
-		spin_lock(&ip->i_flags_lock);
-		ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE));
-		if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) {
-			spin_unlock(&ip->i_flags_lock);
-			continue;
-		}
-
-		/*
-		 * ILOCK will pin the inode against reclaim and prevent
-		 * concurrent transactions modifying the inode while we are
-		 * flushing the inode.
-		 */
-		if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
-			spin_unlock(&ip->i_flags_lock);
-			continue;
-		}
-		spin_unlock(&ip->i_flags_lock);
-
-		/*
-		 * Skip inodes that are already flush locked as they have
-		 * already been written to the buffer.
-		 */
-		if (!xfs_iflock_nowait(ip)) {
-			xfs_iunlock(ip, XFS_ILOCK_SHARED);
-			continue;
-		}
-
-		/*
-		 * If we are shut down, unpin and abort the inode now as there
-		 * is no point in flushing it to the buffer just to get an IO
-		 * completion to abort the buffer and remove it from the AIL.
-		 */
-		if (XFS_FORCED_SHUTDOWN(mp)) {
-			xfs_iunpin_wait(ip);
-			/* xfs_iflush_abort() drops the flush lock */
-			xfs_iflush_abort(ip);
-			xfs_iunlock(ip, XFS_ILOCK_SHARED);
-			error = EIO;
-			continue;
-		}
-
-		/* don't block waiting on a log force to unpin dirty inodes */
-		if (xfs_ipincount(ip)) {
-			xfs_ifunlock(ip);
-			xfs_iunlock(ip, XFS_ILOCK_SHARED);
-			continue;
-		}
-
-		if (!xfs_inode_clean(ip))
-			error = xfs_iflush(ip, bp);
-		else
-			xfs_ifunlock(ip);
-		xfs_iunlock(ip, XFS_ILOCK_SHARED);
-		if (error)
-			break;
-		clcount++;
-	}
-
-	if (clcount) {
-		XFS_STATS_INC(mp, xs_icluster_flushcnt);
-		XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
-	}
+out_unlock_ilock:
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	return error;
 
-	if (error) {
-		bp->b_flags |= XBF_ASYNC;
-		xfs_buf_ioend_fail(bp);
-		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-	}
+out_unlock_flags_lock:
+	spin_unlock(&ip->i_flags_lock);
 	return error;
 }
 
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index b93cf9076df8a..870e6768e119e 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -427,7 +427,7 @@ int		xfs_log_force_inode(struct xfs_inode *ip);
 void		xfs_iunpin_wait(xfs_inode_t *);
 #define xfs_ipincount(ip)	((unsigned int) atomic_read(&ip->i_pincount))
 
-int		xfs_iflush_cluster(struct xfs_buf *);
+int		xfs_iflush(struct xfs_inode_log_item *iip, struct xfs_buf *bp);
 void		xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode,
 				struct xfs_inode *ip1, uint ip1_mode);
 
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 4dd4f45dcc46e..b82d7f3628745 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -501,8 +501,10 @@ xfs_inode_item_push(
 	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
 	struct xfs_inode	*ip = iip->ili_inode;
 	struct xfs_buf		*bp = lip->li_buf;
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_log_item	*l, *n;
 	uint			rval = XFS_ITEM_SUCCESS;
-	int			error;
+	int			clcount = 0;
 
 	ASSERT(iip->ili_item.li_buf);
 
@@ -530,17 +532,34 @@ xfs_inode_item_push(
 	 * for IO until we queue it for delwri submission.
 	 */
 	xfs_buf_hold(bp);
-	error = xfs_iflush_cluster(bp);
-	if (!error) {
-		if (!xfs_buf_delwri_queue(bp, buffer_list)) {
-			ASSERT(0);
-			rval = XFS_ITEM_FLUSHING;
+
+	/*
+	 * We must use the safe variant here as on shutdown xfs_iflush_abort()
+	 * can remove itself from the list.
+	 */
+	list_for_each_entry_safe(l, n, &bp->b_li_list, li_bio_list) {
+		if (xfs_iflush(INODE_ITEM(l), bp)) {
+			bp->b_flags |= XBF_ASYNC;
+			xfs_buf_ioend_fail(bp);
+			xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+			rval = XFS_ITEM_LOCKED;
+			goto out_unlock;
 		}
-		xfs_buf_relse(bp);
-	} else {
-		rval = XFS_ITEM_LOCKED;
+		clcount++;
+	}
+
+	if (clcount) {
+		XFS_STATS_INC(mp, xs_icluster_flushcnt);
+		XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
+	}
+
+	if (!xfs_buf_delwri_queue(bp, buffer_list)) {
+		ASSERT(0);
+		rval = XFS_ITEM_FLUSHING;
 	}
+	xfs_buf_relse(bp);
 
+out_unlock:
 	spin_lock(&lip->li_ailp->ail_lock);
 	return rval;
 }
Christoph Hellwig May 23, 2020, 11:39 a.m. UTC | #3
On Fri, May 22, 2020 at 01:50:27PM +1000, Dave Chinner wrote:
> +		if (XFS_FORCED_SHUTDOWN(mp)) {
> +			xfs_iunpin_wait(ip);
> +			/* xfs_iflush_abort() drops the flush lock */
> +			xfs_iflush_abort(ip);
> +			xfs_iunlock(ip, XFS_ILOCK_SHARED);
> +			error = EIO;
>  			continue;
>  		}

This looks suspicious.  For one it assigns a postitive errno value to
error. But also it continues into the next iteration, which will then
usually overwrite error again, unless this was the last inode to be
written.
Dave Chinner May 23, 2020, 11:14 p.m. UTC | #4
On Fri, May 22, 2020 at 05:13:34PM -0700, Darrick J. Wong wrote:
> On Fri, May 22, 2020 at 01:50:27PM +1000, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > Now that we have all the dirty inodes attached to the cluster
> > buffer, we don't actually have to do radix tree lookups to find
> > them. Sure, the radix tree is efficient, but walking a linked list
> > of just the dirty inodes attached to the buffer is much better.
> > 
> > We are also no longer dependent on having a locked inode passed into
> > the function to determine where to start the lookup. This means we
> > can drop it from the function call and treat all inodes the same.
> > 
> > We also make xfs_iflush_cluster skip inodes marked with
> > XFS_IRECLAIM. This we avoid races with inodes that reclaim is
> > actively referencing or are being re-initialised by inode lookup. If
> > they are actually dirty, they'll get written by a future cluster
> > flush....
> > 
> > We also add a shutdown check after obtaining the flush lock so that
> > we catch inodes that are dirty in memory and may have inconsistent
> > state due to the shutdown in progress. We abort these inodes
> > directly and so they remove themselves directly from the buffer list
> > and the AIL rather than having to wait for the buffer to be failed
> > and callbacks run to be processed correctly.
> > 
> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > ---
> >  fs/xfs/xfs_inode.c      | 150 ++++++++++++++++------------------------
> >  fs/xfs/xfs_inode.h      |   2 +-
> >  fs/xfs/xfs_inode_item.c |  15 +++-
> >  3 files changed, 74 insertions(+), 93 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > index cbf8edf62d102..7db0f97e537e3 100644
> > --- a/fs/xfs/xfs_inode.c
> > +++ b/fs/xfs/xfs_inode.c
> > @@ -3428,7 +3428,7 @@ xfs_rename(
> >  	return error;
> >  }
> >  
> > -static int
> > +int
> >  xfs_iflush(
> 
> Not sure why this drops the static?

Stray hunk from reordering the patchset. This used to be before the
removal of writeback from reclaim, so it needed to be called from
there. But that ordering made no sense as it required heaps of
temporary changes to the reclaim code, so I put it first and got
rid of of all the reclaim writeback futzing. Clearly I forgot to
remove this hunk when I cleared xfs_iflush() out of the header file.

> > -		if (!cip->i_ino) {
> > -			xfs_ifunlock(cip);
> > -			xfs_iunlock(cip, XFS_ILOCK_SHARED);
> > +		if (XFS_FORCED_SHUTDOWN(mp)) {
> > +			xfs_iunpin_wait(ip);
> > +			/* xfs_iflush_abort() drops the flush lock */
> > +			xfs_iflush_abort(ip);
> > +			xfs_iunlock(ip, XFS_ILOCK_SHARED);
> > +			error = EIO;
> 
> error = -EIO?

Yup, good catch.

> > -	error = xfs_iflush_cluster(ip, bp);
> > +	/*
> > +	 * We need to hold a reference for flushing the cluster buffer as it may
> > +	 * fail the buffer without IO submission. In which case, we better have
> > +	 * a reference for that completion as otherwise we don't get a reference
> > +	 * for IO until we queue it for delwri submission.
> 
> <confused>
> 
> What completion are we talking about?  Does this refer to the fact that
> xfs_iflush_cluster handles a flush failure by simulating an async write
> failure which could result in us giving away the inode log item's
> reference to the buffer?

Yes. This is the way the code currently works before this patchset.
The IO reference used by the AIL writeback comes from
xfs_imap_to_bp(), and getting rid of that from the writeback path
means we no longer have an IO reference to the buffer before calling
xfs_iflush_cluster(). And, yes, the xfs_buf_ioend_fail() call in
xfs_iflush_cluster() implicitly relies on this reference existing.
Hence we have to take one ourselves and we cannot rely on the
IO reference we get from adding the buffer to the delwri list.

This was a bug I found a couple of hours before I posted the
patchset, and the bisect pointed at this patch as the cause. It may
be that it should be in the patch that gets rid of the xfs_iflush()
call and propagate through that way. However, after 3 days of
continuous bisecting to find bugs as a result of implicit,
undocumented stuff like this over and over again, the novelty was
starting to wear a bit thin.

I'll revisit this when I've regained some patience....

Cheers,

Dave.
Dave Chinner May 23, 2020, 11:23 p.m. UTC | #5
On Sat, May 23, 2020 at 04:31:31AM -0700, Christoph Hellwig wrote:
> On Fri, May 22, 2020 at 01:50:27PM +1000, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > Now that we have all the dirty inodes attached to the cluster
> > buffer, we don't actually have to do radix tree lookups to find
> > them. Sure, the radix tree is efficient, but walking a linked list
> > of just the dirty inodes attached to the buffer is much better.
> > 
> > We are also no longer dependent on having a locked inode passed into
> > the function to determine where to start the lookup. This means we
> > can drop it from the function call and treat all inodes the same.
> > 
> > We also make xfs_iflush_cluster skip inodes marked with
> > XFS_IRECLAIM. This we avoid races with inodes that reclaim is
> > actively referencing or are being re-initialised by inode lookup. If
> > they are actually dirty, they'll get written by a future cluster
> > flush....
> > 
> > We also add a shutdown check after obtaining the flush lock so that
> > we catch inodes that are dirty in memory and may have inconsistent
> > state due to the shutdown in progress. We abort these inodes
> > directly and so they remove themselves directly from the buffer list
> > and the AIL rather than having to wait for the buffer to be failed
> > and callbacks run to be processed correctly.
> 
> I suspect we should just kill off xfs_iflush_cluster with this, as it
> is best split between xfs_iflush and xfs_inode_item_push.  POC patch
> below, but as part of that I noticed some really odd error handling,
> which I'll bring up separately.

That's the exact opposite way the follow-on patchset I have goes.
xfs_inode_item_push() goes away entirely because tracking 30+ inodes
in the AIL for a single writeback IO event is amazingly inefficient
and consumes huge amounts of CPU unnecessarily.

IOWs, the original point of pinning the inode cluster buffer
directly at inode dirtying was so we could track dirty inodes in the
AIL via the cluster buffer instead of individually as inodes.  The
AIL drops by a factor of 30 in size, and instead of seeing 10,000
inode item push "success" reports and 300,000 "flushing" reports a
second, we only see the 10,000 success reports.

IOWs, the xfsaild is CPU bound in highly concurrent inode dirtying
workloads because of the massive numbers of inodes we cycle through
it. Tracking them by cluster buffer reduces the CPU over of the
xfsaild substantially. The whole "hang on, this solves the OOM kill
problem with async reclaim" was a happy accident - I didn't start
this patch set with the intent of fixing inode reclaim, it started
because inode tracking in the AIL is highly inefficient...

Cheers,

Dave.
Christoph Hellwig May 24, 2020, 5:32 a.m. UTC | #6
On Sun, May 24, 2020 at 09:23:19AM +1000, Dave Chinner wrote:
> That's the exact opposite way the follow-on patchset I have goes.
> xfs_inode_item_push() goes away entirely because tracking 30+ inodes
> in the AIL for a single writeback IO event is amazingly inefficient
> and consumes huge amounts of CPU unnecessarily.

Ok.  It might still make sense to move the checks and locking into
xfs_iflush, as the code flow is much easier to follow that way.
diff mbox series

Patch

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index cbf8edf62d102..7db0f97e537e3 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3428,7 +3428,7 @@  xfs_rename(
 	return error;
 }
 
-static int
+int
 xfs_iflush(
 	struct xfs_inode	*ip,
 	struct xfs_buf		*bp)
@@ -3590,117 +3590,94 @@  xfs_iflush(
  */
 int
 xfs_iflush_cluster(
-	struct xfs_inode	*ip,
 	struct xfs_buf		*bp)
 {
-	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_perag	*pag;
-	unsigned long		first_index, mask;
-	int			cilist_size;
-	struct xfs_inode	**cilist;
-	struct xfs_inode	*cip;
-	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
-	int			error = 0;
-	int			nr_found;
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_log_item	*lip, *n;
+	struct xfs_inode	*ip;
+	struct xfs_inode_log_item *iip;
 	int			clcount = 0;
-	int			i;
-
-	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-
-	cilist_size = igeo->inodes_per_cluster * sizeof(struct xfs_inode *);
-	cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS);
-	if (!cilist)
-		goto out_put;
-
-	mask = ~(igeo->inodes_per_cluster - 1);
-	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
-	rcu_read_lock();
-	/* really need a gang lookup range call here */
-	nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist,
-					first_index, igeo->inodes_per_cluster);
-	if (nr_found == 0)
-		goto out_free;
+	int			error = 0;
 
-	for (i = 0; i < nr_found; i++) {
-		cip = cilist[i];
+	/*
+	 * We must use the safe variant here as on shutdown xfs_iflush_abort()
+	 * can remove itself from the list.
+	 */
+	list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
+		iip = (struct xfs_inode_log_item *)lip;
+		ip = iip->ili_inode;
 
 		/*
-		 * because this is an RCU protected lookup, we could find a
-		 * recently freed or even reallocated inode during the lookup.
-		 * We need to check under the i_flags_lock for a valid inode
-		 * here. Skip it if it is not valid or the wrong inode.
+		 * Quick and dirty check to avoid locks if possible.
 		 */
-		spin_lock(&cip->i_flags_lock);
-		if (!cip->i_ino ||
-		    __xfs_iflags_test(cip, XFS_ISTALE)) {
-			spin_unlock(&cip->i_flags_lock);
+		if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK))
+			continue;
+		if (xfs_ipincount(ip))
 			continue;
-		}
 
 		/*
-		 * Once we fall off the end of the cluster, no point checking
-		 * any more inodes in the list because they will also all be
-		 * outside the cluster.
+		 * The inode is still attached to the buffer, which means it is
+		 * dirty but reclaim might try to grab it. Check carefully for
+		 * that, and grab the ilock while still holding the i_flags_lock
+		 * to guarantee reclaim will not be able to reclaim this inode
+		 * once we drop the i_flags_lock.
 		 */
-		if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) {
-			spin_unlock(&cip->i_flags_lock);
-			break;
+		spin_lock(&ip->i_flags_lock);
+		ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE));
+		if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) {
+			spin_unlock(&ip->i_flags_lock);
+			continue;
 		}
-		spin_unlock(&cip->i_flags_lock);
 
 		/*
-		 * Do an un-protected check to see if the inode is dirty and
-		 * is a candidate for flushing.  These checks will be repeated
-		 * later after the appropriate locks are acquired.
+		 * ILOCK will pin the inode against reclaim and prevent
+		 * concurrent transactions modifying the inode while we are
+		 * flushing the inode.
 		 */
-		if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0)
+		if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+			spin_unlock(&ip->i_flags_lock);
 			continue;
+		}
+		spin_unlock(&ip->i_flags_lock);
 
 		/*
-		 * Try to get locks.  If any are unavailable or it is pinned,
-		 * then this inode cannot be flushed and is skipped.
+		 * Skip inodes that are already flush locked as they have
+		 * already been written to the buffer.
 		 */
-
-		if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED))
-			continue;
-		if (!xfs_iflock_nowait(cip)) {
-			xfs_iunlock(cip, XFS_ILOCK_SHARED);
+		if (!xfs_iflock_nowait(ip)) {
+			xfs_iunlock(ip, XFS_ILOCK_SHARED);
 			continue;
 		}
-		if (xfs_ipincount(cip)) {
-			xfs_ifunlock(cip);
-			xfs_iunlock(cip, XFS_ILOCK_SHARED);
-			continue;
-		}
-
 
 		/*
-		 * Check the inode number again, just to be certain we are not
-		 * racing with freeing in xfs_reclaim_inode(). See the comments
-		 * in that function for more information as to why the initial
-		 * check is not sufficient.
+		 * If we are shut down, unpin and abort the inode now as there
+		 * is no point in flushing it to the buffer just to get an IO
+		 * completion to abort the buffer and remove it from the AIL.
 		 */
-		if (!cip->i_ino) {
-			xfs_ifunlock(cip);
-			xfs_iunlock(cip, XFS_ILOCK_SHARED);
+		if (XFS_FORCED_SHUTDOWN(mp)) {
+			xfs_iunpin_wait(ip);
+			/* xfs_iflush_abort() drops the flush lock */
+			xfs_iflush_abort(ip);
+			xfs_iunlock(ip, XFS_ILOCK_SHARED);
+			error = EIO;
 			continue;
 		}
 
-		/*
-		 * arriving here means that this inode can be flushed.  First
-		 * re-check that it's dirty before flushing.
-		 */
-		if (!xfs_inode_clean(cip)) {
-			error = xfs_iflush(cip, bp);
-			if (error) {
-				xfs_iunlock(cip, XFS_ILOCK_SHARED);
-				goto out_free;
-			}
-			clcount++;
-		} else {
-			xfs_ifunlock(cip);
+		/* don't block waiting on a log force to unpin dirty inodes */
+		if (xfs_ipincount(ip)) {
+			xfs_ifunlock(ip);
+			xfs_iunlock(ip, XFS_ILOCK_SHARED);
+			continue;
 		}
-		xfs_iunlock(cip, XFS_ILOCK_SHARED);
+
+		if (!xfs_inode_clean(ip))
+			error = xfs_iflush(ip, bp);
+		else
+			xfs_ifunlock(ip);
+		xfs_iunlock(ip, XFS_ILOCK_SHARED);
+		if (error)
+			break;
+		clcount++;
 	}
 
 	if (clcount) {
@@ -3708,11 +3685,6 @@  xfs_iflush_cluster(
 		XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
 	}
 
-out_free:
-	rcu_read_unlock();
-	kmem_free(cilist);
-out_put:
-	xfs_perag_put(pag);
 	if (error) {
 		bp->b_flags |= XBF_ASYNC;
 		xfs_buf_ioend_fail(bp);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index d1109eb13ba2e..b93cf9076df8a 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -427,7 +427,7 @@  int		xfs_log_force_inode(struct xfs_inode *ip);
 void		xfs_iunpin_wait(xfs_inode_t *);
 #define xfs_ipincount(ip)	((unsigned int) atomic_read(&ip->i_pincount))
 
-int		xfs_iflush_cluster(struct xfs_inode *, struct xfs_buf *);
+int		xfs_iflush_cluster(struct xfs_buf *);
 void		xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode,
 				struct xfs_inode *ip1, uint ip1_mode);
 
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 42b4b5fe1e2a9..af4764f97a339 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -523,11 +523,20 @@  xfs_inode_item_push(
 	}
 	spin_unlock(&lip->li_ailp->ail_lock);
 
-	error = xfs_iflush_cluster(ip, bp);
+	/*
+	 * We need to hold a reference for flushing the cluster buffer as it may
+	 * fail the buffer without IO submission. In which case, we better have
+	 * a reference for that completion as otherwise we don't get a reference
+	 * for IO until we queue it for delwri submission.
+	 */
+	xfs_buf_hold(bp);
+	error = xfs_iflush_cluster(bp);
 	if (!error) {
-		if (!xfs_buf_delwri_queue(bp, buffer_list))
+		if (!xfs_buf_delwri_queue(bp, buffer_list)) {
+			ASSERT(0);
 			rval = XFS_ITEM_FLUSHING;
-		xfs_buf_unlock(bp);
+		}
+		xfs_buf_relse(bp);
 	} else {
 		rval = XFS_ITEM_LOCKED;
 	}