Message ID | 20200604074606.266213-27-david@fromorbit.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | xfs: rework inode flushing to make inode reclaim fully asynchronous | expand |
On Thu, Jun 04, 2020 at 05:46:02PM +1000, Dave Chinner wrote: > From: Dave Chinner <dchinner@redhat.com> > > Now we have a cached buffer on inode log items, we don't need > to do buffer lookups when flushing inodes anymore - all we need > to do is lock the buffer and we are ready to go. > > This largely gets rid of the need for xfs_iflush(), which is > essentially just a mechanism to look up the buffer and flush the > inode to it. Instead, we can just call xfs_iflush_cluster() with a > few modifications to ensure it also flushes the inode we already > hold locked. > > This allows the AIL inode item pushing to be almost entirely > non-blocking in XFS - we won't block unless memory allocation > for the cluster inode lookup blocks or the block device queues are > full. > > Writeback during inode reclaim becomes a little more complex because > we now have to lock the buffer ourselves, but otherwise this change > is largely a functional no-op that removes a whole lot of code. > > Signed-off-by: Dave Chinner <dchinner@redhat.com> > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> > --- Looks mostly reasonable.. > fs/xfs/xfs_inode.c | 106 ++++++---------------------------------- > fs/xfs/xfs_inode.h | 2 +- > fs/xfs/xfs_inode_item.c | 54 +++++++++----------- > 3 files changed, 37 insertions(+), 125 deletions(-) > > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c > index af65acd24ec4e..61c872e4ee157 100644 > --- a/fs/xfs/xfs_inode.c > +++ b/fs/xfs/xfs_inode.c ... > @@ -3688,6 +3609,7 @@ xfs_iflush_int( > ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || > ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); > ASSERT(iip != NULL && iip->ili_fields != 0); > + ASSERT(iip->ili_item.li_buf == bp); FWIW, the previous assert includes an iip NULL check. > > dip = xfs_buf_offset(bp, ip->i_imap.im_boffset); > ... > diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c > index 697248b7eb2be..326547e89cb6b 100644 > --- a/fs/xfs/xfs_inode_item.c > +++ b/fs/xfs/xfs_inode_item.c > @@ -485,53 +485,42 @@ xfs_inode_item_push( > uint rval = XFS_ITEM_SUCCESS; > int error; > > - if (xfs_ipincount(ip) > 0) > + ASSERT(iip->ili_item.li_buf); > + > + if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp) || > + (ip->i_flags & XFS_ISTALE)) > return XFS_ITEM_PINNED; > > - if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) > - return XFS_ITEM_LOCKED; > + /* If the inode is already flush locked, we're already flushing. */ Or we're racing with reclaim (since we don't have the ilock here any longer)? > + if (xfs_isiflocked(ip)) > + return XFS_ITEM_FLUSHING; > > - /* > - * Re-check the pincount now that we stabilized the value by > - * taking the ilock. > - */ > - if (xfs_ipincount(ip) > 0) { > - rval = XFS_ITEM_PINNED; > - goto out_unlock; > - } > + if (!xfs_buf_trylock(bp)) > + return XFS_ITEM_LOCKED; > > - /* > - * Stale inode items should force out the iclog. > - */ > - if (ip->i_flags & XFS_ISTALE) { > - rval = XFS_ITEM_PINNED; > - goto out_unlock; > + if (bp->b_flags & _XBF_DELWRI_Q) { > + xfs_buf_unlock(bp); > + return XFS_ITEM_FLUSHING; Hmm, what's the purpose of this check? I would expect that we'd still be able to flush to a buffer even though it's delwri queued. We drop the buffer lock after queueing it (and then it's reacquired on delwri submit). > } > + spin_unlock(&lip->li_ailp->ail_lock); > > /* > - * Someone else is already flushing the inode. Nothing we can do > - * here but wait for the flush to finish and remove the item from > - * the AIL. > + * We need to hold a reference for flushing the cluster buffer as it may > + * fail the buffer without IO submission. In which case, we better get a > + * reference for that completion because otherwise we don't get a > + * reference for IO until we queue the buffer for delwri submission. > */ > - if (!xfs_iflock_nowait(ip)) { > - rval = XFS_ITEM_FLUSHING; > - goto out_unlock; > - } > - > - ASSERT(iip->ili_fields != 0 || XFS_FORCED_SHUTDOWN(ip->i_mount)); > - spin_unlock(&lip->li_ailp->ail_lock); > - > - error = xfs_iflush(ip, &bp); > + xfs_buf_hold(bp); > + error = xfs_iflush_cluster(ip, bp); > if (!error) { > if (!xfs_buf_delwri_queue(bp, buffer_list)) > rval = XFS_ITEM_FLUSHING; > xfs_buf_relse(bp); > - } else if (error == -EAGAIN) > + } else { > rval = XFS_ITEM_LOCKED; > + } > > spin_lock(&lip->li_ailp->ail_lock); > -out_unlock: > - xfs_iunlock(ip, XFS_ILOCK_SHARED); > return rval; > } > > @@ -548,6 +537,7 @@ xfs_inode_item_release( > > ASSERT(ip->i_itemp != NULL); > ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); > + ASSERT(lip->li_buf || !test_bit(XFS_LI_DIRTY, &lip->li_flags)); This is the transaction cancel/abort path, so seems like this should be part of the patch that attaches the ili on logging the inode? Brian > > lock_flags = iip->ili_lock_flags; > iip->ili_lock_flags = 0; > -- > 2.26.2.761.g0e0b3e54be >
On Mon, Jun 08, 2020 at 12:45:51PM -0400, Brian Foster wrote: > On Thu, Jun 04, 2020 at 05:46:02PM +1000, Dave Chinner wrote: > > From: Dave Chinner <dchinner@redhat.com> > > > > Now we have a cached buffer on inode log items, we don't need > > to do buffer lookups when flushing inodes anymore - all we need > > to do is lock the buffer and we are ready to go. > > > > This largely gets rid of the need for xfs_iflush(), which is > > essentially just a mechanism to look up the buffer and flush the > > inode to it. Instead, we can just call xfs_iflush_cluster() with a > > few modifications to ensure it also flushes the inode we already > > hold locked. > > > > This allows the AIL inode item pushing to be almost entirely > > non-blocking in XFS - we won't block unless memory allocation > > for the cluster inode lookup blocks or the block device queues are > > full. > > > > Writeback during inode reclaim becomes a little more complex because > > we now have to lock the buffer ourselves, but otherwise this change > > is largely a functional no-op that removes a whole lot of code. > > > > Signed-off-by: Dave Chinner <dchinner@redhat.com> > > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> > > --- > > Looks mostly reasonable.. > > > fs/xfs/xfs_inode.c | 106 ++++++---------------------------------- > > fs/xfs/xfs_inode.h | 2 +- > > fs/xfs/xfs_inode_item.c | 54 +++++++++----------- > > 3 files changed, 37 insertions(+), 125 deletions(-) > > > > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c > > index af65acd24ec4e..61c872e4ee157 100644 > > --- a/fs/xfs/xfs_inode.c > > +++ b/fs/xfs/xfs_inode.c > ... > > @@ -3688,6 +3609,7 @@ xfs_iflush_int( > > ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || > > ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); > > ASSERT(iip != NULL && iip->ili_fields != 0); > > + ASSERT(iip->ili_item.li_buf == bp); > > FWIW, the previous assert includes an iip NULL check. If iip is NULL in this function, we are going to oops anyway. We've slowly been weeding these types of "<x> must not be null" asserts out of the code because it doesn't add any value. Especially as the assert doesn't tell you exactly what failed and that's kinda important here. FWIW, this assert is largely redundant, anyway, because it's just an open coded xfs_inode_clean() check, and we check that in all cases before calling xfs_iflush_int(). So I'd be tempted to remove it if it really mattered... > > > > > dip = xfs_buf_offset(bp, ip->i_imap.im_boffset); > > > ... > > diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c > > index 697248b7eb2be..326547e89cb6b 100644 > > --- a/fs/xfs/xfs_inode_item.c > > +++ b/fs/xfs/xfs_inode_item.c > > @@ -485,53 +485,42 @@ xfs_inode_item_push( > > uint rval = XFS_ITEM_SUCCESS; > > int error; > > > > - if (xfs_ipincount(ip) > 0) > > + ASSERT(iip->ili_item.li_buf); > > + > > + if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp) || > > + (ip->i_flags & XFS_ISTALE)) > > return XFS_ITEM_PINNED; > > > > - if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) > > - return XFS_ITEM_LOCKED; > > + /* If the inode is already flush locked, we're already flushing. */ > > Or we're racing with reclaim (since we don't have the ilock here any > longer)? Possible, but unlikely. The 99.999% case here is the inode was flush locked by a push of another item on the same cluster buffer, and that happens so often it's worth adding a racy check ahead of the flushing to avoid trying to lock the cluster buffer... > > > > - /* > > - * Re-check the pincount now that we stabilized the value by > > - * taking the ilock. > > - */ > > - if (xfs_ipincount(ip) > 0) { > > - rval = XFS_ITEM_PINNED; > > - goto out_unlock; > > - } > > + if (!xfs_buf_trylock(bp)) > > + return XFS_ITEM_LOCKED; > > > > - /* > > - * Stale inode items should force out the iclog. > > - */ > > - if (ip->i_flags & XFS_ISTALE) { > > - rval = XFS_ITEM_PINNED; > > - goto out_unlock; > > + if (bp->b_flags & _XBF_DELWRI_Q) { > > + xfs_buf_unlock(bp); > > + return XFS_ITEM_FLUSHING; > > Hmm, what's the purpose of this check? I would expect that we'd still be > able to flush to a buffer even though it's delwri queued. We drop the > buffer lock after queueing it (and then it's reacquired on delwri > submit). Yes, we can. I did this when tracking down whacky inode flushing issues and AIL deadlocks this patch exposed to simplify the behaviour, and I'd completely forgotten about it once I'd found the underlying problem a week and a half later. It seems harmless at this point, I'll see what happens if I remove it... > > -out_unlock: > > - xfs_iunlock(ip, XFS_ILOCK_SHARED); > > return rval; > > } > > > > @@ -548,6 +537,7 @@ xfs_inode_item_release( > > > > ASSERT(ip->i_itemp != NULL); > > ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); > > + ASSERT(lip->li_buf || !test_bit(XFS_LI_DIRTY, &lip->li_flags)); > > This is the transaction cancel/abort path, so seems like this should be > part of the patch that attaches the ili on logging the inode? Again, likely left over from debugging some whacky inode flushing/writeback/completion issue this patch exposed. I'll remove it... Cheers, Dave.
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index af65acd24ec4e..61c872e4ee157 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3450,7 +3450,18 @@ xfs_rename( return error; } -STATIC int +/* + * Non-blocking flush of dirty inode metadata into the backing buffer. + * + * The caller must have a reference to the inode and hold the cluster buffer + * locked. The function will walk across all the inodes on the cluster buffer it + * can find and lock without blocking, and flush them to the cluster buffer. + * + * On success, the caller must write out the buffer returned in *bp and + * release it. On failure, the filesystem will be shut down, the buffer will + * have been unlocked and released, and EFSCORRUPTED will be returned. + */ +int xfs_iflush_cluster( struct xfs_inode *ip, struct xfs_buf *bp) @@ -3485,8 +3496,6 @@ xfs_iflush_cluster( for (i = 0; i < nr_found; i++) { cip = cilist[i]; - if (cip == ip) - continue; /* * because this is an RCU protected lookup, we could find a @@ -3577,99 +3586,11 @@ xfs_iflush_cluster( kmem_free(cilist); out_put: xfs_perag_put(pag); - return error; -} - -/* - * Flush dirty inode metadata into the backing buffer. - * - * The caller must have the inode lock and the inode flush lock held. The - * inode lock will still be held upon return to the caller, and the inode - * flush lock will be released after the inode has reached the disk. - * - * The caller must write out the buffer returned in *bpp and release it. - */ -int -xfs_iflush( - struct xfs_inode *ip, - struct xfs_buf **bpp) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_buf *bp = NULL; - struct xfs_dinode *dip; - int error; - - XFS_STATS_INC(mp, xs_iflush_count); - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - ASSERT(xfs_isiflocked(ip)); - ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || - ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); - - *bpp = NULL; - - xfs_iunpin_wait(ip); - - /* - * For stale inodes we cannot rely on the backing buffer remaining - * stale in cache for the remaining life of the stale inode and so - * xfs_imap_to_bp() below may give us a buffer that no longer contains - * inodes below. We have to check this after ensuring the inode is - * unpinned so that it is safe to reclaim the stale inode after the - * flush call. - */ - if (xfs_iflags_test(ip, XFS_ISTALE)) { - xfs_ifunlock(ip); - return 0; - } - - /* - * Get the buffer containing the on-disk inode. We are doing a try-lock - * operation here, so we may get an EAGAIN error. In that case, return - * leaving the inode dirty. - * - * If we get any other error, we effectively have a corruption situation - * and we cannot flush the inode. Abort the flush and shut down. - */ - error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK); - if (error == -EAGAIN) { - xfs_ifunlock(ip); - return error; - } - if (error) - goto abort; - - /* - * If the buffer is pinned then push on the log now so we won't - * get stuck waiting in the write for too long. - */ - if (xfs_buf_ispinned(bp)) - xfs_log_force(mp, 0); - - /* - * Flush the provided inode then attempt to gather others from the - * cluster into the write. - * - * Note: Once we attempt to flush an inode, we must run buffer - * completion callbacks on any failure. If this fails, simulate an I/O - * failure on the buffer and shut down. - */ - error = xfs_iflush_int(ip, bp); - if (!error) - error = xfs_iflush_cluster(ip, bp); if (error) { bp->b_flags |= XBF_ASYNC; xfs_buf_ioend_fail(bp); - goto shutdown; + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); } - - *bpp = bp; - return 0; - -abort: - xfs_iflush_abort(ip); -shutdown: - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return error; } @@ -3688,6 +3609,7 @@ xfs_iflush_int( ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); ASSERT(iip != NULL && iip->ili_fields != 0); + ASSERT(iip->ili_item.li_buf == bp); dip = xfs_buf_offset(bp, ip->i_imap.im_boffset); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index dadcf19458960..d1109eb13ba2e 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -427,7 +427,7 @@ int xfs_log_force_inode(struct xfs_inode *ip); void xfs_iunpin_wait(xfs_inode_t *); #define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) -int xfs_iflush(struct xfs_inode *, struct xfs_buf **); +int xfs_iflush_cluster(struct xfs_inode *, struct xfs_buf *); void xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode, struct xfs_inode *ip1, uint ip1_mode); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 697248b7eb2be..326547e89cb6b 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -485,53 +485,42 @@ xfs_inode_item_push( uint rval = XFS_ITEM_SUCCESS; int error; - if (xfs_ipincount(ip) > 0) + ASSERT(iip->ili_item.li_buf); + + if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp) || + (ip->i_flags & XFS_ISTALE)) return XFS_ITEM_PINNED; - if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) - return XFS_ITEM_LOCKED; + /* If the inode is already flush locked, we're already flushing. */ + if (xfs_isiflocked(ip)) + return XFS_ITEM_FLUSHING; - /* - * Re-check the pincount now that we stabilized the value by - * taking the ilock. - */ - if (xfs_ipincount(ip) > 0) { - rval = XFS_ITEM_PINNED; - goto out_unlock; - } + if (!xfs_buf_trylock(bp)) + return XFS_ITEM_LOCKED; - /* - * Stale inode items should force out the iclog. - */ - if (ip->i_flags & XFS_ISTALE) { - rval = XFS_ITEM_PINNED; - goto out_unlock; + if (bp->b_flags & _XBF_DELWRI_Q) { + xfs_buf_unlock(bp); + return XFS_ITEM_FLUSHING; } + spin_unlock(&lip->li_ailp->ail_lock); /* - * Someone else is already flushing the inode. Nothing we can do - * here but wait for the flush to finish and remove the item from - * the AIL. + * We need to hold a reference for flushing the cluster buffer as it may + * fail the buffer without IO submission. In which case, we better get a + * reference for that completion because otherwise we don't get a + * reference for IO until we queue the buffer for delwri submission. */ - if (!xfs_iflock_nowait(ip)) { - rval = XFS_ITEM_FLUSHING; - goto out_unlock; - } - - ASSERT(iip->ili_fields != 0 || XFS_FORCED_SHUTDOWN(ip->i_mount)); - spin_unlock(&lip->li_ailp->ail_lock); - - error = xfs_iflush(ip, &bp); + xfs_buf_hold(bp); + error = xfs_iflush_cluster(ip, bp); if (!error) { if (!xfs_buf_delwri_queue(bp, buffer_list)) rval = XFS_ITEM_FLUSHING; xfs_buf_relse(bp); - } else if (error == -EAGAIN) + } else { rval = XFS_ITEM_LOCKED; + } spin_lock(&lip->li_ailp->ail_lock); -out_unlock: - xfs_iunlock(ip, XFS_ILOCK_SHARED); return rval; } @@ -548,6 +537,7 @@ xfs_inode_item_release( ASSERT(ip->i_itemp != NULL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(lip->li_buf || !test_bit(XFS_LI_DIRTY, &lip->li_flags)); lock_flags = iip->ili_lock_flags; iip->ili_lock_flags = 0;