diff mbox series

[28/30] xfs: rework xfs_iflush_cluster() dirty inode iteration

Message ID 20200604074606.266213-29-david@fromorbit.com (mailing list archive)
State Superseded
Headers show
Series xfs: rework inode flushing to make inode reclaim fully asynchronous | expand

Commit Message

Dave Chinner June 4, 2020, 7:46 a.m. UTC
From: Dave Chinner <dchinner@redhat.com>

Now that we have all the dirty inodes attached to the cluster
buffer, we don't actually have to do radix tree lookups to find
them. Sure, the radix tree is efficient, but walking a linked list
of just the dirty inodes attached to the buffer is much better.

We are also no longer dependent on having a locked inode passed into
the function to determine where to start the lookup. This means we
can drop it from the function call and treat all inodes the same.

We also make xfs_iflush_cluster skip inodes marked with
XFS_IRECLAIM. This we avoid races with inodes that reclaim is
actively referencing or are being re-initialised by inode lookup. If
they are actually dirty, they'll get written by a future cluster
flush....

We also add a shutdown check after obtaining the flush lock so that
we catch inodes that are dirty in memory and may have inconsistent
state due to the shutdown in progress. We abort these inodes
directly and so they remove themselves directly from the buffer list
and the AIL rather than having to wait for the buffer to be failed
and callbacks run to be processed correctly.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_inode.c      | 148 ++++++++++++++++------------------------
 fs/xfs/xfs_inode.h      |   2 +-
 fs/xfs/xfs_inode_item.c |   2 +-
 3 files changed, 62 insertions(+), 90 deletions(-)

Comments

Brian Foster June 9, 2020, 1:11 p.m. UTC | #1
On Thu, Jun 04, 2020 at 05:46:04PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> Now that we have all the dirty inodes attached to the cluster
> buffer, we don't actually have to do radix tree lookups to find
> them. Sure, the radix tree is efficient, but walking a linked list
> of just the dirty inodes attached to the buffer is much better.
> 
> We are also no longer dependent on having a locked inode passed into
> the function to determine where to start the lookup. This means we
> can drop it from the function call and treat all inodes the same.
> 
> We also make xfs_iflush_cluster skip inodes marked with
> XFS_IRECLAIM. This we avoid races with inodes that reclaim is
> actively referencing or are being re-initialised by inode lookup. If
> they are actually dirty, they'll get written by a future cluster
> flush....
> 
> We also add a shutdown check after obtaining the flush lock so that
> we catch inodes that are dirty in memory and may have inconsistent
> state due to the shutdown in progress. We abort these inodes
> directly and so they remove themselves directly from the buffer list
> and the AIL rather than having to wait for the buffer to be failed
> and callbacks run to be processed correctly.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  fs/xfs/xfs_inode.c      | 148 ++++++++++++++++------------------------
>  fs/xfs/xfs_inode.h      |   2 +-
>  fs/xfs/xfs_inode_item.c |   2 +-
>  3 files changed, 62 insertions(+), 90 deletions(-)
> 
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 8566bd0f4334d..931a483d5b316 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -3611,117 +3611,94 @@ xfs_iflush(
>   */
>  int
>  xfs_iflush_cluster(
> -	struct xfs_inode	*ip,
>  	struct xfs_buf		*bp)
>  {
...
> +	/*
> +	 * We must use the safe variant here as on shutdown xfs_iflush_abort()
> +	 * can remove itself from the list.
> +	 */
> +	list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
> +		iip = (struct xfs_inode_log_item *)lip;
> +		ip = iip->ili_inode;
>  
>  		/*
> -		 * because this is an RCU protected lookup, we could find a
> -		 * recently freed or even reallocated inode during the lookup.
> -		 * We need to check under the i_flags_lock for a valid inode
> -		 * here. Skip it if it is not valid or the wrong inode.
> +		 * Quick and dirty check to avoid locks if possible.
>  		 */
> -		spin_lock(&cip->i_flags_lock);
> -		if (!cip->i_ino ||
> -		    __xfs_iflags_test(cip, XFS_ISTALE)) {
> -			spin_unlock(&cip->i_flags_lock);
> +		if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK))
> +			continue;
> +		if (xfs_ipincount(ip))
>  			continue;
> -		}
>  
>  		/*
> -		 * Once we fall off the end of the cluster, no point checking
> -		 * any more inodes in the list because they will also all be
> -		 * outside the cluster.
> +		 * The inode is still attached to the buffer, which means it is
> +		 * dirty but reclaim might try to grab it. Check carefully for
> +		 * that, and grab the ilock while still holding the i_flags_lock
> +		 * to guarantee reclaim will not be able to reclaim this inode
> +		 * once we drop the i_flags_lock.
>  		 */
> -		if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) {
> -			spin_unlock(&cip->i_flags_lock);
> -			break;
> +		spin_lock(&ip->i_flags_lock);
> +		ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE));

What prevents a race with staling an inode here? The push open codes an
unlocked (i.e. no memory barrier) check before it acquires the buffer
lock, so afaict it's technically possible to race and grab the buffer
immediately after the cluster was freed. If that could happen, it looks
like we'd also queue the buffer for write.

That also raises the question.. between this patch and the earlier push
rework, why do we queue the buffer for write when nothing might have
been flushed by this call?

> +		if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) {
> +			spin_unlock(&ip->i_flags_lock);
> +			continue;
>  		}
> -		spin_unlock(&cip->i_flags_lock);
>  
>  		/*
> -		 * Do an un-protected check to see if the inode is dirty and
> -		 * is a candidate for flushing.  These checks will be repeated
> -		 * later after the appropriate locks are acquired.
> +		 * ILOCK will pin the inode against reclaim and prevent
> +		 * concurrent transactions modifying the inode while we are
> +		 * flushing the inode.
>  		 */
> -		if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0)
> +		if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
> +			spin_unlock(&ip->i_flags_lock);
>  			continue;
> +		}
> +		spin_unlock(&ip->i_flags_lock);
>  
>  		/*
> -		 * Try to get locks.  If any are unavailable or it is pinned,
> -		 * then this inode cannot be flushed and is skipped.
> +		 * Skip inodes that are already flush locked as they have
> +		 * already been written to the buffer.
>  		 */
> -
> -		if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED))
> -			continue;
> -		if (!xfs_iflock_nowait(cip)) {
> -			xfs_iunlock(cip, XFS_ILOCK_SHARED);
> -			continue;
> -		}
> -		if (xfs_ipincount(cip)) {
> -			xfs_ifunlock(cip);
> -			xfs_iunlock(cip, XFS_ILOCK_SHARED);
> +		if (!xfs_iflock_nowait(ip)) {
> +			xfs_iunlock(ip, XFS_ILOCK_SHARED);
>  			continue;
>  		}
>  
> -
>  		/*
> -		 * Check the inode number again, just to be certain we are not
> -		 * racing with freeing in xfs_reclaim_inode(). See the comments
> -		 * in that function for more information as to why the initial
> -		 * check is not sufficient.
> +		 * If we are shut down, unpin and abort the inode now as there
> +		 * is no point in flushing it to the buffer just to get an IO
> +		 * completion to abort the buffer and remove it from the AIL.
>  		 */
> -		if (!cip->i_ino) {
> -			xfs_ifunlock(cip);
> -			xfs_iunlock(cip, XFS_ILOCK_SHARED);
> +		if (XFS_FORCED_SHUTDOWN(mp)) {
> +			xfs_iunpin_wait(ip);

Note that we have an unlocked check above that skips pinned inodes.

Brian

> +			/* xfs_iflush_abort() drops the flush lock */
> +			xfs_iflush_abort(ip);
> +			xfs_iunlock(ip, XFS_ILOCK_SHARED);
> +			error = -EIO;
>  			continue;
>  		}
>  
> -		/*
> -		 * arriving here means that this inode can be flushed.  First
> -		 * re-check that it's dirty before flushing.
> -		 */
> -		if (!xfs_inode_clean(cip)) {
> -			error = xfs_iflush(cip, bp);
> -			if (error) {
> -				xfs_iunlock(cip, XFS_ILOCK_SHARED);
> -				goto out_free;
> -			}
> -			clcount++;
> -		} else {
> -			xfs_ifunlock(cip);
> +		/* don't block waiting on a log force to unpin dirty inodes */
> +		if (xfs_ipincount(ip)) {
> +			xfs_ifunlock(ip);
> +			xfs_iunlock(ip, XFS_ILOCK_SHARED);
> +			continue;
>  		}
> -		xfs_iunlock(cip, XFS_ILOCK_SHARED);
> +
> +		if (!xfs_inode_clean(ip))
> +			error = xfs_iflush(ip, bp);
> +		else
> +			xfs_ifunlock(ip);
> +		xfs_iunlock(ip, XFS_ILOCK_SHARED);
> +		if (error)
> +			break;
> +		clcount++;
>  	}
>  
>  	if (clcount) {
> @@ -3729,11 +3706,6 @@ xfs_iflush_cluster(
>  		XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
>  	}
>  
> -out_free:
> -	rcu_read_unlock();
> -	kmem_free(cilist);
> -out_put:
> -	xfs_perag_put(pag);
>  	if (error) {
>  		bp->b_flags |= XBF_ASYNC;
>  		xfs_buf_ioend_fail(bp);
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index d1109eb13ba2e..b93cf9076df8a 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -427,7 +427,7 @@ int		xfs_log_force_inode(struct xfs_inode *ip);
>  void		xfs_iunpin_wait(xfs_inode_t *);
>  #define xfs_ipincount(ip)	((unsigned int) atomic_read(&ip->i_pincount))
>  
> -int		xfs_iflush_cluster(struct xfs_inode *, struct xfs_buf *);
> +int		xfs_iflush_cluster(struct xfs_buf *);
>  void		xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode,
>  				struct xfs_inode *ip1, uint ip1_mode);
>  
> diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
> index 326547e89cb6b..dee7385466f83 100644
> --- a/fs/xfs/xfs_inode_item.c
> +++ b/fs/xfs/xfs_inode_item.c
> @@ -511,7 +511,7 @@ xfs_inode_item_push(
>  	 * reference for IO until we queue the buffer for delwri submission.
>  	 */
>  	xfs_buf_hold(bp);
> -	error = xfs_iflush_cluster(ip, bp);
> +	error = xfs_iflush_cluster(bp);
>  	if (!error) {
>  		if (!xfs_buf_delwri_queue(bp, buffer_list))
>  			rval = XFS_ITEM_FLUSHING;
> -- 
> 2.26.2.761.g0e0b3e54be
>
Dave Chinner June 9, 2020, 10:01 p.m. UTC | #2
On Tue, Jun 09, 2020 at 09:11:55AM -0400, Brian Foster wrote:
> On Thu, Jun 04, 2020 at 05:46:04PM +1000, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > Now that we have all the dirty inodes attached to the cluster
> > buffer, we don't actually have to do radix tree lookups to find
> > them. Sure, the radix tree is efficient, but walking a linked list
> > of just the dirty inodes attached to the buffer is much better.
> > 
> > We are also no longer dependent on having a locked inode passed into
> > the function to determine where to start the lookup. This means we
> > can drop it from the function call and treat all inodes the same.
> > 
> > We also make xfs_iflush_cluster skip inodes marked with
> > XFS_IRECLAIM. This we avoid races with inodes that reclaim is
> > actively referencing or are being re-initialised by inode lookup. If
> > they are actually dirty, they'll get written by a future cluster
> > flush....
> > 
> > We also add a shutdown check after obtaining the flush lock so that
> > we catch inodes that are dirty in memory and may have inconsistent
> > state due to the shutdown in progress. We abort these inodes
> > directly and so they remove themselves directly from the buffer list
> > and the AIL rather than having to wait for the buffer to be failed
> > and callbacks run to be processed correctly.
> > 
> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> >  fs/xfs/xfs_inode.c      | 148 ++++++++++++++++------------------------
> >  fs/xfs/xfs_inode.h      |   2 +-
> >  fs/xfs/xfs_inode_item.c |   2 +-
> >  3 files changed, 62 insertions(+), 90 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > index 8566bd0f4334d..931a483d5b316 100644
> > --- a/fs/xfs/xfs_inode.c
> > +++ b/fs/xfs/xfs_inode.c
> > @@ -3611,117 +3611,94 @@ xfs_iflush(
> >   */
> >  int
> >  xfs_iflush_cluster(
> > -	struct xfs_inode	*ip,
> >  	struct xfs_buf		*bp)
> >  {
> ...
> > +	/*
> > +	 * We must use the safe variant here as on shutdown xfs_iflush_abort()
> > +	 * can remove itself from the list.
> > +	 */
> > +	list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
> > +		iip = (struct xfs_inode_log_item *)lip;
> > +		ip = iip->ili_inode;
> >  
> >  		/*
> > -		 * because this is an RCU protected lookup, we could find a
> > -		 * recently freed or even reallocated inode during the lookup.
> > -		 * We need to check under the i_flags_lock for a valid inode
> > -		 * here. Skip it if it is not valid or the wrong inode.
> > +		 * Quick and dirty check to avoid locks if possible.
> >  		 */
> > -		spin_lock(&cip->i_flags_lock);
> > -		if (!cip->i_ino ||
> > -		    __xfs_iflags_test(cip, XFS_ISTALE)) {
> > -			spin_unlock(&cip->i_flags_lock);
> > +		if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK))
> > +			continue;
> > +		if (xfs_ipincount(ip))
> >  			continue;
> > -		}
> >  
> >  		/*
> > -		 * Once we fall off the end of the cluster, no point checking
> > -		 * any more inodes in the list because they will also all be
> > -		 * outside the cluster.
> > +		 * The inode is still attached to the buffer, which means it is
> > +		 * dirty but reclaim might try to grab it. Check carefully for
> > +		 * that, and grab the ilock while still holding the i_flags_lock
> > +		 * to guarantee reclaim will not be able to reclaim this inode
> > +		 * once we drop the i_flags_lock.
> >  		 */
> > -		if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) {
> > -			spin_unlock(&cip->i_flags_lock);
> > -			break;
> > +		spin_lock(&ip->i_flags_lock);
> > +		ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE));
> 
> What prevents a race with staling an inode here?

xfs_buf_item_release() does not drop the buffer lock when stale
buffers are committed. Hence the buffer it held locked until it is
committed to the journal, and unpinning the buffer on journal IO
completion runs xfs_iflush_done() -> xfs_iflush_abort() on all the
stale attached inodes. At which point, they are removed from the
buffer and the buffer is unlocked..

Hence we cannot run here with stale inodes attached to the buffer
because the buffer is locked the entire time stale inodes are
attached.

> The push open codes an
> unlocked (i.e. no memory barrier) check before it acquires the buffer
> lock, so afaict it's technically possible to race and grab the buffer
> immediately after the cluster was freed. If that could happen, it looks
> like we'd also queue the buffer for write.

Not that I can tell, because we'll fail to get the buffer lock under
the AIL lock until the stale buffer is unpinned, but the unpin
occurs under the buffer lock and that removes the buffer from the
AIL. Hence there's no way the AIL pushing can actually push an inode
cluster buffer that has stale inodes attached to it...

> That also raises the question.. between this patch and the earlier push
> rework, why do we queue the buffer for write when nothing might have
> been flushed by this call?

Largely because I never observed a failure to flush at least one
inode so I didn't really consider it worth the additional complexity
in error handling in the push code. I've really been concerned about
the other end of the scale where we try to push the same buffer 30+
times for each IO.

You are right in that it could happen - perhaps just returning EAGAIN
if clcount == 0 at the end of the function is all that is necessary
and translating that to XFS_ITEM_LOCKED in the push function would
work.

> > -		 * check is not sufficient.
> > +		 * If we are shut down, unpin and abort the inode now as there
> > +		 * is no point in flushing it to the buffer just to get an IO
> > +		 * completion to abort the buffer and remove it from the AIL.
> >  		 */
> > -		if (!cip->i_ino) {
> > -			xfs_ifunlock(cip);
> > -			xfs_iunlock(cip, XFS_ILOCK_SHARED);
> > +		if (XFS_FORCED_SHUTDOWN(mp)) {
> > +			xfs_iunpin_wait(ip);
> 
> Note that we have an unlocked check above that skips pinned inodes.

Right, but we could be racing with a transaction commit that pinned
the inode and a shutdown. As the comment says: it's a quick and
dirty check to avoid trying to get locks when we know that it is
unlikely we can flush the inode without blocking. We still have to
recheck that state once we have the ILOCK....

Cheers,

Dave.
Brian Foster June 10, 2020, 1:06 p.m. UTC | #3
On Wed, Jun 10, 2020 at 08:01:39AM +1000, Dave Chinner wrote:
> On Tue, Jun 09, 2020 at 09:11:55AM -0400, Brian Foster wrote:
> > On Thu, Jun 04, 2020 at 05:46:04PM +1000, Dave Chinner wrote:
> > > From: Dave Chinner <dchinner@redhat.com>
> > > 
> > > Now that we have all the dirty inodes attached to the cluster
> > > buffer, we don't actually have to do radix tree lookups to find
> > > them. Sure, the radix tree is efficient, but walking a linked list
> > > of just the dirty inodes attached to the buffer is much better.
> > > 
> > > We are also no longer dependent on having a locked inode passed into
> > > the function to determine where to start the lookup. This means we
> > > can drop it from the function call and treat all inodes the same.
> > > 
> > > We also make xfs_iflush_cluster skip inodes marked with
> > > XFS_IRECLAIM. This we avoid races with inodes that reclaim is
> > > actively referencing or are being re-initialised by inode lookup. If
> > > they are actually dirty, they'll get written by a future cluster
> > > flush....
> > > 
> > > We also add a shutdown check after obtaining the flush lock so that
> > > we catch inodes that are dirty in memory and may have inconsistent
> > > state due to the shutdown in progress. We abort these inodes
> > > directly and so they remove themselves directly from the buffer list
> > > and the AIL rather than having to wait for the buffer to be failed
> > > and callbacks run to be processed correctly.
> > > 
> > > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > ---
> > >  fs/xfs/xfs_inode.c      | 148 ++++++++++++++++------------------------
> > >  fs/xfs/xfs_inode.h      |   2 +-
> > >  fs/xfs/xfs_inode_item.c |   2 +-
> > >  3 files changed, 62 insertions(+), 90 deletions(-)
> > > 
> > > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > > index 8566bd0f4334d..931a483d5b316 100644
> > > --- a/fs/xfs/xfs_inode.c
> > > +++ b/fs/xfs/xfs_inode.c
> > > @@ -3611,117 +3611,94 @@ xfs_iflush(
> > >   */
> > >  int
> > >  xfs_iflush_cluster(
> > > -	struct xfs_inode	*ip,
> > >  	struct xfs_buf		*bp)
> > >  {
> > ...
> > > +	/*
> > > +	 * We must use the safe variant here as on shutdown xfs_iflush_abort()
> > > +	 * can remove itself from the list.
> > > +	 */
> > > +	list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
> > > +		iip = (struct xfs_inode_log_item *)lip;
> > > +		ip = iip->ili_inode;
> > >  
> > >  		/*
> > > -		 * because this is an RCU protected lookup, we could find a
> > > -		 * recently freed or even reallocated inode during the lookup.
> > > -		 * We need to check under the i_flags_lock for a valid inode
> > > -		 * here. Skip it if it is not valid or the wrong inode.
> > > +		 * Quick and dirty check to avoid locks if possible.
> > >  		 */
> > > -		spin_lock(&cip->i_flags_lock);
> > > -		if (!cip->i_ino ||
> > > -		    __xfs_iflags_test(cip, XFS_ISTALE)) {
> > > -			spin_unlock(&cip->i_flags_lock);
> > > +		if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK))
> > > +			continue;
> > > +		if (xfs_ipincount(ip))
> > >  			continue;
> > > -		}
> > >  
> > >  		/*
> > > -		 * Once we fall off the end of the cluster, no point checking
> > > -		 * any more inodes in the list because they will also all be
> > > -		 * outside the cluster.
> > > +		 * The inode is still attached to the buffer, which means it is
> > > +		 * dirty but reclaim might try to grab it. Check carefully for
> > > +		 * that, and grab the ilock while still holding the i_flags_lock
> > > +		 * to guarantee reclaim will not be able to reclaim this inode
> > > +		 * once we drop the i_flags_lock.
> > >  		 */
> > > -		if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) {
> > > -			spin_unlock(&cip->i_flags_lock);
> > > -			break;
> > > +		spin_lock(&ip->i_flags_lock);
> > > +		ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE));
> > 
> > What prevents a race with staling an inode here?
> 
> xfs_buf_item_release() does not drop the buffer lock when stale
> buffers are committed. Hence the buffer it held locked until it is
> committed to the journal, and unpinning the buffer on journal IO
> completion runs xfs_iflush_done() -> xfs_iflush_abort() on all the
> stale attached inodes. At which point, they are removed from the
> buffer and the buffer is unlocked..
> 
> Hence we cannot run here with stale inodes attached to the buffer
> because the buffer is locked the entire time stale inodes are
> attached.
> 

Ok, so it's actually that the earlier ISTALE check on the inode is more
of an optimization since if we saw the flag set, we'd never be able to
lock the buffer anyways.

> > The push open codes an
> > unlocked (i.e. no memory barrier) check before it acquires the buffer
> > lock, so afaict it's technically possible to race and grab the buffer
> > immediately after the cluster was freed. If that could happen, it looks
> > like we'd also queue the buffer for write.
> 
> Not that I can tell, because we'll fail to get the buffer lock under
> the AIL lock until the stale buffer is unpinned, but the unpin
> occurs under the buffer lock and that removes the buffer from the
> AIL. Hence there's no way the AIL pushing can actually push an inode
> cluster buffer that has stale inodes attached to it...
> 

Makes sense.

> > That also raises the question.. between this patch and the earlier push
> > rework, why do we queue the buffer for write when nothing might have
> > been flushed by this call?
> 
> Largely because I never observed a failure to flush at least one
> inode so I didn't really consider it worth the additional complexity
> in error handling in the push code. I've really been concerned about
> the other end of the scale where we try to push the same buffer 30+
> times for each IO.
> 
> You are right in that it could happen - perhaps just returning EAGAIN
> if clcount == 0 at the end of the function is all that is necessary
> and translating that to XFS_ITEM_LOCKED in the push function would
> work.
> 

That sounds reasonable enough to cover that case.

> > > -		 * check is not sufficient.
> > > +		 * If we are shut down, unpin and abort the inode now as there
> > > +		 * is no point in flushing it to the buffer just to get an IO
> > > +		 * completion to abort the buffer and remove it from the AIL.
> > >  		 */
> > > -		if (!cip->i_ino) {
> > > -			xfs_ifunlock(cip);
> > > -			xfs_iunlock(cip, XFS_ILOCK_SHARED);
> > > +		if (XFS_FORCED_SHUTDOWN(mp)) {
> > > +			xfs_iunpin_wait(ip);
> > 
> > Note that we have an unlocked check above that skips pinned inodes.
> 
> Right, but we could be racing with a transaction commit that pinned
> the inode and a shutdown. As the comment says: it's a quick and
> dirty check to avoid trying to get locks when we know that it is
> unlikely we can flush the inode without blocking. We still have to
> recheck that state once we have the ILOCK....
> 

Right, but that means we can just as easily skip the shutdown processing
(which waits for unpin) if a particular inode is pinned. So which is
supposed to happen in the shutdown case?

ISTM that either could happen. As a result this kind of looks like
random logic to me. IIUC we rely on potentially coming back through the
cluster flush path multiple times if inodes are pinned because of the
racy pin/shutdown checks. If that's the case, then why not push the
shutdown check to after the locked pin count check, skip the
xfs_iunpin_wait() call entirely and then proceed to abort the flush at
the same point we'd call xfs_iflush() if the fs weren't shutdown?

FWIW, I'm also kind of wondering if we need this shutdown check at all
now that this path doesn't have to accommodate reclaim. We follow up
with failing the buffer in the exit path, but there's no guarantee there
aren't other inodes that are passed over due to the pin checks,
trylocks, etc. Can we just flush the inodes as normal and let xfsaild
fail the buffers on submission without additional shutdown checks?

Brian

> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
>
Dave Chinner June 10, 2020, 11:40 p.m. UTC | #4
On Wed, Jun 10, 2020 at 09:06:28AM -0400, Brian Foster wrote:
> On Wed, Jun 10, 2020 at 08:01:39AM +1000, Dave Chinner wrote:
> > On Tue, Jun 09, 2020 at 09:11:55AM -0400, Brian Foster wrote:
> > > On Thu, Jun 04, 2020 at 05:46:04PM +1000, Dave Chinner wrote:
> > > > -		 * check is not sufficient.
> > > > +		 * If we are shut down, unpin and abort the inode now as there
> > > > +		 * is no point in flushing it to the buffer just to get an IO
> > > > +		 * completion to abort the buffer and remove it from the AIL.
> > > >  		 */
> > > > -		if (!cip->i_ino) {
> > > > -			xfs_ifunlock(cip);
> > > > -			xfs_iunlock(cip, XFS_ILOCK_SHARED);
> > > > +		if (XFS_FORCED_SHUTDOWN(mp)) {
> > > > +			xfs_iunpin_wait(ip);
> > > 
> > > Note that we have an unlocked check above that skips pinned inodes.
> > 
> > Right, but we could be racing with a transaction commit that pinned
> > the inode and a shutdown. As the comment says: it's a quick and
> > dirty check to avoid trying to get locks when we know that it is
> > unlikely we can flush the inode without blocking. We still have to
> > recheck that state once we have the ILOCK....
> > 
> 
> Right, but that means we can just as easily skip the shutdown processing
> (which waits for unpin) if a particular inode is pinned. So which is
> supposed to happen in the shutdown case?
>
> ISTM that either could happen. As a result this kind of looks like
> random logic to me.

Yes, shutdown is racy, so it could be either. However, I'm not
changing the shutdown logic or handling here. If the shutdown race
could happen before this patchset (and it can), it can still happen
after the patchset, and this patchset does not change the way we
handle the shutdown race at all.

IOWs, while this shutdown logic may appear "random", that's not a
result of this patchset - it a result of design decisions made in
the last major shutdown rework/cleanup that required checks to be
added to places that could hang waiting for an event that would
never occur because shutdown state prevented it from occurring.

There's already enough complexity in this patchset that adding
shutdown logic changes is just too much to ask for.  If we want to
change how various shutdown logics work, lets do it as a separate
set of changes so all the subtle bugs that result from the changes
bisect to the isolated shutdown logic changes...

Cheers,

Dave.
Brian Foster June 11, 2020, 1:56 p.m. UTC | #5
On Thu, Jun 11, 2020 at 09:40:08AM +1000, Dave Chinner wrote:
> On Wed, Jun 10, 2020 at 09:06:28AM -0400, Brian Foster wrote:
> > On Wed, Jun 10, 2020 at 08:01:39AM +1000, Dave Chinner wrote:
> > > On Tue, Jun 09, 2020 at 09:11:55AM -0400, Brian Foster wrote:
> > > > On Thu, Jun 04, 2020 at 05:46:04PM +1000, Dave Chinner wrote:
> > > > > -		 * check is not sufficient.
> > > > > +		 * If we are shut down, unpin and abort the inode now as there
> > > > > +		 * is no point in flushing it to the buffer just to get an IO
> > > > > +		 * completion to abort the buffer and remove it from the AIL.
> > > > >  		 */
> > > > > -		if (!cip->i_ino) {
> > > > > -			xfs_ifunlock(cip);
> > > > > -			xfs_iunlock(cip, XFS_ILOCK_SHARED);
> > > > > +		if (XFS_FORCED_SHUTDOWN(mp)) {
> > > > > +			xfs_iunpin_wait(ip);
> > > > 
> > > > Note that we have an unlocked check above that skips pinned inodes.
> > > 
> > > Right, but we could be racing with a transaction commit that pinned
> > > the inode and a shutdown. As the comment says: it's a quick and
> > > dirty check to avoid trying to get locks when we know that it is
> > > unlikely we can flush the inode without blocking. We still have to
> > > recheck that state once we have the ILOCK....
> > > 
> > 
> > Right, but that means we can just as easily skip the shutdown processing
> > (which waits for unpin) if a particular inode is pinned. So which is
> > supposed to happen in the shutdown case?
> >
> > ISTM that either could happen. As a result this kind of looks like
> > random logic to me.
> 
> Yes, shutdown is racy, so it could be either. However, I'm not
> changing the shutdown logic or handling here. If the shutdown race
> could happen before this patchset (and it can), it can still happen
> after the patchset, and this patchset does not change the way we
> handle the shutdown race at all.
> 
> IOWs, while this shutdown logic may appear "random", that's not a
> result of this patchset - it a result of design decisions made in
> the last major shutdown rework/cleanup that required checks to be
> added to places that could hang waiting for an event that would
> never occur because shutdown state prevented it from occurring.
> 

It's not so much the shutdown check that I find random as much as how it
intends to handle pinned inodes.

> There's already enough complexity in this patchset that adding
> shutdown logic changes is just too much to ask for.  If we want to
> change how various shutdown logics work, lets do it as a separate
> set of changes so all the subtle bugs that result from the changes
> bisect to the isolated shutdown logic changes...
> 

The fact that shutdown is racy is just background context. My point is
that this patch appears to introduce special shutdown handling for a
condition where it 1.) didn't previously exist and 2.) doesn't appear to
be necessary.

The current push/flush code only incorporates a shutdown check
indirectly via mapping the buffer, which simulates an I/O failure and
causes us to abort the flush (and shutdown if the I/O failure happened
for some other reason). If the shutdown happened sometime after we
acquired the buffer, then there's no real impact on this code path. We
flush the inode(s) and return success. The shutdown will be handled
appropriately when xfsaild attempts to submit the buffer.

The new code no longer maps the buffer because that is done much
earlier, but for some reason incorporates a new check to abort the flush
if the fs is already shutdown. The problem I have with this is that
these checks tend to be brittle, untested and a maintenance burden. As
such, I don't think we should ever add new shutdown checks for cases
that aren't required for functional correctness. That way we hopefully
move to a state where we have the minimum number of shutdown checks with
broadest coverage to ensure everything unwinds correctly, but don't have
to constantly battle with insufficiently tested logic in obscure
contexts that silently break as surrounding code changes over time and
leads to random fstests hangs and shutdown logic cleanouts every couple
of years.

So my question for any newly added shutdown check is: what problem does
this check solve? If there isn't an explicit functional problem and it's
intended more as convenience/efficiency logic (which is what the comment
implies), then I don't think it's justified. If there is one, then
perhaps it is justified, but should be more clearly documented (and I do
still think the pin check logic should be cleaned up, but that's a very
minor tweak).

Brian

> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
>
Dave Chinner June 15, 2020, 1:01 a.m. UTC | #6
On Thu, Jun 11, 2020 at 09:56:18AM -0400, Brian Foster wrote:
> On Thu, Jun 11, 2020 at 09:40:08AM +1000, Dave Chinner wrote:
> > On Wed, Jun 10, 2020 at 09:06:28AM -0400, Brian Foster wrote:
> > > On Wed, Jun 10, 2020 at 08:01:39AM +1000, Dave Chinner wrote:
> > > > On Tue, Jun 09, 2020 at 09:11:55AM -0400, Brian Foster wrote:
> > > > > On Thu, Jun 04, 2020 at 05:46:04PM +1000, Dave Chinner wrote:
> > > > > > -		 * check is not sufficient.
> > > > > > +		 * If we are shut down, unpin and abort the inode now as there
> > > > > > +		 * is no point in flushing it to the buffer just to get an IO
> > > > > > +		 * completion to abort the buffer and remove it from the AIL.
> > > > > >  		 */
> > > > > > -		if (!cip->i_ino) {
> > > > > > -			xfs_ifunlock(cip);
> > > > > > -			xfs_iunlock(cip, XFS_ILOCK_SHARED);
> > > > > > +		if (XFS_FORCED_SHUTDOWN(mp)) {
> > > > > > +			xfs_iunpin_wait(ip);
> > > > > 
> > > > > Note that we have an unlocked check above that skips pinned inodes.
> > > > 
> > > > Right, but we could be racing with a transaction commit that pinned
> > > > the inode and a shutdown. As the comment says: it's a quick and
> > > > dirty check to avoid trying to get locks when we know that it is
> > > > unlikely we can flush the inode without blocking. We still have to
> > > > recheck that state once we have the ILOCK....
> > > > 
> > > 
> > > Right, but that means we can just as easily skip the shutdown processing
> > > (which waits for unpin) if a particular inode is pinned. So which is
> > > supposed to happen in the shutdown case?
> > >
> > > ISTM that either could happen. As a result this kind of looks like
> > > random logic to me.
> > 
> > Yes, shutdown is racy, so it could be either. However, I'm not
> > changing the shutdown logic or handling here. If the shutdown race
> > could happen before this patchset (and it can), it can still happen
> > after the patchset, and this patchset does not change the way we
> > handle the shutdown race at all.
> > 
> > IOWs, while this shutdown logic may appear "random", that's not a
> > result of this patchset - it a result of design decisions made in
> > the last major shutdown rework/cleanup that required checks to be
> > added to places that could hang waiting for an event that would
> > never occur because shutdown state prevented it from occurring.
> 
> It's not so much the shutdown check that I find random as much as how it
> intends to handle pinned inodes.

I can change that, but that's the shutdown was handled by similar
code in the past, so it seemed appropriate here because this code
was hanging on shutdowns during development.

> > There's already enough complexity in this patchset that adding
> > shutdown logic changes is just too much to ask for.  If we want to
> > change how various shutdown logics work, lets do it as a separate
> > set of changes so all the subtle bugs that result from the changes
> > bisect to the isolated shutdown logic changes...
> > 
> 
> The fact that shutdown is racy is just background context. My point is
> that this patch appears to introduce special shutdown handling for a
> condition where it 1.) didn't previously exist and 2.) doesn't appear to
> be necessary.

The random shutdown failures I kept seeing in shutdown tests says
otherwise.

> The current push/flush code only incorporates a shutdown check
> indirectly via mapping the buffer, which simulates an I/O failure and
> causes us to abort the flush (and shutdown if the I/O failure happened
> for some other reason). If the shutdown happened sometime after we
> acquired the buffer, then there's no real impact on this code path. We
> flush the inode(s) and return success. The shutdown will be handled
> appropriately when xfsaild attempts to submit the buffer.

The long-standing rule of thumb is "shutdown == abort in-progress IO
immediately", which is what I followed here when it became apparent
there was some kind of subtle shutdown issue occurring. That made
the shutdown problem go away.

It may be that changes I've been making to other parts of this
writeback code make the shutdown check here unnecessary. My testing
said otherwise, but maybe that's all been cleared up. Hence if the
shutdown check is truly unnecessary, let's clean it up in a future
patchset where that assertion can be bisected down cleanly.  I
needed this to get fstests to pass, and for this patchset which is
entirely unrelated to shutdown architecture, that's all the
justification that should be necessary.

> The new code no longer maps the buffer because that is done much
> earlier, but for some reason incorporates a new check to abort the flush
> if the fs is already shutdown. The problem I have with this is that
> these checks tend to be brittle, untested and a maintenance burden.

Shutdown has *always* been brittle. You've seen it go from being
completely fucking awful to actually being somewhat reliable because
your experience with XFS matches to roughly when we first added
substantial shutdown validation to fstests.  We had a huge mountain
of technical debt around shutdown, but the importance of addressing
it has historically been pretty low because -shutdown is extremely
rare- in production systems.

> As
> such, I don't think we should ever add new shutdown checks for cases
> that aren't required for functional correctness.

I think the burden of proof is the wrong way around for the current
shutdown architecture. If you want to make a rule like this, you
need to fix define how the shutdown architecture is going to allow
this sort of rule to be applied without placing unreasonable demands
on the patch author.

> That way we hopefully
> move to a state where we have the minimum number of shutdown checks with
> broadest coverage to ensure everything unwinds correctly, but don't have
> to constantly battle with insufficiently tested logic in obscure
> contexts that silently break as surrounding code changes over time and
> leads to random fstests hangs and shutdown logic cleanouts every couple
> of years.

Yes, I think this is an admirable goal, but I think you've got how
to get there completely backward.  First work out the architecture
and logic that allows us to remove/avoid "randomly" placed checks,
then work towards cleaning up the code. We don't get there by saying
"no new checks!" and then ignoring the underlying problems those
checks are trying to avoid.

If you can come up with a shutdown mechanism that:

	a) prevents all IO the moment a shutdown is triggered, and
	b) avoids shutdown detection ordering hangs between
	   different objects and subsystems,

then you have grounds for saying "nobody should need to add new
shutdown checks". But right now, that's not even close to being the
reality. There needs to be lots more cleanup and rework to the
shutdown code to be done before we get anywhere near this...

> So my question for any newly added shutdown check is: what problem does
> this check solve?

Repeated fstests hangs on various shutdown tests while developing
this code.

Cheers,

Dave.
Brian Foster June 15, 2020, 2:21 p.m. UTC | #7
On Mon, Jun 15, 2020 at 11:01:52AM +1000, Dave Chinner wrote:
> On Thu, Jun 11, 2020 at 09:56:18AM -0400, Brian Foster wrote:
> > On Thu, Jun 11, 2020 at 09:40:08AM +1000, Dave Chinner wrote:
> > > On Wed, Jun 10, 2020 at 09:06:28AM -0400, Brian Foster wrote:
> > > > On Wed, Jun 10, 2020 at 08:01:39AM +1000, Dave Chinner wrote:
> > > > > On Tue, Jun 09, 2020 at 09:11:55AM -0400, Brian Foster wrote:
> > > > > > On Thu, Jun 04, 2020 at 05:46:04PM +1000, Dave Chinner wrote:
> > > > > > > -		 * check is not sufficient.
> > > > > > > +		 * If we are shut down, unpin and abort the inode now as there
> > > > > > > +		 * is no point in flushing it to the buffer just to get an IO
> > > > > > > +		 * completion to abort the buffer and remove it from the AIL.
> > > > > > >  		 */
> > > > > > > -		if (!cip->i_ino) {
> > > > > > > -			xfs_ifunlock(cip);
> > > > > > > -			xfs_iunlock(cip, XFS_ILOCK_SHARED);
> > > > > > > +		if (XFS_FORCED_SHUTDOWN(mp)) {
> > > > > > > +			xfs_iunpin_wait(ip);
> > > > > > 
> > > > > > Note that we have an unlocked check above that skips pinned inodes.
> > > > > 
> > > > > Right, but we could be racing with a transaction commit that pinned
> > > > > the inode and a shutdown. As the comment says: it's a quick and
> > > > > dirty check to avoid trying to get locks when we know that it is
> > > > > unlikely we can flush the inode without blocking. We still have to
> > > > > recheck that state once we have the ILOCK....
> > > > > 
> > > > 
> > > > Right, but that means we can just as easily skip the shutdown processing
> > > > (which waits for unpin) if a particular inode is pinned. So which is
> > > > supposed to happen in the shutdown case?
> > > >
> > > > ISTM that either could happen. As a result this kind of looks like
> > > > random logic to me.
> > > 
> > > Yes, shutdown is racy, so it could be either. However, I'm not
> > > changing the shutdown logic or handling here. If the shutdown race
> > > could happen before this patchset (and it can), it can still happen
> > > after the patchset, and this patchset does not change the way we
> > > handle the shutdown race at all.
> > > 
> > > IOWs, while this shutdown logic may appear "random", that's not a
> > > result of this patchset - it a result of design decisions made in
> > > the last major shutdown rework/cleanup that required checks to be
> > > added to places that could hang waiting for an event that would
> > > never occur because shutdown state prevented it from occurring.
> > 
> > It's not so much the shutdown check that I find random as much as how it
> > intends to handle pinned inodes.
> 
> I can change that, but that's the shutdown was handled by similar
> code in the past, so it seemed appropriate here because this code
> was hanging on shutdowns during development.
> 

Ok.

> > > There's already enough complexity in this patchset that adding
> > > shutdown logic changes is just too much to ask for.  If we want to
> > > change how various shutdown logics work, lets do it as a separate
> > > set of changes so all the subtle bugs that result from the changes
> > > bisect to the isolated shutdown logic changes...
> > > 
> > 
> > The fact that shutdown is racy is just background context. My point is
> > that this patch appears to introduce special shutdown handling for a
> > condition where it 1.) didn't previously exist and 2.) doesn't appear to
> > be necessary.
> 
> The random shutdown failures I kept seeing in shutdown tests says
> otherwise.
> 

Then can we please focus on the issue? My initial feedback was around
the pin state logic and that made me question whether the broader
shutdown logic was spurious. The response to that was that there's too
much complexity in this series to change shutdown logic, etc., and that
suggested I should justify the feedback with this being shutdown code
and all. Now we're debating about whether I want to make some kind of
architectural shutdown rule (I don't, and the reasoning for my question
doesn't invalidate the question itself), while I still have no idea what
issue this code actually fixes.

Can you elaborate on the problem, please? I realize that specifics may
be lost, but what in general about the failure suggested placing this
shutdown check in the cluster flush path? Does it concern you at all
that since shutdown is racy as such, this is possibly suppressing a new
(or old) bug and making it harder to reproduce and diagnose? It's also
quite possible that this was a latent issue exposed by removing the
buffer mapping from this path. In that case it probably does make more
sense to keep the check in this patch, but if that's the case I also
might be willing to spend some time digging into it to try and address
it sooner rather than have it fall to the wayside if you'd be willing to
provide some useful information...

Brian

> > The current push/flush code only incorporates a shutdown check
> > indirectly via mapping the buffer, which simulates an I/O failure and
> > causes us to abort the flush (and shutdown if the I/O failure happened
> > for some other reason). If the shutdown happened sometime after we
> > acquired the buffer, then there's no real impact on this code path. We
> > flush the inode(s) and return success. The shutdown will be handled
> > appropriately when xfsaild attempts to submit the buffer.
> 
> The long-standing rule of thumb is "shutdown == abort in-progress IO
> immediately", which is what I followed here when it became apparent
> there was some kind of subtle shutdown issue occurring. That made
> the shutdown problem go away.
> 
> It may be that changes I've been making to other parts of this
> writeback code make the shutdown check here unnecessary. My testing
> said otherwise, but maybe that's all been cleared up. Hence if the
> shutdown check is truly unnecessary, let's clean it up in a future
> patchset where that assertion can be bisected down cleanly.  I
> needed this to get fstests to pass, and for this patchset which is
> entirely unrelated to shutdown architecture, that's all the
> justification that should be necessary.
> 
> > The new code no longer maps the buffer because that is done much
> > earlier, but for some reason incorporates a new check to abort the flush
> > if the fs is already shutdown. The problem I have with this is that
> > these checks tend to be brittle, untested and a maintenance burden.
> 
> Shutdown has *always* been brittle. You've seen it go from being
> completely fucking awful to actually being somewhat reliable because
> your experience with XFS matches to roughly when we first added
> substantial shutdown validation to fstests.  We had a huge mountain
> of technical debt around shutdown, but the importance of addressing
> it has historically been pretty low because -shutdown is extremely
> rare- in production systems.
> 
> > As
> > such, I don't think we should ever add new shutdown checks for cases
> > that aren't required for functional correctness.
> 
> I think the burden of proof is the wrong way around for the current
> shutdown architecture. If you want to make a rule like this, you
> need to fix define how the shutdown architecture is going to allow
> this sort of rule to be applied without placing unreasonable demands
> on the patch author.
> 
> > That way we hopefully
> > move to a state where we have the minimum number of shutdown checks with
> > broadest coverage to ensure everything unwinds correctly, but don't have
> > to constantly battle with insufficiently tested logic in obscure
> > contexts that silently break as surrounding code changes over time and
> > leads to random fstests hangs and shutdown logic cleanouts every couple
> > of years.
> 
> Yes, I think this is an admirable goal, but I think you've got how
> to get there completely backward.  First work out the architecture
> and logic that allows us to remove/avoid "randomly" placed checks,
> then work towards cleaning up the code. We don't get there by saying
> "no new checks!" and then ignoring the underlying problems those
> checks are trying to avoid.
> 
> If you can come up with a shutdown mechanism that:
> 
> 	a) prevents all IO the moment a shutdown is triggered, and
> 	b) avoids shutdown detection ordering hangs between
> 	   different objects and subsystems,
> 
> then you have grounds for saying "nobody should need to add new
> shutdown checks". But right now, that's not even close to being the
> reality. There needs to be lots more cleanup and rework to the
> shutdown code to be done before we get anywhere near this...
> 
> > So my question for any newly added shutdown check is: what problem does
> > this check solve?
> 
> Repeated fstests hangs on various shutdown tests while developing
> this code.
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
>
Brian Foster June 16, 2020, 2:41 p.m. UTC | #8
On Mon, Jun 15, 2020 at 10:21:55AM -0400, Brian Foster wrote:
> On Mon, Jun 15, 2020 at 11:01:52AM +1000, Dave Chinner wrote:
> > On Thu, Jun 11, 2020 at 09:56:18AM -0400, Brian Foster wrote:
> > > On Thu, Jun 11, 2020 at 09:40:08AM +1000, Dave Chinner wrote:
> > > > On Wed, Jun 10, 2020 at 09:06:28AM -0400, Brian Foster wrote:
> > > > > On Wed, Jun 10, 2020 at 08:01:39AM +1000, Dave Chinner wrote:
> > > > > > On Tue, Jun 09, 2020 at 09:11:55AM -0400, Brian Foster wrote:
> > > > > > > On Thu, Jun 04, 2020 at 05:46:04PM +1000, Dave Chinner wrote:
> > > > > > > > -		 * check is not sufficient.
> > > > > > > > +		 * If we are shut down, unpin and abort the inode now as there
> > > > > > > > +		 * is no point in flushing it to the buffer just to get an IO
> > > > > > > > +		 * completion to abort the buffer and remove it from the AIL.
> > > > > > > >  		 */
> > > > > > > > -		if (!cip->i_ino) {
> > > > > > > > -			xfs_ifunlock(cip);
> > > > > > > > -			xfs_iunlock(cip, XFS_ILOCK_SHARED);
> > > > > > > > +		if (XFS_FORCED_SHUTDOWN(mp)) {
> > > > > > > > +			xfs_iunpin_wait(ip);
> > > > > > > 
> > > > > > > Note that we have an unlocked check above that skips pinned inodes.
> > > > > > 
> > > > > > Right, but we could be racing with a transaction commit that pinned
> > > > > > the inode and a shutdown. As the comment says: it's a quick and
> > > > > > dirty check to avoid trying to get locks when we know that it is
> > > > > > unlikely we can flush the inode without blocking. We still have to
> > > > > > recheck that state once we have the ILOCK....
> > > > > > 
> > > > > 
> > > > > Right, but that means we can just as easily skip the shutdown processing
> > > > > (which waits for unpin) if a particular inode is pinned. So which is
> > > > > supposed to happen in the shutdown case?
> > > > >
> > > > > ISTM that either could happen. As a result this kind of looks like
> > > > > random logic to me.
> > > > 
> > > > Yes, shutdown is racy, so it could be either. However, I'm not
> > > > changing the shutdown logic or handling here. If the shutdown race
> > > > could happen before this patchset (and it can), it can still happen
> > > > after the patchset, and this patchset does not change the way we
> > > > handle the shutdown race at all.
> > > > 
> > > > IOWs, while this shutdown logic may appear "random", that's not a
> > > > result of this patchset - it a result of design decisions made in
> > > > the last major shutdown rework/cleanup that required checks to be
> > > > added to places that could hang waiting for an event that would
> > > > never occur because shutdown state prevented it from occurring.
> > > 
> > > It's not so much the shutdown check that I find random as much as how it
> > > intends to handle pinned inodes.
> > 
> > I can change that, but that's the shutdown was handled by similar
> > code in the past, so it seemed appropriate here because this code
> > was hanging on shutdowns during development.
> > 
> 
> Ok.
> 
> > > > There's already enough complexity in this patchset that adding
> > > > shutdown logic changes is just too much to ask for.  If we want to
> > > > change how various shutdown logics work, lets do it as a separate
> > > > set of changes so all the subtle bugs that result from the changes
> > > > bisect to the isolated shutdown logic changes...
> > > > 
> > > 
> > > The fact that shutdown is racy is just background context. My point is
> > > that this patch appears to introduce special shutdown handling for a
> > > condition where it 1.) didn't previously exist and 2.) doesn't appear to
> > > be necessary.
> > 
> > The random shutdown failures I kept seeing in shutdown tests says
> > otherwise.
> > 
> 
> Then can we please focus on the issue? My initial feedback was around
> the pin state logic and that made me question whether the broader
> shutdown logic was spurious. The response to that was that there's too
> much complexity in this series to change shutdown logic, etc., and that
> suggested I should justify the feedback with this being shutdown code
> and all. Now we're debating about whether I want to make some kind of
> architectural shutdown rule (I don't, and the reasoning for my question
> doesn't invalidate the question itself), while I still have no idea what
> issue this code actually fixes.
> 
> Can you elaborate on the problem, please? I realize that specifics may
> be lost, but what in general about the failure suggested placing this
> shutdown check in the cluster flush path? Does it concern you at all
> that since shutdown is racy as such, this is possibly suppressing a new
> (or old) bug and making it harder to reproduce and diagnose? It's also
> quite possible that this was a latent issue exposed by removing the
> buffer mapping from this path. In that case it probably does make more
> sense to keep the check in this patch, but if that's the case I also
> might be willing to spend some time digging into it to try and address
> it sooner rather than have it fall to the wayside if you'd be willing to
> provide some useful information...
> 

For reference, I dropped the shutdown check and reproduced a generic/051
explosion (kernel BUG) fairly quickly. Digging into that a bit, it looks
like the problem is we essentially can flush an inode that isn't in the
AIL because in the log I/O error (shutdown) scenario, we unpin the inode
and don't insert it. If the buffer has other inodes in the AIL, then the
cluster flush sees the aforementioned inode is dirty and unpinned and
presumably safe to flush, even though it wasn't inserted because the log
I/O failed. The problem thus manifests as a double remove from the AIL
when the buffer is submitted and failed by xfsaild with a non-AIL
resident flushed inode.

I think that fundamentally this is not a problem outside of shutdown
because we're in a shutdown state before the item is unpinned, and not
necessarily a race introduced by this series when you consider the
current cluster flush implementation. Therefore it's reasonable to
include the shutdown check in the flush path as this patch does. I think
there are several options to clean this up fairly easily (reset dirty
state on abort, abort the flush on buffer completion when shutdown,
etc.), but that can be evaluated separately from this series. Given all
of that, I'm Ok with this patch if you fixed up the slightly erratic pin
checking logic (including the shutdown check) and replace the misleading
comment above the shutdown check with something like:

        /*
         * Abort the flush if we are shut down because the inode may not
         * currently be in the AIL. Log I/O failure unpins the inode
         * without inserting it, leaving a dirty/unpinned inode that
         * otherwise looks like it should be flushed.
         */
 
Brian

> Brian
> 
> > > The current push/flush code only incorporates a shutdown check
> > > indirectly via mapping the buffer, which simulates an I/O failure and
> > > causes us to abort the flush (and shutdown if the I/O failure happened
> > > for some other reason). If the shutdown happened sometime after we
> > > acquired the buffer, then there's no real impact on this code path. We
> > > flush the inode(s) and return success. The shutdown will be handled
> > > appropriately when xfsaild attempts to submit the buffer.
> > 
> > The long-standing rule of thumb is "shutdown == abort in-progress IO
> > immediately", which is what I followed here when it became apparent
> > there was some kind of subtle shutdown issue occurring. That made
> > the shutdown problem go away.
> > 
> > It may be that changes I've been making to other parts of this
> > writeback code make the shutdown check here unnecessary. My testing
> > said otherwise, but maybe that's all been cleared up. Hence if the
> > shutdown check is truly unnecessary, let's clean it up in a future
> > patchset where that assertion can be bisected down cleanly.  I
> > needed this to get fstests to pass, and for this patchset which is
> > entirely unrelated to shutdown architecture, that's all the
> > justification that should be necessary.
> > 
> > > The new code no longer maps the buffer because that is done much
> > > earlier, but for some reason incorporates a new check to abort the flush
> > > if the fs is already shutdown. The problem I have with this is that
> > > these checks tend to be brittle, untested and a maintenance burden.
> > 
> > Shutdown has *always* been brittle. You've seen it go from being
> > completely fucking awful to actually being somewhat reliable because
> > your experience with XFS matches to roughly when we first added
> > substantial shutdown validation to fstests.  We had a huge mountain
> > of technical debt around shutdown, but the importance of addressing
> > it has historically been pretty low because -shutdown is extremely
> > rare- in production systems.
> > 
> > > As
> > > such, I don't think we should ever add new shutdown checks for cases
> > > that aren't required for functional correctness.
> > 
> > I think the burden of proof is the wrong way around for the current
> > shutdown architecture. If you want to make a rule like this, you
> > need to fix define how the shutdown architecture is going to allow
> > this sort of rule to be applied without placing unreasonable demands
> > on the patch author.
> > 
> > > That way we hopefully
> > > move to a state where we have the minimum number of shutdown checks with
> > > broadest coverage to ensure everything unwinds correctly, but don't have
> > > to constantly battle with insufficiently tested logic in obscure
> > > contexts that silently break as surrounding code changes over time and
> > > leads to random fstests hangs and shutdown logic cleanouts every couple
> > > of years.
> > 
> > Yes, I think this is an admirable goal, but I think you've got how
> > to get there completely backward.  First work out the architecture
> > and logic that allows us to remove/avoid "randomly" placed checks,
> > then work towards cleaning up the code. We don't get there by saying
> > "no new checks!" and then ignoring the underlying problems those
> > checks are trying to avoid.
> > 
> > If you can come up with a shutdown mechanism that:
> > 
> > 	a) prevents all IO the moment a shutdown is triggered, and
> > 	b) avoids shutdown detection ordering hangs between
> > 	   different objects and subsystems,
> > 
> > then you have grounds for saying "nobody should need to add new
> > shutdown checks". But right now, that's not even close to being the
> > reality. There needs to be lots more cleanup and rework to the
> > shutdown code to be done before we get anywhere near this...
> > 
> > > So my question for any newly added shutdown check is: what problem does
> > > this check solve?
> > 
> > Repeated fstests hangs on various shutdown tests while developing
> > this code.
> > 
> > Cheers,
> > 
> > Dave.
> > -- 
> > Dave Chinner
> > david@fromorbit.com
> > 
>
diff mbox series

Patch

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 8566bd0f4334d..931a483d5b316 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3611,117 +3611,94 @@  xfs_iflush(
  */
 int
 xfs_iflush_cluster(
-	struct xfs_inode	*ip,
 	struct xfs_buf		*bp)
 {
-	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_perag	*pag;
-	unsigned long		first_index, mask;
-	int			cilist_size;
-	struct xfs_inode	**cilist;
-	struct xfs_inode	*cip;
-	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
-	int			error = 0;
-	int			nr_found;
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_log_item	*lip, *n;
+	struct xfs_inode	*ip;
+	struct xfs_inode_log_item *iip;
 	int			clcount = 0;
-	int			i;
-
-	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-
-	cilist_size = igeo->inodes_per_cluster * sizeof(struct xfs_inode *);
-	cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS);
-	if (!cilist)
-		goto out_put;
-
-	mask = ~(igeo->inodes_per_cluster - 1);
-	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
-	rcu_read_lock();
-	/* really need a gang lookup range call here */
-	nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist,
-					first_index, igeo->inodes_per_cluster);
-	if (nr_found == 0)
-		goto out_free;
+	int			error = 0;
 
-	for (i = 0; i < nr_found; i++) {
-		cip = cilist[i];
+	/*
+	 * We must use the safe variant here as on shutdown xfs_iflush_abort()
+	 * can remove itself from the list.
+	 */
+	list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
+		iip = (struct xfs_inode_log_item *)lip;
+		ip = iip->ili_inode;
 
 		/*
-		 * because this is an RCU protected lookup, we could find a
-		 * recently freed or even reallocated inode during the lookup.
-		 * We need to check under the i_flags_lock for a valid inode
-		 * here. Skip it if it is not valid or the wrong inode.
+		 * Quick and dirty check to avoid locks if possible.
 		 */
-		spin_lock(&cip->i_flags_lock);
-		if (!cip->i_ino ||
-		    __xfs_iflags_test(cip, XFS_ISTALE)) {
-			spin_unlock(&cip->i_flags_lock);
+		if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK))
+			continue;
+		if (xfs_ipincount(ip))
 			continue;
-		}
 
 		/*
-		 * Once we fall off the end of the cluster, no point checking
-		 * any more inodes in the list because they will also all be
-		 * outside the cluster.
+		 * The inode is still attached to the buffer, which means it is
+		 * dirty but reclaim might try to grab it. Check carefully for
+		 * that, and grab the ilock while still holding the i_flags_lock
+		 * to guarantee reclaim will not be able to reclaim this inode
+		 * once we drop the i_flags_lock.
 		 */
-		if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) {
-			spin_unlock(&cip->i_flags_lock);
-			break;
+		spin_lock(&ip->i_flags_lock);
+		ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE));
+		if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) {
+			spin_unlock(&ip->i_flags_lock);
+			continue;
 		}
-		spin_unlock(&cip->i_flags_lock);
 
 		/*
-		 * Do an un-protected check to see if the inode is dirty and
-		 * is a candidate for flushing.  These checks will be repeated
-		 * later after the appropriate locks are acquired.
+		 * ILOCK will pin the inode against reclaim and prevent
+		 * concurrent transactions modifying the inode while we are
+		 * flushing the inode.
 		 */
-		if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0)
+		if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+			spin_unlock(&ip->i_flags_lock);
 			continue;
+		}
+		spin_unlock(&ip->i_flags_lock);
 
 		/*
-		 * Try to get locks.  If any are unavailable or it is pinned,
-		 * then this inode cannot be flushed and is skipped.
+		 * Skip inodes that are already flush locked as they have
+		 * already been written to the buffer.
 		 */
-
-		if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED))
-			continue;
-		if (!xfs_iflock_nowait(cip)) {
-			xfs_iunlock(cip, XFS_ILOCK_SHARED);
-			continue;
-		}
-		if (xfs_ipincount(cip)) {
-			xfs_ifunlock(cip);
-			xfs_iunlock(cip, XFS_ILOCK_SHARED);
+		if (!xfs_iflock_nowait(ip)) {
+			xfs_iunlock(ip, XFS_ILOCK_SHARED);
 			continue;
 		}
 
-
 		/*
-		 * Check the inode number again, just to be certain we are not
-		 * racing with freeing in xfs_reclaim_inode(). See the comments
-		 * in that function for more information as to why the initial
-		 * check is not sufficient.
+		 * If we are shut down, unpin and abort the inode now as there
+		 * is no point in flushing it to the buffer just to get an IO
+		 * completion to abort the buffer and remove it from the AIL.
 		 */
-		if (!cip->i_ino) {
-			xfs_ifunlock(cip);
-			xfs_iunlock(cip, XFS_ILOCK_SHARED);
+		if (XFS_FORCED_SHUTDOWN(mp)) {
+			xfs_iunpin_wait(ip);
+			/* xfs_iflush_abort() drops the flush lock */
+			xfs_iflush_abort(ip);
+			xfs_iunlock(ip, XFS_ILOCK_SHARED);
+			error = -EIO;
 			continue;
 		}
 
-		/*
-		 * arriving here means that this inode can be flushed.  First
-		 * re-check that it's dirty before flushing.
-		 */
-		if (!xfs_inode_clean(cip)) {
-			error = xfs_iflush(cip, bp);
-			if (error) {
-				xfs_iunlock(cip, XFS_ILOCK_SHARED);
-				goto out_free;
-			}
-			clcount++;
-		} else {
-			xfs_ifunlock(cip);
+		/* don't block waiting on a log force to unpin dirty inodes */
+		if (xfs_ipincount(ip)) {
+			xfs_ifunlock(ip);
+			xfs_iunlock(ip, XFS_ILOCK_SHARED);
+			continue;
 		}
-		xfs_iunlock(cip, XFS_ILOCK_SHARED);
+
+		if (!xfs_inode_clean(ip))
+			error = xfs_iflush(ip, bp);
+		else
+			xfs_ifunlock(ip);
+		xfs_iunlock(ip, XFS_ILOCK_SHARED);
+		if (error)
+			break;
+		clcount++;
 	}
 
 	if (clcount) {
@@ -3729,11 +3706,6 @@  xfs_iflush_cluster(
 		XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
 	}
 
-out_free:
-	rcu_read_unlock();
-	kmem_free(cilist);
-out_put:
-	xfs_perag_put(pag);
 	if (error) {
 		bp->b_flags |= XBF_ASYNC;
 		xfs_buf_ioend_fail(bp);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index d1109eb13ba2e..b93cf9076df8a 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -427,7 +427,7 @@  int		xfs_log_force_inode(struct xfs_inode *ip);
 void		xfs_iunpin_wait(xfs_inode_t *);
 #define xfs_ipincount(ip)	((unsigned int) atomic_read(&ip->i_pincount))
 
-int		xfs_iflush_cluster(struct xfs_inode *, struct xfs_buf *);
+int		xfs_iflush_cluster(struct xfs_buf *);
 void		xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode,
 				struct xfs_inode *ip1, uint ip1_mode);
 
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 326547e89cb6b..dee7385466f83 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -511,7 +511,7 @@  xfs_inode_item_push(
 	 * reference for IO until we queue the buffer for delwri submission.
 	 */
 	xfs_buf_hold(bp);
-	error = xfs_iflush_cluster(ip, bp);
+	error = xfs_iflush_cluster(bp);
 	if (!error) {
 		if (!xfs_buf_delwri_queue(bp, buffer_list))
 			rval = XFS_ITEM_FLUSHING;