diff mbox series

[v2,05/13] xfs: ratelimit unmount time per-buffer I/O error message

Message ID 20200422175429.38957-6-bfoster@redhat.com (mailing list archive)
State New, archived
Headers show
Series xfs: flush related error handling cleanups | expand

Commit Message

Brian Foster April 22, 2020, 5:54 p.m. UTC
At unmount time, XFS emits a warning for every in-core buffer that
might have undergone a write error. In practice this behavior is
probably reasonable given that the filesystem is likely short lived
once I/O errors begin to occur consistently. Under certain test or
otherwise expected error conditions, this can spam the logs and slow
down the unmount.

We already have a ratelimit state defined for buffers failing
writeback. Fold this state into the buftarg and reuse it for the
unmount time errors.

Signed-off-by: Brian Foster <bfoster@redhat.com>
---
 fs/xfs/xfs_buf.c      | 13 +++++++++++--
 fs/xfs/xfs_buf.h      |  1 +
 fs/xfs/xfs_buf_item.c | 10 +---------
 3 files changed, 13 insertions(+), 11 deletions(-)

Comments

Dave Chinner April 23, 2020, 4:46 a.m. UTC | #1
On Wed, Apr 22, 2020 at 01:54:21PM -0400, Brian Foster wrote:
> At unmount time, XFS emits a warning for every in-core buffer that
> might have undergone a write error. In practice this behavior is
> probably reasonable given that the filesystem is likely short lived
> once I/O errors begin to occur consistently. Under certain test or
> otherwise expected error conditions, this can spam the logs and slow
> down the unmount.
> 
> We already have a ratelimit state defined for buffers failing
> writeback. Fold this state into the buftarg and reuse it for the
> unmount time errors.
> 
> Signed-off-by: Brian Foster <bfoster@redhat.com>

Looks fine, but I suspect we both missed something here:
xfs_buf_ioerror_alert() was made a ratelimited printk in the last
cycle:

void
xfs_buf_ioerror_alert(
        struct xfs_buf          *bp,
        xfs_failaddr_t          func)
{
        xfs_alert_ratelimited(bp->b_mount,
"metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d",
                        func, (uint64_t)XFS_BUF_ADDR(bp), bp->b_length,
                        -bp->b_error);
}

Hence I think all these buffer error alerts can be brought under the
same rate limiting variable. Something like this in xfs_message.c:

void
xfs_buf_alert_ratelimited(
        struct xfs_buf          *bp,
	const char		*rlmsg,
	const char		*fmt,
	...)
{
	struct va_format        vaf;
	va_list                 args;

	if (!___ratelimit(&bp->b_target->bt_ioerror_rl, rlmsg)
		return;

	va_start(args, fmt);
	vaf.fmt = fmt;
	vaf.args = &args;
	__xfs_printk(KERN_ALERT, bp->b_mount, &vaf);
	va_end(args);
}

and:

void
xfs_buf_ioerror_alert(
        struct xfs_buf          *bp,
        xfs_failaddr_t          func)
{
	xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error",
		"metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d",
		func, (uint64_t)XFS_BUF_ADDR(bp), bp->b_length, -bp->b_error);
}


> ---
>  fs/xfs/xfs_buf.c      | 13 +++++++++++--
>  fs/xfs/xfs_buf.h      |  1 +
>  fs/xfs/xfs_buf_item.c | 10 +---------
>  3 files changed, 13 insertions(+), 11 deletions(-)
> 
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index 7a6bc617f0a9..c28a93d2fd8c 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -1684,10 +1684,12 @@ xfs_wait_buftarg(
>  			struct xfs_buf *bp;
>  			bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
>  			list_del_init(&bp->b_lru);
> -			if (bp->b_flags & XBF_WRITE_FAIL) {
> +			if (bp->b_flags & XBF_WRITE_FAIL &&
> +			    ___ratelimit(&bp->b_target->bt_ioerror_rl,
> +					 "XFS: Corruption Alert")) {
>  				xfs_alert(btp->bt_mount,
>  "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
> -					(long long)bp->b_bn);
> +					  (long long)bp->b_bn);
>  				xfs_alert(btp->bt_mount,
>  "Please run xfs_repair to determine the extent of the problem.");
>  			}

I think if we are tossing away metadata here, we should probably
shut down the filesystem once the loop has completed. That way we
get all the normal warnings about running xfs_repair and don't have
to open code it here...

> -
>  STATIC uint
>  xfs_buf_item_push(
>  	struct xfs_log_item	*lip,
> @@ -518,7 +510,7 @@ xfs_buf_item_push(
>  
>  	/* has a previous flush failed due to IO errors? */
>  	if ((bp->b_flags & XBF_WRITE_FAIL) &&
> -	    ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS: Failing async write")) {
> +	    ___ratelimit(&bp->b_target->bt_ioerror_rl, "XFS: Failing async write")) {
>  		xfs_warn(bp->b_mount,
>  "Failing async write on buffer block 0x%llx. Retrying async write.",
>  			 (long long)bp->b_bn);

This gets simplified to:

	if (bp->b_flags & XBF_WRITE_FAIL) {
		xfs_buf_alert_ratelimited(bp, "XFS: Failing async write",
"Failing async write on buffer block 0x%llx. Retrying async write.",
					(long long)bp->b_bn);
	}

Cheers,

Dave.
Brian Foster April 23, 2020, 2:29 p.m. UTC | #2
On Thu, Apr 23, 2020 at 02:46:04PM +1000, Dave Chinner wrote:
> On Wed, Apr 22, 2020 at 01:54:21PM -0400, Brian Foster wrote:
> > At unmount time, XFS emits a warning for every in-core buffer that
> > might have undergone a write error. In practice this behavior is
> > probably reasonable given that the filesystem is likely short lived
> > once I/O errors begin to occur consistently. Under certain test or
> > otherwise expected error conditions, this can spam the logs and slow
> > down the unmount.
> > 
> > We already have a ratelimit state defined for buffers failing
> > writeback. Fold this state into the buftarg and reuse it for the
> > unmount time errors.
> > 
> > Signed-off-by: Brian Foster <bfoster@redhat.com>
> 
> Looks fine, but I suspect we both missed something here:
> xfs_buf_ioerror_alert() was made a ratelimited printk in the last
> cycle:
> 
> void
> xfs_buf_ioerror_alert(
>         struct xfs_buf          *bp,
>         xfs_failaddr_t          func)
> {
>         xfs_alert_ratelimited(bp->b_mount,
> "metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d",
>                         func, (uint64_t)XFS_BUF_ADDR(bp), bp->b_length,
>                         -bp->b_error);
> }
> 

Yeah, I hadn't noticed that.

> Hence I think all these buffer error alerts can be brought under the
> same rate limiting variable. Something like this in xfs_message.c:
> 

One thing to note is that xfs_alert_ratelimited() ultimately uses
the DEFAULT_RATELIMIT_INTERVAL of 5s. The ratelimit we're generalizing
here uses 30s (both use a burst of 10). That seems reasonable enough to
me for I/O errors so I'm good with the changes below.

FWIW, that also means we could just call xfs_buf_alert_ratelimited()
from xfs_buf_item_push() if we're also Ok with using an "alert" instead
of a "warn." I'm not immediately aware of a reason to use one over the
other (xfs_wait_buftarg() already uses alert) so I'll try that unless I
hear an objection. The xfs_wait_buftarg() ratelimit presumably remains
open coded because it's two separate calls and we probably don't want
them to individually count against the limit.

Brian

> void
> xfs_buf_alert_ratelimited(
>         struct xfs_buf          *bp,
> 	const char		*rlmsg,
> 	const char		*fmt,
> 	...)
> {
> 	struct va_format        vaf;
> 	va_list                 args;
> 
> 	if (!___ratelimit(&bp->b_target->bt_ioerror_rl, rlmsg)
> 		return;
> 
> 	va_start(args, fmt);
> 	vaf.fmt = fmt;
> 	vaf.args = &args;
> 	__xfs_printk(KERN_ALERT, bp->b_mount, &vaf);
> 	va_end(args);
> }
> 
> and:
> 
> void
> xfs_buf_ioerror_alert(
>         struct xfs_buf          *bp,
>         xfs_failaddr_t          func)
> {
> 	xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error",
> 		"metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d",
> 		func, (uint64_t)XFS_BUF_ADDR(bp), bp->b_length, -bp->b_error);
> }
> 
> 
> > ---
> >  fs/xfs/xfs_buf.c      | 13 +++++++++++--
> >  fs/xfs/xfs_buf.h      |  1 +
> >  fs/xfs/xfs_buf_item.c | 10 +---------
> >  3 files changed, 13 insertions(+), 11 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> > index 7a6bc617f0a9..c28a93d2fd8c 100644
> > --- a/fs/xfs/xfs_buf.c
> > +++ b/fs/xfs/xfs_buf.c
> > @@ -1684,10 +1684,12 @@ xfs_wait_buftarg(
> >  			struct xfs_buf *bp;
> >  			bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
> >  			list_del_init(&bp->b_lru);
> > -			if (bp->b_flags & XBF_WRITE_FAIL) {
> > +			if (bp->b_flags & XBF_WRITE_FAIL &&
> > +			    ___ratelimit(&bp->b_target->bt_ioerror_rl,
> > +					 "XFS: Corruption Alert")) {
> >  				xfs_alert(btp->bt_mount,
> >  "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
> > -					(long long)bp->b_bn);
> > +					  (long long)bp->b_bn);
> >  				xfs_alert(btp->bt_mount,
> >  "Please run xfs_repair to determine the extent of the problem.");
> >  			}
> 
> I think if we are tossing away metadata here, we should probably
> shut down the filesystem once the loop has completed. That way we
> get all the normal warnings about running xfs_repair and don't have
> to open code it here...
> 
> > -
> >  STATIC uint
> >  xfs_buf_item_push(
> >  	struct xfs_log_item	*lip,
> > @@ -518,7 +510,7 @@ xfs_buf_item_push(
> >  
> >  	/* has a previous flush failed due to IO errors? */
> >  	if ((bp->b_flags & XBF_WRITE_FAIL) &&
> > -	    ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS: Failing async write")) {
> > +	    ___ratelimit(&bp->b_target->bt_ioerror_rl, "XFS: Failing async write")) {
> >  		xfs_warn(bp->b_mount,
> >  "Failing async write on buffer block 0x%llx. Retrying async write.",
> >  			 (long long)bp->b_bn);
> 
> This gets simplified to:
> 
> 	if (bp->b_flags & XBF_WRITE_FAIL) {
> 		xfs_buf_alert_ratelimited(bp, "XFS: Failing async write",
> "Failing async write on buffer block 0x%llx. Retrying async write.",
> 					(long long)bp->b_bn);
> 	}
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
>
Dave Chinner April 23, 2020, 9:14 p.m. UTC | #3
On Thu, Apr 23, 2020 at 10:29:58AM -0400, Brian Foster wrote:
> On Thu, Apr 23, 2020 at 02:46:04PM +1000, Dave Chinner wrote:
> > On Wed, Apr 22, 2020 at 01:54:21PM -0400, Brian Foster wrote:
> > > At unmount time, XFS emits a warning for every in-core buffer that
> > > might have undergone a write error. In practice this behavior is
> > > probably reasonable given that the filesystem is likely short lived
> > > once I/O errors begin to occur consistently. Under certain test or
> > > otherwise expected error conditions, this can spam the logs and slow
> > > down the unmount.
> > > 
> > > We already have a ratelimit state defined for buffers failing
> > > writeback. Fold this state into the buftarg and reuse it for the
> > > unmount time errors.
> > > 
> > > Signed-off-by: Brian Foster <bfoster@redhat.com>
> > 
> > Looks fine, but I suspect we both missed something here:
> > xfs_buf_ioerror_alert() was made a ratelimited printk in the last
> > cycle:
> > 
> > void
> > xfs_buf_ioerror_alert(
> >         struct xfs_buf          *bp,
> >         xfs_failaddr_t          func)
> > {
> >         xfs_alert_ratelimited(bp->b_mount,
> > "metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d",
> >                         func, (uint64_t)XFS_BUF_ADDR(bp), bp->b_length,
> >                         -bp->b_error);
> > }
> > 
> 
> Yeah, I hadn't noticed that.
> 
> > Hence I think all these buffer error alerts can be brought under the
> > same rate limiting variable. Something like this in xfs_message.c:
> > 
> 
> One thing to note is that xfs_alert_ratelimited() ultimately uses
> the DEFAULT_RATELIMIT_INTERVAL of 5s. The ratelimit we're generalizing
> here uses 30s (both use a burst of 10). That seems reasonable enough to
> me for I/O errors so I'm good with the changes below.
> 
> FWIW, that also means we could just call xfs_buf_alert_ratelimited()
> from xfs_buf_item_push() if we're also Ok with using an "alert" instead
> of a "warn." I'm not immediately aware of a reason to use one over the
> other (xfs_wait_buftarg() already uses alert) so I'll try that unless I
> hear an objection.

SOunds fine to me.

> The xfs_wait_buftarg() ratelimit presumably remains
> open coded because it's two separate calls and we probably don't want
> them to individually count against the limit.

That's why I suggested dropping the second "run xfs_repair" message
and triggering a shutdown after the wait loop. That way we don't
issue "run xfs_repair" for every single failed buffer (largely
noise!), and we get a non-rate-limited common "run xfs-repair"
message once we processed all the failed writes.

Cheers,

Dave.
Brian Foster April 24, 2020, 11:12 a.m. UTC | #4
On Fri, Apr 24, 2020 at 07:14:37AM +1000, Dave Chinner wrote:
> On Thu, Apr 23, 2020 at 10:29:58AM -0400, Brian Foster wrote:
> > On Thu, Apr 23, 2020 at 02:46:04PM +1000, Dave Chinner wrote:
> > > On Wed, Apr 22, 2020 at 01:54:21PM -0400, Brian Foster wrote:
> > > > At unmount time, XFS emits a warning for every in-core buffer that
> > > > might have undergone a write error. In practice this behavior is
> > > > probably reasonable given that the filesystem is likely short lived
> > > > once I/O errors begin to occur consistently. Under certain test or
> > > > otherwise expected error conditions, this can spam the logs and slow
> > > > down the unmount.
> > > > 
> > > > We already have a ratelimit state defined for buffers failing
> > > > writeback. Fold this state into the buftarg and reuse it for the
> > > > unmount time errors.
> > > > 
> > > > Signed-off-by: Brian Foster <bfoster@redhat.com>
> > > 
> > > Looks fine, but I suspect we both missed something here:
> > > xfs_buf_ioerror_alert() was made a ratelimited printk in the last
> > > cycle:
> > > 
> > > void
> > > xfs_buf_ioerror_alert(
> > >         struct xfs_buf          *bp,
> > >         xfs_failaddr_t          func)
> > > {
> > >         xfs_alert_ratelimited(bp->b_mount,
> > > "metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d",
> > >                         func, (uint64_t)XFS_BUF_ADDR(bp), bp->b_length,
> > >                         -bp->b_error);
> > > }
> > > 
> > 
> > Yeah, I hadn't noticed that.
> > 
> > > Hence I think all these buffer error alerts can be brought under the
> > > same rate limiting variable. Something like this in xfs_message.c:
> > > 
> > 
> > One thing to note is that xfs_alert_ratelimited() ultimately uses
> > the DEFAULT_RATELIMIT_INTERVAL of 5s. The ratelimit we're generalizing
> > here uses 30s (both use a burst of 10). That seems reasonable enough to
> > me for I/O errors so I'm good with the changes below.
> > 
> > FWIW, that also means we could just call xfs_buf_alert_ratelimited()
> > from xfs_buf_item_push() if we're also Ok with using an "alert" instead
> > of a "warn." I'm not immediately aware of a reason to use one over the
> > other (xfs_wait_buftarg() already uses alert) so I'll try that unless I
> > hear an objection.
> 
> SOunds fine to me.
> 
> > The xfs_wait_buftarg() ratelimit presumably remains
> > open coded because it's two separate calls and we probably don't want
> > them to individually count against the limit.
> 
> That's why I suggested dropping the second "run xfs_repair" message
> and triggering a shutdown after the wait loop. That way we don't
> issue "run xfs_repair" for every single failed buffer (largely
> noise!), and we get a non-rate-limited common "run xfs-repair"
> message once we processed all the failed writes.
> 

Sorry, must have missed that in the last reply. I don't think we want to
shut down here because XBF_WRITE_FAIL only reflects that the internal
async write retry (e.g. the one historically used to mitigate transient
I/O errors) has occurred on the buffer, not necessarily that the
immediately previous I/O has failed. For that reason I've kind of looked
at this particular instance as more of a warning that I/O errors have
occurred in the past and the user might want to verify it didn't result
in unexpected damage. FWIW, I've observed plenty of these messages long
after I've disabled error injection and allowed I/O to proceed and the
fs to unmount cleanly.

Looking at it again, the wording of the message is much stronger than a
warning (i.e. "Corruption Alert", "permanent write failures"), so
perhaps we should revisit what we're actually trying to accomplish with
this message. Note that we do have the buffer push time alert to
indicate that I/O errors and retries are occurring, as well as error
handling logic to shutdown on I/O failure while the fs is unmounting
(xfs_buf_iodone_callback_error()), so both of those cases are
fundamentally covered already.

ISTM that leaves at least a few simple options for the
xfs_wait_buftarg() message:

1.) Remove it.
2.) Massage it into more of a "Warning: buffer I/O errors have occurred
in the past" type of message. This could perhaps retain the existing
XBF_WRITE_FAIL logic, but use a flag to lift it out of the loop and thus
avoid the need to rate limit it at all.
3.) Fix up the logic to only dump (ratelimited) specifics on buffers
that have actually failed I/O. This means we use the existing message
but perhaps check ->b_error instead of XBF_WRITE_FAIL. We still don't
need to shut down (I'd probably add an assert), but we could also lift
the second xfs_repair line of the message out of the loop to reduce the
noise.

We could also look into clearing XBF_WRITE_RETRY on successful I/O
completion, FWIW. The existing logic kind of bugs me regardless.
Thoughts?

Brian

> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
>
Dave Chinner April 24, 2020, 10:08 p.m. UTC | #5
On Fri, Apr 24, 2020 at 07:12:32AM -0400, Brian Foster wrote:
> On Fri, Apr 24, 2020 at 07:14:37AM +1000, Dave Chinner wrote:
> > On Thu, Apr 23, 2020 at 10:29:58AM -0400, Brian Foster wrote:
> > > On Thu, Apr 23, 2020 at 02:46:04PM +1000, Dave Chinner wrote:
> > > > On Wed, Apr 22, 2020 at 01:54:21PM -0400, Brian Foster wrote:
> > > > > At unmount time, XFS emits a warning for every in-core buffer that
> > > > > might have undergone a write error. In practice this behavior is
> > > > > probably reasonable given that the filesystem is likely short lived
> > > > > once I/O errors begin to occur consistently. Under certain test or
> > > > > otherwise expected error conditions, this can spam the logs and slow
> > > > > down the unmount.
> > > > > 
> > > > > We already have a ratelimit state defined for buffers failing
> > > > > writeback. Fold this state into the buftarg and reuse it for the
> > > > > unmount time errors.
> > > > > 
> > > > > Signed-off-by: Brian Foster <bfoster@redhat.com>
> > > > 
> > > > Looks fine, but I suspect we both missed something here:
> > > > xfs_buf_ioerror_alert() was made a ratelimited printk in the last
> > > > cycle:
> > > > 
> > > > void
> > > > xfs_buf_ioerror_alert(
> > > >         struct xfs_buf          *bp,
> > > >         xfs_failaddr_t          func)
> > > > {
> > > >         xfs_alert_ratelimited(bp->b_mount,
> > > > "metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d",
> > > >                         func, (uint64_t)XFS_BUF_ADDR(bp), bp->b_length,
> > > >                         -bp->b_error);
> > > > }
> > > > 
> > > 
> > > Yeah, I hadn't noticed that.
> > > 
> > > > Hence I think all these buffer error alerts can be brought under the
> > > > same rate limiting variable. Something like this in xfs_message.c:
> > > > 
> > > 
> > > One thing to note is that xfs_alert_ratelimited() ultimately uses
> > > the DEFAULT_RATELIMIT_INTERVAL of 5s. The ratelimit we're generalizing
> > > here uses 30s (both use a burst of 10). That seems reasonable enough to
> > > me for I/O errors so I'm good with the changes below.
> > > 
> > > FWIW, that also means we could just call xfs_buf_alert_ratelimited()
> > > from xfs_buf_item_push() if we're also Ok with using an "alert" instead
> > > of a "warn." I'm not immediately aware of a reason to use one over the
> > > other (xfs_wait_buftarg() already uses alert) so I'll try that unless I
> > > hear an objection.
> > 
> > SOunds fine to me.
> > 
> > > The xfs_wait_buftarg() ratelimit presumably remains
> > > open coded because it's two separate calls and we probably don't want
> > > them to individually count against the limit.
> > 
> > That's why I suggested dropping the second "run xfs_repair" message
> > and triggering a shutdown after the wait loop. That way we don't
> > issue "run xfs_repair" for every single failed buffer (largely
> > noise!), and we get a non-rate-limited common "run xfs-repair"
> > message once we processed all the failed writes.
> > 
> 
> Sorry, must have missed that in the last reply. I don't think we want to
> shut down here because XBF_WRITE_FAIL only reflects that the internal
> async write retry (e.g. the one historically used to mitigate transient
> I/O errors) has occurred on the buffer, not necessarily that the
> immediately previous I/O has failed.

I think this is an incorrect reading of how XBF_WRITE_FAIL
functions. XBF_WRITE_FAIL is used to indicate that the previous
write failed, not that a historic write failed. The flag is cleared
at buffer submission time - see xfs_buf_delwri_submit_buffers() and
xfs_bwrite() - and so it is only set on buffers whose previous IO
failed and hence is still dirty and has not been flushed back to
disk.

If we hit this in xfs_buftarg_wait() after we've pushed the AIL in
xfs_log_quiesce() on unmount, then we've got write failures that
could not be resolved by repeated retries, and the filesystem is, at
this instant in time, inconsistent on disk.

That's a shutdown error condition...

> For that reason I've kind of looked
> at this particular instance as more of a warning that I/O errors have
> occurred in the past and the user might want to verify it didn't result
> in unexpected damage. FWIW, I've observed plenty of these messages long
> after I've disabled error injection and allowed I/O to proceed and the
> fs to unmount cleanly.

Right. THat's the whole point of the flag - the buffer has been
dirtied and it hasn't been able to be written back when we are
purging the buffer cache at the end of unmount. i.e. When
xfs_buftarg_wait() is called, all buffers should be clean because we
are about to write an unmount record to mark the log clean once all
the logged metadata is written back.

What we do right now - write an unmount record after failing metadta
writeback - is actually a bug, and that is one of the
reasons why I suggested a shutdown should be done. i.e. we should
not be writing an unmount record to mark the log clean if we failed
to write back metadata. That metadata is still valid in the journal,
and so it should remain valid in the journal to allow it to be
replayed on the next mount. i.e. retry the writes from log recovery
after the hardware failure has been addressed and the IO errors have
gone away.

Tossing the dirty buffers unmount and then marking the journal
clean is actually making the buffer write failure -worse-. Doing this
guarantees the filesystem is inconsistent on disk (by updating the
journal to indicate those writes actually succeeded) and absolutely
requires xfs_repair to fix as a result.

If we shut down on XBF_WRITE_FAIL buffers in xfs_buftarg_wait(), we
will not write an unmount record and so give the filesystem a chance
to recover on next mount (e.g. after a power cycle to clear whatever
raid hardware bug was being hit) and write that dirty metadata back
without failure.  If recovery fails with IO errors, then the user
really does need to run repair.  However, the situation at this
point is still better than if we write a clean unmount record after
write failures...

Cheers,

Dave.
Brian Foster April 27, 2020, 11:11 a.m. UTC | #6
On Sat, Apr 25, 2020 at 08:08:23AM +1000, Dave Chinner wrote:
> On Fri, Apr 24, 2020 at 07:12:32AM -0400, Brian Foster wrote:
> > On Fri, Apr 24, 2020 at 07:14:37AM +1000, Dave Chinner wrote:
> > > On Thu, Apr 23, 2020 at 10:29:58AM -0400, Brian Foster wrote:
> > > > On Thu, Apr 23, 2020 at 02:46:04PM +1000, Dave Chinner wrote:
> > > > > On Wed, Apr 22, 2020 at 01:54:21PM -0400, Brian Foster wrote:
> > > > > > At unmount time, XFS emits a warning for every in-core buffer that
> > > > > > might have undergone a write error. In practice this behavior is
> > > > > > probably reasonable given that the filesystem is likely short lived
> > > > > > once I/O errors begin to occur consistently. Under certain test or
> > > > > > otherwise expected error conditions, this can spam the logs and slow
> > > > > > down the unmount.
> > > > > > 
> > > > > > We already have a ratelimit state defined for buffers failing
> > > > > > writeback. Fold this state into the buftarg and reuse it for the
> > > > > > unmount time errors.
> > > > > > 
> > > > > > Signed-off-by: Brian Foster <bfoster@redhat.com>
> > > > > 
> > > > > Looks fine, but I suspect we both missed something here:
> > > > > xfs_buf_ioerror_alert() was made a ratelimited printk in the last
> > > > > cycle:
> > > > > 
> > > > > void
> > > > > xfs_buf_ioerror_alert(
> > > > >         struct xfs_buf          *bp,
> > > > >         xfs_failaddr_t          func)
> > > > > {
> > > > >         xfs_alert_ratelimited(bp->b_mount,
> > > > > "metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d",
> > > > >                         func, (uint64_t)XFS_BUF_ADDR(bp), bp->b_length,
> > > > >                         -bp->b_error);
> > > > > }
> > > > > 
> > > > 
> > > > Yeah, I hadn't noticed that.
> > > > 
> > > > > Hence I think all these buffer error alerts can be brought under the
> > > > > same rate limiting variable. Something like this in xfs_message.c:
> > > > > 
> > > > 
> > > > One thing to note is that xfs_alert_ratelimited() ultimately uses
> > > > the DEFAULT_RATELIMIT_INTERVAL of 5s. The ratelimit we're generalizing
> > > > here uses 30s (both use a burst of 10). That seems reasonable enough to
> > > > me for I/O errors so I'm good with the changes below.
> > > > 
> > > > FWIW, that also means we could just call xfs_buf_alert_ratelimited()
> > > > from xfs_buf_item_push() if we're also Ok with using an "alert" instead
> > > > of a "warn." I'm not immediately aware of a reason to use one over the
> > > > other (xfs_wait_buftarg() already uses alert) so I'll try that unless I
> > > > hear an objection.
> > > 
> > > SOunds fine to me.
> > > 
> > > > The xfs_wait_buftarg() ratelimit presumably remains
> > > > open coded because it's two separate calls and we probably don't want
> > > > them to individually count against the limit.
> > > 
> > > That's why I suggested dropping the second "run xfs_repair" message
> > > and triggering a shutdown after the wait loop. That way we don't
> > > issue "run xfs_repair" for every single failed buffer (largely
> > > noise!), and we get a non-rate-limited common "run xfs-repair"
> > > message once we processed all the failed writes.
> > > 
> > 
> > Sorry, must have missed that in the last reply. I don't think we want to
> > shut down here because XBF_WRITE_FAIL only reflects that the internal
> > async write retry (e.g. the one historically used to mitigate transient
> > I/O errors) has occurred on the buffer, not necessarily that the
> > immediately previous I/O has failed.
> 
> I think this is an incorrect reading of how XBF_WRITE_FAIL
> functions. XBF_WRITE_FAIL is used to indicate that the previous
> write failed, not that a historic write failed. The flag is cleared
> at buffer submission time - see xfs_buf_delwri_submit_buffers() and
> xfs_bwrite() - and so it is only set on buffers whose previous IO
> failed and hence is still dirty and has not been flushed back to
> disk.
> 

Right, but the flag is only ever cleared in those particular submission
paths (which doesn't include the resubmit path). The point being
overlooked here is that it is never cleared on successful I/O
completion. That means if an I/O fails, we set the flag and resubmit. If
the resubmitted I/O succeeds, the flag remains set on the buffer until
the next time it is written, but there's no guarantee the buffer is
dirtied again before we unmount. Therefore, it's possible to have
XBF_WRITE_FAIL set on clean/consistent buffers at unmount time without
any underlying inconsistency.

> If we hit this in xfs_buftarg_wait() after we've pushed the AIL in
> xfs_log_quiesce() on unmount, then we've got write failures that
> could not be resolved by repeated retries, and the filesystem is, at
> this instant in time, inconsistent on disk.
> 
> That's a shutdown error condition...
> 

Yes, but XBF_WRITE_FAIL does not reflect that condition with certainty.

> > For that reason I've kind of looked
> > at this particular instance as more of a warning that I/O errors have
> > occurred in the past and the user might want to verify it didn't result
> > in unexpected damage. FWIW, I've observed plenty of these messages long
> > after I've disabled error injection and allowed I/O to proceed and the
> > fs to unmount cleanly.
> 
> Right. THat's the whole point of the flag - the buffer has been
> dirtied and it hasn't been able to be written back when we are
> purging the buffer cache at the end of unmount. i.e. When
> xfs_buftarg_wait() is called, all buffers should be clean because we
> are about to write an unmount record to mark the log clean once all
> the logged metadata is written back.
> 
> What we do right now - write an unmount record after failing metadta
> writeback - is actually a bug, and that is one of the
> reasons why I suggested a shutdown should be done. i.e. we should
> not be writing an unmount record to mark the log clean if we failed
> to write back metadata. That metadata is still valid in the journal,
> and so it should remain valid in the journal to allow it to be
> replayed on the next mount. i.e. retry the writes from log recovery
> after the hardware failure has been addressed and the IO errors have
> gone away.
> 
> Tossing the dirty buffers unmount and then marking the journal
> clean is actually making the buffer write failure -worse-. Doing this
> guarantees the filesystem is inconsistent on disk (by updating the
> journal to indicate those writes actually succeeded) and absolutely
> requires xfs_repair to fix as a result.
> 
> If we shut down on XBF_WRITE_FAIL buffers in xfs_buftarg_wait(), we
> will not write an unmount record and so give the filesystem a chance
> to recover on next mount (e.g. after a power cycle to clear whatever
> raid hardware bug was being hit) and write that dirty metadata back
> without failure.  If recovery fails with IO errors, then the user
> really does need to run repair.  However, the situation at this
> point is still better than if we write a clean unmount record after
> write failures...
> 

I agree with all of this when we are actually tossing dirty buffers. ;)
See above that I'm referring to how this state can persist after error
injection is disabled and I/O errors have ceased. Also note that the
error handling code already shuts down the filesystem in the completion
handling path of any buffer undergoing retries at unmount time (see
XFS_MOUNT_UNMOUNTING). IIRC this was introduced to prevent hanging
unmount on fs' configured with infinite retries, but it ensures the fs
is shut down if dirty buffers have failed to write back before we get to
xfs_wait_buftarg(). So there isn't a functional bug from what I can see
if XBF_WRITE_FAIL does happen to reflect a failed+dirty buffer in
xfs_wait_buftarg().

It's pretty clear that the implementation of XBF_WRITE_FAIL is
unnecessarily fragile and confusing. Aside from not being able to
establish the meaning of the xfs_wait_buftarg() warning message between
the two of us (good luck, users ;P), I'm guessing it's also not obvious
that there's an internal retry associated with each xfsaild submitted
write, even for buffers that persistently fail. For example, xfsaild
submits a buffer, I/O fails, we set XBF_WRITE_FAIL and resubmit, I/O
fails again, XBF_WRITE_FAIL is already set and retries are configured so
we unlock the buffer, xfsaild comes around again and requeues, delwri
submit clears XBF_WRITE_FAIL and we start over from the beginning.

Perhaps a better solution would be to clear XBF_WRITE_FAIL on successful
I/O completion. That way we only issue the internal retry once since the
last time a buffer was successfully written and xfs_wait_buftarg() can
legitimately warn about tossing dirty buffers and shut down. I do think
it makes sense to shut down from there in that scenario as well,
provided the logic is correct, simply because the dependency on error
configuration shutdown behavior is not obvious. Thoughts?

Brian

> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
>
diff mbox series

Patch

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 7a6bc617f0a9..c28a93d2fd8c 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1684,10 +1684,12 @@  xfs_wait_buftarg(
 			struct xfs_buf *bp;
 			bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
 			list_del_init(&bp->b_lru);
-			if (bp->b_flags & XBF_WRITE_FAIL) {
+			if (bp->b_flags & XBF_WRITE_FAIL &&
+			    ___ratelimit(&bp->b_target->bt_ioerror_rl,
+					 "XFS: Corruption Alert")) {
 				xfs_alert(btp->bt_mount,
 "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
-					(long long)bp->b_bn);
+					  (long long)bp->b_bn);
 				xfs_alert(btp->bt_mount,
 "Please run xfs_repair to determine the extent of the problem.");
 			}
@@ -1828,6 +1830,13 @@  xfs_alloc_buftarg(
 	btp->bt_bdev = bdev;
 	btp->bt_daxdev = dax_dev;
 
+	/*
+	 * Buffer IO error rate limiting. Limit it to no more than 10 messages
+	 * per 30 seconds so as to not spam logs too much on repeated errors.
+	 */
+	ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
+			     DEFAULT_RATELIMIT_BURST);
+
 	if (xfs_setsize_buftarg_early(btp, bdev))
 		goto error_free;
 
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 598b93b17d95..10492f68fd4b 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -91,6 +91,7 @@  typedef struct xfs_buftarg {
 	struct list_lru		bt_lru;
 
 	struct percpu_counter	bt_io_count;
+	struct ratelimit_state	bt_ioerror_rl;
 } xfs_buftarg_t;
 
 struct xfs_buf;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index e34298227f87..23cbfeb82183 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -480,14 +480,6 @@  xfs_buf_item_unpin(
 	}
 }
 
-/*
- * Buffer IO error rate limiting. Limit it to no more than 10 messages per 30
- * seconds so as to not spam logs too much on repeated detection of the same
- * buffer being bad..
- */
-
-static DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10);
-
 STATIC uint
 xfs_buf_item_push(
 	struct xfs_log_item	*lip,
@@ -518,7 +510,7 @@  xfs_buf_item_push(
 
 	/* has a previous flush failed due to IO errors? */
 	if ((bp->b_flags & XBF_WRITE_FAIL) &&
-	    ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS: Failing async write")) {
+	    ___ratelimit(&bp->b_target->bt_ioerror_rl, "XFS: Failing async write")) {
 		xfs_warn(bp->b_mount,
 "Failing async write on buffer block 0x%llx. Retrying async write.",
 			 (long long)bp->b_bn);