diff mbox series

[v4,07/17] xfs: ratelimit unmount time per-buffer I/O error alert

Message ID 20200504141154.55887-8-bfoster@redhat.com (mailing list archive)
State Superseded
Headers show
Series xfs: flush related error handling cleanups | expand

Commit Message

Brian Foster May 4, 2020, 2:11 p.m. UTC
At unmount time, XFS emits an alert for every in-core buffer that
might have undergone a write error. In practice this behavior is
probably reasonable given that the filesystem is likely short lived
once I/O errors begin to occur consistently. Under certain test or
otherwise expected error conditions, this can spam the logs and slow
down the unmount.

Now that we have a ratelimit mechanism specifically for buffer
alerts, reuse it for the per-buffer alerts in xfs_wait_buftarg().
Also lift the final repair message out of the loop so it always
prints and assert that the metadata error handling code has shut
down the fs.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_buf.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

Comments

Allison Henderson May 5, 2020, 9:10 p.m. UTC | #1
On 5/4/20 7:11 AM, Brian Foster wrote:
> At unmount time, XFS emits an alert for every in-core buffer that
> might have undergone a write error. In practice this behavior is
> probably reasonable given that the filesystem is likely short lived
> once I/O errors begin to occur consistently. Under certain test or
> otherwise expected error conditions, this can spam the logs and slow
> down the unmount.
> 
> Now that we have a ratelimit mechanism specifically for buffer
> alerts, reuse it for the per-buffer alerts in xfs_wait_buftarg().
> Also lift the final repair message out of the loop so it always
> prints and assert that the metadata error handling code has shut
> down the fs.
> 
> Signed-off-by: Brian Foster <bfoster@redhat.com>
> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
> Reviewed-by: Christoph Hellwig <hch@lst.de>
Looks fine to me:
Reviewed-by: Allison Collins <allison.henderson@oracle.com>

> ---
>   fs/xfs/xfs_buf.c | 15 +++++++++++----
>   1 file changed, 11 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index 594d5e1df6f8..8f0f605de579 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -1657,7 +1657,8 @@ xfs_wait_buftarg(
>   	struct xfs_buftarg	*btp)
>   {
>   	LIST_HEAD(dispose);
> -	int loop = 0;
> +	int			loop = 0;
> +	bool			write_fail = false;
>   
>   	/*
>   	 * First wait on the buftarg I/O count for all in-flight buffers to be
> @@ -1685,17 +1686,23 @@ xfs_wait_buftarg(
>   			bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
>   			list_del_init(&bp->b_lru);
>   			if (bp->b_flags & XBF_WRITE_FAIL) {
> -				xfs_alert(btp->bt_mount,
> +				write_fail = true;
> +				xfs_buf_alert_ratelimited(bp,
> +					"XFS: Corruption Alert",
>   "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
>   					(long long)bp->b_bn);
> -				xfs_alert(btp->bt_mount,
> -"Please run xfs_repair to determine the extent of the problem.");
>   			}
>   			xfs_buf_rele(bp);
>   		}
>   		if (loop++ != 0)
>   			delay(100);
>   	}
> +
> +	if (write_fail) {
> +		ASSERT(XFS_FORCED_SHUTDOWN(btp->bt_mount));
> +		xfs_alert(btp->bt_mount,
> +	      "Please run xfs_repair to determine the extent of the problem.");
> +	}
>   }
>   
>   static enum lru_status
>
diff mbox series

Patch

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 594d5e1df6f8..8f0f605de579 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1657,7 +1657,8 @@  xfs_wait_buftarg(
 	struct xfs_buftarg	*btp)
 {
 	LIST_HEAD(dispose);
-	int loop = 0;
+	int			loop = 0;
+	bool			write_fail = false;
 
 	/*
 	 * First wait on the buftarg I/O count for all in-flight buffers to be
@@ -1685,17 +1686,23 @@  xfs_wait_buftarg(
 			bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
 			list_del_init(&bp->b_lru);
 			if (bp->b_flags & XBF_WRITE_FAIL) {
-				xfs_alert(btp->bt_mount,
+				write_fail = true;
+				xfs_buf_alert_ratelimited(bp,
+					"XFS: Corruption Alert",
 "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
 					(long long)bp->b_bn);
-				xfs_alert(btp->bt_mount,
-"Please run xfs_repair to determine the extent of the problem.");
 			}
 			xfs_buf_rele(bp);
 		}
 		if (loop++ != 0)
 			delay(100);
 	}
+
+	if (write_fail) {
+		ASSERT(XFS_FORCED_SHUTDOWN(btp->bt_mount));
+		xfs_alert(btp->bt_mount,
+	      "Please run xfs_repair to determine the extent of the problem.");
+	}
 }
 
 static enum lru_status