diff mbox series

[11/11] xfs_scrub: create a new category for unfixable errors

Message ID 156944743430.300131.9710870375452494499.stgit@magnolia (mailing list archive)
State Superseded
Headers show
Series xfs_scrub: fix IO error reporting | expand

Commit Message

Darrick J. Wong Sept. 25, 2019, 9:37 p.m. UTC
From: Darrick J. Wong <darrick.wong@oracle.com>

There's nothing that xfs_scrub (or XFS) can do about media errors for
data file blocks -- the data are gone.  Create a new category for these
unfixable errors so that we don't advise the user to take further action
that won't fix the problem.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 scrub/common.c    |    9 ++++++++-
 scrub/common.h    |    3 +++
 scrub/phase4.c    |    3 ++-
 scrub/phase5.c    |    2 +-
 scrub/phase6.c    |    2 +-
 scrub/xfs_scrub.c |   15 +++++++++++++--
 scrub/xfs_scrub.h |    1 +
 7 files changed, 29 insertions(+), 6 deletions(-)

Comments

Eric Sandeen Oct. 21, 2019, 7:45 p.m. UTC | #1
On 9/25/19 4:37 PM, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> There's nothing that xfs_scrub (or XFS) can do about media errors for
> data file blocks -- the data are gone.  Create a new category for these
> unfixable errors so that we don't advise the user to take further action
> that won't fix the problem.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>

<bikeshed>
What if unfixable_errors was a subcounter for total errors_found?
Would that simplify all the places that need to check both?

Or would that be confusing?

</bikeshed>
Darrick J. Wong Oct. 21, 2019, 8:29 p.m. UTC | #2
On Mon, Oct 21, 2019 at 02:45:17PM -0500, Eric Sandeen wrote:
> On 9/25/19 4:37 PM, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > There's nothing that xfs_scrub (or XFS) can do about media errors for
> > data file blocks -- the data are gone.  Create a new category for these
> > unfixable errors so that we don't advise the user to take further action
> > that won't fix the problem.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> 
> <bikeshed>
> What if unfixable_errors was a subcounter for total errors_found?
> Would that simplify all the places that need to check both?

Unfixable errors need to be tracked separately because we advise the
user on what xfs tool to run next if @errors_found (aka corruption
errors) or @runtime_errors are greater than zero.  There is no xfs
utility that you can run to fix things like file data loss.

(I mean, you can fix the unfixable by restoring from backups, maybe we
should say that?)

--D

> Or would that be confusing?
> 
> </bikeshed>
>
Eric Sandeen Oct. 21, 2019, 8:38 p.m. UTC | #3
On 10/21/19 3:29 PM, Darrick J. Wong wrote:
> On Mon, Oct 21, 2019 at 02:45:17PM -0500, Eric Sandeen wrote:
>> On 9/25/19 4:37 PM, Darrick J. Wong wrote:
>>> From: Darrick J. Wong <darrick.wong@oracle.com>
>>>
>>> There's nothing that xfs_scrub (or XFS) can do about media errors for
>>> data file blocks -- the data are gone.  Create a new category for these
>>> unfixable errors so that we don't advise the user to take further action
>>> that won't fix the problem.
>>>
>>> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
>>
>> <bikeshed>
>> What if unfixable_errors was a subcounter for total errors_found?
>> Would that simplify all the places that need to check both?
> 
> Unfixable errors need to be tracked separately because we advise the
> user on what xfs tool to run next if @errors_found (aka corruption
> errors) or @runtime_errors are greater than zero.  There is no xfs
> utility that you can run to fix things like file data loss.
> 
> (I mean, you can fix the unfixable by restoring from backups, maybe we
> should say that?)

I understand why it needs to be tracked separately.

What I was suggesting was that every error (fixable or not), bumps
errors_found.  Unfixable errors also bump unfixable_errors as a subset
of that.

That way every time you want to find out "was there a scrub error?" you
don't have to test both ctx->unfixable_errors and ctx->errors_found, and if
you add some other subclass of errors later, you don't have to then check
even more.

When all is done, you can just see if errors_found > unfixable_errors and
if so there's something to be done.

It's a bikeshed conversation.  If you disagree that's fine, just thought I'd
propose it.

-Eric
diff mbox series

Patch

diff --git a/scrub/common.c b/scrub/common.c
index a814f568..79ec9fd6 100644
--- a/scrub/common.c
+++ b/scrub/common.c
@@ -36,7 +36,8 @@  xfs_scrub_excessive_errors(
 	bool			ret;
 
 	pthread_mutex_lock(&ctx->lock);
-	ret = ctx->max_errors > 0 && ctx->errors_found >= ctx->max_errors;
+	ret = ctx->max_errors > 0 &&
+	      (ctx->unfixable_errors + ctx->errors_found) >= ctx->max_errors;
 	pthread_mutex_unlock(&ctx->lock);
 
 	return ret;
@@ -47,6 +48,10 @@  static struct {
 	int loglevel;
 } err_levels[] = {
 	[S_ERROR]  = { .string = "Error",	.loglevel = LOG_ERR },
+	[S_UNFIXABLE] = {
+		.string = "Unfixable error",
+		.loglevel = LOG_ERR
+	},
 	[S_WARN]   = { .string = "Warning",	.loglevel = LOG_WARNING },
 	[S_REPAIR] = { .string = "Repaired",	.loglevel = LOG_WARNING },
 	[S_INFO]   = { .string = "Info",	.loglevel = LOG_INFO },
@@ -108,6 +113,8 @@  __str_out(
 out_record:
 	if (error)      /* A syscall failed */
 		ctx->runtime_errors++;
+	else if (level == S_UNFIXABLE)
+		ctx->unfixable_errors++;
 	else if (level == S_ERROR)
 		ctx->errors_found++;
 	else if (level == S_WARN)
diff --git a/scrub/common.h b/scrub/common.h
index 1b9ad48f..e8485b4c 100644
--- a/scrub/common.h
+++ b/scrub/common.h
@@ -17,6 +17,7 @@  bool xfs_scrub_excessive_errors(struct scrub_ctx *ctx);
 
 enum error_level {
 	S_ERROR	= 0,
+	S_UNFIXABLE,
 	S_WARN,
 	S_REPAIR,
 	S_INFO,
@@ -40,6 +41,8 @@  void __str_out(struct scrub_ctx *ctx, const char *descr, enum error_level level,
 	__str_out(ctx, str, S_REPAIR,	0,	__FILE__, __LINE__, __VA_ARGS__)
 #define record_preen(ctx, str, ...) \
 	__str_out(ctx, str, S_PREEN,	0,	__FILE__, __LINE__, __VA_ARGS__)
+#define str_unfixable_error(ctx, str, ...) \
+	__str_out(ctx, str, S_UNFIXABLE, 0,	__FILE__, __LINE__, __VA_ARGS__)
 
 #define dbg_printf(fmt, ...) \
 	do {if (debug > 1) {printf(fmt, __VA_ARGS__);}} while (0)
diff --git a/scrub/phase4.c b/scrub/phase4.c
index eb30c189..07927036 100644
--- a/scrub/phase4.c
+++ b/scrub/phase4.c
@@ -99,7 +99,8 @@  xfs_process_action_items(
 	workqueue_destroy(&wq);
 
 	pthread_mutex_lock(&ctx->lock);
-	if (moveon && ctx->errors_found == 0 && want_fstrim) {
+	if (moveon && ctx->errors_found == 0 && ctx->unfixable_errors == 0 &&
+	    want_fstrim) {
 		fstrim(ctx);
 		progress_add(1);
 	}
diff --git a/scrub/phase5.c b/scrub/phase5.c
index 997c88d9..30346fc1 100644
--- a/scrub/phase5.c
+++ b/scrub/phase5.c
@@ -336,7 +336,7 @@  xfs_scan_connections(
 	bool			moveon = true;
 	bool			ret;
 
-	if (ctx->errors_found) {
+	if (ctx->errors_found || ctx->unfixable_errors) {
 		str_info(ctx, ctx->mntpoint,
 _("Filesystem has errors, skipping connectivity checks."));
 		return true;
diff --git a/scrub/phase6.c b/scrub/phase6.c
index 378ea0fb..c50fb8fb 100644
--- a/scrub/phase6.c
+++ b/scrub/phase6.c
@@ -140,7 +140,7 @@  report_badfile(
 	bad_length = min(start + length,
 			 br->bmap->bm_physical + br->bmap->bm_length) - start;
 
-	str_error(br->ctx, br->descr,
+	str_unfixable_error(br->ctx, br->descr,
 _("media error at data offset %llu length %llu."),
 			br->bmap->bm_offset + bad_offset, bad_length);
 	return 0;
diff --git a/scrub/xfs_scrub.c b/scrub/xfs_scrub.c
index 147c114c..aa98caaa 100644
--- a/scrub/xfs_scrub.c
+++ b/scrub/xfs_scrub.c
@@ -515,12 +515,16 @@  report_outcome(
 
 	total_errors = ctx->errors_found + ctx->runtime_errors;
 
-	if (total_errors == 0 && ctx->warnings_found == 0) {
+	if (total_errors == 0 &&
+	    ctx->unfixable_errors == 0 &&
+	    ctx->warnings_found == 0) {
 		log_info(ctx, _("No errors found."));
 		return;
 	}
 
-	if (total_errors == 0) {
+	if (total_errors == 0 && ctx->warnings_found == 0) {
+		/* nothing to report */
+	} else if (total_errors == 0) {
 		fprintf(stderr, _("%s: warnings found: %llu\n"), ctx->mntpoint,
 				ctx->warnings_found);
 		log_warn(ctx, _("warnings found: %llu"), ctx->warnings_found);
@@ -536,6 +540,13 @@  report_outcome(
 				total_errors, ctx->warnings_found);
 	}
 
+	if (ctx->unfixable_errors) {
+		fprintf(stderr, _("%s: unfixable errors found: %llu\n"),
+				ctx->mntpoint, ctx->unfixable_errors);
+		log_err(ctx, _("unfixable errors found: %llu"),
+				ctx->unfixable_errors);
+	}
+
 	/*
 	 * Don't advise the user to run repair unless we were successful in
 	 * setting up the scrub and we actually saw corruptions.  Warnings
diff --git a/scrub/xfs_scrub.h b/scrub/xfs_scrub.h
index 37d78f61..54876acb 100644
--- a/scrub/xfs_scrub.h
+++ b/scrub/xfs_scrub.h
@@ -74,6 +74,7 @@  struct scrub_ctx {
 	unsigned long long	max_errors;
 	unsigned long long	runtime_errors;
 	unsigned long long	errors_found;
+	unsigned long long	unfixable_errors;
 	unsigned long long	warnings_found;
 	unsigned long long	inodes_checked;
 	unsigned long long	bytes_checked;