diff mbox series

[01/11] xfs_scrub: fix handling of read-verify pool runtime errors

Message ID 156944729476.298887.15638727982082805193.stgit@magnolia (mailing list archive)
State Superseded
Headers show
Series xfs_scrub: fix IO error detection during media verify | expand

Commit Message

Darrick J. Wong Sept. 25, 2019, 9:34 p.m. UTC
From: Darrick J. Wong <darrick.wong@oracle.com>

Fix some bogosity with how we handle runtime errors in the read verify
pool functions.  First of all, memory allocation failures shouldn't be
recorded as disk IO errors, they should just complain and abort the
phase.  Second, we need to collect any other runtime errors in the IO
thread and abort the phase instead of silently ignoring them.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 scrub/read_verify.c |   23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

Comments

Eric Sandeen Oct. 15, 2019, 5:07 p.m. UTC | #1
On 9/25/19 4:34 PM, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Fix some bogosity with how we handle runtime errors in the read verify
> pool functions.  First of all, memory allocation failures shouldn't be
> recorded as disk IO errors, they should just complain and abort the
> phase.  Second, we need to collect any other runtime errors in the IO
> thread and abort the phase instead of silently ignoring them.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  scrub/read_verify.c |   23 +++++++++++++++++++----
>  1 file changed, 19 insertions(+), 4 deletions(-)
> 
> 
> diff --git a/scrub/read_verify.c b/scrub/read_verify.c
> index b890c92f..00627307 100644
> --- a/scrub/read_verify.c
> +++ b/scrub/read_verify.c
> @@ -53,6 +53,7 @@ struct read_verify_pool {
>  	struct disk		*disk;		/* which disk? */
>  	read_verify_ioerr_fn_t	ioerr_fn;	/* io error callback */
>  	size_t			miniosz;	/* minimum io size, bytes */
> +	int			errors_seen;
>  };

I'd like to see a comment that says /* runtime/operational errors */
to differentiate between verification errors.

Or maybe even rename it to runtime_errors or something.

I'm also confused; it's an int, but this patch assigns it with true/false,
the next patch assigns errnos i.e. ECANCELED, .... what's it supposed to be?
Darrick J. Wong Oct. 15, 2019, 5:19 p.m. UTC | #2
On Tue, Oct 15, 2019 at 12:07:11PM -0500, Eric Sandeen wrote:
> On 9/25/19 4:34 PM, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Fix some bogosity with how we handle runtime errors in the read verify
> > pool functions.  First of all, memory allocation failures shouldn't be
> > recorded as disk IO errors, they should just complain and abort the
> > phase.  Second, we need to collect any other runtime errors in the IO
> > thread and abort the phase instead of silently ignoring them.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> >  scrub/read_verify.c |   23 +++++++++++++++++++----
> >  1 file changed, 19 insertions(+), 4 deletions(-)
> > 
> > 
> > diff --git a/scrub/read_verify.c b/scrub/read_verify.c
> > index b890c92f..00627307 100644
> > --- a/scrub/read_verify.c
> > +++ b/scrub/read_verify.c
> > @@ -53,6 +53,7 @@ struct read_verify_pool {
> >  	struct disk		*disk;		/* which disk? */
> >  	read_verify_ioerr_fn_t	ioerr_fn;	/* io error callback */
> >  	size_t			miniosz;	/* minimum io size, bytes */
> > +	int			errors_seen;
> >  };
> 
> I'd like to see a comment that says /* runtime/operational errors */
> to differentiate between verification errors.
> 
> Or maybe even rename it to runtime_errors or something.
> 
> I'm also confused; it's an int, but this patch assigns it with true/false,
> the next patch assigns errnos i.e. ECANCELED, .... what's it supposed to be?

It's a mess. :(

It ought to be an int since that's where we're going anyway.

Will add a comment, change the name, and make the usage consistent
throughout the series.

--D
diff mbox series

Patch

diff --git a/scrub/read_verify.c b/scrub/read_verify.c
index b890c92f..00627307 100644
--- a/scrub/read_verify.c
+++ b/scrub/read_verify.c
@@ -53,6 +53,7 @@  struct read_verify_pool {
 	struct disk		*disk;		/* which disk? */
 	read_verify_ioerr_fn_t	ioerr_fn;	/* io error callback */
 	size_t			miniosz;	/* minimum io size, bytes */
+	int			errors_seen;
 };
 
 /*
@@ -91,6 +92,7 @@  read_verify_pool_init(
 	rvp->ctx = ctx;
 	rvp->disk = disk;
 	rvp->ioerr_fn = ioerr_fn;
+	rvp->errors_seen = false;
 	error = ptvar_alloc(submitter_threads, sizeof(struct read_verify),
 			&rvp->rvstate);
 	if (error)
@@ -149,6 +151,7 @@  read_verify(
 	unsigned long long		verified = 0;
 	ssize_t				sz;
 	ssize_t				len;
+	int				ret;
 
 	rvp = (struct read_verify_pool *)wq->wq_ctx;
 	while (rv->io_length > 0) {
@@ -173,7 +176,12 @@  read_verify(
 	}
 
 	free(rv);
-	ptcounter_add(rvp->verified_bytes, verified);
+	ret = ptcounter_add(rvp->verified_bytes, verified);
+	if (ret) {
+		str_liberror(rvp->ctx, ret,
+				_("updating bytes verified counter"));
+		rvp->errors_seen = true;
+	}
 }
 
 /* Queue a read verify request. */
@@ -188,18 +196,25 @@  read_verify_queue(
 	dbg_printf("verify fd %d start %"PRIu64" len %"PRIu64"\n",
 			rvp->disk->d_fd, rv->io_start, rv->io_length);
 
+	/* Worker thread saw a runtime error, don't queue more. */
+	if (rvp->errors_seen)
+		return false;
+
+	/* Otherwise clone the request and queue the copy. */
 	tmp = malloc(sizeof(struct read_verify));
 	if (!tmp) {
-		rvp->ioerr_fn(rvp->ctx, rvp->disk, rv->io_start,
-				rv->io_length, errno, rv->io_end_arg);
-		return true;
+		str_errno(rvp->ctx, _("allocating read-verify request"));
+		rvp->errors_seen = true;
+		return false;
 	}
+
 	memcpy(tmp, rv, sizeof(*tmp));
 
 	ret = workqueue_add(&rvp->wq, read_verify, 0, tmp);
 	if (ret) {
 		str_liberror(rvp->ctx, ret, _("queueing read-verify work"));
 		free(tmp);
+		rvp->errors_seen = true;
 		return false;
 	}
 	rv->io_length = 0;