[02/11] xfs_scrub: improve reporting of file data media errors
diff mbox series

Message ID 156944737999.300131.8592981210219230662.stgit@magnolia
State New
Headers show
Series
  • xfs_scrub: fix IO error reporting
Related show

Commit Message

Darrick J. Wong Sept. 25, 2019, 9:36 p.m. UTC
From: Darrick J. Wong <darrick.wong@oracle.com>

When we report media errors, we should tell the administrator the file
offset and length of the bad region, not just the offset of the entire
file extent record that overlaps a bad region.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 libfrog/bitmap.c |   37 +++++++++++++++++++++++++++++++++++++
 libfrog/bitmap.h |    2 ++
 scrub/phase6.c   |   52 +++++++++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 86 insertions(+), 5 deletions(-)

Comments

Eric Sandeen Oct. 21, 2019, 4:33 p.m. UTC | #1
On 9/25/19 4:36 PM, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> When we report media errors, we should tell the administrator the file
> offset and length of the bad region, not just the offset of the entire
> file extent record that overlaps a bad region.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  libfrog/bitmap.c |   37 +++++++++++++++++++++++++++++++++++++
>  libfrog/bitmap.h |    2 ++
>  scrub/phase6.c   |   52 +++++++++++++++++++++++++++++++++++++++++++++++-----
>  3 files changed, 86 insertions(+), 5 deletions(-)
> 
> 
> diff --git a/libfrog/bitmap.c b/libfrog/bitmap.c
> index a75d085a..6a88ef48 100644
> --- a/libfrog/bitmap.c
> +++ b/libfrog/bitmap.c
> @@ -339,6 +339,43 @@ bitmap_iterate(
>  }
>  #endif
>  
> +/* Iterate the set regions of part of this bitmap. */
> +int
> +bitmap_iterate_range(
> +	struct bitmap		*bmap,
> +	uint64_t		start,
> +	uint64_t		length,
> +	int			(*fn)(uint64_t, uint64_t, void *),
> +	void			*arg)
> +{
> +	struct avl64node	*firstn;
> +	struct avl64node	*lastn;
> +	struct avl64node	*pos;
> +	struct avl64node	*n;
> +	struct avl64node	*l;
> +	struct bitmap_node	*ext;
> +	int			ret = 0;
> +
> +	pthread_mutex_lock(&bmap->bt_lock);
> +
> +	avl64_findranges(bmap->bt_tree, start, start + length, &firstn,
> +			&lastn);
> +
> +	if (firstn == NULL && lastn == NULL)
> +		goto out;
> +
> +	avl_for_each_range_safe(pos, n, l, firstn, lastn) {
> +		ext = container_of(pos, struct bitmap_node, btn_node);
> +		ret = fn(ext->btn_start, ext->btn_length, arg);
> +		if (ret)
> +			break;
> +	}
> +
> +out:
> +	pthread_mutex_unlock(&bmap->bt_lock);
> +	return ret;
> +}
> +
>  /* Do any bitmap extents overlap the given one?  (locked) */
>  static bool
>  __bitmap_test(
> diff --git a/libfrog/bitmap.h b/libfrog/bitmap.h
> index 759386a8..043b77ee 100644
> --- a/libfrog/bitmap.h
> +++ b/libfrog/bitmap.h
> @@ -16,6 +16,8 @@ void bitmap_free(struct bitmap **bmap);
>  int bitmap_set(struct bitmap *bmap, uint64_t start, uint64_t length);
>  int bitmap_iterate(struct bitmap *bmap, int (*fn)(uint64_t, uint64_t, void *),
>  		void *arg);
> +int bitmap_iterate_range(struct bitmap *bmap, uint64_t start, uint64_t length,
> +		int (*fn)(uint64_t, uint64_t, void *), void *arg);
>  bool bitmap_test(struct bitmap *bmap, uint64_t start,
>  		uint64_t len);
>  bool bitmap_empty(struct bitmap *bmap);
> diff --git a/scrub/phase6.c b/scrub/phase6.c
> index 1edd98af..a16ad114 100644
> --- a/scrub/phase6.c
> +++ b/scrub/phase6.c
> @@ -111,6 +111,41 @@ xfs_decode_special_owner(
>  
>  /* Routines to translate bad physical extents into file paths and offsets. */
>  
> +struct badfile_report {
> +	struct scrub_ctx	*ctx;
> +	const char		*descr;
> +	struct xfs_bmap		*bmap;
> +};
> +
> +/* Report on bad extents found during a media scan. */
> +static int
> +report_badfile(
> +	uint64_t		start,
> +	uint64_t		length,
> +	void			*arg)
> +{
> +	struct badfile_report	*br = arg;
> +	unsigned long long	bad_offset;
> +	unsigned long long	bad_length;
> +
> +	/* Clamp the bad region to the file mapping. */
> +	if (start < br->bmap->bm_physical) {
> +		length -= br->bmap->bm_physical - start;
> +		start = br->bmap->bm_physical;
> +	}
> +	length = min(length, br->bmap->bm_length);
> +
> +	/* Figure out how far into the bmap is the bad mapping and report it. */
> +	bad_offset = start - br->bmap->bm_physical;
> +	bad_length = min(start + length,
> +			 br->bmap->bm_physical + br->bmap->bm_length) - start;
> +
> +	str_error(br->ctx, br->descr,
> +_("media error at data offset %llu length %llu."),
> +			br->bmap->bm_offset + bad_offset, bad_length);
> +	return 0;
> +}
> +
>  /* Report if this extent overlaps a bad region. */
>  static bool
>  report_data_loss(
> @@ -122,8 +157,14 @@ report_data_loss(
>  	struct xfs_bmap			*bmap,
>  	void				*arg)
>  {
> +	struct badfile_report		br = {
> +		.ctx			= ctx,
> +		.descr			= descr,
> +		.bmap			= bmap,
> +	};
>  	struct media_verify_state	*vs = arg;
>  	struct bitmap			*bmp;
> +	int				ret;
>  
>  	/* Only report errors for real extents. */
>  	if (bmap->bm_flags & (BMV_OF_PREALLOC | BMV_OF_DELALLOC))
> @@ -134,11 +175,12 @@ report_data_loss(
>  	else
>  		bmp = vs->d_bad;
>  
> -	if (!bitmap_test(bmp, bmap->bm_physical, bmap->bm_length))
> -		return true;
> -
> -	str_error(ctx, descr,
> -_("offset %llu failed read verification."), bmap->bm_offset);
> +	ret = bitmap_iterate_range(bmp, bmap->bm_physical, bmap->bm_length,
> +			report_badfile, &br);
> +	if (ret) {
> +		str_liberror(ctx, ret, descr);
> +		return false;

So related to the prior question; why does changing the way we report a
media error in a file change the flow with a "return false" here?

> +	}
>  	return true;
>  }
>  
>
Darrick J. Wong Oct. 21, 2019, 5:56 p.m. UTC | #2
On Mon, Oct 21, 2019 at 11:33:59AM -0500, Eric Sandeen wrote:
> On 9/25/19 4:36 PM, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > When we report media errors, we should tell the administrator the file
> > offset and length of the bad region, not just the offset of the entire
> > file extent record that overlaps a bad region.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> >  libfrog/bitmap.c |   37 +++++++++++++++++++++++++++++++++++++
> >  libfrog/bitmap.h |    2 ++
> >  scrub/phase6.c   |   52 +++++++++++++++++++++++++++++++++++++++++++++++-----
> >  3 files changed, 86 insertions(+), 5 deletions(-)
> > 
> > 
> > diff --git a/libfrog/bitmap.c b/libfrog/bitmap.c
> > index a75d085a..6a88ef48 100644
> > --- a/libfrog/bitmap.c
> > +++ b/libfrog/bitmap.c
> > @@ -339,6 +339,43 @@ bitmap_iterate(
> >  }
> >  #endif
> >  
> > +/* Iterate the set regions of part of this bitmap. */
> > +int
> > +bitmap_iterate_range(
> > +	struct bitmap		*bmap,
> > +	uint64_t		start,
> > +	uint64_t		length,
> > +	int			(*fn)(uint64_t, uint64_t, void *),
> > +	void			*arg)
> > +{
> > +	struct avl64node	*firstn;
> > +	struct avl64node	*lastn;
> > +	struct avl64node	*pos;
> > +	struct avl64node	*n;
> > +	struct avl64node	*l;
> > +	struct bitmap_node	*ext;
> > +	int			ret = 0;
> > +
> > +	pthread_mutex_lock(&bmap->bt_lock);
> > +
> > +	avl64_findranges(bmap->bt_tree, start, start + length, &firstn,
> > +			&lastn);
> > +
> > +	if (firstn == NULL && lastn == NULL)
> > +		goto out;
> > +
> > +	avl_for_each_range_safe(pos, n, l, firstn, lastn) {
> > +		ext = container_of(pos, struct bitmap_node, btn_node);
> > +		ret = fn(ext->btn_start, ext->btn_length, arg);
> > +		if (ret)
> > +			break;
> > +	}
> > +
> > +out:
> > +	pthread_mutex_unlock(&bmap->bt_lock);
> > +	return ret;
> > +}
> > +
> >  /* Do any bitmap extents overlap the given one?  (locked) */
> >  static bool
> >  __bitmap_test(
> > diff --git a/libfrog/bitmap.h b/libfrog/bitmap.h
> > index 759386a8..043b77ee 100644
> > --- a/libfrog/bitmap.h
> > +++ b/libfrog/bitmap.h
> > @@ -16,6 +16,8 @@ void bitmap_free(struct bitmap **bmap);
> >  int bitmap_set(struct bitmap *bmap, uint64_t start, uint64_t length);
> >  int bitmap_iterate(struct bitmap *bmap, int (*fn)(uint64_t, uint64_t, void *),
> >  		void *arg);
> > +int bitmap_iterate_range(struct bitmap *bmap, uint64_t start, uint64_t length,
> > +		int (*fn)(uint64_t, uint64_t, void *), void *arg);
> >  bool bitmap_test(struct bitmap *bmap, uint64_t start,
> >  		uint64_t len);
> >  bool bitmap_empty(struct bitmap *bmap);
> > diff --git a/scrub/phase6.c b/scrub/phase6.c
> > index 1edd98af..a16ad114 100644
> > --- a/scrub/phase6.c
> > +++ b/scrub/phase6.c
> > @@ -111,6 +111,41 @@ xfs_decode_special_owner(
> >  
> >  /* Routines to translate bad physical extents into file paths and offsets. */
> >  
> > +struct badfile_report {
> > +	struct scrub_ctx	*ctx;
> > +	const char		*descr;
> > +	struct xfs_bmap		*bmap;
> > +};
> > +
> > +/* Report on bad extents found during a media scan. */
> > +static int
> > +report_badfile(
> > +	uint64_t		start,
> > +	uint64_t		length,
> > +	void			*arg)
> > +{
> > +	struct badfile_report	*br = arg;
> > +	unsigned long long	bad_offset;
> > +	unsigned long long	bad_length;
> > +
> > +	/* Clamp the bad region to the file mapping. */
> > +	if (start < br->bmap->bm_physical) {
> > +		length -= br->bmap->bm_physical - start;
> > +		start = br->bmap->bm_physical;
> > +	}
> > +	length = min(length, br->bmap->bm_length);
> > +
> > +	/* Figure out how far into the bmap is the bad mapping and report it. */
> > +	bad_offset = start - br->bmap->bm_physical;
> > +	bad_length = min(start + length,
> > +			 br->bmap->bm_physical + br->bmap->bm_length) - start;
> > +
> > +	str_error(br->ctx, br->descr,
> > +_("media error at data offset %llu length %llu."),
> > +			br->bmap->bm_offset + bad_offset, bad_length);
> > +	return 0;
> > +}
> > +
> >  /* Report if this extent overlaps a bad region. */
> >  static bool
> >  report_data_loss(
> > @@ -122,8 +157,14 @@ report_data_loss(
> >  	struct xfs_bmap			*bmap,
> >  	void				*arg)
> >  {
> > +	struct badfile_report		br = {
> > +		.ctx			= ctx,
> > +		.descr			= descr,
> > +		.bmap			= bmap,
> > +	};
> >  	struct media_verify_state	*vs = arg;
> >  	struct bitmap			*bmp;
> > +	int				ret;
> >  
> >  	/* Only report errors for real extents. */
> >  	if (bmap->bm_flags & (BMV_OF_PREALLOC | BMV_OF_DELALLOC))
> > @@ -134,11 +175,12 @@ report_data_loss(
> >  	else
> >  		bmp = vs->d_bad;
> >  
> > -	if (!bitmap_test(bmp, bmap->bm_physical, bmap->bm_length))
> > -		return true;
> > -
> > -	str_error(ctx, descr,
> > -_("offset %llu failed read verification."), bmap->bm_offset);
> > +	ret = bitmap_iterate_range(bmp, bmap->bm_physical, bmap->bm_length,
> > +			report_badfile, &br);
> > +	if (ret) {
> > +		str_liberror(ctx, ret, descr);
> > +		return false;
> 
> So related to the prior question; why does changing the way we report a
> media error in a file change the flow with a "return false" here?

The bitmap iteration itself failed (since report_badfile never returns
false), which means it hit a runtime error and the program should exit.

(Granted, bitmap iteration never fails, so this is merely defensive
programming.)

--D

> > +	}
> >  	return true;
> >  }
> >  
> >

Patch
diff mbox series

diff --git a/libfrog/bitmap.c b/libfrog/bitmap.c
index a75d085a..6a88ef48 100644
--- a/libfrog/bitmap.c
+++ b/libfrog/bitmap.c
@@ -339,6 +339,43 @@  bitmap_iterate(
 }
 #endif
 
+/* Iterate the set regions of part of this bitmap. */
+int
+bitmap_iterate_range(
+	struct bitmap		*bmap,
+	uint64_t		start,
+	uint64_t		length,
+	int			(*fn)(uint64_t, uint64_t, void *),
+	void			*arg)
+{
+	struct avl64node	*firstn;
+	struct avl64node	*lastn;
+	struct avl64node	*pos;
+	struct avl64node	*n;
+	struct avl64node	*l;
+	struct bitmap_node	*ext;
+	int			ret = 0;
+
+	pthread_mutex_lock(&bmap->bt_lock);
+
+	avl64_findranges(bmap->bt_tree, start, start + length, &firstn,
+			&lastn);
+
+	if (firstn == NULL && lastn == NULL)
+		goto out;
+
+	avl_for_each_range_safe(pos, n, l, firstn, lastn) {
+		ext = container_of(pos, struct bitmap_node, btn_node);
+		ret = fn(ext->btn_start, ext->btn_length, arg);
+		if (ret)
+			break;
+	}
+
+out:
+	pthread_mutex_unlock(&bmap->bt_lock);
+	return ret;
+}
+
 /* Do any bitmap extents overlap the given one?  (locked) */
 static bool
 __bitmap_test(
diff --git a/libfrog/bitmap.h b/libfrog/bitmap.h
index 759386a8..043b77ee 100644
--- a/libfrog/bitmap.h
+++ b/libfrog/bitmap.h
@@ -16,6 +16,8 @@  void bitmap_free(struct bitmap **bmap);
 int bitmap_set(struct bitmap *bmap, uint64_t start, uint64_t length);
 int bitmap_iterate(struct bitmap *bmap, int (*fn)(uint64_t, uint64_t, void *),
 		void *arg);
+int bitmap_iterate_range(struct bitmap *bmap, uint64_t start, uint64_t length,
+		int (*fn)(uint64_t, uint64_t, void *), void *arg);
 bool bitmap_test(struct bitmap *bmap, uint64_t start,
 		uint64_t len);
 bool bitmap_empty(struct bitmap *bmap);
diff --git a/scrub/phase6.c b/scrub/phase6.c
index 1edd98af..a16ad114 100644
--- a/scrub/phase6.c
+++ b/scrub/phase6.c
@@ -111,6 +111,41 @@  xfs_decode_special_owner(
 
 /* Routines to translate bad physical extents into file paths and offsets. */
 
+struct badfile_report {
+	struct scrub_ctx	*ctx;
+	const char		*descr;
+	struct xfs_bmap		*bmap;
+};
+
+/* Report on bad extents found during a media scan. */
+static int
+report_badfile(
+	uint64_t		start,
+	uint64_t		length,
+	void			*arg)
+{
+	struct badfile_report	*br = arg;
+	unsigned long long	bad_offset;
+	unsigned long long	bad_length;
+
+	/* Clamp the bad region to the file mapping. */
+	if (start < br->bmap->bm_physical) {
+		length -= br->bmap->bm_physical - start;
+		start = br->bmap->bm_physical;
+	}
+	length = min(length, br->bmap->bm_length);
+
+	/* Figure out how far into the bmap is the bad mapping and report it. */
+	bad_offset = start - br->bmap->bm_physical;
+	bad_length = min(start + length,
+			 br->bmap->bm_physical + br->bmap->bm_length) - start;
+
+	str_error(br->ctx, br->descr,
+_("media error at data offset %llu length %llu."),
+			br->bmap->bm_offset + bad_offset, bad_length);
+	return 0;
+}
+
 /* Report if this extent overlaps a bad region. */
 static bool
 report_data_loss(
@@ -122,8 +157,14 @@  report_data_loss(
 	struct xfs_bmap			*bmap,
 	void				*arg)
 {
+	struct badfile_report		br = {
+		.ctx			= ctx,
+		.descr			= descr,
+		.bmap			= bmap,
+	};
 	struct media_verify_state	*vs = arg;
 	struct bitmap			*bmp;
+	int				ret;
 
 	/* Only report errors for real extents. */
 	if (bmap->bm_flags & (BMV_OF_PREALLOC | BMV_OF_DELALLOC))
@@ -134,11 +175,12 @@  report_data_loss(
 	else
 		bmp = vs->d_bad;
 
-	if (!bitmap_test(bmp, bmap->bm_physical, bmap->bm_length))
-		return true;
-
-	str_error(ctx, descr,
-_("offset %llu failed read verification."), bmap->bm_offset);
+	ret = bitmap_iterate_range(bmp, bmap->bm_physical, bmap->bm_length,
+			report_badfile, &br);
+	if (ret) {
+		str_liberror(ctx, ret, descr);
+		return false;
+	}
 	return true;
 }