Message ID | 156944737999.300131.8592981210219230662.stgit@magnolia (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | xfs_scrub: fix IO error reporting | expand |
On 9/25/19 4:36 PM, Darrick J. Wong wrote: > From: Darrick J. Wong <darrick.wong@oracle.com> > > When we report media errors, we should tell the administrator the file > offset and length of the bad region, not just the offset of the entire > file extent record that overlaps a bad region. > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> > --- > libfrog/bitmap.c | 37 +++++++++++++++++++++++++++++++++++++ > libfrog/bitmap.h | 2 ++ > scrub/phase6.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++----- > 3 files changed, 86 insertions(+), 5 deletions(-) > > > diff --git a/libfrog/bitmap.c b/libfrog/bitmap.c > index a75d085a..6a88ef48 100644 > --- a/libfrog/bitmap.c > +++ b/libfrog/bitmap.c > @@ -339,6 +339,43 @@ bitmap_iterate( > } > #endif > > +/* Iterate the set regions of part of this bitmap. */ > +int > +bitmap_iterate_range( > + struct bitmap *bmap, > + uint64_t start, > + uint64_t length, > + int (*fn)(uint64_t, uint64_t, void *), > + void *arg) > +{ > + struct avl64node *firstn; > + struct avl64node *lastn; > + struct avl64node *pos; > + struct avl64node *n; > + struct avl64node *l; > + struct bitmap_node *ext; > + int ret = 0; > + > + pthread_mutex_lock(&bmap->bt_lock); > + > + avl64_findranges(bmap->bt_tree, start, start + length, &firstn, > + &lastn); > + > + if (firstn == NULL && lastn == NULL) > + goto out; > + > + avl_for_each_range_safe(pos, n, l, firstn, lastn) { > + ext = container_of(pos, struct bitmap_node, btn_node); > + ret = fn(ext->btn_start, ext->btn_length, arg); > + if (ret) > + break; > + } > + > +out: > + pthread_mutex_unlock(&bmap->bt_lock); > + return ret; > +} > + > /* Do any bitmap extents overlap the given one? (locked) */ > static bool > __bitmap_test( > diff --git a/libfrog/bitmap.h b/libfrog/bitmap.h > index 759386a8..043b77ee 100644 > --- a/libfrog/bitmap.h > +++ b/libfrog/bitmap.h > @@ -16,6 +16,8 @@ void bitmap_free(struct bitmap **bmap); > int bitmap_set(struct bitmap *bmap, uint64_t start, uint64_t length); > int bitmap_iterate(struct bitmap *bmap, int (*fn)(uint64_t, uint64_t, void *), > void *arg); > +int bitmap_iterate_range(struct bitmap *bmap, uint64_t start, uint64_t length, > + int (*fn)(uint64_t, uint64_t, void *), void *arg); > bool bitmap_test(struct bitmap *bmap, uint64_t start, > uint64_t len); > bool bitmap_empty(struct bitmap *bmap); > diff --git a/scrub/phase6.c b/scrub/phase6.c > index 1edd98af..a16ad114 100644 > --- a/scrub/phase6.c > +++ b/scrub/phase6.c > @@ -111,6 +111,41 @@ xfs_decode_special_owner( > > /* Routines to translate bad physical extents into file paths and offsets. */ > > +struct badfile_report { > + struct scrub_ctx *ctx; > + const char *descr; > + struct xfs_bmap *bmap; > +}; > + > +/* Report on bad extents found during a media scan. */ > +static int > +report_badfile( > + uint64_t start, > + uint64_t length, > + void *arg) > +{ > + struct badfile_report *br = arg; > + unsigned long long bad_offset; > + unsigned long long bad_length; > + > + /* Clamp the bad region to the file mapping. */ > + if (start < br->bmap->bm_physical) { > + length -= br->bmap->bm_physical - start; > + start = br->bmap->bm_physical; > + } > + length = min(length, br->bmap->bm_length); > + > + /* Figure out how far into the bmap is the bad mapping and report it. */ > + bad_offset = start - br->bmap->bm_physical; > + bad_length = min(start + length, > + br->bmap->bm_physical + br->bmap->bm_length) - start; > + > + str_error(br->ctx, br->descr, > +_("media error at data offset %llu length %llu."), > + br->bmap->bm_offset + bad_offset, bad_length); > + return 0; > +} > + > /* Report if this extent overlaps a bad region. */ > static bool > report_data_loss( > @@ -122,8 +157,14 @@ report_data_loss( > struct xfs_bmap *bmap, > void *arg) > { > + struct badfile_report br = { > + .ctx = ctx, > + .descr = descr, > + .bmap = bmap, > + }; > struct media_verify_state *vs = arg; > struct bitmap *bmp; > + int ret; > > /* Only report errors for real extents. */ > if (bmap->bm_flags & (BMV_OF_PREALLOC | BMV_OF_DELALLOC)) > @@ -134,11 +175,12 @@ report_data_loss( > else > bmp = vs->d_bad; > > - if (!bitmap_test(bmp, bmap->bm_physical, bmap->bm_length)) > - return true; > - > - str_error(ctx, descr, > -_("offset %llu failed read verification."), bmap->bm_offset); > + ret = bitmap_iterate_range(bmp, bmap->bm_physical, bmap->bm_length, > + report_badfile, &br); > + if (ret) { > + str_liberror(ctx, ret, descr); > + return false; So related to the prior question; why does changing the way we report a media error in a file change the flow with a "return false" here? > + } > return true; > } > >
On Mon, Oct 21, 2019 at 11:33:59AM -0500, Eric Sandeen wrote: > On 9/25/19 4:36 PM, Darrick J. Wong wrote: > > From: Darrick J. Wong <darrick.wong@oracle.com> > > > > When we report media errors, we should tell the administrator the file > > offset and length of the bad region, not just the offset of the entire > > file extent record that overlaps a bad region. > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> > > --- > > libfrog/bitmap.c | 37 +++++++++++++++++++++++++++++++++++++ > > libfrog/bitmap.h | 2 ++ > > scrub/phase6.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++----- > > 3 files changed, 86 insertions(+), 5 deletions(-) > > > > > > diff --git a/libfrog/bitmap.c b/libfrog/bitmap.c > > index a75d085a..6a88ef48 100644 > > --- a/libfrog/bitmap.c > > +++ b/libfrog/bitmap.c > > @@ -339,6 +339,43 @@ bitmap_iterate( > > } > > #endif > > > > +/* Iterate the set regions of part of this bitmap. */ > > +int > > +bitmap_iterate_range( > > + struct bitmap *bmap, > > + uint64_t start, > > + uint64_t length, > > + int (*fn)(uint64_t, uint64_t, void *), > > + void *arg) > > +{ > > + struct avl64node *firstn; > > + struct avl64node *lastn; > > + struct avl64node *pos; > > + struct avl64node *n; > > + struct avl64node *l; > > + struct bitmap_node *ext; > > + int ret = 0; > > + > > + pthread_mutex_lock(&bmap->bt_lock); > > + > > + avl64_findranges(bmap->bt_tree, start, start + length, &firstn, > > + &lastn); > > + > > + if (firstn == NULL && lastn == NULL) > > + goto out; > > + > > + avl_for_each_range_safe(pos, n, l, firstn, lastn) { > > + ext = container_of(pos, struct bitmap_node, btn_node); > > + ret = fn(ext->btn_start, ext->btn_length, arg); > > + if (ret) > > + break; > > + } > > + > > +out: > > + pthread_mutex_unlock(&bmap->bt_lock); > > + return ret; > > +} > > + > > /* Do any bitmap extents overlap the given one? (locked) */ > > static bool > > __bitmap_test( > > diff --git a/libfrog/bitmap.h b/libfrog/bitmap.h > > index 759386a8..043b77ee 100644 > > --- a/libfrog/bitmap.h > > +++ b/libfrog/bitmap.h > > @@ -16,6 +16,8 @@ void bitmap_free(struct bitmap **bmap); > > int bitmap_set(struct bitmap *bmap, uint64_t start, uint64_t length); > > int bitmap_iterate(struct bitmap *bmap, int (*fn)(uint64_t, uint64_t, void *), > > void *arg); > > +int bitmap_iterate_range(struct bitmap *bmap, uint64_t start, uint64_t length, > > + int (*fn)(uint64_t, uint64_t, void *), void *arg); > > bool bitmap_test(struct bitmap *bmap, uint64_t start, > > uint64_t len); > > bool bitmap_empty(struct bitmap *bmap); > > diff --git a/scrub/phase6.c b/scrub/phase6.c > > index 1edd98af..a16ad114 100644 > > --- a/scrub/phase6.c > > +++ b/scrub/phase6.c > > @@ -111,6 +111,41 @@ xfs_decode_special_owner( > > > > /* Routines to translate bad physical extents into file paths and offsets. */ > > > > +struct badfile_report { > > + struct scrub_ctx *ctx; > > + const char *descr; > > + struct xfs_bmap *bmap; > > +}; > > + > > +/* Report on bad extents found during a media scan. */ > > +static int > > +report_badfile( > > + uint64_t start, > > + uint64_t length, > > + void *arg) > > +{ > > + struct badfile_report *br = arg; > > + unsigned long long bad_offset; > > + unsigned long long bad_length; > > + > > + /* Clamp the bad region to the file mapping. */ > > + if (start < br->bmap->bm_physical) { > > + length -= br->bmap->bm_physical - start; > > + start = br->bmap->bm_physical; > > + } > > + length = min(length, br->bmap->bm_length); > > + > > + /* Figure out how far into the bmap is the bad mapping and report it. */ > > + bad_offset = start - br->bmap->bm_physical; > > + bad_length = min(start + length, > > + br->bmap->bm_physical + br->bmap->bm_length) - start; > > + > > + str_error(br->ctx, br->descr, > > +_("media error at data offset %llu length %llu."), > > + br->bmap->bm_offset + bad_offset, bad_length); > > + return 0; > > +} > > + > > /* Report if this extent overlaps a bad region. */ > > static bool > > report_data_loss( > > @@ -122,8 +157,14 @@ report_data_loss( > > struct xfs_bmap *bmap, > > void *arg) > > { > > + struct badfile_report br = { > > + .ctx = ctx, > > + .descr = descr, > > + .bmap = bmap, > > + }; > > struct media_verify_state *vs = arg; > > struct bitmap *bmp; > > + int ret; > > > > /* Only report errors for real extents. */ > > if (bmap->bm_flags & (BMV_OF_PREALLOC | BMV_OF_DELALLOC)) > > @@ -134,11 +175,12 @@ report_data_loss( > > else > > bmp = vs->d_bad; > > > > - if (!bitmap_test(bmp, bmap->bm_physical, bmap->bm_length)) > > - return true; > > - > > - str_error(ctx, descr, > > -_("offset %llu failed read verification."), bmap->bm_offset); > > + ret = bitmap_iterate_range(bmp, bmap->bm_physical, bmap->bm_length, > > + report_badfile, &br); > > + if (ret) { > > + str_liberror(ctx, ret, descr); > > + return false; > > So related to the prior question; why does changing the way we report a > media error in a file change the flow with a "return false" here? The bitmap iteration itself failed (since report_badfile never returns false), which means it hit a runtime error and the program should exit. (Granted, bitmap iteration never fails, so this is merely defensive programming.) --D > > + } > > return true; > > } > > > >
diff --git a/libfrog/bitmap.c b/libfrog/bitmap.c index a75d085a..6a88ef48 100644 --- a/libfrog/bitmap.c +++ b/libfrog/bitmap.c @@ -339,6 +339,43 @@ bitmap_iterate( } #endif +/* Iterate the set regions of part of this bitmap. */ +int +bitmap_iterate_range( + struct bitmap *bmap, + uint64_t start, + uint64_t length, + int (*fn)(uint64_t, uint64_t, void *), + void *arg) +{ + struct avl64node *firstn; + struct avl64node *lastn; + struct avl64node *pos; + struct avl64node *n; + struct avl64node *l; + struct bitmap_node *ext; + int ret = 0; + + pthread_mutex_lock(&bmap->bt_lock); + + avl64_findranges(bmap->bt_tree, start, start + length, &firstn, + &lastn); + + if (firstn == NULL && lastn == NULL) + goto out; + + avl_for_each_range_safe(pos, n, l, firstn, lastn) { + ext = container_of(pos, struct bitmap_node, btn_node); + ret = fn(ext->btn_start, ext->btn_length, arg); + if (ret) + break; + } + +out: + pthread_mutex_unlock(&bmap->bt_lock); + return ret; +} + /* Do any bitmap extents overlap the given one? (locked) */ static bool __bitmap_test( diff --git a/libfrog/bitmap.h b/libfrog/bitmap.h index 759386a8..043b77ee 100644 --- a/libfrog/bitmap.h +++ b/libfrog/bitmap.h @@ -16,6 +16,8 @@ void bitmap_free(struct bitmap **bmap); int bitmap_set(struct bitmap *bmap, uint64_t start, uint64_t length); int bitmap_iterate(struct bitmap *bmap, int (*fn)(uint64_t, uint64_t, void *), void *arg); +int bitmap_iterate_range(struct bitmap *bmap, uint64_t start, uint64_t length, + int (*fn)(uint64_t, uint64_t, void *), void *arg); bool bitmap_test(struct bitmap *bmap, uint64_t start, uint64_t len); bool bitmap_empty(struct bitmap *bmap); diff --git a/scrub/phase6.c b/scrub/phase6.c index 1edd98af..a16ad114 100644 --- a/scrub/phase6.c +++ b/scrub/phase6.c @@ -111,6 +111,41 @@ xfs_decode_special_owner( /* Routines to translate bad physical extents into file paths and offsets. */ +struct badfile_report { + struct scrub_ctx *ctx; + const char *descr; + struct xfs_bmap *bmap; +}; + +/* Report on bad extents found during a media scan. */ +static int +report_badfile( + uint64_t start, + uint64_t length, + void *arg) +{ + struct badfile_report *br = arg; + unsigned long long bad_offset; + unsigned long long bad_length; + + /* Clamp the bad region to the file mapping. */ + if (start < br->bmap->bm_physical) { + length -= br->bmap->bm_physical - start; + start = br->bmap->bm_physical; + } + length = min(length, br->bmap->bm_length); + + /* Figure out how far into the bmap is the bad mapping and report it. */ + bad_offset = start - br->bmap->bm_physical; + bad_length = min(start + length, + br->bmap->bm_physical + br->bmap->bm_length) - start; + + str_error(br->ctx, br->descr, +_("media error at data offset %llu length %llu."), + br->bmap->bm_offset + bad_offset, bad_length); + return 0; +} + /* Report if this extent overlaps a bad region. */ static bool report_data_loss( @@ -122,8 +157,14 @@ report_data_loss( struct xfs_bmap *bmap, void *arg) { + struct badfile_report br = { + .ctx = ctx, + .descr = descr, + .bmap = bmap, + }; struct media_verify_state *vs = arg; struct bitmap *bmp; + int ret; /* Only report errors for real extents. */ if (bmap->bm_flags & (BMV_OF_PREALLOC | BMV_OF_DELALLOC)) @@ -134,11 +175,12 @@ report_data_loss( else bmp = vs->d_bad; - if (!bitmap_test(bmp, bmap->bm_physical, bmap->bm_length)) - return true; - - str_error(ctx, descr, -_("offset %llu failed read verification."), bmap->bm_offset); + ret = bitmap_iterate_range(bmp, bmap->bm_physical, bmap->bm_length, + report_badfile, &br); + if (ret) { + str_liberror(ctx, ret, descr); + return false; + } return true; }