diff mbox series

[2/2] xfs: test xfs_scrub phase 6 media error reporting

Message ID 157604271809.578515.1806500868635425865.stgit@magnolia (mailing list archive)
State New, archived
Headers show
Series xfs: test xfs_scrub media scan | expand

Commit Message

Darrick J. Wong Dec. 11, 2019, 5:38 a.m. UTC
From: Darrick J. Wong <darrick.wong@oracle.com>

Add new helpers to dmerror to provide for marking selected ranges
totally bad -- both reads and writes will fail.  Create a new test for
xfs_scrub to check that it reports media errors correctly.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 common/dmerror    |  107 +++++++++++++++++++++++++++++++++++++++++-
 tests/xfs/747     |  136 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/747.out |   12 +++++
 tests/xfs/748     |  102 ++++++++++++++++++++++++++++++++++++++++
 tests/xfs/748.out |    5 ++
 tests/xfs/group   |    2 +
 6 files changed, 363 insertions(+), 1 deletion(-)
 create mode 100755 tests/xfs/747
 create mode 100644 tests/xfs/747.out
 create mode 100755 tests/xfs/748
 create mode 100644 tests/xfs/748.out

Comments

Eryu Guan Jan. 6, 2020, 7:58 a.m. UTC | #1
On Tue, Dec 10, 2019 at 09:38:38PM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Add new helpers to dmerror to provide for marking selected ranges
> totally bad -- both reads and writes will fail.  Create a new test for
> xfs_scrub to check that it reports media errors correctly.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>

I hit assert failure when testing on v5.5-rc3+ kernel, is that an
expected result? Both test failed in the same way.

[  192.610313] xfs filesystem being mounted at /mnt/scratch supports timestamps until 2038 (0x7fffffff)
[  193.149329] Buffer I/O error on dev dm-11, logical block 128, async page read
[  193.150173] Buffer I/O error on dev dm-11, logical block 129, async page read
[  193.151254] Buffer I/O error on dev dm-11, logical block 130, async page read
[  193.152173] Buffer I/O error on dev dm-11, logical block 131, async page read
[  193.152980] Buffer I/O error on dev dm-11, logical block 132, async page read
[  193.153935] Buffer I/O error on dev dm-11, logical block 133, async page read
[  193.154869] Buffer I/O error on dev dm-11, logical block 134, async page read
[  193.155800] Buffer I/O error on dev dm-11, logical block 135, async page read
[  193.249751] XFS: Assertion failed: !(sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR), file: fs/xfs/scrub/repair.h, line: 78
[  193.255979] ------------[ cut here ]------------
[  193.258406] kernel BUG at fs/xfs/xfs_message.c:110!
[  193.260996] invalid opcode: 0000 [#1] SMP PTI
[  193.263323] CPU: 0 PID: 5613 Comm: xfs_scrub Not tainted 5.5.0-rc3+ #44
[  193.266717] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014
[  193.273736] RIP: 0010:assfail+0x23/0x28 [xfs]
[  193.276045] Code: 67 fc ff ff 0f 0b c3 0f 1f 44 00 00 41 89 c8 48 89 d1 48 89 f2 48 c7 c6 78 e9 8d c0 e8 82 f9 ff ff 80 3d 9a d7 08 00 00 74 02 <0f> 0b 0f 0b c3 48 8b 03 48 c7 c7 68 ee 8d c0 c6 05 0e 2b 0a 00 01
[  193.284481] RSP: 0018:ffffac9540b7fbe0 EFLAGS: 00010202
[  193.286297] RAX: 0000000000000000 RBX: ffffac9540b7fcc8 RCX: 0000000000000000
[  193.288390] RDX: 00000000ffffffc0 RSI: 000000000000000a RDI: ffffffffc08d144a
[  193.290235] RBP: ffffac9540b7fbf8 R08: 0000000000000000 R09: 0000000000000000
[  193.292083] R10: 000000000000000a R11: f000000000000000 R12: 0000000000000000
[  193.293589] R13: ffff90006701c000 R14: ffff900071746400 R15: ffff900071746558
[  193.295068] FS:  00007f91892cc740(0000) GS:ffff900078c00000(0000) knlGS:0000000000000000
[  193.296899] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  193.297977] CR2: 0000000001ef0078 CR3: 0000000236050002 CR4: 00000000003606f0
[  193.299234] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  193.300555] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  193.301805] Call Trace:
[  193.302296]  xchk_setup_fs+0x35/0x40 [xfs]
[  193.302937]  xfs_scrub_metadata+0x23d/0x480 [xfs]
[  193.303658]  xfs_ioc_scrub_metadata+0x50/0xa0 [xfs]
[  193.304417]  xfs_file_ioctl+0xb23/0xc60 [xfs]
[  193.305075]  ? pagevec_lru_move_fn+0xbd/0xe0
[  193.305719]  ? get_kernel_page+0x60/0x60
[  193.306321]  ? __lru_cache_add+0x62/0x80
[  193.306922]  ? __handle_mm_fault+0xc65/0x1930
[  193.307553]  do_vfs_ioctl+0x448/0x6c0
[  193.308042]  ? handle_mm_fault+0xc4/0x1f0
[  193.308572]  ksys_ioctl+0x5e/0x90
[  193.309006]  __x64_sys_ioctl+0x16/0x20
[  193.309501]  do_syscall_64+0x5b/0x1d0
[  193.309990]  entry_SYSCALL_64_after_hwframe+0x44/0xa9

> ---
>  common/dmerror    |  107 +++++++++++++++++++++++++++++++++++++++++-
>  tests/xfs/747     |  136 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>  tests/xfs/747.out |   12 +++++
>  tests/xfs/748     |  102 ++++++++++++++++++++++++++++++++++++++++
>  tests/xfs/748.out |    5 ++
>  tests/xfs/group   |    2 +
>  6 files changed, 363 insertions(+), 1 deletion(-)
>  create mode 100755 tests/xfs/747
>  create mode 100644 tests/xfs/747.out
>  create mode 100755 tests/xfs/748
>  create mode 100644 tests/xfs/748.out
> 
> 
> diff --git a/common/dmerror b/common/dmerror
> index ca1c7335..ee3051f1 100644
> --- a/common/dmerror
> +++ b/common/dmerror
> @@ -62,7 +62,7 @@ _dmerror_load_error_table()
>  	$DMSETUP_PROG suspend $suspend_opt error-test
>  	[ $? -ne 0 ] && _fail  "dmsetup suspend failed"
>  
> -	$DMSETUP_PROG load error-test --table "$DMERROR_TABLE"
> +	echo "$DMERROR_TABLE" | $DMSETUP_PROG load error-test
>  	load_res=$?
>  
>  	$DMSETUP_PROG resume error-test
> @@ -94,3 +94,108 @@ _dmerror_load_working_table()
>  	[ $load_res -ne 0 ] && _fail "dmsetup failed to load error table"
>  	[ $resume_res -ne 0 ] && _fail  "dmsetup resume failed"
>  }
> +
> +# Given a list of (start, length) tuples on stdin, combine adjacent tuples into
> +# larger ones and write the new list to stdout.
> +__dmerror_combine_extents()
> +{
> +	awk 'BEGIN{start = 0; len = 0;}{

$AWK_PROG

> +if (start + len == $1) {
> +	len += $2;
> +} else {
> +	if (len > 0)
> +		printf("%d %d\n", start, len);
> +	start = $1;
> +	len = $2;
> +}
> +} END {
> +	if (len > 0)
> +		printf("%d %d\n", start, len);
> +}'
> +}
> +
> +# Given a block device, the name of a preferred dm target, the name of an
> +# implied dm target, and a list of (start, len) tuples on stdin, create a new
> +# dm table which maps each of the tuples to the preferred target and all other
> +# areas to the implied dm target.
> +__dmerror_recreate_map()
> +{
> +	local device="$1"
> +	local preferred_tgt="$2"
> +	local implied_tgt="$3"
> +	local size=$(blockdev --getsz "$device")
> +
> +	awk -v device="$device" -v size=$size -v implied_tgt="$implied_tgt" \

Same here.

> +		-v preferred_tgt="$preferred_tgt" 'BEGIN{implied_start = 0;}{
> +	extent_start = $1;
> +	extent_len = $2;
> +
> +	if (extent_start > size) {
> +		extent_start = size;
> +		extent_len = 0;
> +	} else if (extent_start + extent_len > size) {
> +		extent_len = size - extent_start;
> +	}
> +
> +	if (implied_start < extent_start)
> +		printf("%d %d %s %s %d\n", implied_start,
> +				extent_start - implied_start, implied_tgt,
> +				device, implied_start);
> +	printf("%d %d %s %s %d\n", extent_start, extent_len, preferred_tgt,
> +			device, extent_start);
> +	implied_start = extent_start + extent_len;
> +}END{
> +	if (implied_start < size)
> +		printf("%d %d %s %s %d\n", implied_start, size - implied_start,
> +				implied_tgt, device, implied_start);
> +}'
> +}
> +
> +# Update the dm error table so that the range (start, len) maps to the
> +# preferred dm target, overriding anything that maps to the implied dm target.
> +# This assumes that the only desired targets for this dm device are the
> +# preferred and and implied targets.  The optional fifth argument can be used
> +# to change the underlying device.
> +__dmerror_change()
> +{
> +	local start="$1"
> +	local len="$2"
> +	local preferred_tgt="$3"
> +	local implied_tgt="$4"
> +	local dm_backing_dev="$5"
> +	test -z "$dm_backing_dev" && dm_backing_dev="$SCRATCH_DEV"
> +
> +	DMERROR_TABLE="$( (echo "$DMERROR_TABLE"; echo "$start $len $preferred_tgt") | \
> +		awk -v type="$preferred_tgt" '{if ($3 == type) print $0;}' | \

Same here.

> +		sort -g | \
> +		__dmerror_combine_extents | \
> +		__dmerror_recreate_map "$dm_backing_dev" "$preferred_tgt" \
> +				"$implied_tgt" )"
> +}
> +
> +# Reset the dm error table to everything ok.  The dm device itself must be
> +# remapped by calling _dmerror_load_error_table.
> +_dmerror_reset_table()
> +{
> +	DMERROR_TABLE="$DMLINEAR_TABLE"
> +}
> +
> +# Update the dm error table so that IOs to the given range will return EIO.
> +# The dm device itself must be remapped by calling _dmerror_load_error_table.
> +_dmerror_mark_range_bad()
> +{
> +	local start="$1"
> +	local len="$2"
> +
> +	__dmerror_change "$start" "$len" error linear
> +}
> +
> +# Update the dm error table so that IOs to the given range will succeed.
> +# The dm device itself must be remapped by calling _dmerror_load_error_table.
> +_dmerror_mark_range_good()
> +{
> +	local start="$1"
> +	local len="$2"
> +
> +	__dmerror_change "$start" "$len" linear error
> +}
> diff --git a/tests/xfs/747 b/tests/xfs/747
> new file mode 100755
> index 00000000..f5894411
> --- /dev/null
> +++ b/tests/xfs/747
> @@ -0,0 +1,136 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0-or-newer
> +# Copyright (c) 2019, Oracle and/or its affiliates.  All Rights Reserved.
> +#
> +# FS QA Test No. 747
> +#
> +# Check xfs_scrub's media scan can actually return diagnostic information for
> +# media errors in file data extents.
> +
> +seq=`basename $0`
> +seqres=$RESULT_DIR/$seq
> +echo "QA output created by $seq"
> +
> +here=`pwd`
> +tmp=/tmp/$$
> +status=1    # failure is the default!
> +trap "_cleanup; exit \$status" 0 1 2 3 15
> +
> +_cleanup()
> +{
> +	cd /
> +	rm -f $tmp.error

	rm -f $tmp.*

would be find. Otherwise files like $tmp.mkfs are still there.

> +	_dmerror_cleanup
> +}
> +
> +# get standard environment, filters and checks
> +. ./common/rc
> +. ./common/fuzzy
> +. ./common/filter
> +. ./common/dmerror
> +
> +# real QA test starts here
> +_supported_fs xfs
> +_supported_os Linux
> +_require_dm_target error
> +_require_scratch_xfs_crc
> +_require_scrub
> +
> +rm -f $seqres.full
> +
> +filter_scrub_errors() {
> +	_filter_scratch | sed -e "s/offset $((blksz * 2)) /offset 2FSB /g" \
> +		-e "s/length $blksz.*/length 1FSB./g"
> +}
> +
> +_scratch_mkfs > $tmp.mkfs
> +_dmerror_init
> +_dmerror_mount >> $seqres.full 2>&1
> +
> +_supports_xfs_scrub $SCRATCH_MNT $SCRATCH_DEV || _notrun "Scrub not supported"
> +
> +victim=$SCRATCH_MNT/a
> +$XFS_IO_PROG -f -c "pwrite -S 0x58 0 1m" -c "fsync" $victim >> $seqres.full
> +bmap_str="$($XFS_IO_PROG -c "bmap -elpv" $victim | grep "^[[:space:]]*0:")"
> +echo "$bmap_str" >> $seqres.full
> +
> +phys="$(echo "$bmap_str" | awk '{print $3}')"
> +len="$(echo "$bmap_str" | awk '{print $6}')"
> +blksz=$(_get_file_block_size $SCRATCH_MNT)
> +sectors_per_block=$((blksz / 512))
> +
> +# Did we get at least 4 fs blocks worth of extent?
> +min_len_sectors=$(( 4 * sectors_per_block ))
> +test "$len" -lt $min_len_sectors && \
> +	_fail "could not format a long enough extent on an empty fs??"
> +
> +phys_start=$(echo "$phys" | sed -e 's/\.\..*//g')
> +
> +
> +echo ":$phys:$len:$blksz:$phys_start" >> $seqres.full
> +echo "victim file:" >> $seqres.full
> +od -tx1 -Ad -c $victim >> $seqres.full
> +
> +# Reset the dmerror table so that all IO will pass through.
> +_dmerror_reset_table
> +
> +cat >> $seqres.full << ENDL
> +dmerror before:
> +$DMERROR_TABLE
> +<end table>
> +ENDL
> +
> +# Now mark /only/ the middle of the extent bad.
> +_dmerror_mark_range_bad $(( phys_start + (2 * sectors_per_block) + 1 )) 1
> +
> +cat >> $seqres.full << ENDL
> +dmerror after marking bad:
> +$DMERROR_TABLE
> +<end table>
> +ENDL
> +
> +_dmerror_load_error_table
> +
> +# See if the media scan picks it up.
> +echo "Scrub for injected media error (single threaded)"
> +
> +# Once in single-threaded mode
> +_scratch_scrub -b -x >> $seqres.full 2> $tmp.error
> +cat $tmp.error | filter_scrub_errors
> +
> +# Once in parallel mode
> +echo "Scrub for injected media error (multi threaded)"
> +_scratch_scrub -x >> $seqres.full 2> $tmp.error
> +cat $tmp.error | filter_scrub_errors
> +
> +# Remount to flush the page cache and reread to see the IO error
> +_dmerror_unmount
> +_dmerror_mount
> +echo "victim file:" >> $seqres.full
> +od -tx1 -Ad -c $victim >> $seqres.full 2> $tmp.error
> +cat $tmp.error | _filter_scratch
> +
> +# Scrub again to re-confirm the media error across a remount
> +echo "Scrub for injected media error (after remount)"
> +_scratch_scrub -x >> $seqres.full 2> $tmp.error
> +cat $tmp.error | filter_scrub_errors
> +
> +# Now mark the bad range good.
> +_dmerror_mark_range_good $(( phys_start + (2 * sectors_per_block) + 1 )) 1
> +_dmerror_load_error_table
> +
> +cat >> $seqres.full << ENDL
> +dmerror after marking good:
> +$DMERROR_TABLE
> +<end table>
> +ENDL
> +
> +echo "Scrub after removing injected media error"
> +
> +# Scrub one last time to make sure the error's gone.
> +_scratch_scrub -x >> $seqres.full 2> $tmp.error
> +cat $tmp.error | filter_scrub_errors
> +
> +# success, all done
> +status=0
> +exit
> diff --git a/tests/xfs/747.out b/tests/xfs/747.out
> new file mode 100644
> index 00000000..f85f1753
> --- /dev/null
> +++ b/tests/xfs/747.out
> @@ -0,0 +1,12 @@
> +QA output created by 747
> +Scrub for injected media error (single threaded)
> +Unfixable Error: SCRATCH_MNT/a: media error at data offset 2FSB length 1FSB.
> +SCRATCH_MNT: unfixable errors found: 1
> +Scrub for injected media error (multi threaded)
> +Unfixable Error: SCRATCH_MNT/a: media error at data offset 2FSB length 1FSB.
> +SCRATCH_MNT: unfixable errors found: 1
> +od: SCRATCH_MNT/a: read error: Input/output error
> +Scrub for injected media error (after remount)
> +Unfixable Error: SCRATCH_MNT/a: media error at data offset 2FSB length 1FSB.
> +SCRATCH_MNT: unfixable errors found: 1
> +Scrub after removing injected media error
> diff --git a/tests/xfs/748 b/tests/xfs/748
> new file mode 100755
> index 00000000..130cc6f2
> --- /dev/null
> +++ b/tests/xfs/748
> @@ -0,0 +1,102 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0-or-newer
> +# Copyright (c) 2019, Oracle and/or its affiliates.  All Rights Reserved.
> +#
> +# FS QA Test No. 748
> +#
> +# Check xfs_scrub's media scan can actually return diagnostic information for
> +# media errors in filesystem metadata.
> +
> +seq=`basename $0`
> +seqres=$RESULT_DIR/$seq
> +echo "QA output created by $seq"
> +
> +here=`pwd`
> +tmp=/tmp/$$
> +status=1    # failure is the default!
> +trap "_cleanup; exit \$status" 0 1 2 3 15
> +
> +_cleanup()
> +{
> +	cd /
> +	rm -f $tmp.error $tmp.fsmap

rm -f $tmp.*

> +	_dmerror_cleanup
> +}
> +
> +# get standard environment, filters and checks
> +. ./common/rc
> +. ./common/fuzzy
> +. ./common/filter
> +. ./common/dmerror
> +
> +# real QA test starts here
> +_supported_fs xfs
> +_supported_os Linux
> +_require_dm_target error
> +_require_xfs_scratch_rmapbt

Add a comment on why rmapbt is needed?

Thanks,
Eryu

> +_require_scrub
> +
> +rm -f $seqres.full
> +
> +filter_scrub_errors() {
> +	_filter_scratch | sed -e "s/disk offset [0-9]*: /disk offset NNN: /g" \
> +		-e "/errors found:/d" -e 's/phase6.c line [0-9]*/!/g' \
> +		-e "/corruptions found:/d" | uniq
> +}
> +
> +_scratch_mkfs > $tmp.mkfs
> +_dmerror_init
> +_dmerror_mount >> $seqres.full 2>&1
> +
> +_supports_xfs_scrub $SCRATCH_MNT $SCRATCH_DEV || _notrun "Scrub not supported"
> +
> +# Create a bunch of metadata so that we can mark them bad in the next step.
> +victim=$SCRATCH_MNT/a
> +$FSSTRESS_PROG -z -n 200 -p 10 \
> +	       -f creat=10 \
> +	       -f resvsp=1 \
> +	       -f truncate=1 \
> +	       -f punch=1 \
> +	       -f chown=5 \
> +	       -f mkdir=5 \
> +	       -f mknod=1 \
> +	       -d $victim >> $seqres.full 2>&1
> +
> +# Mark all the metadata bad
> +_dmerror_reset_table
> +$XFS_IO_PROG -c "fsmap -n100 -vvv" $victim | grep inodes > $tmp.fsmap
> +while read a b c crap; do
> +	phys="$(echo $c | sed -e 's/^.\([0-9]*\)\.\.\([0-9]*\).*$/\1:\2/g')"
> +	target_begin="$(echo "$phys" | cut -d ':' -f 1)"
> +	target_end="$(echo "$phys" | cut -d ':' -f 2)"
> +
> +	_dmerror_mark_range_bad $target_begin $((target_end - target_begin))
> +done < $tmp.fsmap
> +cat $tmp.fsmap >> $seqres.full
> +
> +cat >> $seqres.full << ENDL
> +dmerror after marking bad:
> +$DMERROR_TABLE
> +<end table>
> +ENDL
> +
> +_dmerror_load_error_table
> +
> +# See if the media scan picks it up.
> +echo "Scrub for injected media error"
> +
> +XFS_SCRUB_PHASE=6 _scratch_scrub -x >> $seqres.full 2> $tmp.error
> +cat $tmp.error | filter_scrub_errors
> +
> +# Make the disk work again
> +_dmerror_load_working_table
> +
> +echo "Scrub after removing injected media error"
> +
> +# Scrub one last time to make sure the error's gone.
> +XFS_SCRUB_PHASE=6 _scratch_scrub -x >> $seqres.full 2> $tmp.error
> +cat $tmp.error | filter_scrub_errors
> +
> +# success, all done
> +status=0
> +exit
> diff --git a/tests/xfs/748.out b/tests/xfs/748.out
> new file mode 100644
> index 00000000..49dc2d7a
> --- /dev/null
> +++ b/tests/xfs/748.out
> @@ -0,0 +1,5 @@
> +QA output created by 748
> +Scrub for injected media error
> +Corruption: disk offset NNN: media error in inodes. (!)
> +SCRATCH_MNT: Unmount and run xfs_repair.
> +Scrub after removing injected media error
> diff --git a/tests/xfs/group b/tests/xfs/group
> index 18a593d9..3a58864b 100644
> --- a/tests/xfs/group
> +++ b/tests/xfs/group
> @@ -509,3 +509,5 @@
>  510 auto ioctl quick
>  511 auto quick quota
>  741 auto quick rw
> +747 auto quick scrub
> +748 auto quick scrub
>
Darrick J. Wong Jan. 6, 2020, 6:10 p.m. UTC | #2
On Mon, Jan 06, 2020 at 03:58:30PM +0800, Eryu Guan wrote:
> On Tue, Dec 10, 2019 at 09:38:38PM -0800, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Add new helpers to dmerror to provide for marking selected ranges
> > totally bad -- both reads and writes will fail.  Create a new test for
> > xfs_scrub to check that it reports media errors correctly.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> 
> I hit assert failure when testing on v5.5-rc3+ kernel, is that an
> expected result? Both test failed in the same way.

Oh, my.  That's a bogus assertion in xrep_calc_ag_resblks when
CONFIG_XFS_ONLINE_REPAIR=n.  That's definitely a kernel bug, though it's
not related to this test.  I'll start working on a fix, thank you for
reporting this!

> 
> [  192.610313] xfs filesystem being mounted at /mnt/scratch supports timestamps until 2038 (0x7fffffff)
> [  193.149329] Buffer I/O error on dev dm-11, logical block 128, async page read
> [  193.150173] Buffer I/O error on dev dm-11, logical block 129, async page read
> [  193.151254] Buffer I/O error on dev dm-11, logical block 130, async page read
> [  193.152173] Buffer I/O error on dev dm-11, logical block 131, async page read
> [  193.152980] Buffer I/O error on dev dm-11, logical block 132, async page read
> [  193.153935] Buffer I/O error on dev dm-11, logical block 133, async page read
> [  193.154869] Buffer I/O error on dev dm-11, logical block 134, async page read
> [  193.155800] Buffer I/O error on dev dm-11, logical block 135, async page read
> [  193.249751] XFS: Assertion failed: !(sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR), file: fs/xfs/scrub/repair.h, line: 78
> [  193.255979] ------------[ cut here ]------------
> [  193.258406] kernel BUG at fs/xfs/xfs_message.c:110!
> [  193.260996] invalid opcode: 0000 [#1] SMP PTI
> [  193.263323] CPU: 0 PID: 5613 Comm: xfs_scrub Not tainted 5.5.0-rc3+ #44
> [  193.266717] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014
> [  193.273736] RIP: 0010:assfail+0x23/0x28 [xfs]
> [  193.276045] Code: 67 fc ff ff 0f 0b c3 0f 1f 44 00 00 41 89 c8 48 89 d1 48 89 f2 48 c7 c6 78 e9 8d c0 e8 82 f9 ff ff 80 3d 9a d7 08 00 00 74 02 <0f> 0b 0f 0b c3 48 8b 03 48 c7 c7 68 ee 8d c0 c6 05 0e 2b 0a 00 01
> [  193.284481] RSP: 0018:ffffac9540b7fbe0 EFLAGS: 00010202
> [  193.286297] RAX: 0000000000000000 RBX: ffffac9540b7fcc8 RCX: 0000000000000000
> [  193.288390] RDX: 00000000ffffffc0 RSI: 000000000000000a RDI: ffffffffc08d144a
> [  193.290235] RBP: ffffac9540b7fbf8 R08: 0000000000000000 R09: 0000000000000000
> [  193.292083] R10: 000000000000000a R11: f000000000000000 R12: 0000000000000000
> [  193.293589] R13: ffff90006701c000 R14: ffff900071746400 R15: ffff900071746558
> [  193.295068] FS:  00007f91892cc740(0000) GS:ffff900078c00000(0000) knlGS:0000000000000000
> [  193.296899] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  193.297977] CR2: 0000000001ef0078 CR3: 0000000236050002 CR4: 00000000003606f0
> [  193.299234] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [  193.300555] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [  193.301805] Call Trace:
> [  193.302296]  xchk_setup_fs+0x35/0x40 [xfs]
> [  193.302937]  xfs_scrub_metadata+0x23d/0x480 [xfs]
> [  193.303658]  xfs_ioc_scrub_metadata+0x50/0xa0 [xfs]
> [  193.304417]  xfs_file_ioctl+0xb23/0xc60 [xfs]
> [  193.305075]  ? pagevec_lru_move_fn+0xbd/0xe0
> [  193.305719]  ? get_kernel_page+0x60/0x60
> [  193.306321]  ? __lru_cache_add+0x62/0x80
> [  193.306922]  ? __handle_mm_fault+0xc65/0x1930
> [  193.307553]  do_vfs_ioctl+0x448/0x6c0
> [  193.308042]  ? handle_mm_fault+0xc4/0x1f0
> [  193.308572]  ksys_ioctl+0x5e/0x90
> [  193.309006]  __x64_sys_ioctl+0x16/0x20
> [  193.309501]  do_syscall_64+0x5b/0x1d0
> [  193.309990]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> 
> > ---
> >  common/dmerror    |  107 +++++++++++++++++++++++++++++++++++++++++-
> >  tests/xfs/747     |  136 +++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  tests/xfs/747.out |   12 +++++
> >  tests/xfs/748     |  102 ++++++++++++++++++++++++++++++++++++++++
> >  tests/xfs/748.out |    5 ++
> >  tests/xfs/group   |    2 +
> >  6 files changed, 363 insertions(+), 1 deletion(-)
> >  create mode 100755 tests/xfs/747
> >  create mode 100644 tests/xfs/747.out
> >  create mode 100755 tests/xfs/748
> >  create mode 100644 tests/xfs/748.out
> > 
> > 
> > diff --git a/common/dmerror b/common/dmerror
> > index ca1c7335..ee3051f1 100644
> > --- a/common/dmerror
> > +++ b/common/dmerror
> > @@ -62,7 +62,7 @@ _dmerror_load_error_table()
> >  	$DMSETUP_PROG suspend $suspend_opt error-test
> >  	[ $? -ne 0 ] && _fail  "dmsetup suspend failed"
> >  
> > -	$DMSETUP_PROG load error-test --table "$DMERROR_TABLE"
> > +	echo "$DMERROR_TABLE" | $DMSETUP_PROG load error-test
> >  	load_res=$?
> >  
> >  	$DMSETUP_PROG resume error-test
> > @@ -94,3 +94,108 @@ _dmerror_load_working_table()
> >  	[ $load_res -ne 0 ] && _fail "dmsetup failed to load error table"
> >  	[ $resume_res -ne 0 ] && _fail  "dmsetup resume failed"
> >  }
> > +
> > +# Given a list of (start, length) tuples on stdin, combine adjacent tuples into
> > +# larger ones and write the new list to stdout.
> > +__dmerror_combine_extents()
> > +{
> > +	awk 'BEGIN{start = 0; len = 0;}{
> 
> $AWK_PROG

Fixed all of these.

> 
> > +if (start + len == $1) {
> > +	len += $2;
> > +} else {
> > +	if (len > 0)
> > +		printf("%d %d\n", start, len);
> > +	start = $1;
> > +	len = $2;
> > +}
> > +} END {
> > +	if (len > 0)
> > +		printf("%d %d\n", start, len);
> > +}'
> > +}
> > +
> > +# Given a block device, the name of a preferred dm target, the name of an
> > +# implied dm target, and a list of (start, len) tuples on stdin, create a new
> > +# dm table which maps each of the tuples to the preferred target and all other
> > +# areas to the implied dm target.
> > +__dmerror_recreate_map()
> > +{
> > +	local device="$1"
> > +	local preferred_tgt="$2"
> > +	local implied_tgt="$3"
> > +	local size=$(blockdev --getsz "$device")
> > +
> > +	awk -v device="$device" -v size=$size -v implied_tgt="$implied_tgt" \
> 
> Same here.
> 
> > +		-v preferred_tgt="$preferred_tgt" 'BEGIN{implied_start = 0;}{
> > +	extent_start = $1;
> > +	extent_len = $2;
> > +
> > +	if (extent_start > size) {
> > +		extent_start = size;
> > +		extent_len = 0;
> > +	} else if (extent_start + extent_len > size) {
> > +		extent_len = size - extent_start;
> > +	}
> > +
> > +	if (implied_start < extent_start)
> > +		printf("%d %d %s %s %d\n", implied_start,
> > +				extent_start - implied_start, implied_tgt,
> > +				device, implied_start);
> > +	printf("%d %d %s %s %d\n", extent_start, extent_len, preferred_tgt,
> > +			device, extent_start);
> > +	implied_start = extent_start + extent_len;
> > +}END{
> > +	if (implied_start < size)
> > +		printf("%d %d %s %s %d\n", implied_start, size - implied_start,
> > +				implied_tgt, device, implied_start);
> > +}'
> > +}
> > +
> > +# Update the dm error table so that the range (start, len) maps to the
> > +# preferred dm target, overriding anything that maps to the implied dm target.
> > +# This assumes that the only desired targets for this dm device are the
> > +# preferred and and implied targets.  The optional fifth argument can be used
> > +# to change the underlying device.
> > +__dmerror_change()
> > +{
> > +	local start="$1"
> > +	local len="$2"
> > +	local preferred_tgt="$3"
> > +	local implied_tgt="$4"
> > +	local dm_backing_dev="$5"
> > +	test -z "$dm_backing_dev" && dm_backing_dev="$SCRATCH_DEV"
> > +
> > +	DMERROR_TABLE="$( (echo "$DMERROR_TABLE"; echo "$start $len $preferred_tgt") | \
> > +		awk -v type="$preferred_tgt" '{if ($3 == type) print $0;}' | \
> 
> Same here.
> 
> > +		sort -g | \
> > +		__dmerror_combine_extents | \
> > +		__dmerror_recreate_map "$dm_backing_dev" "$preferred_tgt" \
> > +				"$implied_tgt" )"
> > +}
> > +
> > +# Reset the dm error table to everything ok.  The dm device itself must be
> > +# remapped by calling _dmerror_load_error_table.
> > +_dmerror_reset_table()
> > +{
> > +	DMERROR_TABLE="$DMLINEAR_TABLE"
> > +}
> > +
> > +# Update the dm error table so that IOs to the given range will return EIO.
> > +# The dm device itself must be remapped by calling _dmerror_load_error_table.
> > +_dmerror_mark_range_bad()
> > +{
> > +	local start="$1"
> > +	local len="$2"
> > +
> > +	__dmerror_change "$start" "$len" error linear
> > +}
> > +
> > +# Update the dm error table so that IOs to the given range will succeed.
> > +# The dm device itself must be remapped by calling _dmerror_load_error_table.
> > +_dmerror_mark_range_good()
> > +{
> > +	local start="$1"
> > +	local len="$2"
> > +
> > +	__dmerror_change "$start" "$len" linear error
> > +}
> > diff --git a/tests/xfs/747 b/tests/xfs/747
> > new file mode 100755
> > index 00000000..f5894411
> > --- /dev/null
> > +++ b/tests/xfs/747
> > @@ -0,0 +1,136 @@
> > +#! /bin/bash
> > +# SPDX-License-Identifier: GPL-2.0-or-newer
> > +# Copyright (c) 2019, Oracle and/or its affiliates.  All Rights Reserved.
> > +#
> > +# FS QA Test No. 747
> > +#
> > +# Check xfs_scrub's media scan can actually return diagnostic information for
> > +# media errors in file data extents.
> > +
> > +seq=`basename $0`
> > +seqres=$RESULT_DIR/$seq
> > +echo "QA output created by $seq"
> > +
> > +here=`pwd`
> > +tmp=/tmp/$$
> > +status=1    # failure is the default!
> > +trap "_cleanup; exit \$status" 0 1 2 3 15
> > +
> > +_cleanup()
> > +{
> > +	cd /
> > +	rm -f $tmp.error
> 
> 	rm -f $tmp.*
> 
> would be find. Otherwise files like $tmp.mkfs are still there.

Fixed this too.

> > +	_dmerror_cleanup
> > +}
> > +
> > +# get standard environment, filters and checks
> > +. ./common/rc
> > +. ./common/fuzzy
> > +. ./common/filter
> > +. ./common/dmerror
> > +
> > +# real QA test starts here
> > +_supported_fs xfs
> > +_supported_os Linux
> > +_require_dm_target error
> > +_require_scratch_xfs_crc
> > +_require_scrub
> > +
> > +rm -f $seqres.full
> > +
> > +filter_scrub_errors() {
> > +	_filter_scratch | sed -e "s/offset $((blksz * 2)) /offset 2FSB /g" \
> > +		-e "s/length $blksz.*/length 1FSB./g"
> > +}
> > +
> > +_scratch_mkfs > $tmp.mkfs
> > +_dmerror_init
> > +_dmerror_mount >> $seqres.full 2>&1
> > +
> > +_supports_xfs_scrub $SCRATCH_MNT $SCRATCH_DEV || _notrun "Scrub not supported"
> > +
> > +victim=$SCRATCH_MNT/a
> > +$XFS_IO_PROG -f -c "pwrite -S 0x58 0 1m" -c "fsync" $victim >> $seqres.full
> > +bmap_str="$($XFS_IO_PROG -c "bmap -elpv" $victim | grep "^[[:space:]]*0:")"
> > +echo "$bmap_str" >> $seqres.full
> > +
> > +phys="$(echo "$bmap_str" | awk '{print $3}')"
> > +len="$(echo "$bmap_str" | awk '{print $6}')"
> > +blksz=$(_get_file_block_size $SCRATCH_MNT)
> > +sectors_per_block=$((blksz / 512))
> > +
> > +# Did we get at least 4 fs blocks worth of extent?
> > +min_len_sectors=$(( 4 * sectors_per_block ))
> > +test "$len" -lt $min_len_sectors && \
> > +	_fail "could not format a long enough extent on an empty fs??"
> > +
> > +phys_start=$(echo "$phys" | sed -e 's/\.\..*//g')
> > +
> > +
> > +echo ":$phys:$len:$blksz:$phys_start" >> $seqres.full
> > +echo "victim file:" >> $seqres.full
> > +od -tx1 -Ad -c $victim >> $seqres.full
> > +
> > +# Reset the dmerror table so that all IO will pass through.
> > +_dmerror_reset_table
> > +
> > +cat >> $seqres.full << ENDL
> > +dmerror before:
> > +$DMERROR_TABLE
> > +<end table>
> > +ENDL
> > +
> > +# Now mark /only/ the middle of the extent bad.
> > +_dmerror_mark_range_bad $(( phys_start + (2 * sectors_per_block) + 1 )) 1
> > +
> > +cat >> $seqres.full << ENDL
> > +dmerror after marking bad:
> > +$DMERROR_TABLE
> > +<end table>
> > +ENDL
> > +
> > +_dmerror_load_error_table
> > +
> > +# See if the media scan picks it up.
> > +echo "Scrub for injected media error (single threaded)"
> > +
> > +# Once in single-threaded mode
> > +_scratch_scrub -b -x >> $seqres.full 2> $tmp.error
> > +cat $tmp.error | filter_scrub_errors
> > +
> > +# Once in parallel mode
> > +echo "Scrub for injected media error (multi threaded)"
> > +_scratch_scrub -x >> $seqres.full 2> $tmp.error
> > +cat $tmp.error | filter_scrub_errors
> > +
> > +# Remount to flush the page cache and reread to see the IO error
> > +_dmerror_unmount
> > +_dmerror_mount
> > +echo "victim file:" >> $seqres.full
> > +od -tx1 -Ad -c $victim >> $seqres.full 2> $tmp.error
> > +cat $tmp.error | _filter_scratch
> > +
> > +# Scrub again to re-confirm the media error across a remount
> > +echo "Scrub for injected media error (after remount)"
> > +_scratch_scrub -x >> $seqres.full 2> $tmp.error
> > +cat $tmp.error | filter_scrub_errors
> > +
> > +# Now mark the bad range good.
> > +_dmerror_mark_range_good $(( phys_start + (2 * sectors_per_block) + 1 )) 1
> > +_dmerror_load_error_table
> > +
> > +cat >> $seqres.full << ENDL
> > +dmerror after marking good:
> > +$DMERROR_TABLE
> > +<end table>
> > +ENDL
> > +
> > +echo "Scrub after removing injected media error"
> > +
> > +# Scrub one last time to make sure the error's gone.
> > +_scratch_scrub -x >> $seqres.full 2> $tmp.error
> > +cat $tmp.error | filter_scrub_errors
> > +
> > +# success, all done
> > +status=0
> > +exit
> > diff --git a/tests/xfs/747.out b/tests/xfs/747.out
> > new file mode 100644
> > index 00000000..f85f1753
> > --- /dev/null
> > +++ b/tests/xfs/747.out
> > @@ -0,0 +1,12 @@
> > +QA output created by 747
> > +Scrub for injected media error (single threaded)
> > +Unfixable Error: SCRATCH_MNT/a: media error at data offset 2FSB length 1FSB.
> > +SCRATCH_MNT: unfixable errors found: 1
> > +Scrub for injected media error (multi threaded)
> > +Unfixable Error: SCRATCH_MNT/a: media error at data offset 2FSB length 1FSB.
> > +SCRATCH_MNT: unfixable errors found: 1
> > +od: SCRATCH_MNT/a: read error: Input/output error
> > +Scrub for injected media error (after remount)
> > +Unfixable Error: SCRATCH_MNT/a: media error at data offset 2FSB length 1FSB.
> > +SCRATCH_MNT: unfixable errors found: 1
> > +Scrub after removing injected media error
> > diff --git a/tests/xfs/748 b/tests/xfs/748
> > new file mode 100755
> > index 00000000..130cc6f2
> > --- /dev/null
> > +++ b/tests/xfs/748
> > @@ -0,0 +1,102 @@
> > +#! /bin/bash
> > +# SPDX-License-Identifier: GPL-2.0-or-newer
> > +# Copyright (c) 2019, Oracle and/or its affiliates.  All Rights Reserved.
> > +#
> > +# FS QA Test No. 748
> > +#
> > +# Check xfs_scrub's media scan can actually return diagnostic information for
> > +# media errors in filesystem metadata.
> > +
> > +seq=`basename $0`
> > +seqres=$RESULT_DIR/$seq
> > +echo "QA output created by $seq"
> > +
> > +here=`pwd`
> > +tmp=/tmp/$$
> > +status=1    # failure is the default!
> > +trap "_cleanup; exit \$status" 0 1 2 3 15
> > +
> > +_cleanup()
> > +{
> > +	cd /
> > +	rm -f $tmp.error $tmp.fsmap
> 
> rm -f $tmp.*
> 
> > +	_dmerror_cleanup
> > +}
> > +
> > +# get standard environment, filters and checks
> > +. ./common/rc
> > +. ./common/fuzzy
> > +. ./common/filter
> > +. ./common/dmerror
> > +
> > +# real QA test starts here
> > +_supported_fs xfs
> > +_supported_os Linux
> > +_require_dm_target error
> > +_require_xfs_scratch_rmapbt
> 
> Add a comment on why rmapbt is needed?

"rmapbt is required to enable reporting of what metadata was lost."

--D

> Thanks,
> Eryu
> 
> > +_require_scrub
> > +
> > +rm -f $seqres.full
> > +
> > +filter_scrub_errors() {
> > +	_filter_scratch | sed -e "s/disk offset [0-9]*: /disk offset NNN: /g" \
> > +		-e "/errors found:/d" -e 's/phase6.c line [0-9]*/!/g' \
> > +		-e "/corruptions found:/d" | uniq
> > +}
> > +
> > +_scratch_mkfs > $tmp.mkfs
> > +_dmerror_init
> > +_dmerror_mount >> $seqres.full 2>&1
> > +
> > +_supports_xfs_scrub $SCRATCH_MNT $SCRATCH_DEV || _notrun "Scrub not supported"
> > +
> > +# Create a bunch of metadata so that we can mark them bad in the next step.
> > +victim=$SCRATCH_MNT/a
> > +$FSSTRESS_PROG -z -n 200 -p 10 \
> > +	       -f creat=10 \
> > +	       -f resvsp=1 \
> > +	       -f truncate=1 \
> > +	       -f punch=1 \
> > +	       -f chown=5 \
> > +	       -f mkdir=5 \
> > +	       -f mknod=1 \
> > +	       -d $victim >> $seqres.full 2>&1
> > +
> > +# Mark all the metadata bad
> > +_dmerror_reset_table
> > +$XFS_IO_PROG -c "fsmap -n100 -vvv" $victim | grep inodes > $tmp.fsmap
> > +while read a b c crap; do
> > +	phys="$(echo $c | sed -e 's/^.\([0-9]*\)\.\.\([0-9]*\).*$/\1:\2/g')"
> > +	target_begin="$(echo "$phys" | cut -d ':' -f 1)"
> > +	target_end="$(echo "$phys" | cut -d ':' -f 2)"
> > +
> > +	_dmerror_mark_range_bad $target_begin $((target_end - target_begin))
> > +done < $tmp.fsmap
> > +cat $tmp.fsmap >> $seqres.full
> > +
> > +cat >> $seqres.full << ENDL
> > +dmerror after marking bad:
> > +$DMERROR_TABLE
> > +<end table>
> > +ENDL
> > +
> > +_dmerror_load_error_table
> > +
> > +# See if the media scan picks it up.
> > +echo "Scrub for injected media error"
> > +
> > +XFS_SCRUB_PHASE=6 _scratch_scrub -x >> $seqres.full 2> $tmp.error
> > +cat $tmp.error | filter_scrub_errors
> > +
> > +# Make the disk work again
> > +_dmerror_load_working_table
> > +
> > +echo "Scrub after removing injected media error"
> > +
> > +# Scrub one last time to make sure the error's gone.
> > +XFS_SCRUB_PHASE=6 _scratch_scrub -x >> $seqres.full 2> $tmp.error
> > +cat $tmp.error | filter_scrub_errors
> > +
> > +# success, all done
> > +status=0
> > +exit
> > diff --git a/tests/xfs/748.out b/tests/xfs/748.out
> > new file mode 100644
> > index 00000000..49dc2d7a
> > --- /dev/null
> > +++ b/tests/xfs/748.out
> > @@ -0,0 +1,5 @@
> > +QA output created by 748
> > +Scrub for injected media error
> > +Corruption: disk offset NNN: media error in inodes. (!)
> > +SCRATCH_MNT: Unmount and run xfs_repair.
> > +Scrub after removing injected media error
> > diff --git a/tests/xfs/group b/tests/xfs/group
> > index 18a593d9..3a58864b 100644
> > --- a/tests/xfs/group
> > +++ b/tests/xfs/group
> > @@ -509,3 +509,5 @@
> >  510 auto ioctl quick
> >  511 auto quick quota
> >  741 auto quick rw
> > +747 auto quick scrub
> > +748 auto quick scrub
> >
diff mbox series

Patch

diff --git a/common/dmerror b/common/dmerror
index ca1c7335..ee3051f1 100644
--- a/common/dmerror
+++ b/common/dmerror
@@ -62,7 +62,7 @@  _dmerror_load_error_table()
 	$DMSETUP_PROG suspend $suspend_opt error-test
 	[ $? -ne 0 ] && _fail  "dmsetup suspend failed"
 
-	$DMSETUP_PROG load error-test --table "$DMERROR_TABLE"
+	echo "$DMERROR_TABLE" | $DMSETUP_PROG load error-test
 	load_res=$?
 
 	$DMSETUP_PROG resume error-test
@@ -94,3 +94,108 @@  _dmerror_load_working_table()
 	[ $load_res -ne 0 ] && _fail "dmsetup failed to load error table"
 	[ $resume_res -ne 0 ] && _fail  "dmsetup resume failed"
 }
+
+# Given a list of (start, length) tuples on stdin, combine adjacent tuples into
+# larger ones and write the new list to stdout.
+__dmerror_combine_extents()
+{
+	awk 'BEGIN{start = 0; len = 0;}{
+if (start + len == $1) {
+	len += $2;
+} else {
+	if (len > 0)
+		printf("%d %d\n", start, len);
+	start = $1;
+	len = $2;
+}
+} END {
+	if (len > 0)
+		printf("%d %d\n", start, len);
+}'
+}
+
+# Given a block device, the name of a preferred dm target, the name of an
+# implied dm target, and a list of (start, len) tuples on stdin, create a new
+# dm table which maps each of the tuples to the preferred target and all other
+# areas to the implied dm target.
+__dmerror_recreate_map()
+{
+	local device="$1"
+	local preferred_tgt="$2"
+	local implied_tgt="$3"
+	local size=$(blockdev --getsz "$device")
+
+	awk -v device="$device" -v size=$size -v implied_tgt="$implied_tgt" \
+		-v preferred_tgt="$preferred_tgt" 'BEGIN{implied_start = 0;}{
+	extent_start = $1;
+	extent_len = $2;
+
+	if (extent_start > size) {
+		extent_start = size;
+		extent_len = 0;
+	} else if (extent_start + extent_len > size) {
+		extent_len = size - extent_start;
+	}
+
+	if (implied_start < extent_start)
+		printf("%d %d %s %s %d\n", implied_start,
+				extent_start - implied_start, implied_tgt,
+				device, implied_start);
+	printf("%d %d %s %s %d\n", extent_start, extent_len, preferred_tgt,
+			device, extent_start);
+	implied_start = extent_start + extent_len;
+}END{
+	if (implied_start < size)
+		printf("%d %d %s %s %d\n", implied_start, size - implied_start,
+				implied_tgt, device, implied_start);
+}'
+}
+
+# Update the dm error table so that the range (start, len) maps to the
+# preferred dm target, overriding anything that maps to the implied dm target.
+# This assumes that the only desired targets for this dm device are the
+# preferred and and implied targets.  The optional fifth argument can be used
+# to change the underlying device.
+__dmerror_change()
+{
+	local start="$1"
+	local len="$2"
+	local preferred_tgt="$3"
+	local implied_tgt="$4"
+	local dm_backing_dev="$5"
+	test -z "$dm_backing_dev" && dm_backing_dev="$SCRATCH_DEV"
+
+	DMERROR_TABLE="$( (echo "$DMERROR_TABLE"; echo "$start $len $preferred_tgt") | \
+		awk -v type="$preferred_tgt" '{if ($3 == type) print $0;}' | \
+		sort -g | \
+		__dmerror_combine_extents | \
+		__dmerror_recreate_map "$dm_backing_dev" "$preferred_tgt" \
+				"$implied_tgt" )"
+}
+
+# Reset the dm error table to everything ok.  The dm device itself must be
+# remapped by calling _dmerror_load_error_table.
+_dmerror_reset_table()
+{
+	DMERROR_TABLE="$DMLINEAR_TABLE"
+}
+
+# Update the dm error table so that IOs to the given range will return EIO.
+# The dm device itself must be remapped by calling _dmerror_load_error_table.
+_dmerror_mark_range_bad()
+{
+	local start="$1"
+	local len="$2"
+
+	__dmerror_change "$start" "$len" error linear
+}
+
+# Update the dm error table so that IOs to the given range will succeed.
+# The dm device itself must be remapped by calling _dmerror_load_error_table.
+_dmerror_mark_range_good()
+{
+	local start="$1"
+	local len="$2"
+
+	__dmerror_change "$start" "$len" linear error
+}
diff --git a/tests/xfs/747 b/tests/xfs/747
new file mode 100755
index 00000000..f5894411
--- /dev/null
+++ b/tests/xfs/747
@@ -0,0 +1,136 @@ 
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-newer
+# Copyright (c) 2019, Oracle and/or its affiliates.  All Rights Reserved.
+#
+# FS QA Test No. 747
+#
+# Check xfs_scrub's media scan can actually return diagnostic information for
+# media errors in file data extents.
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+status=1    # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+	cd /
+	rm -f $tmp.error
+	_dmerror_cleanup
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/fuzzy
+. ./common/filter
+. ./common/dmerror
+
+# real QA test starts here
+_supported_fs xfs
+_supported_os Linux
+_require_dm_target error
+_require_scratch_xfs_crc
+_require_scrub
+
+rm -f $seqres.full
+
+filter_scrub_errors() {
+	_filter_scratch | sed -e "s/offset $((blksz * 2)) /offset 2FSB /g" \
+		-e "s/length $blksz.*/length 1FSB./g"
+}
+
+_scratch_mkfs > $tmp.mkfs
+_dmerror_init
+_dmerror_mount >> $seqres.full 2>&1
+
+_supports_xfs_scrub $SCRATCH_MNT $SCRATCH_DEV || _notrun "Scrub not supported"
+
+victim=$SCRATCH_MNT/a
+$XFS_IO_PROG -f -c "pwrite -S 0x58 0 1m" -c "fsync" $victim >> $seqres.full
+bmap_str="$($XFS_IO_PROG -c "bmap -elpv" $victim | grep "^[[:space:]]*0:")"
+echo "$bmap_str" >> $seqres.full
+
+phys="$(echo "$bmap_str" | awk '{print $3}')"
+len="$(echo "$bmap_str" | awk '{print $6}')"
+blksz=$(_get_file_block_size $SCRATCH_MNT)
+sectors_per_block=$((blksz / 512))
+
+# Did we get at least 4 fs blocks worth of extent?
+min_len_sectors=$(( 4 * sectors_per_block ))
+test "$len" -lt $min_len_sectors && \
+	_fail "could not format a long enough extent on an empty fs??"
+
+phys_start=$(echo "$phys" | sed -e 's/\.\..*//g')
+
+
+echo ":$phys:$len:$blksz:$phys_start" >> $seqres.full
+echo "victim file:" >> $seqres.full
+od -tx1 -Ad -c $victim >> $seqres.full
+
+# Reset the dmerror table so that all IO will pass through.
+_dmerror_reset_table
+
+cat >> $seqres.full << ENDL
+dmerror before:
+$DMERROR_TABLE
+<end table>
+ENDL
+
+# Now mark /only/ the middle of the extent bad.
+_dmerror_mark_range_bad $(( phys_start + (2 * sectors_per_block) + 1 )) 1
+
+cat >> $seqres.full << ENDL
+dmerror after marking bad:
+$DMERROR_TABLE
+<end table>
+ENDL
+
+_dmerror_load_error_table
+
+# See if the media scan picks it up.
+echo "Scrub for injected media error (single threaded)"
+
+# Once in single-threaded mode
+_scratch_scrub -b -x >> $seqres.full 2> $tmp.error
+cat $tmp.error | filter_scrub_errors
+
+# Once in parallel mode
+echo "Scrub for injected media error (multi threaded)"
+_scratch_scrub -x >> $seqres.full 2> $tmp.error
+cat $tmp.error | filter_scrub_errors
+
+# Remount to flush the page cache and reread to see the IO error
+_dmerror_unmount
+_dmerror_mount
+echo "victim file:" >> $seqres.full
+od -tx1 -Ad -c $victim >> $seqres.full 2> $tmp.error
+cat $tmp.error | _filter_scratch
+
+# Scrub again to re-confirm the media error across a remount
+echo "Scrub for injected media error (after remount)"
+_scratch_scrub -x >> $seqres.full 2> $tmp.error
+cat $tmp.error | filter_scrub_errors
+
+# Now mark the bad range good.
+_dmerror_mark_range_good $(( phys_start + (2 * sectors_per_block) + 1 )) 1
+_dmerror_load_error_table
+
+cat >> $seqres.full << ENDL
+dmerror after marking good:
+$DMERROR_TABLE
+<end table>
+ENDL
+
+echo "Scrub after removing injected media error"
+
+# Scrub one last time to make sure the error's gone.
+_scratch_scrub -x >> $seqres.full 2> $tmp.error
+cat $tmp.error | filter_scrub_errors
+
+# success, all done
+status=0
+exit
diff --git a/tests/xfs/747.out b/tests/xfs/747.out
new file mode 100644
index 00000000..f85f1753
--- /dev/null
+++ b/tests/xfs/747.out
@@ -0,0 +1,12 @@ 
+QA output created by 747
+Scrub for injected media error (single threaded)
+Unfixable Error: SCRATCH_MNT/a: media error at data offset 2FSB length 1FSB.
+SCRATCH_MNT: unfixable errors found: 1
+Scrub for injected media error (multi threaded)
+Unfixable Error: SCRATCH_MNT/a: media error at data offset 2FSB length 1FSB.
+SCRATCH_MNT: unfixable errors found: 1
+od: SCRATCH_MNT/a: read error: Input/output error
+Scrub for injected media error (after remount)
+Unfixable Error: SCRATCH_MNT/a: media error at data offset 2FSB length 1FSB.
+SCRATCH_MNT: unfixable errors found: 1
+Scrub after removing injected media error
diff --git a/tests/xfs/748 b/tests/xfs/748
new file mode 100755
index 00000000..130cc6f2
--- /dev/null
+++ b/tests/xfs/748
@@ -0,0 +1,102 @@ 
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-newer
+# Copyright (c) 2019, Oracle and/or its affiliates.  All Rights Reserved.
+#
+# FS QA Test No. 748
+#
+# Check xfs_scrub's media scan can actually return diagnostic information for
+# media errors in filesystem metadata.
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+status=1    # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+	cd /
+	rm -f $tmp.error $tmp.fsmap
+	_dmerror_cleanup
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/fuzzy
+. ./common/filter
+. ./common/dmerror
+
+# real QA test starts here
+_supported_fs xfs
+_supported_os Linux
+_require_dm_target error
+_require_xfs_scratch_rmapbt
+_require_scrub
+
+rm -f $seqres.full
+
+filter_scrub_errors() {
+	_filter_scratch | sed -e "s/disk offset [0-9]*: /disk offset NNN: /g" \
+		-e "/errors found:/d" -e 's/phase6.c line [0-9]*/!/g' \
+		-e "/corruptions found:/d" | uniq
+}
+
+_scratch_mkfs > $tmp.mkfs
+_dmerror_init
+_dmerror_mount >> $seqres.full 2>&1
+
+_supports_xfs_scrub $SCRATCH_MNT $SCRATCH_DEV || _notrun "Scrub not supported"
+
+# Create a bunch of metadata so that we can mark them bad in the next step.
+victim=$SCRATCH_MNT/a
+$FSSTRESS_PROG -z -n 200 -p 10 \
+	       -f creat=10 \
+	       -f resvsp=1 \
+	       -f truncate=1 \
+	       -f punch=1 \
+	       -f chown=5 \
+	       -f mkdir=5 \
+	       -f mknod=1 \
+	       -d $victim >> $seqres.full 2>&1
+
+# Mark all the metadata bad
+_dmerror_reset_table
+$XFS_IO_PROG -c "fsmap -n100 -vvv" $victim | grep inodes > $tmp.fsmap
+while read a b c crap; do
+	phys="$(echo $c | sed -e 's/^.\([0-9]*\)\.\.\([0-9]*\).*$/\1:\2/g')"
+	target_begin="$(echo "$phys" | cut -d ':' -f 1)"
+	target_end="$(echo "$phys" | cut -d ':' -f 2)"
+
+	_dmerror_mark_range_bad $target_begin $((target_end - target_begin))
+done < $tmp.fsmap
+cat $tmp.fsmap >> $seqres.full
+
+cat >> $seqres.full << ENDL
+dmerror after marking bad:
+$DMERROR_TABLE
+<end table>
+ENDL
+
+_dmerror_load_error_table
+
+# See if the media scan picks it up.
+echo "Scrub for injected media error"
+
+XFS_SCRUB_PHASE=6 _scratch_scrub -x >> $seqres.full 2> $tmp.error
+cat $tmp.error | filter_scrub_errors
+
+# Make the disk work again
+_dmerror_load_working_table
+
+echo "Scrub after removing injected media error"
+
+# Scrub one last time to make sure the error's gone.
+XFS_SCRUB_PHASE=6 _scratch_scrub -x >> $seqres.full 2> $tmp.error
+cat $tmp.error | filter_scrub_errors
+
+# success, all done
+status=0
+exit
diff --git a/tests/xfs/748.out b/tests/xfs/748.out
new file mode 100644
index 00000000..49dc2d7a
--- /dev/null
+++ b/tests/xfs/748.out
@@ -0,0 +1,5 @@ 
+QA output created by 748
+Scrub for injected media error
+Corruption: disk offset NNN: media error in inodes. (!)
+SCRATCH_MNT: Unmount and run xfs_repair.
+Scrub after removing injected media error
diff --git a/tests/xfs/group b/tests/xfs/group
index 18a593d9..3a58864b 100644
--- a/tests/xfs/group
+++ b/tests/xfs/group
@@ -509,3 +509,5 @@ 
 510 auto ioctl quick
 511 auto quick quota
 741 auto quick rw
+747 auto quick scrub
+748 auto quick scrub