diff mbox series

btrfs: add test for missing csums in log when doing async on subpage vol

Message ID 20241008112302.2757404-1-maharmstone@fb.com (mailing list archive)
State New
Headers show
Series btrfs: add test for missing csums in log when doing async on subpage vol | expand

Commit Message

Mark Harmstone Oct. 8, 2024, 11:22 a.m. UTC
Adds a test for a bug we encountered on Linux 6.4 on aarch64, where a
race could mean that csums weren't getting written to the log tree,
leading to corruption when it was replayed.

The patches to detect log this tree corruption are in btrfs-progs 6.11.

Signed-off-by: Mark Harmstone <maharmstone@fb.com>
---
 common/dmlogwrites  | 24 ++++++++++++++++++
 tests/btrfs/192     | 26 +-------------------
 tests/btrfs/333     | 59 +++++++++++++++++++++++++++++++++++++++++++++
 tests/btrfs/333.out |  2 ++
 4 files changed, 86 insertions(+), 25 deletions(-)
 create mode 100755 tests/btrfs/333
 create mode 100644 tests/btrfs/333.out

Comments

Filipe Manana Oct. 8, 2024, 1:35 p.m. UTC | #1
On Tue, Oct 8, 2024 at 12:26 PM Mark Harmstone <maharmstone@fb.com> wrote:
>
> Adds a test for a bug we encountered on Linux 6.4 on aarch64, where a
> race could mean that csums weren't getting written to the log tree,
> leading to corruption when it was replayed.
>
> The patches to detect log this tree corruption are in btrfs-progs 6.11.
>
> Signed-off-by: Mark Harmstone <maharmstone@fb.com>
> ---
>  common/dmlogwrites  | 24 ++++++++++++++++++
>  tests/btrfs/192     | 26 +-------------------
>  tests/btrfs/333     | 59 +++++++++++++++++++++++++++++++++++++++++++++
>  tests/btrfs/333.out |  2 ++
>  4 files changed, 86 insertions(+), 25 deletions(-)
>  create mode 100755 tests/btrfs/333
>  create mode 100644 tests/btrfs/333.out
>
> diff --git a/common/dmlogwrites b/common/dmlogwrites
> index c1c85de9..f7faf244 100644
> --- a/common/dmlogwrites
> +++ b/common/dmlogwrites
> @@ -203,3 +203,27 @@ _log_writes_replay_log_range()
>                 >> $seqres.full 2>&1
>         [ $? -ne 0 ] && _fail "replay failed"
>  }
> +
> +# Replay and check each fua/flush (specified by $2) point.
> +#
> +# Since dm-log-writes records bio sequentially, even just replaying a range
> +# still needs to iterate all records before the end point.
> +# When number of records grows, it will be unacceptably slow, thus we need
> +# to use relay-log itself to trigger fsck, avoid unnecessary seek.
> +_log_writes_fast_replay_check()
> +{
> +       local check_point=$1
> +       local blkdev=$2
> +       local fsck_command=$3
> +       local ret
> +
> +       [ -z "$check_point" -o -z "$blkdev" ] && _fail \
> +       "check_point and blkdev must be specified for log_writes_fast_replay_check"
> +
> +       $here/src/log-writes/replay-log --log $LOGWRITES_DEV \
> +               --replay $blkdev --check $check_point --fsck "$fsck_command" \
> +               &> $tmp.full_fsck
> +       ret=$?
> +       tail -n 150 $tmp.full_fsck >> $seqres.full
> +       [ $ret -ne 0 ] && _fail "fsck failed during replay"
> +}
> diff --git a/tests/btrfs/192 b/tests/btrfs/192
> index f7fb65b8..449f0459 100755
> --- a/tests/btrfs/192
> +++ b/tests/btrfs/192
> @@ -96,30 +96,6 @@ delete_workload()
>         done
>  }
>
> -# Replay and check each fua/flush (specified by $2) point.
> -#
> -# Since dm-log-writes records bio sequentially, even just replaying a range
> -# still needs to iterate all records before the end point.
> -# When number of records grows, it will be unacceptably slow, thus we need
> -# to use relay-log itself to trigger fsck, avoid unnecessary seek.
> -log_writes_fast_replay_check()
> -{
> -       local check_point=$1
> -       local blkdev=$2
> -       local fsck_command="$BTRFS_UTIL_PROG check $blkdev"
> -       local ret
> -
> -       [ -z "$check_point" -o -z "$blkdev" ] && _fail \
> -       "check_point and blkdev must be specified for log_writes_fast_replay_check"
> -
> -       $here/src/log-writes/replay-log --log $LOGWRITES_DEV \
> -               --replay $blkdev --check $check_point --fsck "$fsck_command" \
> -               &> $tmp.full_fsck
> -       ret=$?
> -       tail -n 150 $tmp.full_fsck >> $seqres.full
> -       [ $ret -ne 0 ] && _fail "fsck failed during replay"
> -}
> -
>  xattr_value=$(printf '%0.sX' $(seq 1 3800))
>
>  # Bumping tree height to level 2.
> @@ -145,7 +121,7 @@ wait
>  _log_writes_unmount
>  _log_writes_remove
>
> -log_writes_fast_replay_check fua "$SCRATCH_DEV"
> +_log_writes_fast_replay_check fua "$SCRATCH_DEV" "$BTRFS_UTIL_PROG check $SCRATCH_DEV"
>
>  echo "Silence is golden"
>
> diff --git a/tests/btrfs/333 b/tests/btrfs/333
> new file mode 100755
> index 00000000..13f113ca
> --- /dev/null
> +++ b/tests/btrfs/333
> @@ -0,0 +1,59 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +#
> +# FS QA Test 333
> +#
> +# Test async dio with fsync to test a bug where a race meant that csums weren't
> +# getting written to the log tree, causing corruptions on remount. This can be
> +# seen on subpage FSes on 6.4.
> +#
> +. ./common/preamble
> +_begin_fstest auto quick metadata log volume

Why the volume group? The test isn't doing any volume management stuff.

However it should be in the "recoveryloop" group.

> +
> +_fixed_by_kernel_commit e917ff56c8e7 \
> +       "btrfs: determine synchronous writers from bio or writeback control"
> +
> +fio_config=$tmp.fio
> +
> +. ./common/dmlogwrites
> +
> +_require_scratch
> +_require_log_writes
> +
> +cat >$fio_config <<EOF
> +[global]
> +iodepth=128
> +direct=1
> +ioengine=libaio
> +rw=randwrite
> +runtime=1s
> +[job0]
> +rw=randwrite
> +filename=$SCRATCH_MNT/file
> +size=1g
> +fdatasync=1
> +EOF
> +
> +_require_fio $fio_config
> +
> +cat $fio_config >> $seqres.full
> +
> +_log_writes_init $SCRATCH_DEV
> +_log_writes_mkfs >> $seqres.full 2>&1
> +_log_writes_mark mkfs
> +
> +_log_writes_mount
> +
> +$FIO_PROG $fio_config > /dev/null 2>&1
> +_log_writes_unmount
> +
> +_log_writes_remove
> +_log_writes_replay_log mkfs $SCRATCH_DEV
> +
> +_log_writes_fast_replay_check fua "$SCRATCH_DEV" "$BTRFS_UTIL_PROG check $SCRATCH_DEV"


Why do we need to do the replays twice? Once with
_log_writes_replay_log mkfs and then again with
_log_writes_fast_replay_check fua.

There's also nothing in this test that is btrfs specific, it could be
made a generic test instead.

Thanks.

> +
> +echo "Silence is golden"
> +
> +# success, all done
> +status=0
> +exit
> diff --git a/tests/btrfs/333.out b/tests/btrfs/333.out
> new file mode 100644
> index 00000000..60a15898
> --- /dev/null
> +++ b/tests/btrfs/333.out
> @@ -0,0 +1,2 @@
> +QA output created by 333
> +Silence is golden
> --
> 2.44.2
>
>
Mark Harmstone Oct. 8, 2024, 2:06 p.m. UTC | #2
Thanks Filipe.

On 8/10/24 14:35, Filipe Manana wrote:
>> +_begin_fstest auto quick metadata log volume
> 
> Why the volume group? The test isn't doing any volume management stuff.
> 
> However it should be in the "recoveryloop" group.

No worries, I'll change that.

>> +_log_writes_replay_log mkfs $SCRATCH_DEV
>> +
>> +_log_writes_fast_replay_check fua "$SCRATCH_DEV" "$BTRFS_UTIL_PROG check $SCRATCH_DEV"
> 
> Why do we need to do the replays twice? Once with
> _log_writes_replay_log mkfs and then again with
> _log_writes_fast_replay_check fua.

_log_writes_replay_log mkfs to put the FS back how it was after 
mkfs.btrfs, _log_writes_fast_replay_check to replay it from there. Is 
_log_writes_replay_log redundant here?

> There's also nothing in this test that is btrfs specific, it could be
> made a generic test instead.

Yes there is, it's running btrfs check every time.

Mark
Filipe Manana Oct. 8, 2024, 2:18 p.m. UTC | #3
On Tue, Oct 8, 2024 at 3:06 PM Mark Harmstone <maharmstone@meta.com> wrote:
>
> Thanks Filipe.
>
> On 8/10/24 14:35, Filipe Manana wrote:
> >> +_begin_fstest auto quick metadata log volume
> >
> > Why the volume group? The test isn't doing any volume management stuff.
> >
> > However it should be in the "recoveryloop" group.
>
> No worries, I'll change that.
>
> >> +_log_writes_replay_log mkfs $SCRATCH_DEV
> >> +
> >> +_log_writes_fast_replay_check fua "$SCRATCH_DEV" "$BTRFS_UTIL_PROG check $SCRATCH_DEV"
> >
> > Why do we need to do the replays twice? Once with
> > _log_writes_replay_log mkfs and then again with
> > _log_writes_fast_replay_check fua.
>
> _log_writes_replay_log mkfs to put the FS back how it was after
> mkfs.btrfs, _log_writes_fast_replay_check to replay it from there. Is
> _log_writes_replay_log redundant here?

No, I missed the mkfs mark passed to _log_writes_replay.
Though it still seems redundant because _log_writes_fast_replay_check
is called for each fua mark, and after mkfs we have a fua.

>
> > There's also nothing in this test that is btrfs specific, it could be
> > made a generic test instead.
>
> Yes there is, it's running btrfs check every time.

Right, but instead of calling it explicitly, it could pass
"_check_scratch_fs" as an argument instead, and the test becomes
generic:

_log_writes_fast_replay_check fua "$SCRATCH_DEV" "_check_scratch_fs"

Thanks.

>
> Mark
Mark Harmstone Oct. 8, 2024, 2:35 p.m. UTC | #4
On 8/10/24 15:18, Filipe Manana wrote:
>>> There's also nothing in this test that is btrfs specific, it could be
>>> made a generic test instead.
>>
>> Yes there is, it's running btrfs check every time.
> 
> Right, but instead of calling it explicitly, it could pass
> "_check_scratch_fs" as an argument instead, and the test becomes
> generic:
> 
> _log_writes_fast_replay_check fua "$SCRATCH_DEV" "_check_scratch_fs"

Well, we could, but this is a test for a btrfs-specific race that 
existed between two known commits - this isn't a generic stress test. 
There's no reason to believe a) that other filesystems are vulnerable to 
this, or b) that their check programs would even pick it up.

Mark
Filipe Manana Oct. 8, 2024, 2:44 p.m. UTC | #5
On Tue, Oct 8, 2024 at 3:35 PM Mark Harmstone <maharmstone@meta.com> wrote:
>
> On 8/10/24 15:18, Filipe Manana wrote:
> >>> There's also nothing in this test that is btrfs specific, it could be
> >>> made a generic test instead.
> >>
> >> Yes there is, it's running btrfs check every time.
> >
> > Right, but instead of calling it explicitly, it could pass
> > "_check_scratch_fs" as an argument instead, and the test becomes
> > generic:
> >
> > _log_writes_fast_replay_check fua "$SCRATCH_DEV" "_check_scratch_fs"
>
> Well, we could, but this is a test for a btrfs-specific race that
> existed between two known commits - this isn't a generic stress test.
> There's no reason to believe a) that other filesystems are vulnerable to
> this, or b) that their check programs would even pick it up.

Does it mean other filesystems can never have any similar bugs in the
future? Or never had similar bugs in the past but no one wrote a test
for them.

Does it mean all check programs for all filesystem don't have and will
never have such checks?

How do you know that? No one can.

We write generic test cases when we are not exercising features
specific to a filesystem.
Similarly, we write filesystem specific test cases  when we are
exercising features unique for a specific filesystem.

If you check git history and the mailing list, you'll see tons of
examples where a generic test case was written even if a bug was
detected (so far) only on a particular filesystem - i.e. what the test
does can be run on any filesystem.

>
> Mark
>
diff mbox series

Patch

diff --git a/common/dmlogwrites b/common/dmlogwrites
index c1c85de9..f7faf244 100644
--- a/common/dmlogwrites
+++ b/common/dmlogwrites
@@ -203,3 +203,27 @@  _log_writes_replay_log_range()
 		>> $seqres.full 2>&1
 	[ $? -ne 0 ] && _fail "replay failed"
 }
+
+# Replay and check each fua/flush (specified by $2) point.
+#
+# Since dm-log-writes records bio sequentially, even just replaying a range
+# still needs to iterate all records before the end point.
+# When number of records grows, it will be unacceptably slow, thus we need
+# to use relay-log itself to trigger fsck, avoid unnecessary seek.
+_log_writes_fast_replay_check()
+{
+	local check_point=$1
+	local blkdev=$2
+	local fsck_command=$3
+	local ret
+
+	[ -z "$check_point" -o -z "$blkdev" ] && _fail \
+	"check_point and blkdev must be specified for log_writes_fast_replay_check"
+
+	$here/src/log-writes/replay-log --log $LOGWRITES_DEV \
+		--replay $blkdev --check $check_point --fsck "$fsck_command" \
+		&> $tmp.full_fsck
+	ret=$?
+	tail -n 150 $tmp.full_fsck >> $seqres.full
+	[ $ret -ne 0 ] && _fail "fsck failed during replay"
+}
diff --git a/tests/btrfs/192 b/tests/btrfs/192
index f7fb65b8..449f0459 100755
--- a/tests/btrfs/192
+++ b/tests/btrfs/192
@@ -96,30 +96,6 @@  delete_workload()
 	done
 }
 
-# Replay and check each fua/flush (specified by $2) point.
-#
-# Since dm-log-writes records bio sequentially, even just replaying a range
-# still needs to iterate all records before the end point.
-# When number of records grows, it will be unacceptably slow, thus we need
-# to use relay-log itself to trigger fsck, avoid unnecessary seek.
-log_writes_fast_replay_check()
-{
-	local check_point=$1
-	local blkdev=$2
-	local fsck_command="$BTRFS_UTIL_PROG check $blkdev"
-	local ret
-
-	[ -z "$check_point" -o -z "$blkdev" ] && _fail \
-	"check_point and blkdev must be specified for log_writes_fast_replay_check"
-
-	$here/src/log-writes/replay-log --log $LOGWRITES_DEV \
-		--replay $blkdev --check $check_point --fsck "$fsck_command" \
-		&> $tmp.full_fsck
-	ret=$?
-	tail -n 150 $tmp.full_fsck >> $seqres.full
-	[ $ret -ne 0 ] && _fail "fsck failed during replay"
-}
-
 xattr_value=$(printf '%0.sX' $(seq 1 3800))
 
 # Bumping tree height to level 2.
@@ -145,7 +121,7 @@  wait
 _log_writes_unmount
 _log_writes_remove
 
-log_writes_fast_replay_check fua "$SCRATCH_DEV"
+_log_writes_fast_replay_check fua "$SCRATCH_DEV" "$BTRFS_UTIL_PROG check $SCRATCH_DEV"
 
 echo "Silence is golden"
 
diff --git a/tests/btrfs/333 b/tests/btrfs/333
new file mode 100755
index 00000000..13f113ca
--- /dev/null
+++ b/tests/btrfs/333
@@ -0,0 +1,59 @@ 
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# FS QA Test 333
+#
+# Test async dio with fsync to test a bug where a race meant that csums weren't
+# getting written to the log tree, causing corruptions on remount. This can be
+# seen on subpage FSes on 6.4.
+#
+. ./common/preamble
+_begin_fstest auto quick metadata log volume
+
+_fixed_by_kernel_commit e917ff56c8e7 \
+	"btrfs: determine synchronous writers from bio or writeback control"
+
+fio_config=$tmp.fio
+
+. ./common/dmlogwrites
+
+_require_scratch
+_require_log_writes
+
+cat >$fio_config <<EOF
+[global]
+iodepth=128
+direct=1
+ioengine=libaio
+rw=randwrite
+runtime=1s
+[job0]
+rw=randwrite
+filename=$SCRATCH_MNT/file
+size=1g
+fdatasync=1
+EOF
+
+_require_fio $fio_config
+
+cat $fio_config >> $seqres.full
+
+_log_writes_init $SCRATCH_DEV
+_log_writes_mkfs >> $seqres.full 2>&1
+_log_writes_mark mkfs
+
+_log_writes_mount
+
+$FIO_PROG $fio_config > /dev/null 2>&1
+_log_writes_unmount
+
+_log_writes_remove
+_log_writes_replay_log mkfs $SCRATCH_DEV
+
+_log_writes_fast_replay_check fua "$SCRATCH_DEV" "$BTRFS_UTIL_PROG check $SCRATCH_DEV"
+
+echo "Silence is golden"
+
+# success, all done
+status=0
+exit
diff --git a/tests/btrfs/333.out b/tests/btrfs/333.out
new file mode 100644
index 00000000..60a15898
--- /dev/null
+++ b/tests/btrfs/333.out
@@ -0,0 +1,2 @@ 
+QA output created by 333
+Silence is golden