[02/16] xfs/422: move the fsstress/freeze/scrub racing logic to common/fuzzy

Message ID	167243837327.694541.10370212917252408651.stgit@magnolia (mailing list archive)
State	New, archived
Headers	show Return-Path: <fstests-owner@vger.kernel.org> Subject: [PATCH 02/16] xfs/422: move the fsstress/freeze/scrub racing logic to common/fuzzy From: "Darrick J. Wong" <djwong@kernel.org> To: zlang@redhat.com, djwong@kernel.org Cc: linux-xfs@vger.kernel.org, fstests@vger.kernel.org, guan@eryu.me Date: Fri, 30 Dec 2022 14:12:53 -0800 Message-ID: <167243837327.694541.10370212917252408651.stgit@magnolia> In-Reply-To: <167243837296.694541.13203497631389630964.stgit@magnolia> References: <167243837296.694541.13203497631389630964.stgit@magnolia> User-Agent: StGit/0.19 MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: 7bit Precedence: bulk
Series	fstests: refactor online fsck stress tests \| expand [PATCHSET,v24.0,00/16] fstests: refactor online fsck stress tests [01/16] xfs/422: create a new test group for fsstress/repair racers [02/16] xfs/422: move the fsstress/freeze/scrub racing logic to common/fuzzy [03/16] xfs/422: rework feature detection so we only test-format scratch once [04/16] fuzzy: clean up scrub stress programs quietly [05/16] fuzzy: rework scrub stress output filtering [06/16] fuzzy: explicitly check for common/inject in _require_xfs_stress_online_repair [07/16] fuzzy: give each test local control over what scrub stress tests get run [08/16] fuzzy: test the scrub stress subcommands before looping [09/16] fuzzy: make scrub stress loop control more robust [10/16] fuzzy: abort scrub stress testing if the scratch fs went down [11/16] fuzzy: clear out the scratch filesystem if it's too full [12/16] fuzzy: increase operation count for each fsstress invocation [13/16] fuzzy: clean up frozen fses after scrub stress testing [14/16] fuzzy: make freezing optional for scrub stress tests [15/16] fuzzy: allow substitution of AG numbers when configuring scrub stress test [16/16] fuzzy: delay the start of the scrub loop when stress-testing scrub

diff --git a/common/fuzzy b/common/fuzzy index 70213af5db..979fa55515 100644 --- a/common/fuzzy +++ b/common/fuzzy @@ -316,3 +316,103 @@ _scratch_xfs_fuzz_metadata() { done done } + +# Functions to race fsstress, fs freeze, and xfs metadata scrubbing against +# each other to shake out bugs in xfs online repair. + +# Filter freeze and thaw loop output so that we don't tarnish the golden output +# if the kernel temporarily won't let us freeze. +__stress_freeze_filter_output() { + grep -E -v '(Device or resource busy|Invalid argument)' +} + +# Filter scrub output so that we don't tarnish the golden output if the fs is +# too busy to scrub. Note: Tests should _notrun if the scrub type is not +# supported. +__stress_scrub_filter_output() { + grep -E -v '(Device or resource busy|Invalid argument)' +} + +# Run fs freeze and thaw in a tight loop. +__stress_scrub_freeze_loop() { + local end="$1" + + while [ "$(date +%s)" -lt $end ]; do + $XFS_IO_PROG -x -c 'freeze' -c 'thaw' $SCRATCH_MNT 2>&1 | \ + __stress_freeze_filter_output + done +} + +# Run xfs online fsck commands in a tight loop. +__stress_scrub_loop() { + local end="$1" + + while [ "$(date +%s)" -lt $end ]; do + $XFS_IO_PROG -x -c 'repair rmapbt 0' -c 'repair rmapbt 1' $SCRATCH_MNT 2>&1 | \ + __stress_scrub_filter_output + done +} + +# Run fsstress while we're testing online fsck. +__stress_scrub_fsstress_loop() { + local end="$1" + + local args=$(_scale_fsstress_args -p 4 -d $SCRATCH_MNT -n 2000 $FSSTRESS_AVOID) + + while [ "$(date +%s)" -lt $end ]; do + $FSSTRESS_PROG $args >> $seqres.full + done +} + +# Make sure we have everything we need to run stress and scrub +_require_xfs_stress_scrub() { + _require_xfs_io_command "scrub" + _require_command "$KILLALL_PROG" killall + _require_freeze +} + +# Make sure we have everything we need to run stress and online repair +_require_xfs_stress_online_repair() { + _require_xfs_stress_scrub + _require_xfs_io_command "repair" + _require_xfs_io_error_injection "force_repair" + _require_freeze +} + +# Clean up after the loops in case they didn't do it themselves. +_scratch_xfs_stress_scrub_cleanup() { + $KILLALL_PROG -TERM xfs_io fsstress >> $seqres.full 2>&1 + $XFS_IO_PROG -x -c 'thaw' $SCRATCH_MNT >> $seqres.full 2>&1 +} + +# Start scrub, freeze, and fsstress in background looping processes, and wait +# for 30*TIME_FACTOR seconds to see if the filesystem goes down. Callers +# must call _scratch_xfs_stress_scrub_cleanup from their cleanup functions. +_scratch_xfs_stress_scrub() { + local start="$(date +%s)" + local end="$((start + (30 * TIME_FACTOR) ))" + + echo "Loop started at $(date --date="@${start}")," \ + "ending at $(date --date="@${end}")" >> $seqres.full + + __stress_scrub_fsstress_loop $end & + __stress_scrub_freeze_loop $end & + __stress_scrub_loop $end & + + # Wait until 2 seconds after the loops should have finished, then + # clean up after ourselves. + while [ "$(date +%s)" -lt $((end + 2)) ]; do + sleep 1 + done + _scratch_xfs_stress_scrub_cleanup + + echo "Loop finished at $(date)" >> $seqres.full +} + +# Start online repair, freeze, and fsstress in background looping processes, +# and wait for 30*TIME_FACTOR seconds to see if the filesystem goes down. +# Same requirements and arguments as _scratch_xfs_stress_scrub. +_scratch_xfs_stress_online_repair() { + $XFS_IO_PROG -x -c 'inject force_repair' $SCRATCH_MNT + _scratch_xfs_stress_scrub "$@" +} diff --git a/tests/xfs/422 b/tests/xfs/422 index 9ed944ed63..0bf08572f3 100755 --- a/tests/xfs/422 +++ b/tests/xfs/422 @@ -4,40 +4,19 @@ # # FS QA Test No. 422 # -# Race freeze and rmapbt repair for a while to see if we crash or livelock. +# Race fsstress and rmapbt repair for a while to see if we crash or livelock. # rmapbt repair requires us to freeze the filesystem to stop all filesystem # activity, so we can't have userspace wandering in and thawing it. # . ./common/preamble _begin_fstest online_repair dangerous_fsstress_repair freeze -_register_cleanup "_cleanup" BUS - -# First kill and wait the freeze loop so it won't try to freeze fs again -# Then make sure fs is not frozen -# Then kill and wait for the rest of the workers -# Because if fs is frozen a killed writer will never exit -kill_loops() { - local sig=$1 - - [ -n "$freeze_pid" ] && kill $sig $freeze_pid - wait $freeze_pid - unset freeze_pid - $XFS_IO_PROG -x -c 'thaw' $SCRATCH_MNT - [ -n "$stress_pid" ] && kill $sig $stress_pid - [ -n "$repair_pid" ] && kill $sig $repair_pid - wait - unset stress_pid - unset repair_pid -} - -# Override the default cleanup function. -_cleanup() -{ - kill_loops -9 > /dev/null 2>&1 +_cleanup() { + _scratch_xfs_stress_scrub_cleanup &> /dev/null cd / - rm -rf $tmp.* + rm -r -f $tmp.* } +_register_cleanup "_cleanup" BUS # Import common functions. . ./common/filter @@ -47,80 +26,13 @@ _cleanup() # real QA test starts here _supported_fs xfs _require_xfs_scratch_rmapbt -_require_xfs_io_command "scrub" -_require_xfs_io_error_injection "force_repair" -_require_command "$KILLALL_PROG" killall -_require_freeze +_require_xfs_stress_online_repair -echo "Format and populate" _scratch_mkfs > "$seqres.full" 2>&1 _scratch_mount - -STRESS_DIR="$SCRATCH_MNT/testdir" -mkdir -p $STRESS_DIR - -for i in $(seq 0 9); do - mkdir -p $STRESS_DIR/$i - for j in $(seq 0 9); do - mkdir -p $STRESS_DIR/$i/$j - for k in $(seq 0 9); do - echo x > $STRESS_DIR/$i/$j/$k - done - done -done - -cpus=$(( $($here/src/feature -o) * 4 * LOAD_FACTOR)) - -echo "Concurrent repair" -filter_output() { - grep -E -v '(Device or resource busy|Invalid argument)' -} -freeze_loop() { - end="$1" - - while [ "$(date +%s)" -lt $end ]; do - $XFS_IO_PROG -x -c 'freeze' -c 'thaw' $SCRATCH_MNT 2>&1 | filter_output - done -} -repair_loop() { - end="$1" - - while [ "$(date +%s)" -lt $end ]; do - $XFS_IO_PROG -x -c 'repair rmapbt 0' -c 'repair rmapbt 1' $SCRATCH_MNT 2>&1 | filter_output - done -} -stress_loop() { - end="$1" - - FSSTRESS_ARGS=$(_scale_fsstress_args -p 4 -d $SCRATCH_MNT -n 2000 $FSSTRESS_AVOID) - while [ "$(date +%s)" -lt $end ]; do - $FSSTRESS_PROG $FSSTRESS_ARGS >> $seqres.full - done -} -$XFS_IO_PROG -x -c 'inject force_repair' $SCRATCH_MNT - -start=$(date +%s) -end=$((start + (30 * TIME_FACTOR) )) - -echo "Loop started at $(date --date="@${start}"), ending at $(date --date="@${end}")" >> $seqres.full -stress_loop $end & -stress_pid=$! -freeze_loop $end & -freeze_pid=$! -repair_loop $end & -repair_pid=$! - -# Wait until 2 seconds after the loops should have finished... -while [ "$(date +%s)" -lt $((end + 2)) ]; do - sleep 1 -done - -# ...and clean up after the loops in case they didn't do it themselves. -kill_loops >> $seqres.full 2>&1 - -echo "Loop finished at $(date)" >> $seqres.full -echo "Test done" +_scratch_xfs_stress_online_repair # success, all done +echo Silence is golden status=0 exit diff --git a/tests/xfs/422.out b/tests/xfs/422.out index 3818c48fa8..f70693fde6 100644 --- a/tests/xfs/422.out +++ b/tests/xfs/422.out @@ -1,4 +1,2 @@ QA output created by 422 -Format and populate -Concurrent repair -Test done +Silence is golden

[02/16] xfs/422: move the fsstress/freeze/scrub racing logic to common/fuzzy

Commit Message

Patch