diff mbox series

[2/4] fstests: make sure to unfreeze test and scratch mounts

Message ID 20220619134657.1846292-3-amir73il@gmail.com (mailing list archive)
State New, archived
Headers show
Series aborted fstests may leave frozen fs behind | expand

Commit Message

Amir Goldstein June 19, 2022, 1:46 p.m. UTC
Almost all of the tests that _require_freeze() fail to unfreeze
scratch mount in case the test is interrupted while fs is frozen.

Move the handling of unfreeze to generic check code.
For now, tests only freeze scratch fs, but to be more robust, unfreeze
both test and scratch fs following a call to _require_freeze().

Tests could still hang if thier private _cleanup() routine tries
to modify the frozen fs or wait for a blocked process. Fix the
_cleanup() routine of xfs/011 to avoid that.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 check             | 14 ++++++++------
 common/rc         |  5 +++--
 tests/generic/390 |  2 --
 tests/xfs/011     |  2 --
 tests/xfs/517     |  1 -
 5 files changed, 11 insertions(+), 13 deletions(-)

Comments

Zorro Lang June 20, 2022, 11:17 a.m. UTC | #1
On Sun, Jun 19, 2022 at 04:46:55PM +0300, Amir Goldstein wrote:
> Almost all of the tests that _require_freeze() fail to unfreeze
> scratch mount in case the test is interrupted while fs is frozen.
> 
> Move the handling of unfreeze to generic check code.
> For now, tests only freeze scratch fs, but to be more robust, unfreeze
> both test and scratch fs following a call to _require_freeze().
> 
> Tests could still hang if thier private _cleanup() routine tries
> to modify the frozen fs or wait for a blocked process. Fix the
> _cleanup() routine of xfs/011 to avoid that.
> 
> Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> ---
>  check             | 14 ++++++++------
>  common/rc         |  5 +++--
>  tests/generic/390 |  2 --
>  tests/xfs/011     |  2 --
>  tests/xfs/517     |  1 -
>  5 files changed, 11 insertions(+), 13 deletions(-)
> 
> diff --git a/check b/check
> index de11b37e..d6ee71aa 100755
> --- a/check
> +++ b/check
> @@ -527,17 +527,21 @@ _check_filesystems()
>  {
>  	local ret=0
>  
> +	# Make sure both test and scratch are unfrozen post _require_freeze()
> +	if [ -f ${RESULT_DIR}/require_freeze ]; then
> +		xfs_freeze -u "$TEST_DIR" >/dev/null 2>&1
> +		xfs_freeze -u "$SCRATCH_MNT" >/dev/null 2>&1
> +	fi

Hi Amir,

I'm wondering why not let each freeze related test cases take care of "unfreeze"
by itself? If a case freeze a filesystem which isn't on TEST_DEV or SCRATCH_DEV
(e.g. a loop device base on a file in $TEST_DIR), or other situations, then the
case still can call _require_freeze, but "unfreeze TEST_DIR or SCRATCH_MNT" can't
help.

Thanks,
Zorro

>  	if [ -f ${RESULT_DIR}/require_test ]; then
>  		_check_test_fs || ret=1
> -		rm -f ${RESULT_DIR}/require_test*
>  	else
>  		_test_unmount 2> /dev/null
>  	fi
>  	if [ -f ${RESULT_DIR}/require_scratch ]; then
>  		_check_scratch_fs || ret=1
> -		rm -f ${RESULT_DIR}/require_scratch*
>  	fi
>  	_scratch_unmount 2> /dev/null
> +	rm -f ${RESULT_DIR}/require_*
>  	return $ret
>  }
>  
> @@ -783,8 +787,7 @@ function run_section()
>  		seqres="$REPORT_DIR/$seqnum"
>  
>  		mkdir -p $RESULT_DIR
> -		rm -f ${RESULT_DIR}/require_scratch*
> -		rm -f ${RESULT_DIR}/require_test*
> +		rm -f ${RESULT_DIR}/require_*
>  		echo -n "$seqnum"
>  
>  		if $showme; then
> @@ -882,8 +885,7 @@ function run_section()
>  			_dump_err_cont "[failed, exit status $sts]"
>  			_test_unmount 2> /dev/null
>  			_scratch_unmount 2> /dev/null
> -			rm -f ${RESULT_DIR}/require_test*
> -			rm -f ${RESULT_DIR}/require_scratch*
> +			rm -f ${RESULT_DIR}/require_*
>  			err=true
>  		else
>  			# The test apparently passed, so check for corruption
> diff --git a/common/rc b/common/rc
> index 3c072c16..b87dfe05 100644
> --- a/common/rc
> +++ b/common/rc
> @@ -1540,8 +1540,7 @@ _notrun()
>  {
>      echo "$*" > $seqres.notrun
>      echo "$seq not run: $*"
> -    rm -f ${RESULT_DIR}/require_test*
> -    rm -f ${RESULT_DIR}/require_scratch*
> +    rm -f ${RESULT_DIR}/require_*
>  
>      status=0
>      exit
> @@ -3648,6 +3647,8 @@ _require_freeze()
>  	local result=$?
>  	xfs_freeze -u "$TEST_DIR" >/dev/null 2>&1
>  	[ $result -eq 0 ] || _notrun "$FSTYP does not support freezing"
> +	# Make sure both test and scratch are unfrozen at the end of the test
> +	touch ${RESULT_DIR}/require_freeze
>  }
>  
>  # Does NFS export work on this fs?
> diff --git a/tests/generic/390 b/tests/generic/390
> index 20c66e22..0f2b86fa 100755
> --- a/tests/generic/390
> +++ b/tests/generic/390
> @@ -14,8 +14,6 @@ _begin_fstest auto freeze stress
>  _cleanup()
>  {
>  	cd /
> -	# Make sure $SCRATCH_MNT is unfreezed
> -	xfs_freeze -u $SCRATCH_MNT 2>/dev/null
>  	rm -f $tmp.*
>  }
>  
> diff --git a/tests/xfs/011 b/tests/xfs/011
> index d6e9099e..351a574e 100755
> --- a/tests/xfs/011
> +++ b/tests/xfs/011
> @@ -17,9 +17,7 @@ _begin_fstest auto freeze log metadata quick
>  _cleanup()
>  {
>  	$KILLALL_PROG -9 fsstress 2>/dev/null
> -	wait
>  	cd /
> -	_scratch_unmount 2>/dev/null
>  	rm -f $tmp.*
>  }
>  
> diff --git a/tests/xfs/517 b/tests/xfs/517
> index f7f9a8a2..961668e3 100755
> --- a/tests/xfs/517
> +++ b/tests/xfs/517
> @@ -15,7 +15,6 @@ _register_cleanup "_cleanup" BUS
>  _cleanup()
>  {
>  	cd /
> -	$XFS_IO_PROG -x -c 'thaw' $SCRATCH_MNT > /dev/null 2>&1
>  	rm -rf $tmp.*
>  }
>  
> -- 
> 2.25.1
>
Amir Goldstein June 20, 2022, 12:13 p.m. UTC | #2
On Mon, Jun 20, 2022 at 2:17 PM Zorro Lang <zlang@redhat.com> wrote:
>
> On Sun, Jun 19, 2022 at 04:46:55PM +0300, Amir Goldstein wrote:
> > Almost all of the tests that _require_freeze() fail to unfreeze
> > scratch mount in case the test is interrupted while fs is frozen.
> >
> > Move the handling of unfreeze to generic check code.
> > For now, tests only freeze scratch fs, but to be more robust, unfreeze
> > both test and scratch fs following a call to _require_freeze().
> >
> > Tests could still hang if thier private _cleanup() routine tries
> > to modify the frozen fs or wait for a blocked process. Fix the
> > _cleanup() routine of xfs/011 to avoid that.
> >
> > Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> > ---
> >  check             | 14 ++++++++------
> >  common/rc         |  5 +++--
> >  tests/generic/390 |  2 --
> >  tests/xfs/011     |  2 --
> >  tests/xfs/517     |  1 -
> >  5 files changed, 11 insertions(+), 13 deletions(-)
> >
> > diff --git a/check b/check
> > index de11b37e..d6ee71aa 100755
> > --- a/check
> > +++ b/check
> > @@ -527,17 +527,21 @@ _check_filesystems()
> >  {
> >       local ret=0
> >
> > +     # Make sure both test and scratch are unfrozen post _require_freeze()
> > +     if [ -f ${RESULT_DIR}/require_freeze ]; then
> > +             xfs_freeze -u "$TEST_DIR" >/dev/null 2>&1
> > +             xfs_freeze -u "$SCRATCH_MNT" >/dev/null 2>&1
> > +     fi
>
> Hi Amir,
>
> I'm wondering why not let each freeze related test cases take care of "unfreeze"
> by itself? If a case freeze a filesystem which isn't on TEST_DEV or SCRATCH_DEV
> (e.g. a loop device base on a file in $TEST_DIR), or other situations, then the
> case still can call _require_freeze, but "unfreeze TEST_DIR or SCRATCH_MNT" can't
> help.

That's also an option, but:
1. It is less robust, leaving room for human mistakes. Why is that better?
2. Leftover frozen fs is quite harmful to subsequent test runs, so it
is important
    to avoid it
3. Tests that freeze loop/dm (generic/459, xfs/428, [*]) can and should cleanup
    themselves. I will add that too, but in any case...
4. unfreeze of tests and scratch fs is harmless even when it is not needed -
    we may even want to do that at the start of tests run(?)

[*] I noticed that generic/085 which _require_freeze() does not even
    use xfs_freeze (intentionally) - it uses dmsetup suspend/resume,
    so I guess _require_freeze() should be removed from that test.

Anyway, if you and others insist against this auto-unfreeze approach,
I will post the patch to unfreeze fs in individual tests, but I won't be
happy about it.

Thanks,
Amir.
Zorro Lang June 20, 2022, 2:21 p.m. UTC | #3
On Mon, Jun 20, 2022 at 03:13:33PM +0300, Amir Goldstein wrote:
> On Mon, Jun 20, 2022 at 2:17 PM Zorro Lang <zlang@redhat.com> wrote:
> >
> > On Sun, Jun 19, 2022 at 04:46:55PM +0300, Amir Goldstein wrote:
> > > Almost all of the tests that _require_freeze() fail to unfreeze
> > > scratch mount in case the test is interrupted while fs is frozen.
> > >
> > > Move the handling of unfreeze to generic check code.
> > > For now, tests only freeze scratch fs, but to be more robust, unfreeze
> > > both test and scratch fs following a call to _require_freeze().
> > >
> > > Tests could still hang if thier private _cleanup() routine tries
> > > to modify the frozen fs or wait for a blocked process. Fix the
> > > _cleanup() routine of xfs/011 to avoid that.
> > >
> > > Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> > > ---
> > >  check             | 14 ++++++++------
> > >  common/rc         |  5 +++--
> > >  tests/generic/390 |  2 --
> > >  tests/xfs/011     |  2 --
> > >  tests/xfs/517     |  1 -
> > >  5 files changed, 11 insertions(+), 13 deletions(-)
> > >
> > > diff --git a/check b/check
> > > index de11b37e..d6ee71aa 100755
> > > --- a/check
> > > +++ b/check
> > > @@ -527,17 +527,21 @@ _check_filesystems()
> > >  {
> > >       local ret=0
> > >
> > > +     # Make sure both test and scratch are unfrozen post _require_freeze()
> > > +     if [ -f ${RESULT_DIR}/require_freeze ]; then
> > > +             xfs_freeze -u "$TEST_DIR" >/dev/null 2>&1
> > > +             xfs_freeze -u "$SCRATCH_MNT" >/dev/null 2>&1
> > > +     fi
> >
> > Hi Amir,
> >
> > I'm wondering why not let each freeze related test cases take care of "unfreeze"
> > by itself? If a case freeze a filesystem which isn't on TEST_DEV or SCRATCH_DEV
> > (e.g. a loop device base on a file in $TEST_DIR), or other situations, then the
> > case still can call _require_freeze, but "unfreeze TEST_DIR or SCRATCH_MNT" can't
> > help.
> 
> That's also an option, but:
> 1. It is less robust, leaving room for human mistakes. Why is that better?
> 2. Leftover frozen fs is quite harmful to subsequent test runs, so it
> is important
>     to avoid it
> 3. Tests that freeze loop/dm (generic/459, xfs/428, [*]) can and should cleanup
>     themselves. I will add that too, but in any case...
> 4. unfreeze of tests and scratch fs is harmless even when it is not needed -
>     we may even want to do that at the start of tests run(?)

I think Dave doesn't like to add common steps to thousands of cases, if without
a critical reason. So I hope to get more review points for this kind of changes.

> 
> [*] I noticed that generic/085 which _require_freeze() does not even
>     use xfs_freeze (intentionally) - it uses dmsetup suspend/resume,
>     so I guess _require_freeze() should be removed from that test.

Agree, I think dm suspend through different userspace and kernel interface with
fsfreeze, so that require doesn't make sense.

> 
> Anyway, if you and others insist against this auto-unfreeze approach,
> I will post the patch to unfreeze fs in individual tests, but I won't be
> happy about it.

From the functional side, I think this change makes sense. But if think about
the performance, let's get more opinions at first. If there's not objection,
we can have it.

Thanks,
Zorro

> 
> Thanks,
> Amir.
>
Amir Goldstein June 20, 2022, 3:08 p.m. UTC | #4
On Mon, Jun 20, 2022 at 5:21 PM Zorro Lang <zlang@redhat.com> wrote:
>
> On Mon, Jun 20, 2022 at 03:13:33PM +0300, Amir Goldstein wrote:
> > On Mon, Jun 20, 2022 at 2:17 PM Zorro Lang <zlang@redhat.com> wrote:
> > >
> > > On Sun, Jun 19, 2022 at 04:46:55PM +0300, Amir Goldstein wrote:
> > > > Almost all of the tests that _require_freeze() fail to unfreeze
> > > > scratch mount in case the test is interrupted while fs is frozen.
> > > >
> > > > Move the handling of unfreeze to generic check code.
> > > > For now, tests only freeze scratch fs, but to be more robust, unfreeze
> > > > both test and scratch fs following a call to _require_freeze().
> > > >
> > > > Tests could still hang if thier private _cleanup() routine tries
> > > > to modify the frozen fs or wait for a blocked process. Fix the
> > > > _cleanup() routine of xfs/011 to avoid that.
> > > >
> > > > Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> > > > ---
> > > >  check             | 14 ++++++++------
> > > >  common/rc         |  5 +++--
> > > >  tests/generic/390 |  2 --
> > > >  tests/xfs/011     |  2 --
> > > >  tests/xfs/517     |  1 -
> > > >  5 files changed, 11 insertions(+), 13 deletions(-)
> > > >
> > > > diff --git a/check b/check
> > > > index de11b37e..d6ee71aa 100755
> > > > --- a/check
> > > > +++ b/check
> > > > @@ -527,17 +527,21 @@ _check_filesystems()
> > > >  {
> > > >       local ret=0
> > > >
> > > > +     # Make sure both test and scratch are unfrozen post _require_freeze()
> > > > +     if [ -f ${RESULT_DIR}/require_freeze ]; then
> > > > +             xfs_freeze -u "$TEST_DIR" >/dev/null 2>&1
> > > > +             xfs_freeze -u "$SCRATCH_MNT" >/dev/null 2>&1
> > > > +     fi
> > >
> > > Hi Amir,
> > >
> > > I'm wondering why not let each freeze related test cases take care of "unfreeze"
> > > by itself? If a case freeze a filesystem which isn't on TEST_DEV or SCRATCH_DEV
> > > (e.g. a loop device base on a file in $TEST_DIR), or other situations, then the
> > > case still can call _require_freeze, but "unfreeze TEST_DIR or SCRATCH_MNT" can't
> > > help.
> >
> > That's also an option, but:
> > 1. It is less robust, leaving room for human mistakes. Why is that better?
> > 2. Leftover frozen fs is quite harmful to subsequent test runs, so it
> > is important
> >     to avoid it
> > 3. Tests that freeze loop/dm (generic/459, xfs/428, [*]) can and should cleanup
> >     themselves. I will add that too, but in any case...
> > 4. unfreeze of tests and scratch fs is harmless even when it is not needed -
> >     we may even want to do that at the start of tests run(?)
>
> I think Dave doesn't like to add common steps to thousands of cases, if without
> a critical reason. So I hope to get more review points for this kind of changes.
>
> >
> > [*] I noticed that generic/085 which _require_freeze() does not even
> >     use xfs_freeze (intentionally) - it uses dmsetup suspend/resume,
> >     so I guess _require_freeze() should be removed from that test.
>
> Agree, I think dm suspend through different userspace and kernel interface with
> fsfreeze, so that require doesn't make sense.
>
> >
> > Anyway, if you and others insist against this auto-unfreeze approach,
> > I will post the patch to unfreeze fs in individual tests, but I won't be
> > happy about it.
>
> From the functional side, I think this change makes sense. But if think about
> the performance, let's get more opinions at first. If there's not objection,
> we can have it.
>

Maybe the change was not clear.

Only the tests that declare _require_freeze() in the beginning of the test
(14 tests) are going to be affected by this change.
The rest of the tests have no impact at all.

Thanks,
Amir.
Dave Chinner June 20, 2022, 10:06 p.m. UTC | #5
On Sun, Jun 19, 2022 at 04:46:55PM +0300, Amir Goldstein wrote:
> Almost all of the tests that _require_freeze() fail to unfreeze
> scratch mount in case the test is interrupted while fs is frozen.
> 
> Move the handling of unfreeze to generic check code.
> For now, tests only freeze scratch fs, but to be more robust, unfreeze
> both test and scratch fs following a call to _require_freeze().
> 
> Tests could still hang if thier private _cleanup() routine tries
> to modify the frozen fs or wait for a blocked process. Fix the
> _cleanup() routine of xfs/011 to avoid that.
> 
> Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> ---
>  check             | 14 ++++++++------
>  common/rc         |  5 +++--
>  tests/generic/390 |  2 --
>  tests/xfs/011     |  2 --
>  tests/xfs/517     |  1 -
>  5 files changed, 11 insertions(+), 13 deletions(-)
> 
> diff --git a/check b/check
> index de11b37e..d6ee71aa 100755
> --- a/check
> +++ b/check
> @@ -527,17 +527,21 @@ _check_filesystems()
>  {
>  	local ret=0
>  
> +	# Make sure both test and scratch are unfrozen post _require_freeze()
> +	if [ -f ${RESULT_DIR}/require_freeze ]; then
> +		xfs_freeze -u "$TEST_DIR" >/dev/null 2>&1
> +		xfs_freeze -u "$SCRATCH_MNT" >/dev/null 2>&1
> +	fi

A test leaving a filesystem frozen on exit is a test bug. There can
still be background test processes sitting blocked on a frozen
filesystem when the test exits with a frozen filesystem, and that
has the potential to cause problems in the next few operations
because of "busy filesystem" errors trying to unmount the fs...

IOWs, think this is the wrong way to address this problem. tests
that freeze filesystems need to ensure that everything is cleaned up
properly in the test _cleanup() function where the right thing can
be done and blocked processes can be waited on once the fs has been
thawed.

> diff --git a/tests/generic/390 b/tests/generic/390
> index 20c66e22..0f2b86fa 100755
> --- a/tests/generic/390
> +++ b/tests/generic/390
> @@ -14,8 +14,6 @@ _begin_fstest auto freeze stress
>  _cleanup()
>  {
>  	cd /
> -	# Make sure $SCRATCH_MNT is unfreezed
> -	xfs_freeze -u $SCRATCH_MNT 2>/dev/null
>  	rm -f $tmp.*
>  }

This test is already doing the right thing.

> diff --git a/tests/xfs/011 b/tests/xfs/011
> index d6e9099e..351a574e 100755
> --- a/tests/xfs/011
> +++ b/tests/xfs/011
> @@ -17,9 +17,7 @@ _begin_fstest auto freeze log metadata quick
>  _cleanup()
>  {
>  	$KILLALL_PROG -9 fsstress 2>/dev/null
> -	wait
>  	cd /
> -	_scratch_unmount 2>/dev/null
>  	rm -f $tmp.*
>  }

This is wrong. We have to wait for background fsstress processes to
exit, otherwise unmount can fail randomly. What it is missing is the
thaw before killing the fsstress processes and waiting for them to
complete.

> diff --git a/tests/xfs/517 b/tests/xfs/517
> index f7f9a8a2..961668e3 100755
> --- a/tests/xfs/517
> +++ b/tests/xfs/517
> @@ -15,7 +15,6 @@ _register_cleanup "_cleanup" BUS
>  _cleanup()
>  {
>  	cd /
> -	$XFS_IO_PROG -x -c 'thaw' $SCRATCH_MNT > /dev/null 2>&1
>  	rm -rf $tmp.*
>  }

This is doing the right thing, too.

Cheers,

Dave.
Dave Chinner June 20, 2022, 10:34 p.m. UTC | #6
On Mon, Jun 20, 2022 at 10:21:39PM +0800, Zorro Lang wrote:
> On Mon, Jun 20, 2022 at 03:13:33PM +0300, Amir Goldstein wrote:
> > On Mon, Jun 20, 2022 at 2:17 PM Zorro Lang <zlang@redhat.com> wrote:
> > >
> > > On Sun, Jun 19, 2022 at 04:46:55PM +0300, Amir Goldstein wrote:
> > > > Almost all of the tests that _require_freeze() fail to unfreeze
> > > > scratch mount in case the test is interrupted while fs is frozen.
> > > >
> > > > Move the handling of unfreeze to generic check code.
> > > > For now, tests only freeze scratch fs, but to be more robust, unfreeze
> > > > both test and scratch fs following a call to _require_freeze().
> > > >
> > > > Tests could still hang if thier private _cleanup() routine tries
> > > > to modify the frozen fs or wait for a blocked process. Fix the
> > > > _cleanup() routine of xfs/011 to avoid that.
> > > >
> > > > Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> > > > ---
> > > >  check             | 14 ++++++++------
> > > >  common/rc         |  5 +++--
> > > >  tests/generic/390 |  2 --
> > > >  tests/xfs/011     |  2 --
> > > >  tests/xfs/517     |  1 -
> > > >  5 files changed, 11 insertions(+), 13 deletions(-)
> > > >
> > > > diff --git a/check b/check
> > > > index de11b37e..d6ee71aa 100755
> > > > --- a/check
> > > > +++ b/check
> > > > @@ -527,17 +527,21 @@ _check_filesystems()
> > > >  {
> > > >       local ret=0
> > > >
> > > > +     # Make sure both test and scratch are unfrozen post _require_freeze()
> > > > +     if [ -f ${RESULT_DIR}/require_freeze ]; then
> > > > +             xfs_freeze -u "$TEST_DIR" >/dev/null 2>&1
> > > > +             xfs_freeze -u "$SCRATCH_MNT" >/dev/null 2>&1
> > > > +     fi
> > >
> > > Hi Amir,
> > >
> > > I'm wondering why not let each freeze related test cases take care of "unfreeze"
> > > by itself? If a case freeze a filesystem which isn't on TEST_DEV or SCRATCH_DEV
> > > (e.g. a loop device base on a file in $TEST_DIR), or other situations, then the
> > > case still can call _require_freeze, but "unfreeze TEST_DIR or SCRATCH_MNT" can't
> > > help.
> > 
> > That's also an option, but:
> > 1. It is less robust, leaving room for human mistakes. Why is that better?

It's not, but I'm addressing exactly this problem with the common
_cleanup() infrastructure that I'm working on. Several of the issues
that you are trying to address here are already dealt with in that
series, and it doesn't add extra overhead to the test
infrastructure for tests that don't use freeze/thaw.

> > 2. Leftover frozen fs is quite harmful to subsequent test runs, so it
> > is important
> >     to avoid it

Well, yes. It is a test bug, and the test shouldn't have bugs.

> > 3. Tests that freeze loop/dm (generic/459, xfs/428, [*]) can and should cleanup
> >     themselves. I will add that too, but in any case...

Except now we have two methods of thawing the filesystem depending
on where it is mounted and what device is in use. That's not an
improvement.

> > 4. unfreeze of tests and scratch fs is harmless even when it is not needed -
> >     we may even want to do that at the start of tests run(?)
> 
> I think Dave doesn't like to add common steps to thousands of cases, if without
> a critical reason. So I hope to get more review points for this kind of changes.

Doing work that in every test execution that is only actually
necessary if a test is interrupted to work around a potential bug in
1 or 2 tests is really poor design and implementation. Just fix the
test bugs, don't burden everything else with the additional overhead
of checking whether the test might have a bug in it or not...

> > [*] I noticed that generic/085 which _require_freeze() does not even
> >     use xfs_freeze (intentionally) - it uses dmsetup suspend/resume,
> >     so I guess _require_freeze() should be removed from that test.
> 
> Agree, I think dm suspend through different userspace and kernel interface with
> fsfreeze, so that require doesn't make sense.

dmsuspend uses the internal kernel freeze API to freeze the
filesytsems. Failures in those tests can leave filesytsems frozen,
too, but these days we cannot thaw them with fs level freeze....

> > Anyway, if you and others insist against this auto-unfreeze approach,
> > I will post the patch to unfreeze fs in individual tests, but I won't be
> > happy about it.
> 
> From the functional side, I think this change makes sense. But if think about
> the performance, let's get more opinions at first. If there's not objection,
> we can have it.

I think it's wrong from a functional side, too. The test harness is
not responsible for individual test state cleanup - the tests
themselves are responsible for that. The whole point of the
_cleanup() consolidation I've started doing is that it will
centralise all this common cleanup functionality and allow us to end
up connecting it to _requires rules that indicate what functionality
the test uses.  i.e. the end goal is that most tests do not need any
cleanup - everything that needs cleaning up is defined by the
_requires rules the test runs first and nothing else is needed.

Only when tests do custom stuff will test specific cleanup functions
be needed, just like they are required to do now to clean up after
themselves. And that means, right now, that tests that freeze the
filesystem still need to do the right thing in the _cleanup()
functions up until the consolidation of cleanup means they don't
need it anymore....

AFAIC, you can go move all this stuff to check if you really
want, but I'm just going to rip it all back out in short order
because it just doesn't fit the cleanup model I'm trying to move the
infrastructure towards.

Cheers,

Dave.
Amir Goldstein June 21, 2022, 2:53 a.m. UTC | #7
On Tue, Jun 21, 2022 at 1:34 AM Dave Chinner <david@fromorbit.com> wrote:
>
> On Mon, Jun 20, 2022 at 10:21:39PM +0800, Zorro Lang wrote:
> > On Mon, Jun 20, 2022 at 03:13:33PM +0300, Amir Goldstein wrote:
> > > On Mon, Jun 20, 2022 at 2:17 PM Zorro Lang <zlang@redhat.com> wrote:
> > > >
> > > > On Sun, Jun 19, 2022 at 04:46:55PM +0300, Amir Goldstein wrote:
> > > > > Almost all of the tests that _require_freeze() fail to unfreeze
> > > > > scratch mount in case the test is interrupted while fs is frozen.
> > > > >
> > > > > Move the handling of unfreeze to generic check code.
> > > > > For now, tests only freeze scratch fs, but to be more robust, unfreeze
> > > > > both test and scratch fs following a call to _require_freeze().
> > > > >
> > > > > Tests could still hang if thier private _cleanup() routine tries
> > > > > to modify the frozen fs or wait for a blocked process. Fix the
> > > > > _cleanup() routine of xfs/011 to avoid that.
> > > > >
> > > > > Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> > > > > ---
> > > > >  check             | 14 ++++++++------
> > > > >  common/rc         |  5 +++--
> > > > >  tests/generic/390 |  2 --
> > > > >  tests/xfs/011     |  2 --
> > > > >  tests/xfs/517     |  1 -
> > > > >  5 files changed, 11 insertions(+), 13 deletions(-)
> > > > >
> > > > > diff --git a/check b/check
> > > > > index de11b37e..d6ee71aa 100755
> > > > > --- a/check
> > > > > +++ b/check
> > > > > @@ -527,17 +527,21 @@ _check_filesystems()
> > > > >  {
> > > > >       local ret=0
> > > > >
> > > > > +     # Make sure both test and scratch are unfrozen post _require_freeze()
> > > > > +     if [ -f ${RESULT_DIR}/require_freeze ]; then
> > > > > +             xfs_freeze -u "$TEST_DIR" >/dev/null 2>&1
> > > > > +             xfs_freeze -u "$SCRATCH_MNT" >/dev/null 2>&1
> > > > > +     fi
> > > >
> > > > Hi Amir,
> > > >
> > > > I'm wondering why not let each freeze related test cases take care of "unfreeze"
> > > > by itself? If a case freeze a filesystem which isn't on TEST_DEV or SCRATCH_DEV
> > > > (e.g. a loop device base on a file in $TEST_DIR), or other situations, then the
> > > > case still can call _require_freeze, but "unfreeze TEST_DIR or SCRATCH_MNT" can't
> > > > help.
> > >
> > > That's also an option, but:
> > > 1. It is less robust, leaving room for human mistakes. Why is that better?
>
> It's not, but I'm addressing exactly this problem with the common
> _cleanup() infrastructure that I'm working on. Several of the issues
> that you are trying to address here are already dealt with in that
> series, and it doesn't add extra overhead to the test
> infrastructure for tests that don't use freeze/thaw.
>
> > > 2. Leftover frozen fs is quite harmful to subsequent test runs, so it
> > > is important
> > >     to avoid it
>
> Well, yes. It is a test bug, and the test shouldn't have bugs.
>
> > > 3. Tests that freeze loop/dm (generic/459, xfs/428, [*]) can and should cleanup
> > >     themselves. I will add that too, but in any case...
>
> Except now we have two methods of thawing the filesystem depending
> on where it is mounted and what device is in use. That's not an
> improvement.
>
> > > 4. unfreeze of tests and scratch fs is harmless even when it is not needed -
> > >     we may even want to do that at the start of tests run(?)
> >
> > I think Dave doesn't like to add common steps to thousands of cases, if without
> > a critical reason. So I hope to get more review points for this kind of changes.
>
> Doing work that in every test execution that is only actually
> necessary if a test is interrupted to work around a potential bug in
> 1 or 2 tests is really poor design and implementation. Just fix the
> test bugs, don't burden everything else with the additional overhead
> of checking whether the test might have a bug in it or not...
>
> > > [*] I noticed that generic/085 which _require_freeze() does not even
> > >     use xfs_freeze (intentionally) - it uses dmsetup suspend/resume,
> > >     so I guess _require_freeze() should be removed from that test.
> >
> > Agree, I think dm suspend through different userspace and kernel interface with
> > fsfreeze, so that require doesn't make sense.
>
> dmsuspend uses the internal kernel freeze API to freeze the
> filesytsems. Failures in those tests can leave filesytsems frozen,
> too, but these days we cannot thaw them with fs level freeze....
>
> > > Anyway, if you and others insist against this auto-unfreeze approach,
> > > I will post the patch to unfreeze fs in individual tests, but I won't be
> > > happy about it.
> >
> > From the functional side, I think this change makes sense. But if think about
> > the performance, let's get more opinions at first. If there's not objection,
> > we can have it.
>
> I think it's wrong from a functional side, too. The test harness is
> not responsible for individual test state cleanup - the tests
> themselves are responsible for that. The whole point of the
> _cleanup() consolidation I've started doing is that it will
> centralise all this common cleanup functionality and allow us to end
> up connecting it to _requires rules that indicate what functionality
> the test uses.  i.e. the end goal is that most tests do not need any
> cleanup - everything that needs cleaning up is defined by the
> _requires rules the test runs first and nothing else is needed.
>
> Only when tests do custom stuff will test specific cleanup functions
> be needed, just like they are required to do now to clean up after
> themselves. And that means, right now, that tests that freeze the
> filesystem still need to do the right thing in the _cleanup()
> functions up until the consolidation of cleanup means they don't
> need it anymore....
>
> AFAIC, you can go move all this stuff to check if you really
> want, but I'm just going to rip it all back out in short order
> because it just doesn't fit the cleanup model I'm trying to move the
> infrastructure towards.
>

As long as your cleanup is going to consolidate all this I will
just go ahead and fix the buggy tests now.

Thanks!
Amir.
Amir Goldstein June 21, 2022, 2:57 a.m. UTC | #8
On Tue, Jun 21, 2022 at 1:06 AM Dave Chinner <david@fromorbit.com> wrote:
>
> On Sun, Jun 19, 2022 at 04:46:55PM +0300, Amir Goldstein wrote:
> > Almost all of the tests that _require_freeze() fail to unfreeze
> > scratch mount in case the test is interrupted while fs is frozen.
> >
> > Move the handling of unfreeze to generic check code.
> > For now, tests only freeze scratch fs, but to be more robust, unfreeze
> > both test and scratch fs following a call to _require_freeze().
> >
> > Tests could still hang if thier private _cleanup() routine tries
> > to modify the frozen fs or wait for a blocked process. Fix the
> > _cleanup() routine of xfs/011 to avoid that.
> >
> > Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> > ---
> >  check             | 14 ++++++++------
> >  common/rc         |  5 +++--
> >  tests/generic/390 |  2 --
> >  tests/xfs/011     |  2 --
> >  tests/xfs/517     |  1 -
> >  5 files changed, 11 insertions(+), 13 deletions(-)
> >
> > diff --git a/check b/check
> > index de11b37e..d6ee71aa 100755
> > --- a/check
> > +++ b/check
> > @@ -527,17 +527,21 @@ _check_filesystems()
> >  {
> >       local ret=0
> >
> > +     # Make sure both test and scratch are unfrozen post _require_freeze()
> > +     if [ -f ${RESULT_DIR}/require_freeze ]; then
> > +             xfs_freeze -u "$TEST_DIR" >/dev/null 2>&1
> > +             xfs_freeze -u "$SCRATCH_MNT" >/dev/null 2>&1
> > +     fi
>
> A test leaving a filesystem frozen on exit is a test bug. There can
> still be background test processes sitting blocked on a frozen
> filesystem when the test exits with a frozen filesystem, and that
> has the potential to cause problems in the next few operations
> because of "busy filesystem" errors trying to unmount the fs...
>
> IOWs, think this is the wrong way to address this problem. tests
> that freeze filesystems need to ensure that everything is cleaned up
> properly in the test _cleanup() function where the right thing can
> be done and blocked processes can be waited on once the fs has been
> thawed.
>

ok.

> > diff --git a/tests/generic/390 b/tests/generic/390
> > index 20c66e22..0f2b86fa 100755
> > --- a/tests/generic/390
> > +++ b/tests/generic/390
> > @@ -14,8 +14,6 @@ _begin_fstest auto freeze stress
> >  _cleanup()
> >  {
> >       cd /
> > -     # Make sure $SCRATCH_MNT is unfreezed
> > -     xfs_freeze -u $SCRATCH_MNT 2>/dev/null
> >       rm -f $tmp.*
> >  }
>
> This test is already doing the right thing.
>
> > diff --git a/tests/xfs/011 b/tests/xfs/011
> > index d6e9099e..351a574e 100755
> > --- a/tests/xfs/011
> > +++ b/tests/xfs/011
> > @@ -17,9 +17,7 @@ _begin_fstest auto freeze log metadata quick
> >  _cleanup()
> >  {
> >       $KILLALL_PROG -9 fsstress 2>/dev/null
> > -     wait
> >       cd /
> > -     _scratch_unmount 2>/dev/null
> >       rm -f $tmp.*
> >  }
>
> This is wrong. We have to wait for background fsstress processes to
> exit, otherwise unmount can fail randomly. What it is missing is the
> thaw before killing the fsstress processes and waiting for them to
> complete.
>

I didn't remember if your position was that _cleanup()
should wait or not . I guess from your answer that you think
that it should and it makes sense to me, so  I will also fix xfs/517
to wait.

Thanks,
Amir.
diff mbox series

Patch

diff --git a/check b/check
index de11b37e..d6ee71aa 100755
--- a/check
+++ b/check
@@ -527,17 +527,21 @@  _check_filesystems()
 {
 	local ret=0
 
+	# Make sure both test and scratch are unfrozen post _require_freeze()
+	if [ -f ${RESULT_DIR}/require_freeze ]; then
+		xfs_freeze -u "$TEST_DIR" >/dev/null 2>&1
+		xfs_freeze -u "$SCRATCH_MNT" >/dev/null 2>&1
+	fi
 	if [ -f ${RESULT_DIR}/require_test ]; then
 		_check_test_fs || ret=1
-		rm -f ${RESULT_DIR}/require_test*
 	else
 		_test_unmount 2> /dev/null
 	fi
 	if [ -f ${RESULT_DIR}/require_scratch ]; then
 		_check_scratch_fs || ret=1
-		rm -f ${RESULT_DIR}/require_scratch*
 	fi
 	_scratch_unmount 2> /dev/null
+	rm -f ${RESULT_DIR}/require_*
 	return $ret
 }
 
@@ -783,8 +787,7 @@  function run_section()
 		seqres="$REPORT_DIR/$seqnum"
 
 		mkdir -p $RESULT_DIR
-		rm -f ${RESULT_DIR}/require_scratch*
-		rm -f ${RESULT_DIR}/require_test*
+		rm -f ${RESULT_DIR}/require_*
 		echo -n "$seqnum"
 
 		if $showme; then
@@ -882,8 +885,7 @@  function run_section()
 			_dump_err_cont "[failed, exit status $sts]"
 			_test_unmount 2> /dev/null
 			_scratch_unmount 2> /dev/null
-			rm -f ${RESULT_DIR}/require_test*
-			rm -f ${RESULT_DIR}/require_scratch*
+			rm -f ${RESULT_DIR}/require_*
 			err=true
 		else
 			# The test apparently passed, so check for corruption
diff --git a/common/rc b/common/rc
index 3c072c16..b87dfe05 100644
--- a/common/rc
+++ b/common/rc
@@ -1540,8 +1540,7 @@  _notrun()
 {
     echo "$*" > $seqres.notrun
     echo "$seq not run: $*"
-    rm -f ${RESULT_DIR}/require_test*
-    rm -f ${RESULT_DIR}/require_scratch*
+    rm -f ${RESULT_DIR}/require_*
 
     status=0
     exit
@@ -3648,6 +3647,8 @@  _require_freeze()
 	local result=$?
 	xfs_freeze -u "$TEST_DIR" >/dev/null 2>&1
 	[ $result -eq 0 ] || _notrun "$FSTYP does not support freezing"
+	# Make sure both test and scratch are unfrozen at the end of the test
+	touch ${RESULT_DIR}/require_freeze
 }
 
 # Does NFS export work on this fs?
diff --git a/tests/generic/390 b/tests/generic/390
index 20c66e22..0f2b86fa 100755
--- a/tests/generic/390
+++ b/tests/generic/390
@@ -14,8 +14,6 @@  _begin_fstest auto freeze stress
 _cleanup()
 {
 	cd /
-	# Make sure $SCRATCH_MNT is unfreezed
-	xfs_freeze -u $SCRATCH_MNT 2>/dev/null
 	rm -f $tmp.*
 }
 
diff --git a/tests/xfs/011 b/tests/xfs/011
index d6e9099e..351a574e 100755
--- a/tests/xfs/011
+++ b/tests/xfs/011
@@ -17,9 +17,7 @@  _begin_fstest auto freeze log metadata quick
 _cleanup()
 {
 	$KILLALL_PROG -9 fsstress 2>/dev/null
-	wait
 	cd /
-	_scratch_unmount 2>/dev/null
 	rm -f $tmp.*
 }
 
diff --git a/tests/xfs/517 b/tests/xfs/517
index f7f9a8a2..961668e3 100755
--- a/tests/xfs/517
+++ b/tests/xfs/517
@@ -15,7 +15,6 @@  _register_cleanup "_cleanup" BUS
 _cleanup()
 {
 	cd /
-	$XFS_IO_PROG -x -c 'thaw' $SCRATCH_MNT > /dev/null 2>&1
 	rm -rf $tmp.*
 }