diff mbox series

[3/5] fstests: add fsstress + compaction test

Message ID 20240611030203.1719072-4-mcgrof@kernel.org (mailing list archive)
State Superseded, archived
Headers show
Series fstests: add some new LBS inspired tests | expand

Commit Message

Luis Chamberlain June 11, 2024, 3:02 a.m. UTC
Running compaction while we run fsstress can crash older kernels as per
korg#218227 [0], the fix for that [0] has been posted [1] that patch
was merged on v6.9-rc6 fixed by commit d99e3140a4d3 ("mm: turn
folio_test_hugetlb into a PageType"). However even on v6.10-rc2 where
this kernel commit is already merged we can still deadlock when running
fsstress and at the same time triggering compaction, this is a new
issue being reported now this through patch, but this patch also
serves as a reproducer with a high confidence. It always deadlocks.
If you enable CONFIG_PROVE_LOCKING with the defaults you will end up
with a complaint about increasing MAX_LOCKDEP_CHAIN_HLOCKS [1], if
you adjust that you then end up with a few soft lockup complaints and
some possible deadlock candidates to evaluate [2].

Provide a simple reproducer and pave the way so we keep on testing this.

Without lockdep enabled we silently deadlock on the first run of the
test without the fix applied. With lockdep enabled you get a splat about
the possible deadlock on the first run of the test.

[0] https://bugzilla.kernel.org/show_bug.cgi?id=218227
[1] https://gist.github.com/mcgrof/824913b645892214effeb1631df75072
[2] https://gist.github.com/mcgrof/926e183d21c5c4c55d74ec90197bd77a

Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 common/rc             |  7 +++++
 tests/generic/750     | 62 +++++++++++++++++++++++++++++++++++++++++++
 tests/generic/750.out |  2 ++
 3 files changed, 71 insertions(+)
 create mode 100755 tests/generic/750
 create mode 100644 tests/generic/750.out

Comments

Darrick J. Wong June 11, 2024, 2:48 p.m. UTC | #1
On Mon, Jun 10, 2024 at 08:02:00PM -0700, Luis Chamberlain wrote:
> Running compaction while we run fsstress can crash older kernels as per
> korg#218227 [0], the fix for that [0] has been posted [1] that patch
> was merged on v6.9-rc6 fixed by commit d99e3140a4d3 ("mm: turn
> folio_test_hugetlb into a PageType"). However even on v6.10-rc2 where
> this kernel commit is already merged we can still deadlock when running
> fsstress and at the same time triggering compaction, this is a new
> issue being reported now this through patch, but this patch also
> serves as a reproducer with a high confidence. It always deadlocks.
> If you enable CONFIG_PROVE_LOCKING with the defaults you will end up
> with a complaint about increasing MAX_LOCKDEP_CHAIN_HLOCKS [1], if
> you adjust that you then end up with a few soft lockup complaints and
> some possible deadlock candidates to evaluate [2].
> 
> Provide a simple reproducer and pave the way so we keep on testing this.
> 
> Without lockdep enabled we silently deadlock on the first run of the
> test without the fix applied. With lockdep enabled you get a splat about
> the possible deadlock on the first run of the test.
> 
> [0] https://bugzilla.kernel.org/show_bug.cgi?id=218227
> [1] https://gist.github.com/mcgrof/824913b645892214effeb1631df75072
> [2] https://gist.github.com/mcgrof/926e183d21c5c4c55d74ec90197bd77a
> 
> Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
> ---
>  common/rc             |  7 +++++
>  tests/generic/750     | 62 +++++++++++++++++++++++++++++++++++++++++++
>  tests/generic/750.out |  2 ++
>  3 files changed, 71 insertions(+)
>  create mode 100755 tests/generic/750
>  create mode 100644 tests/generic/750.out
> 
> diff --git a/common/rc b/common/rc
> index e812a2f7cc67..18ad25662d5c 100644
> --- a/common/rc
> +++ b/common/rc
> @@ -151,6 +151,13 @@ _require_hugepages()
>  		_notrun "Kernel does not report huge page size"
>  }
>  
> +# Requires CONFIG_COMPACTION
> +_require_vm_compaction()
> +{
> +	if [ ! -f /proc/sys/vm/compact_memory ]; then
> +	    _notrun "Need compaction enabled CONFIG_COMPACTION=y"
> +	fi
> +}
>  # Get hugepagesize in bytes
>  _get_hugepagesize()
>  {
> diff --git a/tests/generic/750 b/tests/generic/750
> new file mode 100755
> index 000000000000..334ab011dfa0
> --- /dev/null
> +++ b/tests/generic/750
> @@ -0,0 +1,62 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +# Copyright (c) 2024 Luis Chamberlain.  All Rights Reserved.
> +#
> +# FS QA Test 750
> +#
> +# fsstress + memory compaction test
> +#
> +. ./common/preamble
> +_begin_fstest auto rw long_rw stress soak smoketest
> +
> +_cleanup()
> +{
> +	cd /
> +	rm -f $runfile
> +	rm -f $tmp.*
> +	kill -9 $trigger_compaction_pid > /dev/null 2>&1
> +	$KILLALL_PROG -9 fsstress > /dev/null 2>&1
> +
> +	wait > /dev/null 2>&1
> +}
> +
> +# Import common functions.
> +
> +# real QA test starts here
> +
> +_supported_fs generic
> +
> +_require_scratch
> +_require_vm_compaction
> +_require_command "$KILLALL_PROG" "killall"
> +
> +# We still deadlock with this test on v6.10-rc2, we need more work.
> +# but the below makes things better.
> +_fixed_by_git_commit kernel d99e3140a4d3 \
> +	"mm: turn folio_test_hugetlb into a PageType"
> +
> +echo "Silence is golden"
> +
> +_scratch_mkfs > $seqres.full 2>&1
> +_scratch_mount >> $seqres.full 2>&1
> +
> +nr_cpus=$((LOAD_FACTOR * 4))
> +nr_ops=$((25000 * nr_cpus * TIME_FACTOR))
> +fsstress_args=(-w -d $SCRATCH_MNT -n $nr_ops -p $nr_cpus)
> +
> +# start a background trigger for memory compaction
> +runfile="$tmp.compaction"
> +touch $runfile
> +while [ -e $runfile ]; do
> +	echo 1 > /proc/sys/vm/compact_memory
> +	sleep 5
> +done &
> +trigger_compaction_pid=$!
> +
> +test -n "$SOAK_DURATION" && fsstress_args+=(--duration="$SOAK_DURATION")

Maybe put this with the other fsstress_args definition above, but
otherwise this looks reasonable.

Reviewed-by: Darrick J. Wong <djwong@kernel.org>

--D

> +
> +$FSSTRESS_PROG $FSSTRESS_AVOID "${fsstress_args[@]}" >> $seqres.full
> +wait > /dev/null 2>&1
> +
> +status=0
> +exit
> diff --git a/tests/generic/750.out b/tests/generic/750.out
> new file mode 100644
> index 000000000000..bd79507b632e
> --- /dev/null
> +++ b/tests/generic/750.out
> @@ -0,0 +1,2 @@
> +QA output created by 750
> +Silence is golden
> -- 
> 2.43.0
> 
>
Zorro Lang June 12, 2024, 8 a.m. UTC | #2
On Mon, Jun 10, 2024 at 08:02:00PM -0700, Luis Chamberlain wrote:
> Running compaction while we run fsstress can crash older kernels as per
> korg#218227 [0], the fix for that [0] has been posted [1] that patch
> was merged on v6.9-rc6 fixed by commit d99e3140a4d3 ("mm: turn
> folio_test_hugetlb into a PageType"). However even on v6.10-rc2 where
> this kernel commit is already merged we can still deadlock when running
> fsstress and at the same time triggering compaction, this is a new
> issue being reported now this through patch, but this patch also
> serves as a reproducer with a high confidence. It always deadlocks.
> If you enable CONFIG_PROVE_LOCKING with the defaults you will end up
> with a complaint about increasing MAX_LOCKDEP_CHAIN_HLOCKS [1], if
> you adjust that you then end up with a few soft lockup complaints and
> some possible deadlock candidates to evaluate [2].
> 
> Provide a simple reproducer and pave the way so we keep on testing this.
> 
> Without lockdep enabled we silently deadlock on the first run of the
> test without the fix applied. With lockdep enabled you get a splat about
> the possible deadlock on the first run of the test.
> 
> [0] https://bugzilla.kernel.org/show_bug.cgi?id=218227
> [1] https://gist.github.com/mcgrof/824913b645892214effeb1631df75072
> [2] https://gist.github.com/mcgrof/926e183d21c5c4c55d74ec90197bd77a
> 
> Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
> ---
>  common/rc             |  7 +++++
>  tests/generic/750     | 62 +++++++++++++++++++++++++++++++++++++++++++
>  tests/generic/750.out |  2 ++
>  3 files changed, 71 insertions(+)
>  create mode 100755 tests/generic/750
>  create mode 100644 tests/generic/750.out
> 
> diff --git a/common/rc b/common/rc
> index e812a2f7cc67..18ad25662d5c 100644
> --- a/common/rc
> +++ b/common/rc
> @@ -151,6 +151,13 @@ _require_hugepages()
>  		_notrun "Kernel does not report huge page size"
>  }
>  
> +# Requires CONFIG_COMPACTION
> +_require_vm_compaction()
> +{
> +	if [ ! -f /proc/sys/vm/compact_memory ]; then
> +	    _notrun "Need compaction enabled CONFIG_COMPACTION=y"
> +	fi
> +}
>  # Get hugepagesize in bytes
>  _get_hugepagesize()
>  {
> diff --git a/tests/generic/750 b/tests/generic/750
> new file mode 100755
> index 000000000000..334ab011dfa0
> --- /dev/null
> +++ b/tests/generic/750
> @@ -0,0 +1,62 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +# Copyright (c) 2024 Luis Chamberlain.  All Rights Reserved.
> +#
> +# FS QA Test 750
> +#
> +# fsstress + memory compaction test
> +#
> +. ./common/preamble
> +_begin_fstest auto rw long_rw stress soak smoketest
> +
> +_cleanup()
> +{
> +	cd /
> +	rm -f $runfile
> +	rm -f $tmp.*
> +	kill -9 $trigger_compaction_pid > /dev/null 2>&1
> +	$KILLALL_PROG -9 fsstress > /dev/null 2>&1
> +
> +	wait > /dev/null 2>&1
> +}
> +
> +# Import common functions.
> +
> +# real QA test starts here
> +
> +_supported_fs generic
> +
> +_require_scratch
> +_require_vm_compaction
> +_require_command "$KILLALL_PROG" "killall"
> +
> +# We still deadlock with this test on v6.10-rc2, we need more work.
> +# but the below makes things better.
> +_fixed_by_git_commit kernel d99e3140a4d3 \
> +	"mm: turn folio_test_hugetlb into a PageType"
> +
> +echo "Silence is golden"
> +
> +_scratch_mkfs > $seqres.full 2>&1
> +_scratch_mount >> $seqres.full 2>&1
> +
> +nr_cpus=$((LOAD_FACTOR * 4))
> +nr_ops=$((25000 * nr_cpus * TIME_FACTOR))
> +fsstress_args=(-w -d $SCRATCH_MNT -n $nr_ops -p $nr_cpus)
> +
> +# start a background trigger for memory compaction
> +runfile="$tmp.compaction"
> +touch $runfile
> +while [ -e $runfile ]; do
> +	echo 1 > /proc/sys/vm/compact_memory
> +	sleep 5
> +done &
> +trigger_compaction_pid=$!
> +
> +test -n "$SOAK_DURATION" && fsstress_args+=(--duration="$SOAK_DURATION")
> +
> +$FSSTRESS_PROG $FSSTRESS_AVOID "${fsstress_args[@]}" >> $seqres.full
> +wait > /dev/null 2>&1

Won't this "wait" wait forever (except a ctrl+C), due to no one removes
the $runfile?

Thanks,
Zorro

> +
> +status=0
> +exit
> diff --git a/tests/generic/750.out b/tests/generic/750.out
> new file mode 100644
> index 000000000000..bd79507b632e
> --- /dev/null
> +++ b/tests/generic/750.out
> @@ -0,0 +1,2 @@
> +QA output created by 750
> +Silence is golden
> -- 
> 2.43.0
> 
>
Luis Chamberlain June 13, 2024, 9:10 p.m. UTC | #3
On Wed, Jun 12, 2024 at 04:00:48PM +0800, Zorro Lang wrote:
> On Mon, Jun 10, 2024 at 08:02:00PM -0700, Luis Chamberlain wrote:
> > Running compaction while we run fsstress can crash older kernels as per
> > korg#218227 [0], the fix for that [0] has been posted [1] that patch
> > was merged on v6.9-rc6 fixed by commit d99e3140a4d3 ("mm: turn
> > folio_test_hugetlb into a PageType"). However even on v6.10-rc2 where
> > this kernel commit is already merged we can still deadlock when running
> > fsstress and at the same time triggering compaction, this is a new
> > issue being reported now this through patch, but this patch also
> > serves as a reproducer with a high confidence. It always deadlocks.
> > If you enable CONFIG_PROVE_LOCKING with the defaults you will end up
> > with a complaint about increasing MAX_LOCKDEP_CHAIN_HLOCKS [1], if
> > you adjust that you then end up with a few soft lockup complaints and
> > some possible deadlock candidates to evaluate [2].
> > 
> > Provide a simple reproducer and pave the way so we keep on testing this.
> > 
> > Without lockdep enabled we silently deadlock on the first run of the
> > test without the fix applied. With lockdep enabled you get a splat about
> > the possible deadlock on the first run of the test.
> > 
> > [0] https://bugzilla.kernel.org/show_bug.cgi?id=218227
> > [1] https://gist.github.com/mcgrof/824913b645892214effeb1631df75072
> > [2] https://gist.github.com/mcgrof/926e183d21c5c4c55d74ec90197bd77a
> > 
> > Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
> > ---
> >  common/rc             |  7 +++++
> >  tests/generic/750     | 62 +++++++++++++++++++++++++++++++++++++++++++
> >  tests/generic/750.out |  2 ++
> >  3 files changed, 71 insertions(+)
> >  create mode 100755 tests/generic/750
> >  create mode 100644 tests/generic/750.out
> > 
> > diff --git a/common/rc b/common/rc
> > index e812a2f7cc67..18ad25662d5c 100644
> > --- a/common/rc
> > +++ b/common/rc
> > @@ -151,6 +151,13 @@ _require_hugepages()
> >  		_notrun "Kernel does not report huge page size"
> >  }
> >  
> > +# Requires CONFIG_COMPACTION
> > +_require_vm_compaction()
> > +{
> > +	if [ ! -f /proc/sys/vm/compact_memory ]; then
> > +	    _notrun "Need compaction enabled CONFIG_COMPACTION=y"
> > +	fi
> > +}
> >  # Get hugepagesize in bytes
> >  _get_hugepagesize()
> >  {
> > diff --git a/tests/generic/750 b/tests/generic/750
> > new file mode 100755
> > index 000000000000..334ab011dfa0
> > --- /dev/null
> > +++ b/tests/generic/750
> > @@ -0,0 +1,62 @@
> > +#! /bin/bash
> > +# SPDX-License-Identifier: GPL-2.0
> > +# Copyright (c) 2024 Luis Chamberlain.  All Rights Reserved.
> > +#
> > +# FS QA Test 750
> > +#
> > +# fsstress + memory compaction test
> > +#
> > +. ./common/preamble
> > +_begin_fstest auto rw long_rw stress soak smoketest
> > +
> > +_cleanup()
> > +{
> > +	cd /
> > +	rm -f $runfile
> > +	rm -f $tmp.*
> > +	kill -9 $trigger_compaction_pid > /dev/null 2>&1
> > +	$KILLALL_PROG -9 fsstress > /dev/null 2>&1
> > +
> > +	wait > /dev/null 2>&1
> > +}
> > +
> > +# Import common functions.
> > +
> > +# real QA test starts here
> > +
> > +_supported_fs generic
> > +
> > +_require_scratch
> > +_require_vm_compaction
> > +_require_command "$KILLALL_PROG" "killall"
> > +
> > +# We still deadlock with this test on v6.10-rc2, we need more work.
> > +# but the below makes things better.
> > +_fixed_by_git_commit kernel d99e3140a4d3 \
> > +	"mm: turn folio_test_hugetlb into a PageType"
> > +
> > +echo "Silence is golden"
> > +
> > +_scratch_mkfs > $seqres.full 2>&1
> > +_scratch_mount >> $seqres.full 2>&1
> > +
> > +nr_cpus=$((LOAD_FACTOR * 4))
> > +nr_ops=$((25000 * nr_cpus * TIME_FACTOR))
> > +fsstress_args=(-w -d $SCRATCH_MNT -n $nr_ops -p $nr_cpus)
> > +
> > +# start a background trigger for memory compaction
> > +runfile="$tmp.compaction"
> > +touch $runfile
> > +while [ -e $runfile ]; do
> > +	echo 1 > /proc/sys/vm/compact_memory
> > +	sleep 5
> > +done &
> > +trigger_compaction_pid=$!
> > +
> > +test -n "$SOAK_DURATION" && fsstress_args+=(--duration="$SOAK_DURATION")
> > +
> > +$FSSTRESS_PROG $FSSTRESS_AVOID "${fsstress_args[@]}" >> $seqres.full
> > +wait > /dev/null 2>&1
> 
> Won't this "wait" wait forever (except a ctrl+C), due to no one removes
> the $runfile?

Odd, pretty sure I tested it and it didn't wait forever, but I'll add
the rm after the FSSTRESS call.

  Luis
diff mbox series

Patch

diff --git a/common/rc b/common/rc
index e812a2f7cc67..18ad25662d5c 100644
--- a/common/rc
+++ b/common/rc
@@ -151,6 +151,13 @@  _require_hugepages()
 		_notrun "Kernel does not report huge page size"
 }
 
+# Requires CONFIG_COMPACTION
+_require_vm_compaction()
+{
+	if [ ! -f /proc/sys/vm/compact_memory ]; then
+	    _notrun "Need compaction enabled CONFIG_COMPACTION=y"
+	fi
+}
 # Get hugepagesize in bytes
 _get_hugepagesize()
 {
diff --git a/tests/generic/750 b/tests/generic/750
new file mode 100755
index 000000000000..334ab011dfa0
--- /dev/null
+++ b/tests/generic/750
@@ -0,0 +1,62 @@ 
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2024 Luis Chamberlain.  All Rights Reserved.
+#
+# FS QA Test 750
+#
+# fsstress + memory compaction test
+#
+. ./common/preamble
+_begin_fstest auto rw long_rw stress soak smoketest
+
+_cleanup()
+{
+	cd /
+	rm -f $runfile
+	rm -f $tmp.*
+	kill -9 $trigger_compaction_pid > /dev/null 2>&1
+	$KILLALL_PROG -9 fsstress > /dev/null 2>&1
+
+	wait > /dev/null 2>&1
+}
+
+# Import common functions.
+
+# real QA test starts here
+
+_supported_fs generic
+
+_require_scratch
+_require_vm_compaction
+_require_command "$KILLALL_PROG" "killall"
+
+# We still deadlock with this test on v6.10-rc2, we need more work.
+# but the below makes things better.
+_fixed_by_git_commit kernel d99e3140a4d3 \
+	"mm: turn folio_test_hugetlb into a PageType"
+
+echo "Silence is golden"
+
+_scratch_mkfs > $seqres.full 2>&1
+_scratch_mount >> $seqres.full 2>&1
+
+nr_cpus=$((LOAD_FACTOR * 4))
+nr_ops=$((25000 * nr_cpus * TIME_FACTOR))
+fsstress_args=(-w -d $SCRATCH_MNT -n $nr_ops -p $nr_cpus)
+
+# start a background trigger for memory compaction
+runfile="$tmp.compaction"
+touch $runfile
+while [ -e $runfile ]; do
+	echo 1 > /proc/sys/vm/compact_memory
+	sleep 5
+done &
+trigger_compaction_pid=$!
+
+test -n "$SOAK_DURATION" && fsstress_args+=(--duration="$SOAK_DURATION")
+
+$FSSTRESS_PROG $FSSTRESS_AVOID "${fsstress_args[@]}" >> $seqres.full
+wait > /dev/null 2>&1
+
+status=0
+exit
diff --git a/tests/generic/750.out b/tests/generic/750.out
new file mode 100644
index 000000000000..bd79507b632e
--- /dev/null
+++ b/tests/generic/750.out
@@ -0,0 +1,2 @@ 
+QA output created by 750
+Silence is golden