diff mbox series

[25/40] fstests: scale some tests for high CPU count sanity

Message ID 20241127045403.3665299-26-david@fromorbit.com (mailing list archive)
State New
Headers show
Series fstests: concurrent test execution | expand

Commit Message

Dave Chinner Nov. 27, 2024, 4:51 a.m. UTC
From: Dave Chinner <dchinner@redhat.com>

Several tests use lots of processes to stress the filesystem. many
of them haven't really considered what this means for running the
test on high CPU machines (e.g. >32p) and the potential contention
and performance issues this might trigger.

Some of these tests simply need to increase the size of the journal.
Some need to run on filesystems with high inherent concurrency (e.g.
larger AG count). Some need more efficient/faster file creation. And
so on.

This commit is a collection of those sorts of changes to improve
runtimes on high CPU count machines.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 src/aio-dio-regress/aio-last-ref-held-by-io.c | 5 ++++-
 tests/generic/251                             | 5 ++++-
 tests/generic/323                             | 7 +++++--
 tests/generic/530                             | 2 +-
 tests/generic/531                             | 8 +++++++-
 tests/xfs/013                                 | 4 ++--
 tests/xfs/076                                 | 6 +++---
 tests/xfs/176                                 | 6 +++---
 tests/xfs/297                                 | 4 +++-
 tests/xfs/501                                 | 2 +-
 tests/xfs/502                                 | 2 +-
 11 files changed, 34 insertions(+), 17 deletions(-)

Comments

Zorro Lang Nov. 29, 2024, 3:34 a.m. UTC | #1
On Wed, Nov 27, 2024 at 03:51:55PM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> Several tests use lots of processes to stress the filesystem. many
> of them haven't really considered what this means for running the
> test on high CPU machines (e.g. >32p) and the potential contention
> and performance issues this might trigger.
> 
> Some of these tests simply need to increase the size of the journal.
> Some need to run on filesystems with high inherent concurrency (e.g.
> larger AG count). Some need more efficient/faster file creation. And
> so on.
> 
> This commit is a collection of those sorts of changes to improve
> runtimes on high CPU count machines.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  src/aio-dio-regress/aio-last-ref-held-by-io.c | 5 ++++-
>  tests/generic/251                             | 5 ++++-
>  tests/generic/323                             | 7 +++++--
>  tests/generic/530                             | 2 +-
>  tests/generic/531                             | 8 +++++++-
>  tests/xfs/013                                 | 4 ++--
>  tests/xfs/076                                 | 6 +++---
>  tests/xfs/176                                 | 6 +++---
>  tests/xfs/297                                 | 4 +++-
>  tests/xfs/501                                 | 2 +-
>  tests/xfs/502                                 | 2 +-
>  11 files changed, 34 insertions(+), 17 deletions(-)
> 
> diff --git a/src/aio-dio-regress/aio-last-ref-held-by-io.c b/src/aio-dio-regress/aio-last-ref-held-by-io.c
> index a70f2a9b7..7106e30a9 100644
> --- a/src/aio-dio-regress/aio-last-ref-held-by-io.c
> +++ b/src/aio-dio-regress/aio-last-ref-held-by-io.c
> @@ -85,11 +85,14 @@ aio_test_thread(void *data)
>  	/*
>  	 * Problems have been easier to trigger when spreading the
>  	 * workload over the available CPUs.
> +	 *
> +	 * If CPU hotplug is active, this can randomly fail so dump the error
> +	 * to stderror so it can be filtered out easily by the caller.
>  	 */
>  	CPU_ZERO(&cpuset);
>  	CPU_SET(mycpu, &cpuset);
>  	if (sched_setaffinity(mytid, sizeof(cpuset), &cpuset)) {
> -		printf("FAILED to set thread %d to run on cpu %ld\n",
> +		fprintf(stderr, "FAILED to set thread %d to run on cpu %ld\n",
>  		       mytid, mycpu);
>  	}
>  
> diff --git a/tests/generic/251 b/tests/generic/251
> index b432fb119..98986469e 100755
> --- a/tests/generic/251
> +++ b/tests/generic/251
> @@ -175,9 +175,12 @@ nproc=20
>  # Copy $here to the scratch fs and make coipes of the replica.  The fstests
>  # output (and hence $seqres.full) could be in $here, so we need to snapshot
>  # $here before computing file checksums.
> +#
> +# $here/* as the files to copy so we avoid any .git directory that might be
> +# much, much larger than the rest of the fstests source tree we are copying.
>  content=$SCRATCH_MNT/orig
>  mkdir -p $content
> -cp -axT $here/ $content/
> +cp -ax $here/* $content/

Hi Dave,

Darrick sent a patch to review this line:
https://lore.kernel.org/fstests/173258395238.4031902.16373799205312238046.stgit@frogsfrogsfrogs/T/#u

Please help to review, if you don't need this change anymore, I'll fix this
conflict (by removing your above change on g/251) when I merge this patch.

Thanks,
Zorro

>  
>  mkdir -p $tmp
>  
> diff --git a/tests/generic/323 b/tests/generic/323
> index 457253fee..2dde04d06 100755
> --- a/tests/generic/323
> +++ b/tests/generic/323
> @@ -23,12 +23,15 @@ _require_aiodio aio-last-ref-held-by-io
>  testfile=$TEST_DIR/aio-testfile
>  $XFS_IO_PROG -ftc "pwrite 0 10m" $testfile | _filter_xfs_io
>  
> -$AIO_TEST 0 100 $testfile
> +# This can emit cpu affinity setting failures that aren't considered test
> +# failures but cause golden image failures. Redirect the test output to
> +# $seqres.full so that it is captured but doesn't directly cause test failures.
> +$AIO_TEST 0 100 $testfile 2>> $seqres.full
>  if [ $? -ne 0 ]; then
>  	exit $status
>  fi
>  
> -$AIO_TEST 1 100 $testfile
> +$AIO_TEST 1 100 $testfile 2>> $seqres.full
>  if [ $? -ne 0 ]; then
>  	exit $status
>  fi
> diff --git a/tests/generic/530 b/tests/generic/530
> index 2e47c3e0c..18256b870 100755
> --- a/tests/generic/530
> +++ b/tests/generic/530
> @@ -22,7 +22,7 @@ _require_scratch_shutdown
>  _require_metadata_journaling
>  _require_test_program "t_open_tmpfiles"
>  
> -_scratch_mkfs >> $seqres.full 2>&1
> +_scratch_mkfs "-l size=256m" >> $seqres.full 2>&1
>  _scratch_mount
>  
>  # Set ULIMIT_NOFILE to min(file-max / 2, 50000 files per LOAD_FACTOR)
> diff --git a/tests/generic/531 b/tests/generic/531
> index 0e3564fd4..ed6c3f911 100755
> --- a/tests/generic/531
> +++ b/tests/generic/531
> @@ -21,7 +21,13 @@ _require_scratch
>  _require_xfs_io_command "-T"
>  _require_test_program "t_open_tmpfiles"
>  
> -_scratch_mkfs >> $seqres.full 2>&1
> +# On high CPU count machines, this runs a -lot- of create and unlink
> +# concurrency. Set the filesytsem up to handle this.
> +if [ $FSTYP = "xfs" ]; then
> +	_scratch_mkfs "-d agcount=32" >> $seqres.full 2>&1
> +else
> +	_scratch_mkfs >> $seqres.full 2>&1
> +fi
>  _scratch_mount
>  
>  # Try to load up all the CPUs, two threads per CPU.
> diff --git a/tests/xfs/013 b/tests/xfs/013
> index fd3d8c64c..5a92ef084 100755
> --- a/tests/xfs/013
> +++ b/tests/xfs/013
> @@ -28,7 +28,7 @@ _create()
>  	mkdir -p $dir
>  	for i in $(seq 0 $count)
>  	do
> -		touch $dir/$i 2>&1 | filter_enospc
> +		echo -n > $dir/$i 2>&1 | filter_enospc
>  	done
>  }
>  
> @@ -42,7 +42,7 @@ _rand_replace()
>  	do
>  		file=$((RANDOM % count))
>  		rm -f $dir/$file
> -		touch $dir/$file 2>&1 | filter_enospc
> +		echo -n > $dir/$file 2>&1 | filter_enospc
>  	done
>  }
>  
> diff --git a/tests/xfs/076 b/tests/xfs/076
> index 840617ccb..e315a067c 100755
> --- a/tests/xfs/076
> +++ b/tests/xfs/076
> @@ -47,10 +47,10 @@ _alloc_inodes()
>  	dir=$1
>  
>  	i=0
> -	while [ true ]; do
> -		touch $dir/$i 2>> $seqres.full || break
> +	( while [ true ]; do
> +		echo -n > $dir/$i || break
>  		i=$((i + 1))
> -	done
> +	done ) >> $seqres.full 2>&1
>  }
>  
>  
> diff --git a/tests/xfs/176 b/tests/xfs/176
> index 8e5951ec1..1aa8cde38 100755
> --- a/tests/xfs/176
> +++ b/tests/xfs/176
> @@ -68,10 +68,10 @@ _alloc_inodes()
>  	dir=$1
>  
>  	i=0
> -	while [ true ]; do
> -		echo -n > $dir/$i >> $seqres.full 2>&1 || break
> +	( while [ true ]; do
> +		echo -n > $dir/$i || break
>  		i=$((i + 1))
> -	done
> +	done ) >> $seqres.full 2>&1
>  }
>  
>  # Find a sparse inode cluster after logend_agno/logend_agino.
> diff --git a/tests/xfs/297 b/tests/xfs/297
> index f9cd2ff12..af6af601a 100755
> --- a/tests/xfs/297
> +++ b/tests/xfs/297
> @@ -34,7 +34,9 @@ _scratch_mount
>  STRESS_DIR="$SCRATCH_MNT/testdir"
>  mkdir -p $STRESS_DIR
>  
> -_run_fsstress_bg -d $STRESS_DIR -n 1000 -p 1000 $FSSTRESS_AVOID
> +# turn off sync as this can lead to near deadlock conditions due to every
> +# fsstress process lockstepping against freeze on large CPU count machines
> +_run_fsstress_bg -d $STRESS_DIR -f sync=0 -n 1000 -p 1000 $FSSTRESS_AVOID
>  
>  # Freeze/unfreeze file system randomly
>  echo "Start freeze/unfreeze randomly" | tee -a $seqres.full
> diff --git a/tests/xfs/501 b/tests/xfs/501
> index 1da4cbf92..678c51b52 100755
> --- a/tests/xfs/501
> +++ b/tests/xfs/501
> @@ -33,7 +33,7 @@ _require_xfs_sysfs debug/log_recovery_delay
>  _require_scratch
>  _require_test_program "t_open_tmpfiles"
>  
> -_scratch_mkfs >> $seqres.full 2>&1
> +_scratch_mkfs "-l size=256m" >> $seqres.full 2>&1
>  _scratch_mount
>  
>  # Set ULIMIT_NOFILE to min(file-max / 2, 30000 files per LOAD_FACTOR)
> diff --git a/tests/xfs/502 b/tests/xfs/502
> index 52b8e95a2..10b0017f6 100755
> --- a/tests/xfs/502
> +++ b/tests/xfs/502
> @@ -23,7 +23,7 @@ _require_xfs_io_error_injection "iunlink_fallback"
>  _require_scratch
>  _require_test_program "t_open_tmpfiles"
>  
> -_scratch_mkfs | _filter_mkfs 2> $tmp.mkfs > /dev/null
> +_scratch_mkfs "-l size=256m" | _filter_mkfs 2> $tmp.mkfs > /dev/null
>  cat $tmp.mkfs >> $seqres.full
>  . $tmp.mkfs
>  
> -- 
> 2.45.2
> 
>
diff mbox series

Patch

diff --git a/src/aio-dio-regress/aio-last-ref-held-by-io.c b/src/aio-dio-regress/aio-last-ref-held-by-io.c
index a70f2a9b7..7106e30a9 100644
--- a/src/aio-dio-regress/aio-last-ref-held-by-io.c
+++ b/src/aio-dio-regress/aio-last-ref-held-by-io.c
@@ -85,11 +85,14 @@  aio_test_thread(void *data)
 	/*
 	 * Problems have been easier to trigger when spreading the
 	 * workload over the available CPUs.
+	 *
+	 * If CPU hotplug is active, this can randomly fail so dump the error
+	 * to stderror so it can be filtered out easily by the caller.
 	 */
 	CPU_ZERO(&cpuset);
 	CPU_SET(mycpu, &cpuset);
 	if (sched_setaffinity(mytid, sizeof(cpuset), &cpuset)) {
-		printf("FAILED to set thread %d to run on cpu %ld\n",
+		fprintf(stderr, "FAILED to set thread %d to run on cpu %ld\n",
 		       mytid, mycpu);
 	}
 
diff --git a/tests/generic/251 b/tests/generic/251
index b432fb119..98986469e 100755
--- a/tests/generic/251
+++ b/tests/generic/251
@@ -175,9 +175,12 @@  nproc=20
 # Copy $here to the scratch fs and make coipes of the replica.  The fstests
 # output (and hence $seqres.full) could be in $here, so we need to snapshot
 # $here before computing file checksums.
+#
+# $here/* as the files to copy so we avoid any .git directory that might be
+# much, much larger than the rest of the fstests source tree we are copying.
 content=$SCRATCH_MNT/orig
 mkdir -p $content
-cp -axT $here/ $content/
+cp -ax $here/* $content/
 
 mkdir -p $tmp
 
diff --git a/tests/generic/323 b/tests/generic/323
index 457253fee..2dde04d06 100755
--- a/tests/generic/323
+++ b/tests/generic/323
@@ -23,12 +23,15 @@  _require_aiodio aio-last-ref-held-by-io
 testfile=$TEST_DIR/aio-testfile
 $XFS_IO_PROG -ftc "pwrite 0 10m" $testfile | _filter_xfs_io
 
-$AIO_TEST 0 100 $testfile
+# This can emit cpu affinity setting failures that aren't considered test
+# failures but cause golden image failures. Redirect the test output to
+# $seqres.full so that it is captured but doesn't directly cause test failures.
+$AIO_TEST 0 100 $testfile 2>> $seqres.full
 if [ $? -ne 0 ]; then
 	exit $status
 fi
 
-$AIO_TEST 1 100 $testfile
+$AIO_TEST 1 100 $testfile 2>> $seqres.full
 if [ $? -ne 0 ]; then
 	exit $status
 fi
diff --git a/tests/generic/530 b/tests/generic/530
index 2e47c3e0c..18256b870 100755
--- a/tests/generic/530
+++ b/tests/generic/530
@@ -22,7 +22,7 @@  _require_scratch_shutdown
 _require_metadata_journaling
 _require_test_program "t_open_tmpfiles"
 
-_scratch_mkfs >> $seqres.full 2>&1
+_scratch_mkfs "-l size=256m" >> $seqres.full 2>&1
 _scratch_mount
 
 # Set ULIMIT_NOFILE to min(file-max / 2, 50000 files per LOAD_FACTOR)
diff --git a/tests/generic/531 b/tests/generic/531
index 0e3564fd4..ed6c3f911 100755
--- a/tests/generic/531
+++ b/tests/generic/531
@@ -21,7 +21,13 @@  _require_scratch
 _require_xfs_io_command "-T"
 _require_test_program "t_open_tmpfiles"
 
-_scratch_mkfs >> $seqres.full 2>&1
+# On high CPU count machines, this runs a -lot- of create and unlink
+# concurrency. Set the filesytsem up to handle this.
+if [ $FSTYP = "xfs" ]; then
+	_scratch_mkfs "-d agcount=32" >> $seqres.full 2>&1
+else
+	_scratch_mkfs >> $seqres.full 2>&1
+fi
 _scratch_mount
 
 # Try to load up all the CPUs, two threads per CPU.
diff --git a/tests/xfs/013 b/tests/xfs/013
index fd3d8c64c..5a92ef084 100755
--- a/tests/xfs/013
+++ b/tests/xfs/013
@@ -28,7 +28,7 @@  _create()
 	mkdir -p $dir
 	for i in $(seq 0 $count)
 	do
-		touch $dir/$i 2>&1 | filter_enospc
+		echo -n > $dir/$i 2>&1 | filter_enospc
 	done
 }
 
@@ -42,7 +42,7 @@  _rand_replace()
 	do
 		file=$((RANDOM % count))
 		rm -f $dir/$file
-		touch $dir/$file 2>&1 | filter_enospc
+		echo -n > $dir/$file 2>&1 | filter_enospc
 	done
 }
 
diff --git a/tests/xfs/076 b/tests/xfs/076
index 840617ccb..e315a067c 100755
--- a/tests/xfs/076
+++ b/tests/xfs/076
@@ -47,10 +47,10 @@  _alloc_inodes()
 	dir=$1
 
 	i=0
-	while [ true ]; do
-		touch $dir/$i 2>> $seqres.full || break
+	( while [ true ]; do
+		echo -n > $dir/$i || break
 		i=$((i + 1))
-	done
+	done ) >> $seqres.full 2>&1
 }
 
 
diff --git a/tests/xfs/176 b/tests/xfs/176
index 8e5951ec1..1aa8cde38 100755
--- a/tests/xfs/176
+++ b/tests/xfs/176
@@ -68,10 +68,10 @@  _alloc_inodes()
 	dir=$1
 
 	i=0
-	while [ true ]; do
-		echo -n > $dir/$i >> $seqres.full 2>&1 || break
+	( while [ true ]; do
+		echo -n > $dir/$i || break
 		i=$((i + 1))
-	done
+	done ) >> $seqres.full 2>&1
 }
 
 # Find a sparse inode cluster after logend_agno/logend_agino.
diff --git a/tests/xfs/297 b/tests/xfs/297
index f9cd2ff12..af6af601a 100755
--- a/tests/xfs/297
+++ b/tests/xfs/297
@@ -34,7 +34,9 @@  _scratch_mount
 STRESS_DIR="$SCRATCH_MNT/testdir"
 mkdir -p $STRESS_DIR
 
-_run_fsstress_bg -d $STRESS_DIR -n 1000 -p 1000 $FSSTRESS_AVOID
+# turn off sync as this can lead to near deadlock conditions due to every
+# fsstress process lockstepping against freeze on large CPU count machines
+_run_fsstress_bg -d $STRESS_DIR -f sync=0 -n 1000 -p 1000 $FSSTRESS_AVOID
 
 # Freeze/unfreeze file system randomly
 echo "Start freeze/unfreeze randomly" | tee -a $seqres.full
diff --git a/tests/xfs/501 b/tests/xfs/501
index 1da4cbf92..678c51b52 100755
--- a/tests/xfs/501
+++ b/tests/xfs/501
@@ -33,7 +33,7 @@  _require_xfs_sysfs debug/log_recovery_delay
 _require_scratch
 _require_test_program "t_open_tmpfiles"
 
-_scratch_mkfs >> $seqres.full 2>&1
+_scratch_mkfs "-l size=256m" >> $seqres.full 2>&1
 _scratch_mount
 
 # Set ULIMIT_NOFILE to min(file-max / 2, 30000 files per LOAD_FACTOR)
diff --git a/tests/xfs/502 b/tests/xfs/502
index 52b8e95a2..10b0017f6 100755
--- a/tests/xfs/502
+++ b/tests/xfs/502
@@ -23,7 +23,7 @@  _require_xfs_io_error_injection "iunlink_fallback"
 _require_scratch
 _require_test_program "t_open_tmpfiles"
 
-_scratch_mkfs | _filter_mkfs 2> $tmp.mkfs > /dev/null
+_scratch_mkfs "-l size=256m" | _filter_mkfs 2> $tmp.mkfs > /dev/null
 cat $tmp.mkfs >> $seqres.full
 . $tmp.mkfs