diff mbox series

[v4,11/11] KVM: selftests: Enable tunning of err_margin_us in arch timer test

Message ID 0343a9e4bfa8011fbb6bca0286cee7eab1f17d5d.1702371136.git.haibo1.xu@intel.com (mailing list archive)
State Superseded
Headers show
Series RISCV: Add kvm Sstc timer selftests | expand

Checks

Context Check Description
conchuod/vmtest-for-next-PR fail PR summary
conchuod/patch-11-test-1 success .github/scripts/patches/build_rv32_defconfig.sh
conchuod/patch-11-test-2 success .github/scripts/patches/build_rv64_clang_allmodconfig.sh
conchuod/patch-11-test-3 success .github/scripts/patches/build_rv64_gcc_allmodconfig.sh
conchuod/patch-11-test-4 success .github/scripts/patches/build_rv64_nommu_k210_defconfig.sh
conchuod/patch-11-test-5 success .github/scripts/patches/build_rv64_nommu_virt_defconfig.sh
conchuod/patch-11-test-6 fail .github/scripts/patches/checkpatch.sh
conchuod/patch-11-test-7 success .github/scripts/patches/dtb_warn_rv64.sh
conchuod/patch-11-test-8 success .github/scripts/patches/header_inline.sh
conchuod/patch-11-test-9 success .github/scripts/patches/kdoc.sh
conchuod/patch-11-test-10 success .github/scripts/patches/module_param.sh
conchuod/patch-11-test-11 success .github/scripts/patches/verify_fixes.sh
conchuod/patch-11-test-12 success .github/scripts/patches/verify_signedoff.sh

Commit Message

Xu, Haibo1 Dec. 12, 2023, 9:31 a.m. UTC
There are intermittent failures occured when stressing the
arch-timer test in a Qemu VM:

 Guest assert failed,  vcpu 0; stage; 4; iter: 3
 ==== Test Assertion Failure ====
   aarch64/arch_timer.c:196: config_iter + 1 == irq_iter
   pid=4048 tid=4049 errno=4 - Interrupted system call
      1  0x000000000040253b: test_vcpu_run at arch_timer.c:248
      2  0x0000ffffb60dd5c7: ?? ??:0
      3  0x0000ffffb6145d1b: ?? ??:0
   0x3 != 0x2 (config_iter + 1 != irq_iter)e

Further test and debug show that the timeout for an interrupt
to arrive do have random high fluctuation, espectially when
testing in an virtual environment.

To alleviate this issue, just expose the timeout value as user
configurable and print some hint message to increase the value
when hitting the failure..

Signed-off-by: Haibo Xu <haibo1.xu@intel.com>
---
 .../selftests/kvm/aarch64/arch_timer.c        |  8 +++++--
 tools/testing/selftests/kvm/arch_timer.c      | 22 +++++++++++++------
 .../selftests/kvm/include/timer_test.h        |  1 +
 .../testing/selftests/kvm/riscv/arch_timer.c  |  8 +++++--
 4 files changed, 28 insertions(+), 11 deletions(-)

Comments

Andrew Jones Dec. 13, 2023, 2:27 p.m. UTC | #1
On Tue, Dec 12, 2023 at 05:31:20PM +0800, Haibo Xu wrote:
> There are intermittent failures occured when stressing the
> arch-timer test in a Qemu VM:
> 
>  Guest assert failed,  vcpu 0; stage; 4; iter: 3
>  ==== Test Assertion Failure ====
>    aarch64/arch_timer.c:196: config_iter + 1 == irq_iter
>    pid=4048 tid=4049 errno=4 - Interrupted system call
>       1  0x000000000040253b: test_vcpu_run at arch_timer.c:248
>       2  0x0000ffffb60dd5c7: ?? ??:0
>       3  0x0000ffffb6145d1b: ?? ??:0
>    0x3 != 0x2 (config_iter + 1 != irq_iter)e
> 
> Further test and debug show that the timeout for an interrupt
> to arrive do have random high fluctuation, espectially when
> testing in an virtual environment.
> 
> To alleviate this issue, just expose the timeout value as user
> configurable and print some hint message to increase the value
> when hitting the failure..
> 
> Signed-off-by: Haibo Xu <haibo1.xu@intel.com>
> ---
>  .../selftests/kvm/aarch64/arch_timer.c        |  8 +++++--
>  tools/testing/selftests/kvm/arch_timer.c      | 22 +++++++++++++------
>  .../selftests/kvm/include/timer_test.h        |  1 +
>  .../testing/selftests/kvm/riscv/arch_timer.c  |  8 +++++--
>  4 files changed, 28 insertions(+), 11 deletions(-)
> 
> diff --git a/tools/testing/selftests/kvm/aarch64/arch_timer.c b/tools/testing/selftests/kvm/aarch64/arch_timer.c
> index 4b421d421c3f..139eecbf77e7 100644
> --- a/tools/testing/selftests/kvm/aarch64/arch_timer.c
> +++ b/tools/testing/selftests/kvm/aarch64/arch_timer.c
> @@ -131,10 +131,14 @@ static void guest_run_stage(struct test_vcpu_shared_data *shared_data,
>  
>  		/* Setup a timeout for the interrupt to arrive */
>  		udelay(msecs_to_usecs(test_args.timer_period_ms) +
> -			TIMER_TEST_ERR_MARGIN_US);
> +			test_args.timer_err_margin_us);
>  
>  		irq_iter = READ_ONCE(shared_data->nr_iter);
> -		GUEST_ASSERT_EQ(config_iter + 1, irq_iter);
> +		__GUEST_ASSERT(config_iter + 1 == irq_iter,
> +			"config_iter + 1 = 0x%lx, irq_iter = 0x%lx.\n"
> +			"  Guest timer interrupt was not trigged within the specified\n"
> +			"  interval, try to increase the error margin by [-e] option.\n",
> +			config_iter + 1, irq_iter);
>  	}
>  }
>  
> diff --git a/tools/testing/selftests/kvm/arch_timer.c b/tools/testing/selftests/kvm/arch_timer.c
> index 60963fce16f2..5050022fd345 100644
> --- a/tools/testing/selftests/kvm/arch_timer.c
> +++ b/tools/testing/selftests/kvm/arch_timer.c
> @@ -5,16 +5,17 @@
>   * The guest's main thread configures the timer interrupt and waits
>   * for it to fire, with a timeout equal to the timer period.
>   * It asserts that the timeout doesn't exceed the timer period plus
> - * an error margin of 100us.
> + * an user configurable error margin(default to 100us).
>   *
>   * On the other hand, upon receipt of an interrupt, the guest's interrupt
>   * handler validates the interrupt by checking if the architectural state
>   * is in compliance with the specifications.
>   *
>   * The test provides command-line options to configure the timer's
> - * period (-p), number of vCPUs (-n), and iterations per stage (-i).
> - * To stress-test the timer stack even more, an option to migrate the
> - * vCPUs across pCPUs (-m), at a particular rate, is also provided.
> + * period (-p), number of vCPUs (-n), iterations per stage (-i), and timer
> + * interrupt arrival error margin (-e). To stress-test the timer stack even
> + * more, an option to migrate the vCPUs across pCPUs (-m), at a particular
> + * rate, is also provided.
>   *
>   * Copyright (c) 2021, Google LLC.
>   */
> @@ -34,6 +35,7 @@ struct test_args test_args = {
>  	.nr_iter = NR_TEST_ITERS_DEF,
>  	.timer_period_ms = TIMER_TEST_PERIOD_MS_DEF,
>  	.migration_freq_ms = TIMER_TEST_MIGRATION_FREQ_MS,
> +	.timer_err_margin_us = TIMER_TEST_ERR_MARGIN_US,
>  	.reserved = 1,
>  };
>  
> @@ -179,8 +181,9 @@ static void test_run(struct kvm_vm *vm)
>  
>  static void test_print_help(char *name)
>  {
> -	pr_info("Usage: %s [-h] [-n nr_vcpus] [-i iterations] [-p timer_period_ms]\n",
> -		name);
> +	pr_info("Usage: %s [-h] [-n nr_vcpus] [-i iterations] [-p timer_period_ms]\n"
> +	        "\t\t    [-m migration_freq_ms] [-o counter_offset]\n"
> +	        "\t\t    [-e timer_err_margin_us]\n", name);
>  	pr_info("\t-n: Number of vCPUs to configure (default: %u; max: %u)\n",
>  		NR_VCPUS_DEF, KVM_MAX_VCPUS);
>  	pr_info("\t-i: Number of iterations per stage (default: %u)\n",
> @@ -190,6 +193,8 @@ static void test_print_help(char *name)
>  	pr_info("\t-m: Frequency (in ms) of vCPUs to migrate to different pCPU. 0 to turn off (default: %u)\n",
>  		TIMER_TEST_MIGRATION_FREQ_MS);
>  	pr_info("\t-o: Counter offset (in counter cycles, default: 0) [aarch64-only]\n");
> +	pr_info("\t-e: Interrupt arrival error margin(in us) of the guest timer (default: %u)\n",
                                                     ^ need space here

> +		TIMER_TEST_ERR_MARGIN_US);
>  	pr_info("\t-h: print this help screen\n");
>  }
>  
> @@ -197,7 +202,7 @@ static bool parse_args(int argc, char *argv[])
>  {
>  	int opt;
>  
> -	while ((opt = getopt(argc, argv, "hn:i:p:m:o:")) != -1) {
> +	while ((opt = getopt(argc, argv, "hn:i:p:m:o:e:")) != -1) {
>  		switch (opt) {
>  		case 'n':
>  			test_args.nr_vcpus = atoi_positive("Number of vCPUs", optarg);
> @@ -216,6 +221,9 @@ static bool parse_args(int argc, char *argv[])
>  		case 'm':
>  			test_args.migration_freq_ms = atoi_non_negative("Frequency", optarg);
>  			break;
> +		case 'e':
> +			test_args.timer_err_margin_us = atoi_non_negative("Error Margin", optarg);
> +			break;
>  		case 'o':
>  			test_args.counter_offset = strtol(optarg, NULL, 0);
>  			test_args.reserved = 0;
> diff --git a/tools/testing/selftests/kvm/include/timer_test.h b/tools/testing/selftests/kvm/include/timer_test.h
> index 968257b893a7..b1d405e7157d 100644
> --- a/tools/testing/selftests/kvm/include/timer_test.h
> +++ b/tools/testing/selftests/kvm/include/timer_test.h
> @@ -22,6 +22,7 @@ struct test_args {
>  	int nr_iter;
>  	int timer_period_ms;
>  	int migration_freq_ms;
> +	int timer_err_margin_us;
>  	/* Members of struct kvm_arm_counter_offset */
>  	uint64_t counter_offset;
>  	uint64_t reserved;
> diff --git a/tools/testing/selftests/kvm/riscv/arch_timer.c b/tools/testing/selftests/kvm/riscv/arch_timer.c
> index 13bf184d1ff5..45a139dc7ce3 100644
> --- a/tools/testing/selftests/kvm/riscv/arch_timer.c
> +++ b/tools/testing/selftests/kvm/riscv/arch_timer.c
> @@ -55,10 +55,14 @@ static void guest_run(struct test_vcpu_shared_data *shared_data)
>  
>  		/* Setup a timeout for the interrupt to arrive */
>  		udelay(msecs_to_usecs(test_args.timer_period_ms) +
> -			TIMER_TEST_ERR_MARGIN_US);
> +			test_args.timer_err_margin_us);
>  
>  		irq_iter = READ_ONCE(shared_data->nr_iter);
> -		GUEST_ASSERT_EQ(config_iter + 1, irq_iter);
> +		__GUEST_ASSERT(config_iter + 1 == irq_iter,
> +			"config_iter + 1 = 0x%lx, irq_iter = 0x%lx.\n"
> +			"  Guest timer interrupt was not trigged within the specified\n"
> +			"  interval, try to increase the error margin by [-e] option.\n",
> +			config_iter + 1, irq_iter);
>  	}
>  }
>  
> -- 
> 2.34.1
>

I probably would have started the series with this patch since you said
you could reproduce the problem on aarch64. Starting the series with
this patch would allow just this patch to get picked up to fix aarch64,
if that's necessary, and also avoids touching a couple places twice
with the code split for riscv.

But anyway, other than the missing space in the help text,

Reviewed-by: Andrew Jones <ajones@ventanamicro.com>

Thanks,
drew
Anup Patel Dec. 14, 2023, 6:44 a.m. UTC | #2
Hi Marc, Hi Oliver,

On Tue, Dec 12, 2023 at 2:52 PM Haibo Xu <haibo1.xu@intel.com> wrote:
>
> There are intermittent failures occured when stressing the
> arch-timer test in a Qemu VM:
>
>  Guest assert failed,  vcpu 0; stage; 4; iter: 3
>  ==== Test Assertion Failure ====
>    aarch64/arch_timer.c:196: config_iter + 1 == irq_iter
>    pid=4048 tid=4049 errno=4 - Interrupted system call
>       1  0x000000000040253b: test_vcpu_run at arch_timer.c:248
>       2  0x0000ffffb60dd5c7: ?? ??:0
>       3  0x0000ffffb6145d1b: ?? ??:0
>    0x3 != 0x2 (config_iter + 1 != irq_iter)e
>
> Further test and debug show that the timeout for an interrupt
> to arrive do have random high fluctuation, espectially when
> testing in an virtual environment.
>
> To alleviate this issue, just expose the timeout value as user
> configurable and print some hint message to increase the value
> when hitting the failure..
>
> Signed-off-by: Haibo Xu <haibo1.xu@intel.com>

Can you please review this patch ?

We want to take this entire series through the KVM RISC-V tree.

Regards,
Anup

> ---
>  .../selftests/kvm/aarch64/arch_timer.c        |  8 +++++--
>  tools/testing/selftests/kvm/arch_timer.c      | 22 +++++++++++++------
>  .../selftests/kvm/include/timer_test.h        |  1 +
>  .../testing/selftests/kvm/riscv/arch_timer.c  |  8 +++++--
>  4 files changed, 28 insertions(+), 11 deletions(-)
>
> diff --git a/tools/testing/selftests/kvm/aarch64/arch_timer.c b/tools/testing/selftests/kvm/aarch64/arch_timer.c
> index 4b421d421c3f..139eecbf77e7 100644
> --- a/tools/testing/selftests/kvm/aarch64/arch_timer.c
> +++ b/tools/testing/selftests/kvm/aarch64/arch_timer.c
> @@ -131,10 +131,14 @@ static void guest_run_stage(struct test_vcpu_shared_data *shared_data,
>
>                 /* Setup a timeout for the interrupt to arrive */
>                 udelay(msecs_to_usecs(test_args.timer_period_ms) +
> -                       TIMER_TEST_ERR_MARGIN_US);
> +                       test_args.timer_err_margin_us);
>
>                 irq_iter = READ_ONCE(shared_data->nr_iter);
> -               GUEST_ASSERT_EQ(config_iter + 1, irq_iter);
> +               __GUEST_ASSERT(config_iter + 1 == irq_iter,
> +                       "config_iter + 1 = 0x%lx, irq_iter = 0x%lx.\n"
> +                       "  Guest timer interrupt was not trigged within the specified\n"
> +                       "  interval, try to increase the error margin by [-e] option.\n",
> +                       config_iter + 1, irq_iter);
>         }
>  }
>
> diff --git a/tools/testing/selftests/kvm/arch_timer.c b/tools/testing/selftests/kvm/arch_timer.c
> index 60963fce16f2..5050022fd345 100644
> --- a/tools/testing/selftests/kvm/arch_timer.c
> +++ b/tools/testing/selftests/kvm/arch_timer.c
> @@ -5,16 +5,17 @@
>   * The guest's main thread configures the timer interrupt and waits
>   * for it to fire, with a timeout equal to the timer period.
>   * It asserts that the timeout doesn't exceed the timer period plus
> - * an error margin of 100us.
> + * an user configurable error margin(default to 100us).
>   *
>   * On the other hand, upon receipt of an interrupt, the guest's interrupt
>   * handler validates the interrupt by checking if the architectural state
>   * is in compliance with the specifications.
>   *
>   * The test provides command-line options to configure the timer's
> - * period (-p), number of vCPUs (-n), and iterations per stage (-i).
> - * To stress-test the timer stack even more, an option to migrate the
> - * vCPUs across pCPUs (-m), at a particular rate, is also provided.
> + * period (-p), number of vCPUs (-n), iterations per stage (-i), and timer
> + * interrupt arrival error margin (-e). To stress-test the timer stack even
> + * more, an option to migrate the vCPUs across pCPUs (-m), at a particular
> + * rate, is also provided.
>   *
>   * Copyright (c) 2021, Google LLC.
>   */
> @@ -34,6 +35,7 @@ struct test_args test_args = {
>         .nr_iter = NR_TEST_ITERS_DEF,
>         .timer_period_ms = TIMER_TEST_PERIOD_MS_DEF,
>         .migration_freq_ms = TIMER_TEST_MIGRATION_FREQ_MS,
> +       .timer_err_margin_us = TIMER_TEST_ERR_MARGIN_US,
>         .reserved = 1,
>  };
>
> @@ -179,8 +181,9 @@ static void test_run(struct kvm_vm *vm)
>
>  static void test_print_help(char *name)
>  {
> -       pr_info("Usage: %s [-h] [-n nr_vcpus] [-i iterations] [-p timer_period_ms]\n",
> -               name);
> +       pr_info("Usage: %s [-h] [-n nr_vcpus] [-i iterations] [-p timer_period_ms]\n"
> +               "\t\t    [-m migration_freq_ms] [-o counter_offset]\n"
> +               "\t\t    [-e timer_err_margin_us]\n", name);
>         pr_info("\t-n: Number of vCPUs to configure (default: %u; max: %u)\n",
>                 NR_VCPUS_DEF, KVM_MAX_VCPUS);
>         pr_info("\t-i: Number of iterations per stage (default: %u)\n",
> @@ -190,6 +193,8 @@ static void test_print_help(char *name)
>         pr_info("\t-m: Frequency (in ms) of vCPUs to migrate to different pCPU. 0 to turn off (default: %u)\n",
>                 TIMER_TEST_MIGRATION_FREQ_MS);
>         pr_info("\t-o: Counter offset (in counter cycles, default: 0) [aarch64-only]\n");
> +       pr_info("\t-e: Interrupt arrival error margin(in us) of the guest timer (default: %u)\n",
> +               TIMER_TEST_ERR_MARGIN_US);
>         pr_info("\t-h: print this help screen\n");
>  }
>
> @@ -197,7 +202,7 @@ static bool parse_args(int argc, char *argv[])
>  {
>         int opt;
>
> -       while ((opt = getopt(argc, argv, "hn:i:p:m:o:")) != -1) {
> +       while ((opt = getopt(argc, argv, "hn:i:p:m:o:e:")) != -1) {
>                 switch (opt) {
>                 case 'n':
>                         test_args.nr_vcpus = atoi_positive("Number of vCPUs", optarg);
> @@ -216,6 +221,9 @@ static bool parse_args(int argc, char *argv[])
>                 case 'm':
>                         test_args.migration_freq_ms = atoi_non_negative("Frequency", optarg);
>                         break;
> +               case 'e':
> +                       test_args.timer_err_margin_us = atoi_non_negative("Error Margin", optarg);
> +                       break;
>                 case 'o':
>                         test_args.counter_offset = strtol(optarg, NULL, 0);
>                         test_args.reserved = 0;
> diff --git a/tools/testing/selftests/kvm/include/timer_test.h b/tools/testing/selftests/kvm/include/timer_test.h
> index 968257b893a7..b1d405e7157d 100644
> --- a/tools/testing/selftests/kvm/include/timer_test.h
> +++ b/tools/testing/selftests/kvm/include/timer_test.h
> @@ -22,6 +22,7 @@ struct test_args {
>         int nr_iter;
>         int timer_period_ms;
>         int migration_freq_ms;
> +       int timer_err_margin_us;
>         /* Members of struct kvm_arm_counter_offset */
>         uint64_t counter_offset;
>         uint64_t reserved;
> diff --git a/tools/testing/selftests/kvm/riscv/arch_timer.c b/tools/testing/selftests/kvm/riscv/arch_timer.c
> index 13bf184d1ff5..45a139dc7ce3 100644
> --- a/tools/testing/selftests/kvm/riscv/arch_timer.c
> +++ b/tools/testing/selftests/kvm/riscv/arch_timer.c
> @@ -55,10 +55,14 @@ static void guest_run(struct test_vcpu_shared_data *shared_data)
>
>                 /* Setup a timeout for the interrupt to arrive */
>                 udelay(msecs_to_usecs(test_args.timer_period_ms) +
> -                       TIMER_TEST_ERR_MARGIN_US);
> +                       test_args.timer_err_margin_us);
>
>                 irq_iter = READ_ONCE(shared_data->nr_iter);
> -               GUEST_ASSERT_EQ(config_iter + 1, irq_iter);
> +               __GUEST_ASSERT(config_iter + 1 == irq_iter,
> +                       "config_iter + 1 = 0x%lx, irq_iter = 0x%lx.\n"
> +                       "  Guest timer interrupt was not trigged within the specified\n"
> +                       "  interval, try to increase the error margin by [-e] option.\n",
> +                       config_iter + 1, irq_iter);
>         }
>  }
>
> --
> 2.34.1
>
Marc Zyngier Dec. 19, 2023, 6:22 p.m. UTC | #3
On Tue, 12 Dec 2023 09:31:20 +0000,
Haibo Xu <haibo1.xu@intel.com> wrote:
> > @@ -216,6 +221,9 @@ static bool parse_args(int argc, char *argv[])
>  		case 'm':
>  			test_args.migration_freq_ms = atoi_non_negative("Frequency", optarg);
>  			break;
> +		case 'e':
> +			test_args.timer_err_margin_us = atoi_non_negative("Error Margin", optarg);
> +			break;

So your error margin is always unsigned...

>  		case 'o':
>  			test_args.counter_offset = strtol(optarg, NULL, 0);
>  			test_args.reserved = 0;
> diff --git a/tools/testing/selftests/kvm/include/timer_test.h b/tools/testing/selftests/kvm/include/timer_test.h
> index 968257b893a7..b1d405e7157d 100644
> --- a/tools/testing/selftests/kvm/include/timer_test.h
> +++ b/tools/testing/selftests/kvm/include/timer_test.h
> @@ -22,6 +22,7 @@ struct test_args {
>  	int nr_iter;
>  	int timer_period_ms;
>  	int migration_freq_ms;
> +	int timer_err_margin_us;

... except that you are storing it as a signed value. Some consistency
wouldn't hurt, really, and would avoid issues when passing large
values.

	M.
Haibo Xu Dec. 20, 2023, 6:50 a.m. UTC | #4
On Wed, Dec 20, 2023 at 2:22 AM Marc Zyngier <maz@kernel.org> wrote:
>
> On Tue, 12 Dec 2023 09:31:20 +0000,
> Haibo Xu <haibo1.xu@intel.com> wrote:
> > > @@ -216,6 +221,9 @@ static bool parse_args(int argc, char *argv[])
> >               case 'm':
> >                       test_args.migration_freq_ms = atoi_non_negative("Frequency", optarg);
> >                       break;
> > +             case 'e':
> > +                     test_args.timer_err_margin_us = atoi_non_negative("Error Margin", optarg);
> > +                     break;
>
> So your error margin is always unsigned...
>

The error margin was supposed to be a non-negative [0, INT_MAX].
(May be need to define a Max for the input, instead of INT_MAX)

> >               case 'o':
> >                       test_args.counter_offset = strtol(optarg, NULL, 0);
> >                       test_args.reserved = 0;
> > diff --git a/tools/testing/selftests/kvm/include/timer_test.h b/tools/testing/selftests/kvm/include/timer_test.h
> > index 968257b893a7..b1d405e7157d 100644
> > --- a/tools/testing/selftests/kvm/include/timer_test.h
> > +++ b/tools/testing/selftests/kvm/include/timer_test.h
> > @@ -22,6 +22,7 @@ struct test_args {
> >       int nr_iter;
> >       int timer_period_ms;
> >       int migration_freq_ms;
> > +     int timer_err_margin_us;
>
> ... except that you are storing it as a signed value. Some consistency
> wouldn't hurt, really, and would avoid issues when passing large
> values.
>

Yes, it's more proper to use an unsigned int for the non-negative error margin.
Storing as signed here is just to keep the type consistent with that
of timer_period_ms
since there will be '+' operation in other places.

        tools/testing/selftests/kvm/aarch64/arch_timer.c
        /* Setup a timeout for the interrupt to arrive */
         udelay(msecs_to_usecs(test_args.timer_period_ms) +
             test_args.timer_err_margin_us);

Thanks,
Haibo

>         M.
>
> --
> Without deviation from the norm, progress is not possible.
Marc Zyngier Dec. 20, 2023, 9 a.m. UTC | #5
On 2023-12-20 06:50, Haibo Xu wrote:
> On Wed, Dec 20, 2023 at 2:22 AM Marc Zyngier <maz@kernel.org> wrote:
>> 
>> On Tue, 12 Dec 2023 09:31:20 +0000,
>> Haibo Xu <haibo1.xu@intel.com> wrote:
>> > > @@ -216,6 +221,9 @@ static bool parse_args(int argc, char *argv[])
>> >               case 'm':
>> >                       test_args.migration_freq_ms = atoi_non_negative("Frequency", optarg);
>> >                       break;
>> > +             case 'e':
>> > +                     test_args.timer_err_margin_us = atoi_non_negative("Error Margin", optarg);
>> > +                     break;
>> 
>> So your error margin is always unsigned...
>> 
> 
> The error margin was supposed to be a non-negative [0, INT_MAX].
> (May be need to define a Max for the input, instead of INT_MAX)
> 
>> >               case 'o':
>> >                       test_args.counter_offset = strtol(optarg, NULL, 0);
>> >                       test_args.reserved = 0;
>> > diff --git a/tools/testing/selftests/kvm/include/timer_test.h b/tools/testing/selftests/kvm/include/timer_test.h
>> > index 968257b893a7..b1d405e7157d 100644
>> > --- a/tools/testing/selftests/kvm/include/timer_test.h
>> > +++ b/tools/testing/selftests/kvm/include/timer_test.h
>> > @@ -22,6 +22,7 @@ struct test_args {
>> >       int nr_iter;
>> >       int timer_period_ms;
>> >       int migration_freq_ms;
>> > +     int timer_err_margin_us;
>> 
>> ... except that you are storing it as a signed value. Some consistency
>> wouldn't hurt, really, and would avoid issues when passing large
>> values.
>> 
> 
> Yes, it's more proper to use an unsigned int for the non-negative error 
> margin.
> Storing as signed here is just to keep the type consistent with that
> of timer_period_ms
> since there will be '+' operation in other places.
> 
>         tools/testing/selftests/kvm/aarch64/arch_timer.c
>         /* Setup a timeout for the interrupt to arrive */
>          udelay(msecs_to_usecs(test_args.timer_period_ms) +
>              test_args.timer_err_margin_us);

But that's exactly why using a signed quantity is wrong.
What does it mean to have a huge *negative* margin?

I don't see how you can justify this.

         M.
Haibo Xu Dec. 20, 2023, 1:51 p.m. UTC | #6
On Wed, Dec 20, 2023 at 5:00 PM Marc Zyngier <maz@kernel.org> wrote:
>
> On 2023-12-20 06:50, Haibo Xu wrote:
> > On Wed, Dec 20, 2023 at 2:22 AM Marc Zyngier <maz@kernel.org> wrote:
> >>
> >> On Tue, 12 Dec 2023 09:31:20 +0000,
> >> Haibo Xu <haibo1.xu@intel.com> wrote:
> >> > > @@ -216,6 +221,9 @@ static bool parse_args(int argc, char *argv[])
> >> >               case 'm':
> >> >                       test_args.migration_freq_ms = atoi_non_negative("Frequency", optarg);
> >> >                       break;
> >> > +             case 'e':
> >> > +                     test_args.timer_err_margin_us = atoi_non_negative("Error Margin", optarg);
> >> > +                     break;
> >>
> >> So your error margin is always unsigned...
> >>
> >
> > The error margin was supposed to be a non-negative [0, INT_MAX].
> > (May be need to define a Max for the input, instead of INT_MAX)
> >
> >> >               case 'o':
> >> >                       test_args.counter_offset = strtol(optarg, NULL, 0);
> >> >                       test_args.reserved = 0;
> >> > diff --git a/tools/testing/selftests/kvm/include/timer_test.h b/tools/testing/selftests/kvm/include/timer_test.h
> >> > index 968257b893a7..b1d405e7157d 100644
> >> > --- a/tools/testing/selftests/kvm/include/timer_test.h
> >> > +++ b/tools/testing/selftests/kvm/include/timer_test.h
> >> > @@ -22,6 +22,7 @@ struct test_args {
> >> >       int nr_iter;
> >> >       int timer_period_ms;
> >> >       int migration_freq_ms;
> >> > +     int timer_err_margin_us;
> >>
> >> ... except that you are storing it as a signed value. Some consistency
> >> wouldn't hurt, really, and would avoid issues when passing large
> >> values.
> >>
> >
> > Yes, it's more proper to use an unsigned int for the non-negative error
> > margin.
> > Storing as signed here is just to keep the type consistent with that
> > of timer_period_ms
> > since there will be '+' operation in other places.
> >
> >         tools/testing/selftests/kvm/aarch64/arch_timer.c
> >         /* Setup a timeout for the interrupt to arrive */
> >          udelay(msecs_to_usecs(test_args.timer_period_ms) +
> >              test_args.timer_err_margin_us);
>
> But that's exactly why using a signed quantity is wrong.
> What does it mean to have a huge *negative* margin?
>

Hi Marc,

I agree that negative values are meaningless for the margin.
If I understand correctly, the negative margin should be filtered by
assertion in atoi_non_negative().

Thanks,
Haibo

> I don't see how you can justify this.
>
>          M.
> --
> Jazz is not dead. It just smells funny...
Marc Zyngier Dec. 20, 2023, 1:58 p.m. UTC | #7
On Wed, 20 Dec 2023 13:51:24 +0000,
Haibo Xu <xiaobo55x@gmail.com> wrote:
> 
> On Wed, Dec 20, 2023 at 5:00 PM Marc Zyngier <maz@kernel.org> wrote:
> >
> > On 2023-12-20 06:50, Haibo Xu wrote:
> > > On Wed, Dec 20, 2023 at 2:22 AM Marc Zyngier <maz@kernel.org> wrote:
> > >>
> > >> On Tue, 12 Dec 2023 09:31:20 +0000,
> > >> Haibo Xu <haibo1.xu@intel.com> wrote:
> > >> > diff --git a/tools/testing/selftests/kvm/include/timer_test.h b/tools/testing/selftests/kvm/include/timer_test.h
> > >> > index 968257b893a7..b1d405e7157d 100644
> > >> > --- a/tools/testing/selftests/kvm/include/timer_test.h
> > >> > +++ b/tools/testing/selftests/kvm/include/timer_test.h
> > >> > @@ -22,6 +22,7 @@ struct test_args {
> > >> >       int nr_iter;
> > >> >       int timer_period_ms;
> > >> >       int migration_freq_ms;
> > >> > +     int timer_err_margin_us;
> > >>
> > >> ... except that you are storing it as a signed value. Some consistency
> > >> wouldn't hurt, really, and would avoid issues when passing large
> > >> values.
> > >>
> > >
> > > Yes, it's more proper to use an unsigned int for the non-negative error
> > > margin.
> > > Storing as signed here is just to keep the type consistent with that
> > > of timer_period_ms
> > > since there will be '+' operation in other places.
> > >
> > >         tools/testing/selftests/kvm/aarch64/arch_timer.c
> > >         /* Setup a timeout for the interrupt to arrive */
> > >          udelay(msecs_to_usecs(test_args.timer_period_ms) +
> > >              test_args.timer_err_margin_us);
> >
> > But that's exactly why using a signed quantity is wrong.
> > What does it mean to have a huge *negative* margin?
> >
> 
> Hi Marc,
> 
> I agree that negative values are meaningless for the margin.
> If I understand correctly, the negative margin should be filtered by
> assertion in atoi_non_negative().

No. Please.

atoi_non_negative() returns a uint32_t, which is what it should do.
The bug is squarely in the use of an 'int' to store such value, and it
is the *storage* that turns a positive value into a negative one.

	M.
Haibo Xu Dec. 21, 2023, 2:58 a.m. UTC | #8
On Wed, Dec 20, 2023 at 9:58 PM Marc Zyngier <maz@kernel.org> wrote:
>
> On Wed, 20 Dec 2023 13:51:24 +0000,
> Haibo Xu <xiaobo55x@gmail.com> wrote:
> >
> > On Wed, Dec 20, 2023 at 5:00 PM Marc Zyngier <maz@kernel.org> wrote:
> > >
> > > On 2023-12-20 06:50, Haibo Xu wrote:
> > > > On Wed, Dec 20, 2023 at 2:22 AM Marc Zyngier <maz@kernel.org> wrote:
> > > >>
> > > >> On Tue, 12 Dec 2023 09:31:20 +0000,
> > > >> Haibo Xu <haibo1.xu@intel.com> wrote:
> > > >> > diff --git a/tools/testing/selftests/kvm/include/timer_test.h b/tools/testing/selftests/kvm/include/timer_test.h
> > > >> > index 968257b893a7..b1d405e7157d 100644
> > > >> > --- a/tools/testing/selftests/kvm/include/timer_test.h
> > > >> > +++ b/tools/testing/selftests/kvm/include/timer_test.h
> > > >> > @@ -22,6 +22,7 @@ struct test_args {
> > > >> >       int nr_iter;
> > > >> >       int timer_period_ms;
> > > >> >       int migration_freq_ms;
> > > >> > +     int timer_err_margin_us;
> > > >>
> > > >> ... except that you are storing it as a signed value. Some consistency
> > > >> wouldn't hurt, really, and would avoid issues when passing large
> > > >> values.
> > > >>
> > > >
> > > > Yes, it's more proper to use an unsigned int for the non-negative error
> > > > margin.
> > > > Storing as signed here is just to keep the type consistent with that
> > > > of timer_period_ms
> > > > since there will be '+' operation in other places.
> > > >
> > > >         tools/testing/selftests/kvm/aarch64/arch_timer.c
> > > >         /* Setup a timeout for the interrupt to arrive */
> > > >          udelay(msecs_to_usecs(test_args.timer_period_ms) +
> > > >              test_args.timer_err_margin_us);
> > >
> > > But that's exactly why using a signed quantity is wrong.
> > > What does it mean to have a huge *negative* margin?
> > >
> >
> > Hi Marc,
> >
> > I agree that negative values are meaningless for the margin.
> > If I understand correctly, the negative margin should be filtered by
> > assertion in atoi_non_negative().
>
> No. Please.
>
> atoi_non_negative() returns a uint32_t, which is what it should do.
> The bug is squarely in the use of an 'int' to store such value, and it
> is the *storage* that turns a positive value into a negative one.
>

Thanks for the detailed info!

May I understand that your concern is mainly for a platform with 64bit int type,
which may trigger the positive to negative convert?

If so, I think we may need to do a clean up for the test code since
several other
places have the same issue.

Regards,
Haibo

>         M.
>
> --
> Without deviation from the norm, progress is not possible.
Marc Zyngier Dec. 21, 2023, 9:25 a.m. UTC | #9
On Thu, 21 Dec 2023 02:58:40 +0000,
Haibo Xu <xiaobo55x@gmail.com> wrote:
> 
> On Wed, Dec 20, 2023 at 9:58 PM Marc Zyngier <maz@kernel.org> wrote:
> >
> > On Wed, 20 Dec 2023 13:51:24 +0000,
> > Haibo Xu <xiaobo55x@gmail.com> wrote:
> > >
> > > On Wed, Dec 20, 2023 at 5:00 PM Marc Zyngier <maz@kernel.org> wrote:
> > > >
> > > > On 2023-12-20 06:50, Haibo Xu wrote:
> > > > > On Wed, Dec 20, 2023 at 2:22 AM Marc Zyngier <maz@kernel.org> wrote:
> > > > >>
> > > > >> On Tue, 12 Dec 2023 09:31:20 +0000,
> > > > >> Haibo Xu <haibo1.xu@intel.com> wrote:
> > > > >> > diff --git a/tools/testing/selftests/kvm/include/timer_test.h b/tools/testing/selftests/kvm/include/timer_test.h
> > > > >> > index 968257b893a7..b1d405e7157d 100644
> > > > >> > --- a/tools/testing/selftests/kvm/include/timer_test.h
> > > > >> > +++ b/tools/testing/selftests/kvm/include/timer_test.h
> > > > >> > @@ -22,6 +22,7 @@ struct test_args {
> > > > >> >       int nr_iter;
> > > > >> >       int timer_period_ms;
> > > > >> >       int migration_freq_ms;
> > > > >> > +     int timer_err_margin_us;
> > > > >>
> > > > >> ... except that you are storing it as a signed value. Some consistency
> > > > >> wouldn't hurt, really, and would avoid issues when passing large
> > > > >> values.
> > > > >>
> > > > >
> > > > > Yes, it's more proper to use an unsigned int for the non-negative error
> > > > > margin.
> > > > > Storing as signed here is just to keep the type consistent with that
> > > > > of timer_period_ms
> > > > > since there will be '+' operation in other places.
> > > > >
> > > > >         tools/testing/selftests/kvm/aarch64/arch_timer.c
> > > > >         /* Setup a timeout for the interrupt to arrive */
> > > > >          udelay(msecs_to_usecs(test_args.timer_period_ms) +
> > > > >              test_args.timer_err_margin_us);
> > > >
> > > > But that's exactly why using a signed quantity is wrong.
> > > > What does it mean to have a huge *negative* margin?
> > > >
> > >
> > > Hi Marc,
> > >
> > > I agree that negative values are meaningless for the margin.
> > > If I understand correctly, the negative margin should be filtered by
> > > assertion in atoi_non_negative().
> >
> > No. Please.
> >
> > atoi_non_negative() returns a uint32_t, which is what it should do.
> > The bug is squarely in the use of an 'int' to store such value, and it
> > is the *storage* that turns a positive value into a negative one.
> >
> 
> Thanks for the detailed info!
> 
> May I understand that your concern is mainly for a platform with
> 64bit int type, which may trigger the positive to negative convert?

No. It specifically applies to architectures with a 32bit int type,
which is... *EVERYTHING*. Here's a basic example:

<quote>
#include <stdio.h>

int main(int argc, char *argv[])
{
	int x = 1U << 31;

	printf("%d (%d)\n", x, sizeof(x));
	return 0;
}
</quote>

which returns "-2147483648 (4)" on any platform.

This really is basic C, and I am very worried that you don't see the
issue. I strongly suggest that you go and read about the C type system
before touching this code.

	M.
Andrew Jones Jan. 4, 2024, 11:09 a.m. UTC | #10
On Thu, Dec 21, 2023 at 10:58:40AM +0800, Haibo Xu wrote:
> On Wed, Dec 20, 2023 at 9:58 PM Marc Zyngier <maz@kernel.org> wrote:
> >
> > On Wed, 20 Dec 2023 13:51:24 +0000,
> > Haibo Xu <xiaobo55x@gmail.com> wrote:
> > >
> > > On Wed, Dec 20, 2023 at 5:00 PM Marc Zyngier <maz@kernel.org> wrote:
> > > >
> > > > On 2023-12-20 06:50, Haibo Xu wrote:
> > > > > On Wed, Dec 20, 2023 at 2:22 AM Marc Zyngier <maz@kernel.org> wrote:
> > > > >>
> > > > >> On Tue, 12 Dec 2023 09:31:20 +0000,
> > > > >> Haibo Xu <haibo1.xu@intel.com> wrote:
> > > > >> > diff --git a/tools/testing/selftests/kvm/include/timer_test.h b/tools/testing/selftests/kvm/include/timer_test.h
> > > > >> > index 968257b893a7..b1d405e7157d 100644
> > > > >> > --- a/tools/testing/selftests/kvm/include/timer_test.h
> > > > >> > +++ b/tools/testing/selftests/kvm/include/timer_test.h
> > > > >> > @@ -22,6 +22,7 @@ struct test_args {
> > > > >> >       int nr_iter;
> > > > >> >       int timer_period_ms;
> > > > >> >       int migration_freq_ms;
> > > > >> > +     int timer_err_margin_us;
> > > > >>
> > > > >> ... except that you are storing it as a signed value. Some consistency
> > > > >> wouldn't hurt, really, and would avoid issues when passing large
> > > > >> values.
> > > > >>
> > > > >
> > > > > Yes, it's more proper to use an unsigned int for the non-negative error
> > > > > margin.
> > > > > Storing as signed here is just to keep the type consistent with that
> > > > > of timer_period_ms
> > > > > since there will be '+' operation in other places.
> > > > >
> > > > >         tools/testing/selftests/kvm/aarch64/arch_timer.c
> > > > >         /* Setup a timeout for the interrupt to arrive */
> > > > >          udelay(msecs_to_usecs(test_args.timer_period_ms) +
> > > > >              test_args.timer_err_margin_us);
> > > >
> > > > But that's exactly why using a signed quantity is wrong.
> > > > What does it mean to have a huge *negative* margin?
> > > >
> > >
> > > Hi Marc,
> > >
> > > I agree that negative values are meaningless for the margin.
> > > If I understand correctly, the negative margin should be filtered by
> > > assertion in atoi_non_negative().
> >
> > No. Please.
> >
> > atoi_non_negative() returns a uint32_t, which is what it should do.
> > The bug is squarely in the use of an 'int' to store such value, and it
> > is the *storage* that turns a positive value into a negative one.
> >
> 
> Thanks for the detailed info!
> 
> May I understand that your concern is mainly for a platform with 64bit int type,
> which may trigger the positive to negative convert?
> 
> If so, I think we may need to do a clean up for the test code since
> several other
> places have the same issue.

Yes, I think we should do that cleanup. While there are probably several
offenders scattered throughout kvm selftests, we can keep the scope of
this series focused on arch_timer.c. Let's audit all uses of signed types
and convert them to unsigned as necessary with some separate patch(es)
before splitting the test, so both aarch64 and riscv get the cleanups.

Thanks,
drew
diff mbox series

Patch

diff --git a/tools/testing/selftests/kvm/aarch64/arch_timer.c b/tools/testing/selftests/kvm/aarch64/arch_timer.c
index 4b421d421c3f..139eecbf77e7 100644
--- a/tools/testing/selftests/kvm/aarch64/arch_timer.c
+++ b/tools/testing/selftests/kvm/aarch64/arch_timer.c
@@ -131,10 +131,14 @@  static void guest_run_stage(struct test_vcpu_shared_data *shared_data,
 
 		/* Setup a timeout for the interrupt to arrive */
 		udelay(msecs_to_usecs(test_args.timer_period_ms) +
-			TIMER_TEST_ERR_MARGIN_US);
+			test_args.timer_err_margin_us);
 
 		irq_iter = READ_ONCE(shared_data->nr_iter);
-		GUEST_ASSERT_EQ(config_iter + 1, irq_iter);
+		__GUEST_ASSERT(config_iter + 1 == irq_iter,
+			"config_iter + 1 = 0x%lx, irq_iter = 0x%lx.\n"
+			"  Guest timer interrupt was not trigged within the specified\n"
+			"  interval, try to increase the error margin by [-e] option.\n",
+			config_iter + 1, irq_iter);
 	}
 }
 
diff --git a/tools/testing/selftests/kvm/arch_timer.c b/tools/testing/selftests/kvm/arch_timer.c
index 60963fce16f2..5050022fd345 100644
--- a/tools/testing/selftests/kvm/arch_timer.c
+++ b/tools/testing/selftests/kvm/arch_timer.c
@@ -5,16 +5,17 @@ 
  * The guest's main thread configures the timer interrupt and waits
  * for it to fire, with a timeout equal to the timer period.
  * It asserts that the timeout doesn't exceed the timer period plus
- * an error margin of 100us.
+ * an user configurable error margin(default to 100us).
  *
  * On the other hand, upon receipt of an interrupt, the guest's interrupt
  * handler validates the interrupt by checking if the architectural state
  * is in compliance with the specifications.
  *
  * The test provides command-line options to configure the timer's
- * period (-p), number of vCPUs (-n), and iterations per stage (-i).
- * To stress-test the timer stack even more, an option to migrate the
- * vCPUs across pCPUs (-m), at a particular rate, is also provided.
+ * period (-p), number of vCPUs (-n), iterations per stage (-i), and timer
+ * interrupt arrival error margin (-e). To stress-test the timer stack even
+ * more, an option to migrate the vCPUs across pCPUs (-m), at a particular
+ * rate, is also provided.
  *
  * Copyright (c) 2021, Google LLC.
  */
@@ -34,6 +35,7 @@  struct test_args test_args = {
 	.nr_iter = NR_TEST_ITERS_DEF,
 	.timer_period_ms = TIMER_TEST_PERIOD_MS_DEF,
 	.migration_freq_ms = TIMER_TEST_MIGRATION_FREQ_MS,
+	.timer_err_margin_us = TIMER_TEST_ERR_MARGIN_US,
 	.reserved = 1,
 };
 
@@ -179,8 +181,9 @@  static void test_run(struct kvm_vm *vm)
 
 static void test_print_help(char *name)
 {
-	pr_info("Usage: %s [-h] [-n nr_vcpus] [-i iterations] [-p timer_period_ms]\n",
-		name);
+	pr_info("Usage: %s [-h] [-n nr_vcpus] [-i iterations] [-p timer_period_ms]\n"
+	        "\t\t    [-m migration_freq_ms] [-o counter_offset]\n"
+	        "\t\t    [-e timer_err_margin_us]\n", name);
 	pr_info("\t-n: Number of vCPUs to configure (default: %u; max: %u)\n",
 		NR_VCPUS_DEF, KVM_MAX_VCPUS);
 	pr_info("\t-i: Number of iterations per stage (default: %u)\n",
@@ -190,6 +193,8 @@  static void test_print_help(char *name)
 	pr_info("\t-m: Frequency (in ms) of vCPUs to migrate to different pCPU. 0 to turn off (default: %u)\n",
 		TIMER_TEST_MIGRATION_FREQ_MS);
 	pr_info("\t-o: Counter offset (in counter cycles, default: 0) [aarch64-only]\n");
+	pr_info("\t-e: Interrupt arrival error margin(in us) of the guest timer (default: %u)\n",
+		TIMER_TEST_ERR_MARGIN_US);
 	pr_info("\t-h: print this help screen\n");
 }
 
@@ -197,7 +202,7 @@  static bool parse_args(int argc, char *argv[])
 {
 	int opt;
 
-	while ((opt = getopt(argc, argv, "hn:i:p:m:o:")) != -1) {
+	while ((opt = getopt(argc, argv, "hn:i:p:m:o:e:")) != -1) {
 		switch (opt) {
 		case 'n':
 			test_args.nr_vcpus = atoi_positive("Number of vCPUs", optarg);
@@ -216,6 +221,9 @@  static bool parse_args(int argc, char *argv[])
 		case 'm':
 			test_args.migration_freq_ms = atoi_non_negative("Frequency", optarg);
 			break;
+		case 'e':
+			test_args.timer_err_margin_us = atoi_non_negative("Error Margin", optarg);
+			break;
 		case 'o':
 			test_args.counter_offset = strtol(optarg, NULL, 0);
 			test_args.reserved = 0;
diff --git a/tools/testing/selftests/kvm/include/timer_test.h b/tools/testing/selftests/kvm/include/timer_test.h
index 968257b893a7..b1d405e7157d 100644
--- a/tools/testing/selftests/kvm/include/timer_test.h
+++ b/tools/testing/selftests/kvm/include/timer_test.h
@@ -22,6 +22,7 @@  struct test_args {
 	int nr_iter;
 	int timer_period_ms;
 	int migration_freq_ms;
+	int timer_err_margin_us;
 	/* Members of struct kvm_arm_counter_offset */
 	uint64_t counter_offset;
 	uint64_t reserved;
diff --git a/tools/testing/selftests/kvm/riscv/arch_timer.c b/tools/testing/selftests/kvm/riscv/arch_timer.c
index 13bf184d1ff5..45a139dc7ce3 100644
--- a/tools/testing/selftests/kvm/riscv/arch_timer.c
+++ b/tools/testing/selftests/kvm/riscv/arch_timer.c
@@ -55,10 +55,14 @@  static void guest_run(struct test_vcpu_shared_data *shared_data)
 
 		/* Setup a timeout for the interrupt to arrive */
 		udelay(msecs_to_usecs(test_args.timer_period_ms) +
-			TIMER_TEST_ERR_MARGIN_US);
+			test_args.timer_err_margin_us);
 
 		irq_iter = READ_ONCE(shared_data->nr_iter);
-		GUEST_ASSERT_EQ(config_iter + 1, irq_iter);
+		__GUEST_ASSERT(config_iter + 1 == irq_iter,
+			"config_iter + 1 = 0x%lx, irq_iter = 0x%lx.\n"
+			"  Guest timer interrupt was not trigged within the specified\n"
+			"  interval, try to increase the error margin by [-e] option.\n",
+			config_iter + 1, irq_iter);
 	}
 }