diff mbox series

[8/8] KVM: selftests: x86: Add vmx_nested_tsc_scaling_test

Message ID 20210506103228.67864-9-ilstam@mailbox.org (mailing list archive)
State New, archived
Headers show
Series KVM: VMX: Implement nested TSC scaling | expand

Commit Message

ilstam@mailbox.org May 6, 2021, 10:32 a.m. UTC
From: Ilias Stamatis <ilstam@amazon.com>

Test that nested TSC scaling works as expected with both L1 and L2
scaled.

Signed-off-by: Ilias Stamatis <ilstam@amazon.com>
---
 tools/testing/selftests/kvm/.gitignore        |   1 +
 tools/testing/selftests/kvm/Makefile          |   1 +
 .../kvm/x86_64/vmx_nested_tsc_scaling_test.c  | 209 ++++++++++++++++++
 3 files changed, 211 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c

Comments

Maxim Levitsky May 10, 2021, 1:59 p.m. UTC | #1
On Thu, 2021-05-06 at 10:32 +0000, ilstam@mailbox.org wrote:
> From: Ilias Stamatis <ilstam@amazon.com>
> 
> Test that nested TSC scaling works as expected with both L1 and L2
> scaled.
> 
> Signed-off-by: Ilias Stamatis <ilstam@amazon.com>
> ---
>  tools/testing/selftests/kvm/.gitignore        |   1 +
>  tools/testing/selftests/kvm/Makefile          |   1 +
>  .../kvm/x86_64/vmx_nested_tsc_scaling_test.c  | 209 ++++++++++++++++++
>  3 files changed, 211 insertions(+)
>  create mode 100644 tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c
> 
> diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
> index bd83158e0e0b..cc02022f9951 100644
> --- a/tools/testing/selftests/kvm/.gitignore
> +++ b/tools/testing/selftests/kvm/.gitignore
> @@ -29,6 +29,7 @@
>  /x86_64/vmx_preemption_timer_test
>  /x86_64/vmx_set_nested_state_test
>  /x86_64/vmx_tsc_adjust_test
> +/x86_64/vmx_nested_tsc_scaling_test
>  /x86_64/xapic_ipi_test
>  /x86_64/xen_shinfo_test
>  /x86_64/xen_vmcall_test
> diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
> index e439d027939d..1078240b1313 100644
> --- a/tools/testing/selftests/kvm/Makefile
> +++ b/tools/testing/selftests/kvm/Makefile
> @@ -60,6 +60,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test
>  TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test
>  TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
>  TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
> +TEST_GEN_PROGS_x86_64 += x86_64/vmx_nested_tsc_scaling_test
>  TEST_GEN_PROGS_x86_64 += x86_64/xapic_ipi_test
>  TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test
>  TEST_GEN_PROGS_x86_64 += x86_64/debug_regs
> diff --git a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c
> new file mode 100644
> index 000000000000..b05f5151ecbe
> --- /dev/null
> +++ b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c
> @@ -0,0 +1,209 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * vmx_nested_tsc_scaling_test
> + *
> + * Copyright (C) 2021 Amazon.com, Inc. or its affiliates.
> + *
> + * This test case verifies that nested TSC scaling behaves as expected when
> + * both L1 and L2 are scaled using different ratios. For this test we scale
> + * L1 down and scale L2 up.
> + */
> +
> +
> +#include "kvm_util.h"
> +#include "vmx.h"
> +#include "kselftest.h"
> +
> +
> +#define VCPU_ID 0
> +
> +/* L1 is scaled down by this factor */
> +#define L1_SCALE_FACTOR 2ULL
> +/* L2 is scaled up (from L1's perspective) by this factor */
> +#define L2_SCALE_FACTOR 4ULL

For fun, I might have randomized these factors as well.

> +
> +#define TSC_OFFSET_L2 (1UL << 32)
> +#define TSC_MULTIPLIER_L2 (L2_SCALE_FACTOR << 48)
It would be fun to use a negative offset here (also randomally).

> +
> +#define L2_GUEST_STACK_SIZE 64
> +
> +enum { USLEEP, UCHECK_L1, UCHECK_L2 };
> +#define GUEST_SLEEP(sec)         ucall(UCALL_SYNC, 2, USLEEP, sec)
> +#define GUEST_CHECK(level, freq) ucall(UCALL_SYNC, 2, level, freq)
> +
> +
> +/*
> + * This function checks whether the "actual" TSC frequency of a guest matches
> + * its expected frequency. In order to account for delays in taking the TSC
> + * measurements, a difference of 1% between the actual and the expected value
> + * is tolerated.
> + */
> +static void compare_tsc_freq(uint64_t actual, uint64_t expected)
> +{
> +	uint64_t tolerance, thresh_low, thresh_high;
> +
> +	tolerance = expected / 100;
> +	thresh_low = expected - tolerance;
> +	thresh_high = expected + tolerance;
> +
> +	TEST_ASSERT(thresh_low < actual,
> +		"TSC freq is expected to be between %"PRIu64" and %"PRIu64
> +		" but it actually is %"PRIu64,
> +		thresh_low, thresh_high, actual);
> +	TEST_ASSERT(thresh_high > actual,
> +		"TSC freq is expected to be between %"PRIu64" and %"PRIu64
> +		" but it actually is %"PRIu64,
> +		thresh_low, thresh_high, actual);
> +}
> +
> +static void check_tsc_freq(int level)
> +{
> +	uint64_t tsc_start, tsc_end, tsc_freq;
> +
> +	/*
> +	 * Reading the TSC twice with about a second's difference should give
> +	 * us an approximation of the TSC frequency from the guest's
> +	 * perspective. Now, this won't be completely accurate, but it should
> +	 * be good enough for the purposes of this test.
> +	 */

It would be nice to know if the host has stable TSC (you can obtain this via 
KVM_GET_CLOCK, the KVM_CLOCK_TSC_STABLE flag).

And if not stable skip the test, to avoid false positives.
(Yes I have a laptop I just bought that has an unstable TSC....)


> +	tsc_start = rdmsr(MSR_IA32_TSC);
> +	GUEST_SLEEP(1);
> +	tsc_end = rdmsr(MSR_IA32_TSC);
> +
> +	tsc_freq = tsc_end - tsc_start;
> +
> +	GUEST_CHECK(level, tsc_freq);
> +}
> +
> +static void l2_guest_code(void)
> +{
> +	check_tsc_freq(UCHECK_L2);
> +
> +	/* exit to L1 */
> +	__asm__ __volatile__("vmcall");
> +}
> +
> +static void l1_guest_code(struct vmx_pages *vmx_pages)
> +{
> +	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
> +	uint32_t control;
> +
> +	/* check that L1's frequency looks alright before launching L2 */
> +	check_tsc_freq(UCHECK_L1);
> +
> +	GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
> +	GUEST_ASSERT(load_vmcs(vmx_pages));
> +
> +	/* prepare the VMCS for L2 execution */
> +	prepare_vmcs(vmx_pages, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
> +
> +	/* enable TSC offsetting and TSC scaling for L2 */
> +	control = vmreadz(CPU_BASED_VM_EXEC_CONTROL);
> +	control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETTING;
> +	vmwrite(CPU_BASED_VM_EXEC_CONTROL, control);
> +
> +	control = vmreadz(SECONDARY_VM_EXEC_CONTROL);
> +	control |= SECONDARY_EXEC_TSC_SCALING;
> +	vmwrite(SECONDARY_VM_EXEC_CONTROL, control);
> +
> +	vmwrite(TSC_OFFSET, TSC_OFFSET_L2);
> +	vmwrite(TSC_MULTIPLIER, TSC_MULTIPLIER_L2);
> +	vmwrite(TSC_MULTIPLIER_HIGH, TSC_MULTIPLIER_L2 >> 32);
> +
> +	/* launch L2 */
> +	GUEST_ASSERT(!vmlaunch());
> +	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
> +
> +	/* check that L1's frequency still looks good */
> +	check_tsc_freq(UCHECK_L1);
> +
> +	GUEST_DONE();
> +}
> +
> +static void tsc_scaling_check_supported(void)
> +{
> +	if (!kvm_check_cap(KVM_CAP_TSC_CONTROL)) {
> +		print_skip("TSC scaling not supported by the HW");
> +		exit(KSFT_SKIP);
> +	}
> +}
> +
> +int main(int argc, char *argv[])
> +{
> +	struct kvm_vm *vm;
> +	vm_vaddr_t vmx_pages_gva;
> +
> +	uint64_t tsc_start, tsc_end;
> +	uint64_t tsc_khz;
> +	uint64_t l0_tsc_freq = 0;
> +	uint64_t l1_tsc_freq = 0;
> +	uint64_t l2_tsc_freq = 0;
> +
> +	nested_vmx_check_supported();
> +	tsc_scaling_check_supported();
> +
> +	tsc_start = rdtsc();
> +	sleep(1);
> +	tsc_end = rdtsc();
> +
> +	l0_tsc_freq = tsc_end - tsc_start;
> +	printf("real TSC frequency is around: %"PRIu64"\n", l0_tsc_freq);
> +
> +	vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code);
> +	vcpu_alloc_vmx(vm, &vmx_pages_gva);
> +	vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
> +
> +	tsc_khz = _vcpu_ioctl(vm, VCPU_ID, KVM_GET_TSC_KHZ, NULL);
> +	TEST_ASSERT(tsc_khz != -1, "vcpu ioctl KVM_GET_TSC_KHZ failed");
> +
> +	/* scale down L1's TSC frequency */
> +	vcpu_ioctl(vm, VCPU_ID, KVM_SET_TSC_KHZ,
> +		  (void *) (tsc_khz / L1_SCALE_FACTOR));
> +
> +	for (;;) {
> +		volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID);
> +		struct ucall uc;
> +
> +		vcpu_run(vm, VCPU_ID);
> +		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
> +			    "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n",
> +			    run->exit_reason,
> +			    exit_reason_str(run->exit_reason));
> +
> +		switch (get_ucall(vm, VCPU_ID, &uc)) {
> +		case UCALL_ABORT:
> +			TEST_FAIL("%s", (const char *) uc.args[0]);
> +		case UCALL_SYNC:
> +			switch (uc.args[0]) {
> +			case USLEEP:
> +				sleep(uc.args[1]);
> +				break;
> +			case UCHECK_L1:
> +				l1_tsc_freq = uc.args[1];
> +				printf("L1's TSC frequency is around: %"PRIu64
> +				       "\n", l1_tsc_freq);
> +
> +				compare_tsc_freq(l1_tsc_freq,
> +						 l0_tsc_freq / L1_SCALE_FACTOR);
> +				break;
> +			case UCHECK_L2:
> +				l2_tsc_freq = uc.args[1];
> +				printf("L2's TSC frequency is around: %"PRIu64
> +				       "\n", l2_tsc_freq);
> +
> +				compare_tsc_freq(l2_tsc_freq,
> +						 l1_tsc_freq * L2_SCALE_FACTOR);
> +				break;
> +			}
> +			break;
> +		case UCALL_DONE:
> +			goto done;
> +		default:
> +			TEST_FAIL("Unknown ucall %lu", uc.cmd);
> +		}
> +	}
> +
> +done:
> +	kvm_vm_free(vm);
> +	return 0;
> +}
Overall looks OK to me.

I can't test it, since the most recent Intel laptop I have (i7-7600U)
still lacks TSC scaling (or did Intel cripple this feature on clients like what
they did with APICv ?)

Best regards,
	Maxim Levitsky
Stamatis, Ilias May 11, 2021, 11:16 a.m. UTC | #2
On Mon, 2021-05-10 at 16:59 +0300, Maxim Levitsky wrote:
> On Thu, 2021-05-06 at 10:32 +0000, ilstam@mailbox.org wrote:
> > From: Ilias Stamatis <ilstam@amazon.com>
> > 
> > Test that nested TSC scaling works as expected with both L1 and L2
> > scaled.
> > 
> > Signed-off-by: Ilias Stamatis <ilstam@amazon.com>
> > ---
> >  tools/testing/selftests/kvm/.gitignore        |   1 +
> >  tools/testing/selftests/kvm/Makefile          |   1 +
> >  .../kvm/x86_64/vmx_nested_tsc_scaling_test.c  | 209 ++++++++++++++++++
> >  3 files changed, 211 insertions(+)
> >  create mode 100644 tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c
> > 
> > diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
> > index bd83158e0e0b..cc02022f9951 100644
> > --- a/tools/testing/selftests/kvm/.gitignore
> > +++ b/tools/testing/selftests/kvm/.gitignore
> > @@ -29,6 +29,7 @@
> >  /x86_64/vmx_preemption_timer_test
> >  /x86_64/vmx_set_nested_state_test
> >  /x86_64/vmx_tsc_adjust_test
> > +/x86_64/vmx_nested_tsc_scaling_test
> >  /x86_64/xapic_ipi_test
> >  /x86_64/xen_shinfo_test
> >  /x86_64/xen_vmcall_test
> > diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
> > index e439d027939d..1078240b1313 100644
> > --- a/tools/testing/selftests/kvm/Makefile
> > +++ b/tools/testing/selftests/kvm/Makefile
> > @@ -60,6 +60,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test
> >  TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test
> >  TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
> >  TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
> > +TEST_GEN_PROGS_x86_64 += x86_64/vmx_nested_tsc_scaling_test
> >  TEST_GEN_PROGS_x86_64 += x86_64/xapic_ipi_test
> >  TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test
> >  TEST_GEN_PROGS_x86_64 += x86_64/debug_regs
> > diff --git a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c
> > new file mode 100644
> > index 000000000000..b05f5151ecbe
> > --- /dev/null
> > +++ b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c
> > @@ -0,0 +1,209 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * vmx_nested_tsc_scaling_test
> > + *
> > + * Copyright (C) 2021 Amazon.com, Inc. or its affiliates.
> > + *
> > + * This test case verifies that nested TSC scaling behaves as expected when
> > + * both L1 and L2 are scaled using different ratios. For this test we scale
> > + * L1 down and scale L2 up.
> > + */
> > +
> > +
> > +#include "kvm_util.h"
> > +#include "vmx.h"
> > +#include "kselftest.h"
> > +
> > +
> > +#define VCPU_ID 0
> > +
> > +/* L1 is scaled down by this factor */
> > +#define L1_SCALE_FACTOR 2ULL
> > +/* L2 is scaled up (from L1's perspective) by this factor */
> > +#define L2_SCALE_FACTOR 4ULL
> 
> For fun, I might have randomized these factors as well.

So L2_SCALE_FACTOR (or rather TSC_MULTIPLIER_L2 that depends on it) is
referenced from within l1_guest_code(). If we change this to a static variable
we won't be able to access it from there. How could this be done?

For the L1 factor it's easy as we only use it in main().

> 
> > +
> > +#define TSC_OFFSET_L2 (1UL << 32)
> > +#define TSC_MULTIPLIER_L2 (L2_SCALE_FACTOR << 48)
> 
> It would be fun to use a negative offset here (also randomally).

Do you mean a random offset that is always negative or a random offset that
sometimes is positive and sometimes is negative?

> 
> > +
> > +#define L2_GUEST_STACK_SIZE 64
> > +
> > +enum { USLEEP, UCHECK_L1, UCHECK_L2 };
> > +#define GUEST_SLEEP(sec)         ucall(UCALL_SYNC, 2, USLEEP, sec)
> > +#define GUEST_CHECK(level, freq) ucall(UCALL_SYNC, 2, level, freq)
> > +
> > +
> > +/*
> > + * This function checks whether the "actual" TSC frequency of a guest matches
> > + * its expected frequency. In order to account for delays in taking the TSC
> > + * measurements, a difference of 1% between the actual and the expected value
> > + * is tolerated.
> > + */
> > +static void compare_tsc_freq(uint64_t actual, uint64_t expected)
> > +{
> > +     uint64_t tolerance, thresh_low, thresh_high;
> > +
> > +     tolerance = expected / 100;
> > +     thresh_low = expected - tolerance;
> > +     thresh_high = expected + tolerance;
> > +
> > +     TEST_ASSERT(thresh_low < actual,
> > +             "TSC freq is expected to be between %"PRIu64" and %"PRIu64
> > +             " but it actually is %"PRIu64,
> > +             thresh_low, thresh_high, actual);
> > +     TEST_ASSERT(thresh_high > actual,
> > +             "TSC freq is expected to be between %"PRIu64" and %"PRIu64
> > +             " but it actually is %"PRIu64,
> > +             thresh_low, thresh_high, actual);
> > +}
> > +
> > +static void check_tsc_freq(int level)
> > +{
> > +     uint64_t tsc_start, tsc_end, tsc_freq;
> > +
> > +     /*
> > +      * Reading the TSC twice with about a second's difference should give
> > +      * us an approximation of the TSC frequency from the guest's
> > +      * perspective. Now, this won't be completely accurate, but it should
> > +      * be good enough for the purposes of this test.
> > +      */
> 
> It would be nice to know if the host has stable TSC (you can obtain this via
> KVM_GET_CLOCK, the KVM_CLOCK_TSC_STABLE flag).
> 
> And if not stable skip the test, to avoid false positives.
> (Yes I have a laptop I just bought that has an unstable TSC....)
> 

Hmm, this is a vm ioctl but I noticed that one of its vcpus needs to have been
run at least once otherwise it won't return KVM_CLOCK_TSC_STABLE in the flags.

So...

> 
> > +     tsc_start = rdmsr(MSR_IA32_TSC);
> > +     GUEST_SLEEP(1);
> > +     tsc_end = rdmsr(MSR_IA32_TSC);
> > +
> > +     tsc_freq = tsc_end - tsc_start;
> > +
> > +     GUEST_CHECK(level, tsc_freq);
> > +}
> > +
> > +static void l2_guest_code(void)
> > +{
> > +     check_tsc_freq(UCHECK_L2);
> > +
> > +     /* exit to L1 */
> > +     __asm__ __volatile__("vmcall");
> > +}
> > +
> > +static void l1_guest_code(struct vmx_pages *vmx_pages)
> > +{
> > +     unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
> > +     uint32_t control;
> > +
> > +     /* check that L1's frequency looks alright before launching L2 */
> > +     check_tsc_freq(UCHECK_L1);
> > +
> > +     GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
> > +     GUEST_ASSERT(load_vmcs(vmx_pages));
> > +
> > +     /* prepare the VMCS for L2 execution */
> > +     prepare_vmcs(vmx_pages, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
> > +
> > +     /* enable TSC offsetting and TSC scaling for L2 */
> > +     control = vmreadz(CPU_BASED_VM_EXEC_CONTROL);
> > +     control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETTING;
> > +     vmwrite(CPU_BASED_VM_EXEC_CONTROL, control);
> > +
> > +     control = vmreadz(SECONDARY_VM_EXEC_CONTROL);
> > +     control |= SECONDARY_EXEC_TSC_SCALING;
> > +     vmwrite(SECONDARY_VM_EXEC_CONTROL, control);
> > +
> > +     vmwrite(TSC_OFFSET, TSC_OFFSET_L2);
> > +     vmwrite(TSC_MULTIPLIER, TSC_MULTIPLIER_L2);
> > +     vmwrite(TSC_MULTIPLIER_HIGH, TSC_MULTIPLIER_L2 >> 32);
> > +
> > +     /* launch L2 */
> > +     GUEST_ASSERT(!vmlaunch());
> > +     GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
> > +
> > +     /* check that L1's frequency still looks good */
> > +     check_tsc_freq(UCHECK_L1);
> > +
> > +     GUEST_DONE();
> > +}
> > +
> > +static void tsc_scaling_check_supported(void)
> > +{
> > +     if (!kvm_check_cap(KVM_CAP_TSC_CONTROL)) {
> > +             print_skip("TSC scaling not supported by the HW");
> > +             exit(KSFT_SKIP);
> > +     }
> > +}
> > +
> > +int main(int argc, char *argv[])
> > +{
> > +     struct kvm_vm *vm;
> > +     vm_vaddr_t vmx_pages_gva;
> > +
> > +     uint64_t tsc_start, tsc_end;
> > +     uint64_t tsc_khz;
> > +     uint64_t l0_tsc_freq = 0;
> > +     uint64_t l1_tsc_freq = 0;
> > +     uint64_t l2_tsc_freq = 0;
> > +
> > +     nested_vmx_check_supported();
> > +     tsc_scaling_check_supported();

I can't add the check here

> > +
> > +     tsc_start = rdtsc();
> > +     sleep(1);
> > +     tsc_end = rdtsc();
> > +
> > +     l0_tsc_freq = tsc_end - tsc_start;
> > +     printf("real TSC frequency is around: %"PRIu64"\n", l0_tsc_freq);
> > +
> > +     vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code);
> > +     vcpu_alloc_vmx(vm, &vmx_pages_gva);
> > +     vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);

nor here

> > +
> > +     tsc_khz = _vcpu_ioctl(vm, VCPU_ID, KVM_GET_TSC_KHZ, NULL);
> > +     TEST_ASSERT(tsc_khz != -1, "vcpu ioctl KVM_GET_TSC_KHZ failed");
> > +
> > +     /* scale down L1's TSC frequency */
> > +     vcpu_ioctl(vm, VCPU_ID, KVM_SET_TSC_KHZ,
> > +               (void *) (tsc_khz / L1_SCALE_FACTOR));
> > +
> > +     for (;;) {
> > +             volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID);
> > +             struct ucall uc;
> > +
> > +             vcpu_run(vm, VCPU_ID);
> > +             TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
> > +                         "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n",
> > +                         run->exit_reason,
> > +                         exit_reason_str(run->exit_reason));

should I add it here?

> > +
> > +             switch (get_ucall(vm, VCPU_ID, &uc)) {
> > +             case UCALL_ABORT:
> > +                     TEST_FAIL("%s", (const char *) uc.args[0]);
> > +             case UCALL_SYNC:
> > +                     switch (uc.args[0]) {
> > +                     case USLEEP:
> > +                             sleep(uc.args[1]);
> > +                             break;
> > +                     case UCHECK_L1:
> > +                             l1_tsc_freq = uc.args[1];
> > +                             printf("L1's TSC frequency is around: %"PRIu64
> > +                                    "\n", l1_tsc_freq);
> > +
> > +                             compare_tsc_freq(l1_tsc_freq,
> > +                                              l0_tsc_freq / L1_SCALE_FACTOR);
> > +                             break;
> > +                     case UCHECK_L2:
> > +                             l2_tsc_freq = uc.args[1];
> > +                             printf("L2's TSC frequency is around: %"PRIu64
> > +                                    "\n", l2_tsc_freq);
> > +
> > +                             compare_tsc_freq(l2_tsc_freq,
> > +                                              l1_tsc_freq * L2_SCALE_FACTOR);
> > +                             break;
> > +                     }
> > +                     break;
> > +             case UCALL_DONE:
> > +                     goto done;
> > +             default:
> > +                     TEST_FAIL("Unknown ucall %lu", uc.cmd);
> > +             }
> > +     }
> > +
> > +done:
> > +     kvm_vm_free(vm);
> > +     return 0;
> > +}
> 
> Overall looks OK to me.
> 
> I can't test it, since the most recent Intel laptop I have (i7-7600U)
> still lacks TSC scaling (or did Intel cripple this feature on clients like what
> they did with APICv ?)
> 
> Best regards,
>         Maxim Levitsky
> 
> 
>
Maxim Levitsky May 11, 2021, 12:47 p.m. UTC | #3
On Tue, 2021-05-11 at 11:16 +0000, Stamatis, Ilias wrote:
> On Mon, 2021-05-10 at 16:59 +0300, Maxim Levitsky wrote:
> > On Thu, 2021-05-06 at 10:32 +0000, ilstam@mailbox.org wrote:
> > > From: Ilias Stamatis <ilstam@amazon.com>
> > > 
> > > Test that nested TSC scaling works as expected with both L1 and L2
> > > scaled.
> > > 
> > > Signed-off-by: Ilias Stamatis <ilstam@amazon.com>
> > > ---
> > >  tools/testing/selftests/kvm/.gitignore        |   1 +
> > >  tools/testing/selftests/kvm/Makefile          |   1 +
> > >  .../kvm/x86_64/vmx_nested_tsc_scaling_test.c  | 209 ++++++++++++++++++
> > >  3 files changed, 211 insertions(+)
> > >  create mode 100644 tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c
> > > 
> > > diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
> > > index bd83158e0e0b..cc02022f9951 100644
> > > --- a/tools/testing/selftests/kvm/.gitignore
> > > +++ b/tools/testing/selftests/kvm/.gitignore
> > > @@ -29,6 +29,7 @@
> > >  /x86_64/vmx_preemption_timer_test
> > >  /x86_64/vmx_set_nested_state_test
> > >  /x86_64/vmx_tsc_adjust_test
> > > +/x86_64/vmx_nested_tsc_scaling_test
> > >  /x86_64/xapic_ipi_test
> > >  /x86_64/xen_shinfo_test
> > >  /x86_64/xen_vmcall_test
> > > diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
> > > index e439d027939d..1078240b1313 100644
> > > --- a/tools/testing/selftests/kvm/Makefile
> > > +++ b/tools/testing/selftests/kvm/Makefile
> > > @@ -60,6 +60,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test
> > >  TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test
> > >  TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
> > >  TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
> > > +TEST_GEN_PROGS_x86_64 += x86_64/vmx_nested_tsc_scaling_test
> > >  TEST_GEN_PROGS_x86_64 += x86_64/xapic_ipi_test
> > >  TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test
> > >  TEST_GEN_PROGS_x86_64 += x86_64/debug_regs
> > > diff --git a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c
> > > new file mode 100644
> > > index 000000000000..b05f5151ecbe
> > > --- /dev/null
> > > +++ b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c
> > > @@ -0,0 +1,209 @@
> > > +// SPDX-License-Identifier: GPL-2.0-only
> > > +/*
> > > + * vmx_nested_tsc_scaling_test
> > > + *
> > > + * Copyright (C) 2021 Amazon.com, Inc. or its affiliates.
> > > + *
> > > + * This test case verifies that nested TSC scaling behaves as expected when
> > > + * both L1 and L2 are scaled using different ratios. For this test we scale
> > > + * L1 down and scale L2 up.
> > > + */
> > > +
> > > +
> > > +#include "kvm_util.h"
> > > +#include "vmx.h"
> > > +#include "kselftest.h"
> > > +
> > > +
> > > +#define VCPU_ID 0
> > > +
> > > +/* L1 is scaled down by this factor */
> > > +#define L1_SCALE_FACTOR 2ULL
> > > +/* L2 is scaled up (from L1's perspective) by this factor */
> > > +#define L2_SCALE_FACTOR 4ULL
> > 
> > For fun, I might have randomized these factors as well.
> 
> So L2_SCALE_FACTOR (or rather TSC_MULTIPLIER_L2 that depends on it) is
> referenced from within l1_guest_code(). If we change this to a static variable
> we won't be able to access it from there. How could this be done?
I also had this thought after I wrote the reply. I don't have much experience
yet with KVM selftests so this might indeed be not possible.

> 
> For the L1 factor it's easy as we only use it in main().
> 
> > > +
> > > +#define TSC_OFFSET_L2 (1UL << 32)
> > > +#define TSC_MULTIPLIER_L2 (L2_SCALE_FACTOR << 48)
> > 
> > It would be fun to use a negative offset here (also randomally).
> 
> Do you mean a random offset that is always negative or a random offset that
> sometimes is positive and sometimes is negative?
Yep, to test the special case for negative numbers.
> 
> > > +
> > > +#define L2_GUEST_STACK_SIZE 64
> > > +
> > > +enum { USLEEP, UCHECK_L1, UCHECK_L2 };
> > > +#define GUEST_SLEEP(sec)         ucall(UCALL_SYNC, 2, USLEEP, sec)
> > > +#define GUEST_CHECK(level, freq) ucall(UCALL_SYNC, 2, level, freq)
> > > +
> > > +
> > > +/*
> > > + * This function checks whether the "actual" TSC frequency of a guest matches
> > > + * its expected frequency. In order to account for delays in taking the TSC
> > > + * measurements, a difference of 1% between the actual and the expected value
> > > + * is tolerated.
> > > + */
> > > +static void compare_tsc_freq(uint64_t actual, uint64_t expected)
> > > +{
> > > +     uint64_t tolerance, thresh_low, thresh_high;
> > > +
> > > +     tolerance = expected / 100;
> > > +     thresh_low = expected - tolerance;
> > > +     thresh_high = expected + tolerance;
> > > +
> > > +     TEST_ASSERT(thresh_low < actual,
> > > +             "TSC freq is expected to be between %"PRIu64" and %"PRIu64
> > > +             " but it actually is %"PRIu64,
> > > +             thresh_low, thresh_high, actual);
> > > +     TEST_ASSERT(thresh_high > actual,
> > > +             "TSC freq is expected to be between %"PRIu64" and %"PRIu64
> > > +             " but it actually is %"PRIu64,
> > > +             thresh_low, thresh_high, actual);
> > > +}
> > > +
> > > +static void check_tsc_freq(int level)
> > > +{
> > > +     uint64_t tsc_start, tsc_end, tsc_freq;
> > > +
> > > +     /*
> > > +      * Reading the TSC twice with about a second's difference should give
> > > +      * us an approximation of the TSC frequency from the guest's
> > > +      * perspective. Now, this won't be completely accurate, but it should
> > > +      * be good enough for the purposes of this test.
> > > +      */
> > 
> > It would be nice to know if the host has stable TSC (you can obtain this via
> > KVM_GET_CLOCK, the KVM_CLOCK_TSC_STABLE flag).
> > 
> > And if not stable skip the test, to avoid false positives.
> > (Yes I have a laptop I just bought that has an unstable TSC....)
> > 
> 
> Hmm, this is a vm ioctl but I noticed that one of its vcpus needs to have been
> run at least once otherwise it won't return KVM_CLOCK_TSC_STABLE in the flags.
> 
> So...

Yes, now I remember that this thing relies on the TSC sync logic,
master clock thing, etc... Oh well...

To be honest we really need the kernel to export the information
it knows about the TSC because it is useful to many users and
not limited to virtualization.

Currently other than KVM's KVM_GET_TSC_KHZ there is no clean way
to know even the TSC frequency, let alone if kernel considers 
the TSC to be stable AFAIK.

Other more or less reliable (but hacky) way to know if TSC is stable is to see
if the kernel is using tsc via
(/sys/devices/system/clocksource/clocksource0/current_clocksource = tsc)

Oh well...

Best regards,
	Maxim Levitsky

> 
> > > +     tsc_start = rdmsr(MSR_IA32_TSC);
> > > +     GUEST_SLEEP(1);
> > > +     tsc_end = rdmsr(MSR_IA32_TSC);
> > > +
> > > +     tsc_freq = tsc_end - tsc_start;
> > > +
> > > +     GUEST_CHECK(level, tsc_freq);
> > > +}
> > > +
> > > +static void l2_guest_code(void)
> > > +{
> > > +     check_tsc_freq(UCHECK_L2);
> > > +
> > > +     /* exit to L1 */
> > > +     __asm__ __volatile__("vmcall");
> > > +}
> > > +
> > > +static void l1_guest_code(struct vmx_pages *vmx_pages)
> > > +{
> > > +     unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
> > > +     uint32_t control;
> > > +
> > > +     /* check that L1's frequency looks alright before launching L2 */
> > > +     check_tsc_freq(UCHECK_L1);
> > > +
> > > +     GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
> > > +     GUEST_ASSERT(load_vmcs(vmx_pages));
> > > +
> > > +     /* prepare the VMCS for L2 execution */
> > > +     prepare_vmcs(vmx_pages, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
> > > +
> > > +     /* enable TSC offsetting and TSC scaling for L2 */
> > > +     control = vmreadz(CPU_BASED_VM_EXEC_CONTROL);
> > > +     control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETTING;
> > > +     vmwrite(CPU_BASED_VM_EXEC_CONTROL, control);
> > > +
> > > +     control = vmreadz(SECONDARY_VM_EXEC_CONTROL);
> > > +     control |= SECONDARY_EXEC_TSC_SCALING;
> > > +     vmwrite(SECONDARY_VM_EXEC_CONTROL, control);
> > > +
> > > +     vmwrite(TSC_OFFSET, TSC_OFFSET_L2);
> > > +     vmwrite(TSC_MULTIPLIER, TSC_MULTIPLIER_L2);
> > > +     vmwrite(TSC_MULTIPLIER_HIGH, TSC_MULTIPLIER_L2 >> 32);
> > > +
> > > +     /* launch L2 */
> > > +     GUEST_ASSERT(!vmlaunch());
> > > +     GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
> > > +
> > > +     /* check that L1's frequency still looks good */
> > > +     check_tsc_freq(UCHECK_L1);
> > > +
> > > +     GUEST_DONE();
> > > +}
> > > +
> > > +static void tsc_scaling_check_supported(void)
> > > +{
> > > +     if (!kvm_check_cap(KVM_CAP_TSC_CONTROL)) {
> > > +             print_skip("TSC scaling not supported by the HW");
> > > +             exit(KSFT_SKIP);
> > > +     }
> > > +}
> > > +
> > > +int main(int argc, char *argv[])
> > > +{
> > > +     struct kvm_vm *vm;
> > > +     vm_vaddr_t vmx_pages_gva;
> > > +
> > > +     uint64_t tsc_start, tsc_end;
> > > +     uint64_t tsc_khz;
> > > +     uint64_t l0_tsc_freq = 0;
> > > +     uint64_t l1_tsc_freq = 0;
> > > +     uint64_t l2_tsc_freq = 0;
> > > +
> > > +     nested_vmx_check_supported();
> > > +     tsc_scaling_check_supported();
> 
> I can't add the check here
> 
> > > +
> > > +     tsc_start = rdtsc();
> > > +     sleep(1);
> > > +     tsc_end = rdtsc();
> > > +
> > > +     l0_tsc_freq = tsc_end - tsc_start;
> > > +     printf("real TSC frequency is around: %"PRIu64"\n", l0_tsc_freq);
> > > +
> > > +     vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code);
> > > +     vcpu_alloc_vmx(vm, &vmx_pages_gva);
> > > +     vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
> 
> nor here
> 
> > > +
> > > +     tsc_khz = _vcpu_ioctl(vm, VCPU_ID, KVM_GET_TSC_KHZ, NULL);
> > > +     TEST_ASSERT(tsc_khz != -1, "vcpu ioctl KVM_GET_TSC_KHZ failed");
> > > +
> > > +     /* scale down L1's TSC frequency */
> > > +     vcpu_ioctl(vm, VCPU_ID, KVM_SET_TSC_KHZ,
> > > +               (void *) (tsc_khz / L1_SCALE_FACTOR));
> > > +
> > > +     for (;;) {
> > > +             volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID);
> > > +             struct ucall uc;
> > > +
> > > +             vcpu_run(vm, VCPU_ID);
> > > +             TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
> > > +                         "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n",
> > > +                         run->exit_reason,
> > > +                         exit_reason_str(run->exit_reason));
> 
> should I add it here?
> 
> > > +
> > > +             switch (get_ucall(vm, VCPU_ID, &uc)) {
> > > +             case UCALL_ABORT:
> > > +                     TEST_FAIL("%s", (const char *) uc.args[0]);
> > > +             case UCALL_SYNC:
> > > +                     switch (uc.args[0]) {
> > > +                     case USLEEP:
> > > +                             sleep(uc.args[1]);
> > > +                             break;
> > > +                     case UCHECK_L1:
> > > +                             l1_tsc_freq = uc.args[1];
> > > +                             printf("L1's TSC frequency is around: %"PRIu64
> > > +                                    "\n", l1_tsc_freq);
> > > +
> > > +                             compare_tsc_freq(l1_tsc_freq,
> > > +                                              l0_tsc_freq / L1_SCALE_FACTOR);
> > > +                             break;
> > > +                     case UCHECK_L2:
> > > +                             l2_tsc_freq = uc.args[1];
> > > +                             printf("L2's TSC frequency is around: %"PRIu64
> > > +                                    "\n", l2_tsc_freq);
> > > +
> > > +                             compare_tsc_freq(l2_tsc_freq,
> > > +                                              l1_tsc_freq * L2_SCALE_FACTOR);
> > > +                             break;
> > > +                     }
> > > +                     break;
> > > +             case UCALL_DONE:
> > > +                     goto done;
> > > +             default:
> > > +                     TEST_FAIL("Unknown ucall %lu", uc.cmd);
> > > +             }
> > > +     }
> > > +
> > > +done:
> > > +     kvm_vm_free(vm);
> > > +     return 0;
> > > +}
> > 
> > Overall looks OK to me.
> > 
> > I can't test it, since the most recent Intel laptop I have (i7-7600U)
> > still lacks TSC scaling (or did Intel cripple this feature on clients like what
> > they did with APICv ?)
> > 
> > Best regards,
> >         Maxim Levitsky
> > 
> > 
> >
Stamatis, Ilias May 11, 2021, 2:02 p.m. UTC | #4
On Tue, 2021-05-11 at 15:47 +0300, Maxim Levitsky wrote:
> On Tue, 2021-05-11 at 11:16 +0000, Stamatis, Ilias wrote:
> > On Mon, 2021-05-10 at 16:59 +0300, Maxim Levitsky wrote:
> > > On Thu, 2021-05-06 at 10:32 +0000, ilstam@mailbox.org wrote:
> > > > From: Ilias Stamatis <ilstam@amazon.com>
> > > > 
> > > > Test that nested TSC scaling works as expected with both L1 and L2
> > > > scaled.
> > > > 
> > > > Signed-off-by: Ilias Stamatis <ilstam@amazon.com>
> > > > ---
> > > >  tools/testing/selftests/kvm/.gitignore        |   1 +
> > > >  tools/testing/selftests/kvm/Makefile          |   1 +
> > > >  .../kvm/x86_64/vmx_nested_tsc_scaling_test.c  | 209 ++++++++++++++++++
> > > >  3 files changed, 211 insertions(+)
> > > >  create mode 100644 tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c
> > > > 
> > > > diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
> > > > index bd83158e0e0b..cc02022f9951 100644
> > > > --- a/tools/testing/selftests/kvm/.gitignore
> > > > +++ b/tools/testing/selftests/kvm/.gitignore
> > > > @@ -29,6 +29,7 @@
> > > >  /x86_64/vmx_preemption_timer_test
> > > >  /x86_64/vmx_set_nested_state_test
> > > >  /x86_64/vmx_tsc_adjust_test
> > > > +/x86_64/vmx_nested_tsc_scaling_test
> > > >  /x86_64/xapic_ipi_test
> > > >  /x86_64/xen_shinfo_test
> > > >  /x86_64/xen_vmcall_test
> > > > diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
> > > > index e439d027939d..1078240b1313 100644
> > > > --- a/tools/testing/selftests/kvm/Makefile
> > > > +++ b/tools/testing/selftests/kvm/Makefile
> > > > @@ -60,6 +60,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test
> > > >  TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test
> > > >  TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
> > > >  TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
> > > > +TEST_GEN_PROGS_x86_64 += x86_64/vmx_nested_tsc_scaling_test
> > > >  TEST_GEN_PROGS_x86_64 += x86_64/xapic_ipi_test
> > > >  TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test
> > > >  TEST_GEN_PROGS_x86_64 += x86_64/debug_regs
> > > > diff --git a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c
> > > > new file mode 100644
> > > > index 000000000000..b05f5151ecbe
> > > > --- /dev/null
> > > > +++ b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c
> > > > @@ -0,0 +1,209 @@
> > > > +// SPDX-License-Identifier: GPL-2.0-only
> > > > +/*
> > > > + * vmx_nested_tsc_scaling_test
> > > > + *
> > > > + * Copyright (C) 2021 Amazon.com, Inc. or its affiliates.
> > > > + *
> > > > + * This test case verifies that nested TSC scaling behaves as expected when
> > > > + * both L1 and L2 are scaled using different ratios. For this test we scale
> > > > + * L1 down and scale L2 up.
> > > > + */
> > > > +
> > > > +
> > > > +#include "kvm_util.h"
> > > > +#include "vmx.h"
> > > > +#include "kselftest.h"
> > > > +
> > > > +
> > > > +#define VCPU_ID 0
> > > > +
> > > > +/* L1 is scaled down by this factor */
> > > > +#define L1_SCALE_FACTOR 2ULL
> > > > +/* L2 is scaled up (from L1's perspective) by this factor */
> > > > +#define L2_SCALE_FACTOR 4ULL
> > > 
> > > For fun, I might have randomized these factors as well.
> > 
> > So L2_SCALE_FACTOR (or rather TSC_MULTIPLIER_L2 that depends on it) is
> > referenced from within l1_guest_code(). If we change this to a static variable
> > we won't be able to access it from there. How could this be done?
> 
> I also had this thought after I wrote the reply. I don't have much experience
> yet with KVM selftests so this might indeed be not possible.
> 

OK, I can make the L1 scale factor random, but the L2 fixed for now.

Does a range of 2 to 10 sound reasonable for L1?

> > 
> > For the L1 factor it's easy as we only use it in main().
> > 
> > > > +
> > > > +#define TSC_OFFSET_L2 (1UL << 32)
> > > > +#define TSC_MULTIPLIER_L2 (L2_SCALE_FACTOR << 48)
> > > 
> > > It would be fun to use a negative offset here (also randomally).
> > 
> > Do you mean a random offset that is always negative or a random offset that
> > sometimes is positive and sometimes is negative?
> 
> Yep, to test the special case for negative numbers.

OK, I will use a negative offset then but it won't be random for the same
reason as above.

> > 
> > > > +
> > > > +#define L2_GUEST_STACK_SIZE 64
> > > > +
> > > > +enum { USLEEP, UCHECK_L1, UCHECK_L2 };
> > > > +#define GUEST_SLEEP(sec)         ucall(UCALL_SYNC, 2, USLEEP, sec)
> > > > +#define GUEST_CHECK(level, freq) ucall(UCALL_SYNC, 2, level, freq)
> > > > +
> > > > +
> > > > +/*
> > > > + * This function checks whether the "actual" TSC frequency of a guest matches
> > > > + * its expected frequency. In order to account for delays in taking the TSC
> > > > + * measurements, a difference of 1% between the actual and the expected value
> > > > + * is tolerated.
> > > > + */
> > > > +static void compare_tsc_freq(uint64_t actual, uint64_t expected)
> > > > +{
> > > > +     uint64_t tolerance, thresh_low, thresh_high;
> > > > +
> > > > +     tolerance = expected / 100;
> > > > +     thresh_low = expected - tolerance;
> > > > +     thresh_high = expected + tolerance;
> > > > +
> > > > +     TEST_ASSERT(thresh_low < actual,
> > > > +             "TSC freq is expected to be between %"PRIu64" and %"PRIu64
> > > > +             " but it actually is %"PRIu64,
> > > > +             thresh_low, thresh_high, actual);
> > > > +     TEST_ASSERT(thresh_high > actual,
> > > > +             "TSC freq is expected to be between %"PRIu64" and %"PRIu64
> > > > +             " but it actually is %"PRIu64,
> > > > +             thresh_low, thresh_high, actual);
> > > > +}
> > > > +
> > > > +static void check_tsc_freq(int level)
> > > > +{
> > > > +     uint64_t tsc_start, tsc_end, tsc_freq;
> > > > +
> > > > +     /*
> > > > +      * Reading the TSC twice with about a second's difference should give
> > > > +      * us an approximation of the TSC frequency from the guest's
> > > > +      * perspective. Now, this won't be completely accurate, but it should
> > > > +      * be good enough for the purposes of this test.
> > > > +      */
> > > 
> > > It would be nice to know if the host has stable TSC (you can obtain this via
> > > KVM_GET_CLOCK, the KVM_CLOCK_TSC_STABLE flag).
> > > 
> > > And if not stable skip the test, to avoid false positives.
> > > (Yes I have a laptop I just bought that has an unstable TSC....)
> > > 
> > 
> > Hmm, this is a vm ioctl but I noticed that one of its vcpus needs to have been
> > run at least once otherwise it won't return KVM_CLOCK_TSC_STABLE in the flags.
> > 
> > So...
> 
> Yes, now I remember that this thing relies on the TSC sync logic,
> master clock thing, etc... Oh well...
> 
> To be honest we really need the kernel to export the information
> it knows about the TSC because it is useful to many users and
> not limited to virtualization.
> 
> Currently other than KVM's KVM_GET_TSC_KHZ there is no clean way
> to know even the TSC frequency, let alone if kernel considers
> the TSC to be stable AFAIK.
> 
> Other more or less reliable (but hacky) way to know if TSC is stable is to see
> if the kernel is using tsc via
> (/sys/devices/system/clocksource/clocksource0/current_clocksource = tsc)
> 
> Oh well...

So do you suggest checking the content of this file over doing a KVM_GET_CLOCK
ioctl after the vcpu has run once?

> 
> Best regards,
>         Maxim Levitsky
> 
> > 
> > > > +     tsc_start = rdmsr(MSR_IA32_TSC);
> > > > +     GUEST_SLEEP(1);
> > > > +     tsc_end = rdmsr(MSR_IA32_TSC);
> > > > +
> > > > +     tsc_freq = tsc_end - tsc_start;
> > > > +
> > > > +     GUEST_CHECK(level, tsc_freq);
> > > > +}
> > > > +
> > > > +static void l2_guest_code(void)
> > > > +{
> > > > +     check_tsc_freq(UCHECK_L2);
> > > > +
> > > > +     /* exit to L1 */
> > > > +     __asm__ __volatile__("vmcall");
> > > > +}
> > > > +
> > > > +static void l1_guest_code(struct vmx_pages *vmx_pages)
> > > > +{
> > > > +     unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
> > > > +     uint32_t control;
> > > > +
> > > > +     /* check that L1's frequency looks alright before launching L2 */
> > > > +     check_tsc_freq(UCHECK_L1);
> > > > +
> > > > +     GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
> > > > +     GUEST_ASSERT(load_vmcs(vmx_pages));
> > > > +
> > > > +     /* prepare the VMCS for L2 execution */
> > > > +     prepare_vmcs(vmx_pages, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
> > > > +
> > > > +     /* enable TSC offsetting and TSC scaling for L2 */
> > > > +     control = vmreadz(CPU_BASED_VM_EXEC_CONTROL);
> > > > +     control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETTING;
> > > > +     vmwrite(CPU_BASED_VM_EXEC_CONTROL, control);
> > > > +
> > > > +     control = vmreadz(SECONDARY_VM_EXEC_CONTROL);
> > > > +     control |= SECONDARY_EXEC_TSC_SCALING;
> > > > +     vmwrite(SECONDARY_VM_EXEC_CONTROL, control);
> > > > +
> > > > +     vmwrite(TSC_OFFSET, TSC_OFFSET_L2);
> > > > +     vmwrite(TSC_MULTIPLIER, TSC_MULTIPLIER_L2);
> > > > +     vmwrite(TSC_MULTIPLIER_HIGH, TSC_MULTIPLIER_L2 >> 32);
> > > > +
> > > > +     /* launch L2 */
> > > > +     GUEST_ASSERT(!vmlaunch());
> > > > +     GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
> > > > +
> > > > +     /* check that L1's frequency still looks good */
> > > > +     check_tsc_freq(UCHECK_L1);
> > > > +
> > > > +     GUEST_DONE();
> > > > +}
> > > > +
> > > > +static void tsc_scaling_check_supported(void)
> > > > +{
> > > > +     if (!kvm_check_cap(KVM_CAP_TSC_CONTROL)) {
> > > > +             print_skip("TSC scaling not supported by the HW");
> > > > +             exit(KSFT_SKIP);
> > > > +     }
> > > > +}
> > > > +
> > > > +int main(int argc, char *argv[])
> > > > +{
> > > > +     struct kvm_vm *vm;
> > > > +     vm_vaddr_t vmx_pages_gva;
> > > > +
> > > > +     uint64_t tsc_start, tsc_end;
> > > > +     uint64_t tsc_khz;
> > > > +     uint64_t l0_tsc_freq = 0;
> > > > +     uint64_t l1_tsc_freq = 0;
> > > > +     uint64_t l2_tsc_freq = 0;
> > > > +
> > > > +     nested_vmx_check_supported();
> > > > +     tsc_scaling_check_supported();
> > 
> > I can't add the check here
> > 
> > > > +
> > > > +     tsc_start = rdtsc();
> > > > +     sleep(1);
> > > > +     tsc_end = rdtsc();
> > > > +
> > > > +     l0_tsc_freq = tsc_end - tsc_start;
> > > > +     printf("real TSC frequency is around: %"PRIu64"\n", l0_tsc_freq);
> > > > +
> > > > +     vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code);
> > > > +     vcpu_alloc_vmx(vm, &vmx_pages_gva);
> > > > +     vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
> > 
> > nor here
> > 
> > > > +
> > > > +     tsc_khz = _vcpu_ioctl(vm, VCPU_ID, KVM_GET_TSC_KHZ, NULL);
> > > > +     TEST_ASSERT(tsc_khz != -1, "vcpu ioctl KVM_GET_TSC_KHZ failed");
> > > > +
> > > > +     /* scale down L1's TSC frequency */
> > > > +     vcpu_ioctl(vm, VCPU_ID, KVM_SET_TSC_KHZ,
> > > > +               (void *) (tsc_khz / L1_SCALE_FACTOR));
> > > > +
> > > > +     for (;;) {
> > > > +             volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID);
> > > > +             struct ucall uc;
> > > > +
> > > > +             vcpu_run(vm, VCPU_ID);
> > > > +             TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
> > > > +                         "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n",
> > > > +                         run->exit_reason,
> > > > +                         exit_reason_str(run->exit_reason));
> > 
> > should I add it here?
> > 
> > > > +
> > > > +             switch (get_ucall(vm, VCPU_ID, &uc)) {
> > > > +             case UCALL_ABORT:
> > > > +                     TEST_FAIL("%s", (const char *) uc.args[0]);
> > > > +             case UCALL_SYNC:
> > > > +                     switch (uc.args[0]) {
> > > > +                     case USLEEP:
> > > > +                             sleep(uc.args[1]);
> > > > +                             break;
> > > > +                     case UCHECK_L1:
> > > > +                             l1_tsc_freq = uc.args[1];
> > > > +                             printf("L1's TSC frequency is around: %"PRIu64
> > > > +                                    "\n", l1_tsc_freq);
> > > > +
> > > > +                             compare_tsc_freq(l1_tsc_freq,
> > > > +                                              l0_tsc_freq / L1_SCALE_FACTOR);
> > > > +                             break;
> > > > +                     case UCHECK_L2:
> > > > +                             l2_tsc_freq = uc.args[1];
> > > > +                             printf("L2's TSC frequency is around: %"PRIu64
> > > > +                                    "\n", l2_tsc_freq);
> > > > +
> > > > +                             compare_tsc_freq(l2_tsc_freq,
> > > > +                                              l1_tsc_freq * L2_SCALE_FACTOR);
> > > > +                             break;
> > > > +                     }
> > > > +                     break;
> > > > +             case UCALL_DONE:
> > > > +                     goto done;
> > > > +             default:
> > > > +                     TEST_FAIL("Unknown ucall %lu", uc.cmd);
> > > > +             }
> > > > +     }
> > > > +
> > > > +done:
> > > > +     kvm_vm_free(vm);
> > > > +     return 0;
> > > > +}
> > > 
> > > Overall looks OK to me.
> > > 
> > > I can't test it, since the most recent Intel laptop I have (i7-7600U)
> > > still lacks TSC scaling (or did Intel cripple this feature on clients like what
> > > they did with APICv ?)
> > > 
> > > Best regards,
> > >         Maxim Levitsky
> > > 
> > > 
> > > 
> 
>
diff mbox series

Patch

diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
index bd83158e0e0b..cc02022f9951 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -29,6 +29,7 @@ 
 /x86_64/vmx_preemption_timer_test
 /x86_64/vmx_set_nested_state_test
 /x86_64/vmx_tsc_adjust_test
+/x86_64/vmx_nested_tsc_scaling_test
 /x86_64/xapic_ipi_test
 /x86_64/xen_shinfo_test
 /x86_64/xen_vmcall_test
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index e439d027939d..1078240b1313 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -60,6 +60,7 @@  TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
+TEST_GEN_PROGS_x86_64 += x86_64/vmx_nested_tsc_scaling_test
 TEST_GEN_PROGS_x86_64 += x86_64/xapic_ipi_test
 TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test
 TEST_GEN_PROGS_x86_64 += x86_64/debug_regs
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c
new file mode 100644
index 000000000000..b05f5151ecbe
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c
@@ -0,0 +1,209 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vmx_nested_tsc_scaling_test
+ *
+ * Copyright (C) 2021 Amazon.com, Inc. or its affiliates.
+ *
+ * This test case verifies that nested TSC scaling behaves as expected when
+ * both L1 and L2 are scaled using different ratios. For this test we scale
+ * L1 down and scale L2 up.
+ */
+
+
+#include "kvm_util.h"
+#include "vmx.h"
+#include "kselftest.h"
+
+
+#define VCPU_ID 0
+
+/* L1 is scaled down by this factor */
+#define L1_SCALE_FACTOR 2ULL
+/* L2 is scaled up (from L1's perspective) by this factor */
+#define L2_SCALE_FACTOR 4ULL
+
+#define TSC_OFFSET_L2 (1UL << 32)
+#define TSC_MULTIPLIER_L2 (L2_SCALE_FACTOR << 48)
+
+#define L2_GUEST_STACK_SIZE 64
+
+enum { USLEEP, UCHECK_L1, UCHECK_L2 };
+#define GUEST_SLEEP(sec)         ucall(UCALL_SYNC, 2, USLEEP, sec)
+#define GUEST_CHECK(level, freq) ucall(UCALL_SYNC, 2, level, freq)
+
+
+/*
+ * This function checks whether the "actual" TSC frequency of a guest matches
+ * its expected frequency. In order to account for delays in taking the TSC
+ * measurements, a difference of 1% between the actual and the expected value
+ * is tolerated.
+ */
+static void compare_tsc_freq(uint64_t actual, uint64_t expected)
+{
+	uint64_t tolerance, thresh_low, thresh_high;
+
+	tolerance = expected / 100;
+	thresh_low = expected - tolerance;
+	thresh_high = expected + tolerance;
+
+	TEST_ASSERT(thresh_low < actual,
+		"TSC freq is expected to be between %"PRIu64" and %"PRIu64
+		" but it actually is %"PRIu64,
+		thresh_low, thresh_high, actual);
+	TEST_ASSERT(thresh_high > actual,
+		"TSC freq is expected to be between %"PRIu64" and %"PRIu64
+		" but it actually is %"PRIu64,
+		thresh_low, thresh_high, actual);
+}
+
+static void check_tsc_freq(int level)
+{
+	uint64_t tsc_start, tsc_end, tsc_freq;
+
+	/*
+	 * Reading the TSC twice with about a second's difference should give
+	 * us an approximation of the TSC frequency from the guest's
+	 * perspective. Now, this won't be completely accurate, but it should
+	 * be good enough for the purposes of this test.
+	 */
+	tsc_start = rdmsr(MSR_IA32_TSC);
+	GUEST_SLEEP(1);
+	tsc_end = rdmsr(MSR_IA32_TSC);
+
+	tsc_freq = tsc_end - tsc_start;
+
+	GUEST_CHECK(level, tsc_freq);
+}
+
+static void l2_guest_code(void)
+{
+	check_tsc_freq(UCHECK_L2);
+
+	/* exit to L1 */
+	__asm__ __volatile__("vmcall");
+}
+
+static void l1_guest_code(struct vmx_pages *vmx_pages)
+{
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+	uint32_t control;
+
+	/* check that L1's frequency looks alright before launching L2 */
+	check_tsc_freq(UCHECK_L1);
+
+	GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+	GUEST_ASSERT(load_vmcs(vmx_pages));
+
+	/* prepare the VMCS for L2 execution */
+	prepare_vmcs(vmx_pages, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+	/* enable TSC offsetting and TSC scaling for L2 */
+	control = vmreadz(CPU_BASED_VM_EXEC_CONTROL);
+	control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETTING;
+	vmwrite(CPU_BASED_VM_EXEC_CONTROL, control);
+
+	control = vmreadz(SECONDARY_VM_EXEC_CONTROL);
+	control |= SECONDARY_EXEC_TSC_SCALING;
+	vmwrite(SECONDARY_VM_EXEC_CONTROL, control);
+
+	vmwrite(TSC_OFFSET, TSC_OFFSET_L2);
+	vmwrite(TSC_MULTIPLIER, TSC_MULTIPLIER_L2);
+	vmwrite(TSC_MULTIPLIER_HIGH, TSC_MULTIPLIER_L2 >> 32);
+
+	/* launch L2 */
+	GUEST_ASSERT(!vmlaunch());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+	/* check that L1's frequency still looks good */
+	check_tsc_freq(UCHECK_L1);
+
+	GUEST_DONE();
+}
+
+static void tsc_scaling_check_supported(void)
+{
+	if (!kvm_check_cap(KVM_CAP_TSC_CONTROL)) {
+		print_skip("TSC scaling not supported by the HW");
+		exit(KSFT_SKIP);
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vm *vm;
+	vm_vaddr_t vmx_pages_gva;
+
+	uint64_t tsc_start, tsc_end;
+	uint64_t tsc_khz;
+	uint64_t l0_tsc_freq = 0;
+	uint64_t l1_tsc_freq = 0;
+	uint64_t l2_tsc_freq = 0;
+
+	nested_vmx_check_supported();
+	tsc_scaling_check_supported();
+
+	tsc_start = rdtsc();
+	sleep(1);
+	tsc_end = rdtsc();
+
+	l0_tsc_freq = tsc_end - tsc_start;
+	printf("real TSC frequency is around: %"PRIu64"\n", l0_tsc_freq);
+
+	vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code);
+	vcpu_alloc_vmx(vm, &vmx_pages_gva);
+	vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
+
+	tsc_khz = _vcpu_ioctl(vm, VCPU_ID, KVM_GET_TSC_KHZ, NULL);
+	TEST_ASSERT(tsc_khz != -1, "vcpu ioctl KVM_GET_TSC_KHZ failed");
+
+	/* scale down L1's TSC frequency */
+	vcpu_ioctl(vm, VCPU_ID, KVM_SET_TSC_KHZ,
+		  (void *) (tsc_khz / L1_SCALE_FACTOR));
+
+	for (;;) {
+		volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID);
+		struct ucall uc;
+
+		vcpu_run(vm, VCPU_ID);
+		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+			    "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n",
+			    run->exit_reason,
+			    exit_reason_str(run->exit_reason));
+
+		switch (get_ucall(vm, VCPU_ID, &uc)) {
+		case UCALL_ABORT:
+			TEST_FAIL("%s", (const char *) uc.args[0]);
+		case UCALL_SYNC:
+			switch (uc.args[0]) {
+			case USLEEP:
+				sleep(uc.args[1]);
+				break;
+			case UCHECK_L1:
+				l1_tsc_freq = uc.args[1];
+				printf("L1's TSC frequency is around: %"PRIu64
+				       "\n", l1_tsc_freq);
+
+				compare_tsc_freq(l1_tsc_freq,
+						 l0_tsc_freq / L1_SCALE_FACTOR);
+				break;
+			case UCHECK_L2:
+				l2_tsc_freq = uc.args[1];
+				printf("L2's TSC frequency is around: %"PRIu64
+				       "\n", l2_tsc_freq);
+
+				compare_tsc_freq(l2_tsc_freq,
+						 l1_tsc_freq * L2_SCALE_FACTOR);
+				break;
+			}
+			break;
+		case UCALL_DONE:
+			goto done;
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+	}
+
+done:
+	kvm_vm_free(vm);
+	return 0;
+}