diff mbox series

[2/2] KVM: selftests: Add KVM/PV clock selftest to prove timer drift correction

Message ID 20240408220705.7637-3-jalliste@amazon.com (mailing list archive)
State New, archived
Headers show
Series Add API to correct KVM/PV clock drift | expand

Commit Message

Jack Allister April 8, 2024, 10:07 p.m. UTC
This test proves that there is an inherent KVM/PV clock drift away from the
guest TSC when KVM decides to update the PV time information structure due
to a KVM_REQ_MASTERCLOCK_UPDATE. This drift is exascerbated when a guest is
using TSC scaling and running at a different frequency to the host TSC [1].
It also proves that KVM_[GS]ET_CLOCK_GUEST API is working to mitigate the
drift from TSC to within ±1ns.

The test simply records the PVTI (PV time information) at time of guest
creation, after KVM has updated it's mapped PVTI structure and once the
correction has taken place.

A singular point in time is then recorded via the guest TSC and is used to
calculate the a PV clock value using each of the 3 PVTI structures.

As seen below a drift of ~3500ns is observed if no correction has taken
place after KVM has updated the PVTI via master clock update. However,
after the correction a delta of at most 1ns can be seen.

* selftests: kvm: pvclock_test
* scaling tsc from 2999999KHz to 1499999KHz
* before=5038374946 uncorrected=5038371437 corrected=5038374945
* delta_uncorrected=3509 delta_corrected=1

Clocksource check code has been borrowed from [2].

[1]: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=451a707813ae
[2]: https://lore.kernel.org/kvm/20240106083346.29180-1-dongli.zhang@oracle.com/

Signed-off-by: Jack Allister <jalliste@amazon.com>
CC: David Woodhouse <dwmw2@infradead.org>
CC: Paul Durrant <paul@xen.org>
---
 tools/testing/selftests/kvm/Makefile          |   1 +
 .../selftests/kvm/x86_64/pvclock_test.c       | 223 ++++++++++++++++++
 2 files changed, 224 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/x86_64/pvclock_test.c

Comments

Dongli Zhang April 9, 2024, 12:43 a.m. UTC | #1
Hi Jack,

On 4/8/24 15:07, Jack Allister wrote:
> This test proves that there is an inherent KVM/PV clock drift away from the
> guest TSC when KVM decides to update the PV time information structure due
> to a KVM_REQ_MASTERCLOCK_UPDATE. This drift is exascerbated when a guest is

Typo: exacerbated

> using TSC scaling and running at a different frequency to the host TSC [1].
> It also proves that KVM_[GS]ET_CLOCK_GUEST API is working to mitigate the
> drift from TSC to within ±1ns.
> 
> The test simply records the PVTI (PV time information) at time of guest
> creation, after KVM has updated it's mapped PVTI structure and once the
> correction has taken place.
> 
> A singular point in time is then recorded via the guest TSC and is used to
> calculate the a PV clock value using each of the 3 PVTI structures.

Typo: "the a"

> 
> As seen below a drift of ~3500ns is observed if no correction has taken
> place after KVM has updated the PVTI via master clock update. However,
> after the correction a delta of at most 1ns can be seen.
> 
> * selftests: kvm: pvclock_test
> * scaling tsc from 2999999KHz to 1499999KHz
> * before=5038374946 uncorrected=5038371437 corrected=5038374945
> * delta_uncorrected=3509 delta_corrected=1
> 
> Clocksource check code has been borrowed from [2].
> 
> [1]: https://urldefense.com/v3/__https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=451a707813ae__;!!ACWV5N9M2RV99hQ!JWCFSSEPaZUer553HE44W0OhktMh3-Iyz4aZNE4pcjc94q_bs4QBIrVS8ciYEzj1NigrVCamkHgpvwqP1XrV$ 
> [2]: https://urldefense.com/v3/__https://lore.kernel.org/kvm/20240106083346.29180-1-dongli.zhang@oracle.com/__;!!ACWV5N9M2RV99hQ!JWCFSSEPaZUer553HE44W0OhktMh3-Iyz4aZNE4pcjc94q_bs4QBIrVS8ciYEzj1NigrVCamkHgpv7jfLpy7$ 
> 
> Signed-off-by: Jack Allister <jalliste@amazon.com>
> CC: David Woodhouse <dwmw2@infradead.org>
> CC: Paul Durrant <paul@xen.org>
> ---
>  tools/testing/selftests/kvm/Makefile          |   1 +
>  .../selftests/kvm/x86_64/pvclock_test.c       | 223 ++++++++++++++++++
>  2 files changed, 224 insertions(+)
>  create mode 100644 tools/testing/selftests/kvm/x86_64/pvclock_test.c
> 
> diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
> index 741c7dc16afc..02ee1205bbed 100644
> --- a/tools/testing/selftests/kvm/Makefile
> +++ b/tools/testing/selftests/kvm/Makefile
> @@ -87,6 +87,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/pmu_counters_test
>  TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test
>  TEST_GEN_PROGS_x86_64 += x86_64/private_mem_conversions_test
>  TEST_GEN_PROGS_x86_64 += x86_64/private_mem_kvm_exits_test
> +TEST_GEN_PROGS_x86_64 += x86_64/pvclock_test
>  TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id
>  TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test
>  TEST_GEN_PROGS_x86_64 += x86_64/smaller_maxphyaddr_emulation_test
> diff --git a/tools/testing/selftests/kvm/x86_64/pvclock_test.c b/tools/testing/selftests/kvm/x86_64/pvclock_test.c
> new file mode 100644
> index 000000000000..172ef4d19c60
> --- /dev/null
> +++ b/tools/testing/selftests/kvm/x86_64/pvclock_test.c
> @@ -0,0 +1,223 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright © 2024, Amazon.com, Inc. or its affiliates.
> + *
> + * Tests for pvclock API
> + * KVM_SET_CLOCK_GUEST/KVM_GET_CLOCK_GUEST
> + */
> +#include <asm/pvclock.h>
> +#include <asm/pvclock-abi.h>
> +#include <sys/stat.h>
> +#include <stdint.h>
> +#include <stdio.h>
> +
> +#include "test_util.h"
> +#include "kvm_util.h"
> +#include "processor.h"
> +
> +enum {
> +	STAGE_FIRST_BOOT,
> +	STAGE_UNCORRECTED,
> +	STAGE_CORRECTED,
> +	NUM_STAGES
> +};
> +
> +#define KVMCLOCK_GPA 0xc0000000ull
> +#define KVMCLOCK_SIZE sizeof(struct pvclock_vcpu_time_info)
> +
> +static void trigger_pvti_update(vm_paddr_t pvti_pa)
> +{
> +	/*
> +	 * We need a way to trigger KVM to update the fields
> +	 * in the PV time info. The easiest way to do this is
> +	 * to temporarily switch to the old KVM system time
> +	 * method and then switch back to the new one.
> +	 */
> +	wrmsr(MSR_KVM_SYSTEM_TIME, pvti_pa | KVM_MSR_ENABLED);
> +	wrmsr(MSR_KVM_SYSTEM_TIME_NEW, pvti_pa | KVM_MSR_ENABLED);
> +}
> +
> +static void guest_code(vm_paddr_t pvti_pa)
> +{
> +	struct pvclock_vcpu_time_info *pvti_va =
> +		(struct pvclock_vcpu_time_info *)pvti_pa;
> +
> +	struct pvclock_vcpu_time_info pvti_boot;
> +	struct pvclock_vcpu_time_info pvti_uncorrected;
> +	struct pvclock_vcpu_time_info pvti_corrected;
> +	uint64_t cycles_boot;
> +	uint64_t cycles_uncorrected;
> +	uint64_t cycles_corrected;
> +	uint64_t tsc_guest;
> +
> +	/*
> +	 * Setup the KVMCLOCK in the guest & store the original
> +	 * PV time structure that is used.
> +	 */
> +	wrmsr(MSR_KVM_SYSTEM_TIME_NEW, pvti_pa | KVM_MSR_ENABLED);
> +	pvti_boot = *pvti_va;
> +	GUEST_SYNC(STAGE_FIRST_BOOT);
> +
> +	/*
> +	 * Trigger an update of the PVTI, if we calculate
> +	 * the KVM clock using this structure we'll see
> +	 * a drift from the TSC.
> +	 */
> +	trigger_pvti_update(pvti_pa);
> +	pvti_uncorrected = *pvti_va;
> +	GUEST_SYNC(STAGE_UNCORRECTED);
> +
> +	/*
> +	 * The test should have triggered the correction by this
> +	 * point in time. We have a copy of each of the PVTI structs
> +	 * at each stage now.
> +	 *
> +	 * Let's sample the timestamp at a SINGLE point in time and
> +	 * then calculate what the KVM clock would be using the PVTI
> +	 * from each stage.
> +	 *
> +	 * Then return each of these values to the tester.
> +	 */
> +	pvti_corrected = *pvti_va;
> +	tsc_guest = rdtsc();
> +
> +	cycles_boot = __pvclock_read_cycles(&pvti_boot, tsc_guest);
> +	cycles_uncorrected = __pvclock_read_cycles(&pvti_uncorrected, tsc_guest);
> +	cycles_corrected = __pvclock_read_cycles(&pvti_corrected, tsc_guest);
> +
> +	GUEST_SYNC_ARGS(STAGE_CORRECTED, cycles_boot, cycles_uncorrected,
> +			cycles_corrected, 0);
> +}
> +
> +static void run_test(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
> +{
> +	struct ucall uc;
> +	uint64_t ucall_reason;
> +	struct pvclock_vcpu_time_info pvti_before;
> +	uint64_t before, uncorrected, corrected;
> +	int64_t delta_uncorrected, delta_corrected;
> +
> +	/* Loop through each stage of the test. */
> +	while (true) {
> +
> +		/* Start/restart the running vCPU code. */
> +		vcpu_run(vcpu);
> +		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
> +
> +		/* Retrieve and verify our stage. */
> +		ucall_reason = get_ucall(vcpu, &uc);
> +		TEST_ASSERT(ucall_reason == UCALL_SYNC,
> +			    "Unhandled ucall reason=%lu",
> +			    ucall_reason);
> +
> +		/* Run host specific code relating to stage. */
> +		switch (uc.args[1]) {
> +		case STAGE_FIRST_BOOT:
> +			/* Store the KVM clock values before an update. */
> +			vm_ioctl(vm, KVM_GET_CLOCK_GUEST, &pvti_before);
> +
> +			/* Sleep for a set amount of time to induce drift. */
> +			sleep(5);
> +			break;
> +
> +		case STAGE_UNCORRECTED:
> +			/* Restore the KVM clock values. */
> +			vm_ioctl(vm, KVM_SET_CLOCK_GUEST, &pvti_before);
> +			break;
> +
> +		case STAGE_CORRECTED:
> +			/* Query the clock information and verify delta. */
> +			before = uc.args[2];
> +			uncorrected = uc.args[3];
> +			corrected = uc.args[4];
> +
> +			delta_uncorrected = before - uncorrected;
> +			delta_corrected = before - corrected;
> +
> +			pr_info("before=%lu uncorrected=%lu corrected=%lu\n",
> +				before, uncorrected, corrected);
> +
> +			pr_info("delta_uncorrected=%ld delta_corrected=%ld\n",
> +				delta_uncorrected, delta_corrected);
> +
> +			TEST_ASSERT((delta_corrected <= 1) && (delta_corrected >= -1),
> +				    "larger than expected delta detected = %ld", delta_corrected);
> +			return;
> +		}
> +	}
> +}
> +
> +#define CLOCKSOURCE_PATH "/sys/devices/system/clocksource/clocksource0/current_clocksource"
> +
> +static void check_clocksource(void)
> +{

AFAIR, I copied check_clocksource() from existing code during that time.

The commit e440c5f2e ("KVM: selftests: Generalize check_clocksource() from
kvm_clock_test") has introduced sys_clocksource_is_tsc(). Later it is renamed to
sys_clocksource_is_based_on_tsc().

Any chance to re-use sys_clocksource_is_based_on_tsc()?

> +	char *clk_name;
> +	struct stat st;
> +	FILE *fp;
> +
> +	fp = fopen(CLOCKSOURCE_PATH, "r");
> +	if (!fp) {
> +		pr_info("failed to open clocksource file: %d; assuming TSC.\n",
> +			errno);
> +		return;
> +	}
> +
> +	if (fstat(fileno(fp), &st)) {
> +		pr_info("failed to stat clocksource file: %d; assuming TSC.\n",
> +			errno);
> +		goto out;
> +	}
> +
> +	clk_name = malloc(st.st_size);
> +	TEST_ASSERT(clk_name, "failed to allocate buffer to read file\n");
> +
> +	if (!fgets(clk_name, st.st_size, fp)) {
> +		pr_info("failed to read clocksource file: %d; assuming TSC.\n",
> +			ferror(fp));
> +		goto out;
> +	}
> +
> +	TEST_ASSERT(!strncmp(clk_name, "tsc\n", st.st_size),
> +		    "clocksource not supported: %s", clk_name);
> +out:
> +	fclose(fp);
> +}
> +
> +static void configure_pvclock(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
> +{
> +	unsigned int gpages;
> +
> +	gpages = vm_calc_num_guest_pages(VM_MODE_DEFAULT, KVMCLOCK_SIZE);
> +	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
> +				    KVMCLOCK_GPA, 1, gpages, 0);
> +	virt_map(vm, KVMCLOCK_GPA, KVMCLOCK_GPA, gpages);
> +
> +	vcpu_args_set(vcpu, 1, KVMCLOCK_GPA);
> +}
> +
> +static void configure_scaled_tsc(struct kvm_vcpu *vcpu)
> +{
> +	uint64_t tsc_khz;
> +
> +	tsc_khz =  __vcpu_ioctl(vcpu, KVM_GET_TSC_KHZ, NULL);
> +	pr_info("scaling tsc from %ldKHz to %ldKHz\n", tsc_khz, tsc_khz / 2);
> +	tsc_khz /= 2;
> +	vcpu_ioctl(vcpu, KVM_SET_TSC_KHZ, (void *)tsc_khz);
> +}

Is configure_scaled_tsc() anecessary? Or how about to make it an option/arg?
Then I will be able to test it on a VM/server without TSC scaling.

Thank you very much!

Dongli Zhang

> +
> +int main(void)
> +{
> +	struct kvm_vm *vm;
> +	struct kvm_vcpu *vcpu;
> +
> +	check_clocksource();
> +
> +	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
> +
> +	configure_pvclock(vm, vcpu);
> +	configure_scaled_tsc(vcpu);
> +
> +	run_test(vm, vcpu);
> +
> +	return 0;
> +}
David Woodhouse April 9, 2024, 4:23 a.m. UTC | #2
On Mon, 2024-04-08 at 17:43 -0700, Dongli Zhang wrote:
> Hi Jack,
> 
> On 4/8/24 15:07, Jack Allister wrote:
> > This test proves that there is an inherent KVM/PV clock drift away from the
> > guest TSC when KVM decides to update the PV time information structure due
> > to a KVM_REQ_MASTERCLOCK_UPDATE. This drift is exascerbated when a guest is
> 
> Typo: exacerbated
> 
> > using TSC scaling and running at a different frequency to the host TSC [1].
> > It also proves that KVM_[GS]ET_CLOCK_GUEST API is working to mitigate the
> > drift from TSC to within ±1ns.

No, the KVM_[GS}ET_CLOCK_GUEST API is not about mitigating kernel bugs.

We *fix* kernel bugs, we don't make userspace work around them.

The KVM_[GS}ET_CLOCK_GUEST API allows userspace to perform accurate
live update and live migration without disrupting the relationship
between guest TSC and KVM clock.

Since a bombing run on KVM_REQ_MASTERCLOCK_UPDATE users is on my TODO
list, it's worth noting the *reason* that switching to the obsolete
MSR_KVM_SYSTEM_TIME forces ka->use_master_clock mode off. 

It's not documented at all as far as I can tell, but in commit
54750f2cf042 (“KVM: x86: workaround SuSE's 2.6.16 pvclock vs
masterclock issue“) in 2015, it was done to work around a guest bug
where the guest *expected* the reference point to keep being updated
and never be too far in the past.


> Is configure_scaled_tsc() anecessary? Or how about to make it an
> option/arg?
> Then I will be able to test it on a VM/server without TSC scaling.

As discussed, TSC scaling shouldn't be needed. It *should* be part of
the unit test if possible, because there is a class of bugs it'll
trigger, but it should be optional.

In particular, the existing KVM_GET_CLOCK will return extra-wrong
results if TSC scaling is in force. But that isn't being tested here
yet because we haven't *fixed* it yet :)

For reference, here's my TODO list which Jack is working from...

 • Add KVM unit test to validate that KVM clock does not change when
   provoked (including by simulated live update).
   It’s OK for the reference point at { tsc_timestamp, system_time } in
   the pvclock structure to change, but only such that it gives the
   same results for a given guest TSC — that is, if system_time
   changes, then tsc_timestamp must change by a delta which precisely
   corresponds in terms of the advertised guest TSC frequency. Perhaps
    allow a slop of 1ns for rounding, but no more.

 • Audit and fix (i.e. remove) KVM_REQ_MASTERCLOCK_UPDATE usage,
   starting with kvm_xen_shared_info_init(). And work out whether it
   should be sent to all vCPUs, as some call sites do, or just one?

 • Add KVM_VCPU_TSC_SCALE attribute to allow userspace to know the
   precise host→guest TSC scaling.
   (cf. https://lore.kernel.org/all/13f256ad95de186e3b6bcfcc1f88da5d0ad0cb71.camel@infradead.org/)

 • Expose guest’s view of KVM clock to userspace via KVM_GET_CLOCK_GUEST
   ioctl. Perhaps also a memory-mapped version, as the gfn_to_pfn_cache
   allows writing to userspace HVAs. With this, userspace has fast and
   accurate way to calculate the KVM clock at any given moment in time.
   (Currently, userspace calls the KVM_GET_CLOCK ioctl which is slow
   and returns inaccurate results). Then userspace can base other
   things like PIT and HPET emulation on the KVM clock and simplify
   timekeeping over migration for those too.

 • Add a KVM_SET_CLOCK_GUEST ioctl which consumes the pvclock
   information back again. This should not only set the kvmclock_offset
   field, but also set the reference point { master_cycle_now,
   master_kernel_ns } as follows:
     • Sample the kernel’s CLOCK_MONOTONIC_RAW to create a new
       master_kernel_ns and master_cycle_now.
     • Convert the new master_cycle_now to a guest TSC.
     • Calculate the intended KVM clock with that guest TSC from the
       provided pvclock information.
     • Calculate the current KVM clock with that guest TSC using the
       new master_cycle_now and master_kernel_ns and kvmclock_offset as
       usual.
     • Adjust kvmclock_offset to correct for the delta between current
       and intended values.
     • Raise KVM_REQ_CLOCK_UPDATE on all vCPUs.

 • Fix the broken __get_kvmclock() function to scale via the guest’s
   TSC frequency as it should. There isn’t necessarily a vCPU to use
   for this, so it’s OK for this to work only when the frequency has
   been set of the whole VM rather than only for individual vCPUs.
   Likewise kvm_get_wall_clock_epoch() which has the same bug.

 • Fix all other cases where KVM reads the time in two places
   separately and then treats them as simultaneous.

 • Fix the discontinuities in KVM_REQ_MASTERCLOCK_UPDATE by allowing
   kvmclock_offset to vary while the VM is running in master clock
   mode. Perhaps every call to pvclock_update_vm_gtod_copy() which
   starts in master clock mode should follow the same process as the
   proposed KVM_SET_CLOCK_GUEST to adjust the kvmclock_offset value
   which corresponds with the new reference point. As long as we don’t
   break in the case where something weird (host hibernation, etc.)
   happened to the TSC, and we actually want to trust kvmclock_offset.
   Maybe we should have a periodic work queue which keeps
   kvmclock_offset in sync with the KVM clock while the VM is in master
   clock mode? 

 • Correct the KVM documentation for TSC migration to take TSC
scaling
   into account. Something like...

   (SOURCE)
    • Sample both TAI and the (source) host TSC at an arbitrary time we
      shall call Tsrc:
      • Use adjtimex() to obtain tai_offset.
      • Use KVM_GET_CLOCK to read UTC time and host TSC (ignoring the
        actual kvm clock). These represent time Tsrc.
      • Use adjtimex() to obtain tai_offset again, looping back to the
        beginning if it changes.
      • Convert the UTC time to TAI by adding the tai_offset.

    • ∀ vCPU:
      • Read the scaling information with the KVM_CPU_TSC_SCALE
        attribute.
      • Read the offset with the KVM_CPU_TSC_OFFSET attribute.
      • Calculate this vCPU’s TSC at time Tsrc, from the host TSC
        value.

    • Use KVM_GET_CLOCK_GUEST to read the KVM clock (on vCPU0).

   (DESTINATION)
    • Sample both TAI and the (destination) host TSC at a time we shall
      call Tdst:
      • Use adjtimex() to obtain tai_offset.
      • Use KVM_GET_CLOCK to read UTC time and host TSC.
      • Use adjtimex() to obtain tai_offset again, looping back to the
        beginning if it changes.
      • Convert the UTC time to TAI by adding the tai_offset.

    • Calculate the time (in the TAI clock) elapsed between Tsrc and
      Tdst. Call this ΔT.

    • ∀ vCPU:
      • Calculate this vCPU’s intended TSC value at time Tdst:
        • Given this vCPU’s TSC frequency, calculate the number of TSC
          ticks correponding to ΔT.
        • Add this to the vCPU TSC value calculated on the source
        • Read the scaling information on the current host with the
          KVM_CPU_TSC_SCALE attribute
        • Calculate this vCPU’s scaled TSC value corresponding to the
          host TSC at time Tdst without taking offsetting into account.
        • Set KVM_CPU_TSC_OFFSET to the delta between that and the
          intended TSC value.

    • Use KVM_SET_CLOCK_GUEST to set the KVM clock (on vCPU0).
Allister, Jack April 10, 2024, 10:15 a.m. UTC | #3
> AFAIR, I copied check_clocksource() from existing code during that >
time.

> The commit e440c5f2e ("KVM: selftests: Generalize check_clocksource()
> from kvm_clock_test") has introduced sys_clocksource_is_tsc(). Later
> it is renamed to sys_clocksource_is_based_on_tsc().
> Any chance to re-use sys_clocksource_is_based_on_tsc()?

Yes I'm more than happy to change it to that. I was using your original
mail as a reference and did not realise there was a utility present for
this.

> Is configure_scaled_tsc() anecessary? Or how about to make it an  >
option/arg?
> Then I will be able to test it on a VM/server without TSC scaling.

So if TSC scaling from 3GHz (host) -> 1.5GHz (guest) I do see a skew of
~3500ns after the update. Where as without scaling a delta can be seen
but is roughly ~180ns.

In V2 I've adjusted the test so that now by default scaling won't take
place, however if someone wants to test with it enabled they can pass
"-s/--scale-tsc" to induce the greater delta.


Thanks you for the feedback,
Jack Allister
David Woodhouse April 11, 2024, 1:28 p.m. UTC | #4
On Wed, 2024-04-10 at 10:15 +0000, Allister, Jack wrote:
> > AFAIR, I copied check_clocksource() from existing code during that >
> time.
> 
> > The commit e440c5f2e ("KVM: selftests: Generalize check_clocksource()
> > from kvm_clock_test") has introduced sys_clocksource_is_tsc(). Later
> > it is renamed to sys_clocksource_is_based_on_tsc().
> > Any chance to re-use sys_clocksource_is_based_on_tsc()?
> 
> Yes I'm more than happy to change it to that. I was using your original
> mail as a reference and did not realise there was a utility present for
> this.
> 
> > Is configure_scaled_tsc() anecessary? Or how about to make it an  >
> option/arg?
> > Then I will be able to test it on a VM/server without TSC scaling.
> 
> So if TSC scaling from 3GHz (host) -> 1.5GHz (guest) I do see a skew of
> ~3500ns after the update. Where as without scaling a delta can be seen
> but is roughly ~180ns.

I don't think it's as simple as "TSC scaling makes the drift larger".
I suspect that's just the way the arithmetic precision works out for
those frequencies. With other frequencies of host and guest you might
find that it works out closer *with* the scaling.

Consider a graph of "time" in the Y axis, against the host TSC as the X
axis. As an example, let's assume the host has a TSC frequency of 3GHz.

Each of the three definitions of the KVM clock (A based on
CLOCK_MONOTONIC_RAW, B based on the guest TSC, C based directly on the
host TSC) will have a gradient of *roughly* 1 ns per three ticks.

Due to arithmetic precision, the gradient of each is going to vary
slightly. We hope that CLOCK_MONOTONIC_RAW is going to do the best, as 
the other two are limited by the precision of the pvclock ABI that's
exposed to the guest. You can use http://david.woodhou.se/tsdrift.c to
see where the latter two land, for different TSC frequencies.

 $ ./tsdrift 2500000000 3000000000 | tail -1
TSC 259200000000000, guest TSC 215999999979883, guest ns 86399999971836 host ns 86399999979883 (delta -8047)
 $ ./tsdrift 2700000000 3000000000 | tail -1
TSC 259200000000000, guest TSC 233279999975860, guest ns 86399999983012 host ns 86399999979883 (delta 3129)

So after a day, let's assume CLOCK_MONOTONIC_RAW will have advanced by
86400 seconds. The KVM clock based on the host TSC will be 20µs slow,
while a KVM clock based on a guest TSC frequency of 2.5GHz would be an
*additional* 8µs slower. But a guest TSC frequency of 2.7GHz would
actually run *faster* than the host-based one, and would only be 17µs
behind reality.

Your test is measuring how *much* the host CLOCK_MONOTONIC_RAW (my
definition A) drifts from definition B which is derived from the guest
TSC.

It demonstrates the discontinuity that KVM_REQ_MASTERCLOCK_UPDATE
introduces, by clamping the KVM clock back to the 'definition A' line.

Fixing that is in the TODO list I shared. Basically it involves
realising that in use_master_clock mode, the delta between the KVM
clock and CLOCK_MONOTONIC_RAW (ka->kvmclock_offset) is *varying* over
time. So instead of just blindly using kvmclock_offset, we should
*recalculate* it in precisely the way that your KVM_SET_CLOCK_GUEST
does.

Having said all that... scaling from 3GHz to 1.5GHz *doesn't* lose any
precision; it shouldn't make any difference. But I guess your host TSC
isn't *really* 3GHz, it's measured against the PIT or something awful,
and comes out at a shade above or below 3GHz, leading to a more
interesting scaling factor?

> In V2 I've adjusted the test so that now by default scaling won't take
> place, however if someone wants to test with it enabled they can pass
> "-s/--scale-tsc" to induce the greater delta.

Please do it automatically based on the availability of the feature.
Chen, Zide April 19, 2024, 5:13 p.m. UTC | #5
On 4/8/2024 3:07 PM, Jack Allister wrote:
> This test proves that there is an inherent KVM/PV clock drift away from the
> guest TSC when KVM decides to update the PV time information structure due
> to a KVM_REQ_MASTERCLOCK_UPDATE. This drift is exascerbated when a guest is
> using TSC scaling and running at a different frequency to the host TSC [1].
> It also proves that KVM_[GS]ET_CLOCK_GUEST API is working to mitigate the
> drift from TSC to within ±1ns.
> 
> The test simply records the PVTI (PV time information) at time of guest
> creation, after KVM has updated it's mapped PVTI structure and once the
> correction has taken place.
> 
> A singular point in time is then recorded via the guest TSC and is used to
> calculate the a PV clock value using each of the 3 PVTI structures.
> 
> As seen below a drift of ~3500ns is observed if no correction has taken
> place after KVM has updated the PVTI via master clock update. However,
> after the correction a delta of at most 1ns can be seen.
> 
> * selftests: kvm: pvclock_test
> * scaling tsc from 2999999KHz to 1499999KHz
> * before=5038374946 uncorrected=5038371437 corrected=5038374945
> * delta_uncorrected=3509 delta_corrected=1
> 
> Clocksource check code has been borrowed from [2].
> 
> [1]: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=451a707813ae
> [2]: https://lore.kernel.org/kvm/20240106083346.29180-1-dongli.zhang@oracle.com/
> 
> Signed-off-by: Jack Allister <jalliste@amazon.com>
> CC: David Woodhouse <dwmw2@infradead.org>
> CC: Paul Durrant <paul@xen.org>
> ---
>  tools/testing/selftests/kvm/Makefile          |   1 +
>  .../selftests/kvm/x86_64/pvclock_test.c       | 223 ++++++++++++++++++
>  2 files changed, 224 insertions(+)
>  create mode 100644 tools/testing/selftests/kvm/x86_64/pvclock_test.c
> 
> diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
> index 741c7dc16afc..02ee1205bbed 100644
> --- a/tools/testing/selftests/kvm/Makefile
> +++ b/tools/testing/selftests/kvm/Makefile
> @@ -87,6 +87,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/pmu_counters_test
>  TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test
>  TEST_GEN_PROGS_x86_64 += x86_64/private_mem_conversions_test
>  TEST_GEN_PROGS_x86_64 += x86_64/private_mem_kvm_exits_test
> +TEST_GEN_PROGS_x86_64 += x86_64/pvclock_test
>  TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id
>  TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test
>  TEST_GEN_PROGS_x86_64 += x86_64/smaller_maxphyaddr_emulation_test
> diff --git a/tools/testing/selftests/kvm/x86_64/pvclock_test.c b/tools/testing/selftests/kvm/x86_64/pvclock_test.c
> new file mode 100644
> index 000000000000..172ef4d19c60
> --- /dev/null
> +++ b/tools/testing/selftests/kvm/x86_64/pvclock_test.c
> @@ -0,0 +1,223 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright © 2024, Amazon.com, Inc. or its affiliates.
> + *
> + * Tests for pvclock API
> + * KVM_SET_CLOCK_GUEST/KVM_GET_CLOCK_GUEST
> + */
> +#include <asm/pvclock.h>
> +#include <asm/pvclock-abi.h>
> +#include <sys/stat.h>
> +#include <stdint.h>
> +#include <stdio.h>
> +
> +#include "test_util.h"
> +#include "kvm_util.h"
> +#include "processor.h"
> +
> +enum {
> +	STAGE_FIRST_BOOT,
> +	STAGE_UNCORRECTED,
> +	STAGE_CORRECTED,
> +	NUM_STAGES
> +};
> +
> +#define KVMCLOCK_GPA 0xc0000000ull
> +#define KVMCLOCK_SIZE sizeof(struct pvclock_vcpu_time_info)
> +
> +static void trigger_pvti_update(vm_paddr_t pvti_pa)
> +{
> +	/*
> +	 * We need a way to trigger KVM to update the fields
> +	 * in the PV time info. The easiest way to do this is
> +	 * to temporarily switch to the old KVM system time
> +	 * method and then switch back to the new one.
> +	 */
> +	wrmsr(MSR_KVM_SYSTEM_TIME, pvti_pa | KVM_MSR_ENABLED);
> +	wrmsr(MSR_KVM_SYSTEM_TIME_NEW, pvti_pa | KVM_MSR_ENABLED);
> +}
> +
> +static void guest_code(vm_paddr_t pvti_pa)
> +{
> +	struct pvclock_vcpu_time_info *pvti_va =
> +		(struct pvclock_vcpu_time_info *)pvti_pa;
> +
> +	struct pvclock_vcpu_time_info pvti_boot;
> +	struct pvclock_vcpu_time_info pvti_uncorrected;
> +	struct pvclock_vcpu_time_info pvti_corrected;
> +	uint64_t cycles_boot;
> +	uint64_t cycles_uncorrected;
> +	uint64_t cycles_corrected;
> +	uint64_t tsc_guest;
> +
> +	/*
> +	 * Setup the KVMCLOCK in the guest & store the original
> +	 * PV time structure that is used.
> +	 */
> +	wrmsr(MSR_KVM_SYSTEM_TIME_NEW, pvti_pa | KVM_MSR_ENABLED);
> +	pvti_boot = *pvti_va;
> +	GUEST_SYNC(STAGE_FIRST_BOOT);
> +
> +	/*
> +	 * Trigger an update of the PVTI, if we calculate
> +	 * the KVM clock using this structure we'll see
> +	 * a drift from the TSC.
> +	 */
> +	trigger_pvti_update(pvti_pa);
> +	pvti_uncorrected = *pvti_va;
> +	GUEST_SYNC(STAGE_UNCORRECTED);
> +
> +	/*
> +	 * The test should have triggered the correction by this
> +	 * point in time. We have a copy of each of the PVTI structs
> +	 * at each stage now.
> +	 *
> +	 * Let's sample the timestamp at a SINGLE point in time and
> +	 * then calculate what the KVM clock would be using the PVTI
> +	 * from each stage.
> +	 *
> +	 * Then return each of these values to the tester.
> +	 */
> +	pvti_corrected = *pvti_va;
> +	tsc_guest = rdtsc();
> +
> +	cycles_boot = __pvclock_read_cycles(&pvti_boot, tsc_guest);
> +	cycles_uncorrected = __pvclock_read_cycles(&pvti_uncorrected, tsc_guest);
> +	cycles_corrected = __pvclock_read_cycles(&pvti_corrected, tsc_guest);
> +
> +	GUEST_SYNC_ARGS(STAGE_CORRECTED, cycles_boot, cycles_uncorrected,
> +			cycles_corrected, 0);
> +}
> +
> +static void run_test(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
> +{
> +	struct ucall uc;
> +	uint64_t ucall_reason;
> +	struct pvclock_vcpu_time_info pvti_before;
> +	uint64_t before, uncorrected, corrected;
> +	int64_t delta_uncorrected, delta_corrected;
> +
> +	/* Loop through each stage of the test. */
> +	while (true) {
> +
> +		/* Start/restart the running vCPU code. */
> +		vcpu_run(vcpu);
> +		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
> +
> +		/* Retrieve and verify our stage. */
> +		ucall_reason = get_ucall(vcpu, &uc);
> +		TEST_ASSERT(ucall_reason == UCALL_SYNC,
> +			    "Unhandled ucall reason=%lu",
> +			    ucall_reason);
> +
> +		/* Run host specific code relating to stage. */
> +		switch (uc.args[1]) {
> +		case STAGE_FIRST_BOOT:
> +			/* Store the KVM clock values before an update. */
> +			vm_ioctl(vm, KVM_GET_CLOCK_GUEST, &pvti_before);
> +
> +			/* Sleep for a set amount of time to induce drift. */
> +			sleep(5);
> +			break;
> +
> +		case STAGE_UNCORRECTED:
> +			/* Restore the KVM clock values. */
> +			vm_ioctl(vm, KVM_SET_CLOCK_GUEST, &pvti_before);
> +			break;
> +
> +		case STAGE_CORRECTED:
> +			/* Query the clock information and verify delta. */
> +			before = uc.args[2];
> +			uncorrected = uc.args[3];
> +			corrected = uc.args[4];
> +
> +			delta_uncorrected = before - uncorrected;
> +			delta_corrected = before - corrected;
> +
> +			pr_info("before=%lu uncorrected=%lu corrected=%lu\n",
> +				before, uncorrected, corrected);
> +
> +			pr_info("delta_uncorrected=%ld delta_corrected=%ld\n",
> +				delta_uncorrected, delta_corrected);
> +
> +			TEST_ASSERT((delta_corrected <= 1) && (delta_corrected >= -1),
> +				    "larger than expected delta detected = %ld", delta_corrected);

I'm wondering what's the underling theory that we definitely can achieve
±1ns accuracy? I tested it on a Sapphire Rapids @2100MHz TSC frequency,
and I can see delta_corrected=2 in ~2% cases.
David Woodhouse April 19, 2024, 6:43 p.m. UTC | #6
On 19 April 2024 19:40:06 BST, David Woodhouse <dwmw2@infradead.org> wrote:
>On 19 April 2024 18:13:16 BST, "Chen, Zide" <zide.chen@intel.com> wrote:
>>I'm wondering what's the underling theory that we definitely can achieve
>>±1ns accuracy? I tested it on a Sapphire Rapids @2100MHz TSC frequency,
>>and I can see delta_corrected=2 in ~2% cases.
>
>Hm. Thanks for testing!
>
>So the KVM clock is based on the guest TSC. Given a delta between the guest TSC T and some reference point in time R, the KVM clock is expressed as a(T-R)+r, where little r is the value of the KVM clock when the guest TSC was R, and (a) is the rate of the guest TSC.
>
>When set the clock with KVM_SET_CLOCK_GUEST, we are changing the values of R and r to a new point in time. Call the new ones Q and q respectively.
>
>But we calculate precisely (within 1ns at least) what the KVM clock would have been with the *old* formula, and adjust our new offset (q) so that at our new reference TSC value Q, the formulae give exactly the same result.
>
>And because the *rates* are the same, they should continue to give the same results, ±1ns.
>
>Or such *was* my theory, at least. 
>
>Would be interesting to see it disproven with actual numbers for the old+new pvclock structs, so I can understand where the logic goes wrong.
>
>Were you using frequency scaling?
>

Oh, also please could you test the updated version I posted yesterday, from https://git.infradead.org/?p=users/dwmw2/linux.git;a=shortlog;h=refs/heads/clocks
David Woodhouse April 19, 2024, 7:34 p.m. UTC | #7
On 19 April 2024 18:13:16 BST, "Chen, Zide" <zide.chen@intel.com> wrote:
>I'm wondering what's the underling theory that we definitely can achieve
>±1ns accuracy? I tested it on a Sapphire Rapids @2100MHz TSC frequency,
>and I can see delta_corrected=2 in ~2% cases.

Hm. Thanks for testing!

So the KVM clock is based on the guest TSC. Given a delta between the guest TSC T and some reference point in time R, the KVM clock is expressed as a(T-R)+r, where little r is the value of the KVM clock when the guest TSC was R, and (a) is the rate of the guest TSC.

When set the clock with KVM_SET_CLOCK_GUEST, we are changing the values of R and r to a new point in time. Call the new ones Q and q respectively.

But we calculate precisely (within 1ns at least) what the KVM clock would have been with the *old* formula, and adjust our new offset (q) so that at our new reference TSC value Q, the formulae give exactly the same result.

And because the *rates* are the same, they should continue to give the same results, ±1ns.

Or such *was* my theory, at least. 

Would be interesting to see it disproven with actual numbers for the old+new pvclock structs, so I can understand where the logic goes wrong.

Were you using frequency scaling?
Chen, Zide April 19, 2024, 11:53 p.m. UTC | #8
On 4/19/2024 12:34 PM, David Woodhouse wrote:
> On 19 April 2024 18:13:16 BST, "Chen, Zide" <zide.chen@intel.com> wrote:
>> I'm wondering what's the underling theory that we definitely can achieve
>> ±1ns accuracy? I tested it on a Sapphire Rapids @2100MHz TSC frequency,
>> and I can see delta_corrected=2 in ~2% cases.
> 
> Hm. Thanks for testing!
> 
> So the KVM clock is based on the guest TSC. Given a delta between the guest TSC T and some reference point in time R, the KVM clock is expressed as a(T-R)+r, where little r is the value of the KVM clock when the guest TSC was R, and (a) is the rate of the guest TSC.
> 
> When set the clock with KVM_SET_CLOCK_GUEST, we are changing the values of R and r to a new point in time. Call the new ones Q and q respectively.
> 
> But we calculate precisely (within 1ns at least) what the KVM clock would have been with the *old* formula, and adjust our new offset (q) so that at our new reference TSC value Q, the formulae give exactly the same result.
> 
> And because the *rates* are the same, they should continue to give the same results, ±1ns.
> 
> Or such *was* my theory, at least. 

Thanks for the explanation.

> 
> Would be interesting to see it disproven with actual numbers for the old+new pvclock structs, so I can understand where the logic goes wrong.
> 
> Were you using frequency scaling?

I can see similar ~2% failure ratio w/ or w/o commenting out
configure_scaled_tsc().
Chen, Zide April 19, 2024, 11:54 p.m. UTC | #9
On 4/19/2024 11:43 AM, David Woodhouse wrote:
> On 19 April 2024 19:40:06 BST, David Woodhouse <dwmw2@infradead.org> wrote:
>> On 19 April 2024 18:13:16 BST, "Chen, Zide" <zide.chen@intel.com> wrote:
>>> I'm wondering what's the underling theory that we definitely can achieve
>>> ±1ns accuracy? I tested it on a Sapphire Rapids @2100MHz TSC frequency,
>>> and I can see delta_corrected=2 in ~2% cases.
>>
>> Hm. Thanks for testing!
>>
>> So the KVM clock is based on the guest TSC. Given a delta between the guest TSC T and some reference point in time R, the KVM clock is expressed as a(T-R)+r, where little r is the value of the KVM clock when the guest TSC was R, and (a) is the rate of the guest TSC.
>>
>> When set the clock with KVM_SET_CLOCK_GUEST, we are changing the values of R and r to a new point in time. Call the new ones Q and q respectively.
>>
>> But we calculate precisely (within 1ns at least) what the KVM clock would have been with the *old* formula, and adjust our new offset (q) so that at our new reference TSC value Q, the formulae give exactly the same result.
>>
>> And because the *rates* are the same, they should continue to give the same results, ±1ns.
>>
>> Or such *was* my theory, at least. 
>>
>> Would be interesting to see it disproven with actual numbers for the old+new pvclock structs, so I can understand where the logic goes wrong.
>>
>> Were you using frequency scaling?
>>
> 
> Oh, also please could you test the updated version I posted yesterday, from https://git.infradead.org/?p=users/dwmw2/linux.git;a=shortlog;h=refs/heads/clocks

I failed to check out your branch, instead I downloaded the patch series
from:
https://lore.kernel.org/linux-kselftest/FABCFBD0-4B76-4662-9F7B-7E1A856BBBB6@infradead.org/T/#t

However, the selftest hangs:

[Apr19 16:15] kselftest: Running tests in kvm
[Apr19 16:16] rcu: INFO: rcu_preempt detected stalls on CPUs/tasks:
[  +0.000628] rcu:      78-...0: (1 GPs behind)
idle=3c8c/1/0x4000000000000000 softirq=5908/5913 fqs=14025
[  +0.000468] rcu:      (detected by 104, t=60003 jiffies, g=60073,
q=3100 ncpus=128)
[  +0.000389] Sending NMI from CPU 104 to CPUs 78:
[  +0.000360] NMI backtrace for cpu 78
[  +0.000004] CPU: 78 PID: 33515 Comm: pvclock_test Tainted: G
O       6.9.0-rc1zide-l0+ #194
[  +0.000003] Hardware name: Inspur NF5280M7/NF5280M7, BIOS 05.08.01
08/18/2023
[  +0.000002] RIP: 0010:pvclock_update_vm_gtod_copy+0xb5/0x200 [kvm]
[  +0.000079] Code: ea 83 e1 40 48 0f 45 c2 31 d2 48 3d 00 94 35 77 76
0e 48 d1 e8 83 ea 01 48 3d 00 94 35 77 77 f2 48 3d 00 ca 9a 3b 89 c1 77
0d <01> c9 83 c2 01 81 f9 00 ca 9a 3b 76 f3 88 93 8c 95 00 00 31 c0 ba
[  +0.000002] RSP: 0018:ff368a58cfe07e30 EFLAGS: 00000087
[  +0.000002] RAX: 0000000000000000 RBX: ff368a58e0ccd000 RCX:
0000000000000000
[  +0.000001] RDX: 000000005ca49a49 RSI: 00000000000029aa RDI:
0000019ee77a1c00
[  +0.000002] RBP: ff368a58cfe07e50 R08: 0000000000000001 R09:
0000000000000000
[  +0.000000] R10: ff26383d853ab400 R11: 0000000000000002 R12:
0000000000000000
[  +0.000001] R13: ff368a58e0cd6400 R14: 0000000000000293 R15:
ff368a58e0cd69f0
[  +0.000001] FS:  00007f6946473740(0000) GS:ff26384c7fb80000(0000)
knlGS:0000000000000000
[  +0.000001] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  +0.000001] CR2: 00007f69463bd445 CR3: 000000016f466006 CR4:
0000000000f71ef0
[  +0.000001] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
0000000000000000
[  +0.000000] DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7:
0000000000000400
[  +0.000001] PKRU: 55555554
[  +0.000001] Call Trace:
[  +0.000004]  <NMI>
[  +0.000003]  ? nmi_cpu_backtrace+0x87/0xf0
[  +0.000008]  ? nmi_cpu_backtrace_handler+0x11/0x20
[  +0.000005]  ? nmi_handle+0x5f/0x170
[  +0.000005]  ? pvclock_update_vm_gtod_copy+0xb5/0x200 [kvm]
[  +0.000045]  ? default_do_nmi+0x79/0x1a0
[  +0.000004]  ? exc_nmi+0xf0/0x130
[  +0.000001]  ? end_repeat_nmi+0xf/0x53
[  +0.000006]  ? pvclock_update_vm_gtod_copy+0xb5/0x200 [kvm]
[  +0.000041]  ? pvclock_update_vm_gtod_copy+0xb5/0x200 [kvm]
[  +0.000040]  ? pvclock_update_vm_gtod_copy+0xb5/0x200 [kvm]
[  +0.000039]  </NMI>
[  +0.000000]  <TASK>
[  +0.000001]  ? preempt_count_add+0x73/0xa0
[  +0.000004]  kvm_arch_init_vm+0xf1/0x1e0 [kvm]
[  +0.000049]  kvm_create_vm+0x370/0x650 [kvm]
[  +0.000036]  kvm_dev_ioctl+0x88/0x180 [kvm]
[  +0.000034]  __x64_sys_ioctl+0x8e/0xd0
[  +0.000007]  do_syscall_64+0x5b/0x120
[  +0.000003]  entry_SYSCALL_64_after_hwframe+0x6c/0x74
[  +0.000003] RIP: 0033:0x7f694631a94f
[  +0.000002] Code: 00 48 89 44 24 18 31 c0 48 8d 44 24 60 c7 04 24 10
00 00 00 48 89 44 24 08 48 8d 44 24 20 48 89 44 24 10 b8 10 00 00 00 0f
05 <41> 89 c0 3d 00 f0 ff ff 77 1f 48 8b 44 24 18 64 48 2b 04 25 28 00
[  +0.000001] RSP: 002b:00007ffca91b2e50 EFLAGS: 00000246 ORIG_RAX:
0000000000000010
[  +0.000002] RAX: ffffffffffffffda RBX: 0000000000434480 RCX:
00007f694631a94f
[  +0.000001] RDX: 0000000000000000 RSI: 000000000000ae01 RDI:
0000000000000005
[  +0.000000] RBP: 0000000000000009 R08: 000000000041b198 R09:
000000000041bfbf
[  +0.000001] R10: 00007f69463d8882 R11: 0000000000000246 R12:
0000000000434480
[  +0.000000] R13: 000000000041e0f0 R14: 0000000000001000 R15:
0000000000000207
[  +0.000002]  </TASK>
David Woodhouse April 20, 2024, 10:32 a.m. UTC | #10
On 20 April 2024 00:54:05 BST, "Chen, Zide" <zide.chen@intel.com> wrote:
>
>
>On 4/19/2024 11:43 AM, David Woodhouse wrote:
>> On 19 April 2024 19:40:06 BST, David Woodhouse <dwmw2@infradead.org> wrote:
>>> On 19 April 2024 18:13:16 BST, "Chen, Zide" <zide.chen@intel.com> wrote:
>>>> I'm wondering what's the underling theory that we definitely can achieve
>>>> ±1ns accuracy? I tested it on a Sapphire Rapids @2100MHz TSC frequency,
>>>> and I can see delta_corrected=2 in ~2% cases.
>>>
>>> Hm. Thanks for testing!
>>>
>>> So the KVM clock is based on the guest TSC. Given a delta between the guest TSC T and some reference point in time R, the KVM clock is expressed as a(T-R)+r, where little r is the value of the KVM clock when the guest TSC was R, and (a) is the rate of the guest TSC.
>>>
>>> When set the clock with KVM_SET_CLOCK_GUEST, we are changing the values of R and r to a new point in time. Call the new ones Q and q respectively.
>>>
>>> But we calculate precisely (within 1ns at least) what the KVM clock would have been with the *old* formula, and adjust our new offset (q) so that at our new reference TSC value Q, the formulae give exactly the same result.
>>>
>>> And because the *rates* are the same, they should continue to give the same results, ±1ns.
>>>
>>> Or such *was* my theory, at least. 
>>>
>>> Would be interesting to see it disproven with actual numbers for the old+new pvclock structs, so I can understand where the logic goes wrong.
>>>
>>> Were you using frequency scaling?
>>>
>> 
>> Oh, also please could you test the updated version I posted yesterday, from https://git.infradead.org/?p=users/dwmw2/linux.git;a=shortlog;h=refs/heads/clocks
>
>I failed to check out your branch, instead I downloaded the patch series
>from:
>https://lore.kernel.org/linux-kselftest/FABCFBD0-4B76-4662-9F7B-7E1A856BBBB6@infradead.org/T/#t
>
>However, the selftest hangs:

Odd. It locks up in kvm_arch_init_vm(). Maybe when I get back to my desk something will be obvious. But please could I have your .config?

If you're able to bisect and see which patch causes that, it would also be much appreciated. Thanks!

>[Apr19 16:15] kselftest: Running tests in kvm
>[Apr19 16:16] rcu: INFO: rcu_preempt detected stalls on CPUs/tasks:
>[  +0.000628] rcu:      78-...0: (1 GPs behind)
>idle=3c8c/1/0x4000000000000000 softirq=5908/5913 fqs=14025
>[  +0.000468] rcu:      (detected by 104, t=60003 jiffies, g=60073,
>q=3100 ncpus=128)
>[  +0.000389] Sending NMI from CPU 104 to CPUs 78:
>[  +0.000360] NMI backtrace for cpu 78
>[  +0.000004] CPU: 78 PID: 33515 Comm: pvclock_test Tainted: G
>O       6.9.0-rc1zide-l0+ #194
>[  +0.000003] Hardware name: Inspur NF5280M7/NF5280M7, BIOS 05.08.01
>08/18/2023
>[  +0.000002] RIP: 0010:pvclock_update_vm_gtod_copy+0xb5/0x200 [kvm]
>[  +0.000079] Code: ea 83 e1 40 48 0f 45 c2 31 d2 48 3d 00 94 35 77 76
>0e 48 d1 e8 83 ea 01 48 3d 00 94 35 77 77 f2 48 3d 00 ca 9a 3b 89 c1 77
>0d <01> c9 83 c2 01 81 f9 00 ca 9a 3b 76 f3 88 93 8c 95 00 00 31 c0 ba
>[  +0.000002] RSP: 0018:ff368a58cfe07e30 EFLAGS: 00000087
>[  +0.000002] RAX: 0000000000000000 RBX: ff368a58e0ccd000 RCX:
>0000000000000000
>[  +0.000001] RDX: 000000005ca49a49 RSI: 00000000000029aa RDI:
>0000019ee77a1c00
>[  +0.000002] RBP: ff368a58cfe07e50 R08: 0000000000000001 R09:
>0000000000000000
>[  +0.000000] R10: ff26383d853ab400 R11: 0000000000000002 R12:
>0000000000000000
>[  +0.000001] R13: ff368a58e0cd6400 R14: 0000000000000293 R15:
>ff368a58e0cd69f0
>[  +0.000001] FS:  00007f6946473740(0000) GS:ff26384c7fb80000(0000)
>knlGS:0000000000000000
>[  +0.000001] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>[  +0.000001] CR2: 00007f69463bd445 CR3: 000000016f466006 CR4:
>0000000000f71ef0
>[  +0.000001] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
>0000000000000000
>[  +0.000000] DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7:
>0000000000000400
>[  +0.000001] PKRU: 55555554
>[  +0.000001] Call Trace:
>[  +0.000004]  <NMI>
>[  +0.000003]  ? nmi_cpu_backtrace+0x87/0xf0
>[  +0.000008]  ? nmi_cpu_backtrace_handler+0x11/0x20
>[  +0.000005]  ? nmi_handle+0x5f/0x170
>[  +0.000005]  ? pvclock_update_vm_gtod_copy+0xb5/0x200 [kvm]
>[  +0.000045]  ? default_do_nmi+0x79/0x1a0
>[  +0.000004]  ? exc_nmi+0xf0/0x130
>[  +0.000001]  ? end_repeat_nmi+0xf/0x53
>[  +0.000006]  ? pvclock_update_vm_gtod_copy+0xb5/0x200 [kvm]
>[  +0.000041]  ? pvclock_update_vm_gtod_copy+0xb5/0x200 [kvm]
>[  +0.000040]  ? pvclock_update_vm_gtod_copy+0xb5/0x200 [kvm]
>[  +0.000039]  </NMI>
>[  +0.000000]  <TASK>
>[  +0.000001]  ? preempt_count_add+0x73/0xa0
>[  +0.000004]  kvm_arch_init_vm+0xf1/0x1e0 [kvm]
>[  +0.000049]  kvm_create_vm+0x370/0x650 [kvm]
>[  +0.000036]  kvm_dev_ioctl+0x88/0x180 [kvm]
>[  +0.000034]  __x64_sys_ioctl+0x8e/0xd0
>[  +0.000007]  do_syscall_64+0x5b/0x120
>[  +0.000003]  entry_SYSCALL_64_after_hwframe+0x6c/0x74
>[  +0.000003] RIP: 0033:0x7f694631a94f
>[  +0.000002] Code: 00 48 89 44 24 18 31 c0 48 8d 44 24 60 c7 04 24 10
>00 00 00 48 89 44 24 08 48 8d 44 24 20 48 89 44 24 10 b8 10 00 00 00 0f
>05 <41> 89 c0 3d 00 f0 ff ff 77 1f 48 8b 44 24 18 64 48 2b 04 25 28 00
>[  +0.000001] RSP: 002b:00007ffca91b2e50 EFLAGS: 00000246 ORIG_RAX:
>0000000000000010
>[  +0.000002] RAX: ffffffffffffffda RBX: 0000000000434480 RCX:
>00007f694631a94f
>[  +0.000001] RDX: 0000000000000000 RSI: 000000000000ae01 RDI:
>0000000000000005
>[  +0.000000] RBP: 0000000000000009 R08: 000000000041b198 R09:
>000000000041bfbf
>[  +0.000001] R10: 00007f69463d8882 R11: 0000000000000246 R12:
>0000000000434480
>[  +0.000000] R13: 000000000041e0f0 R14: 0000000000001000 R15:
>0000000000000207
>[  +0.000002]  </TASK>
David Woodhouse April 20, 2024, 4:03 p.m. UTC | #11
On Fri, 2024-04-19 at 16:54 -0700, Chen, Zide wrote:
> 
> However, the selftest hangs:
> 
> [Apr19 16:15] kselftest: Running tests in kvm
> [Apr19 16:16] rcu: INFO: rcu_preempt detected stalls on CPUs/tasks:
> [  +0.000628] rcu:      78-...0: (1 GPs behind) idle=3c8c/1/0x4000000000000000 softirq=5908/5913 fqs=14025
> [  +0.000468] rcu:      (detected by 104, t=60003 jiffies, g=60073, q=3100 ncpus=128)
> [  +0.000389] Sending NMI from CPU 104 to CPUs 78:
> [  +0.000360] NMI backtrace for cpu 78
> [  +0.000004] CPU: 78 PID: 33515 Comm: pvclock_test Tainted: G O       6.9.0-rc1zide-l0+ #194
> [  +0.000003] Hardware name: Inspur NF5280M7/NF5280M7, BIOS 05.08.01 08/18/2023
> [  +0.000002] RIP: 0010:pvclock_update_vm_gtod_copy+0xb5/0x200 [kvm]

Ah, kvm_get_time_scale() doesn't much like being asked to scale to zero.

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a07b60351894..45fb99986cf9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3046,7 +3046,8 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
 		 * Copy from the field protected solely by ka->tsc_write_lock,
 		 * to the field protected by the ka->pvclock_sc seqlock.
 		 */
-		ka->master_tsc_scaling_ratio = ka->last_tsc_scaling_ratio;
+		ka->master_tsc_scaling_ratio = ka->last_tsc_scaling_ratio ? :
+			kvm_caps.default_tsc_scaling_ratio;
 
 		/*
 		 * Calculate the scaling factors precisely the same way
Chen, Zide April 22, 2024, 10:02 p.m. UTC | #12
On 4/20/2024 9:03 AM, David Woodhouse wrote:
> On Fri, 2024-04-19 at 16:54 -0700, Chen, Zide wrote:
>>
>> However, the selftest hangs:
>>
>> [Apr19 16:15] kselftest: Running tests in kvm
>> [Apr19 16:16] rcu: INFO: rcu_preempt detected stalls on CPUs/tasks:
>> [  +0.000628] rcu:      78-...0: (1 GPs behind) idle=3c8c/1/0x4000000000000000 softirq=5908/5913 fqs=14025
>> [  +0.000468] rcu:      (detected by 104, t=60003 jiffies, g=60073, q=3100 ncpus=128)
>> [  +0.000389] Sending NMI from CPU 104 to CPUs 78:
>> [  +0.000360] NMI backtrace for cpu 78
>> [  +0.000004] CPU: 78 PID: 33515 Comm: pvclock_test Tainted: G O       6.9.0-rc1zide-l0+ #194
>> [  +0.000003] Hardware name: Inspur NF5280M7/NF5280M7, BIOS 05.08.01 08/18/2023
>> [  +0.000002] RIP: 0010:pvclock_update_vm_gtod_copy+0xb5/0x200 [kvm]
> 
> Ah, kvm_get_time_scale() doesn't much like being asked to scale to zero.
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index a07b60351894..45fb99986cf9 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3046,7 +3046,8 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
>  		 * Copy from the field protected solely by ka->tsc_write_lock,
>  		 * to the field protected by the ka->pvclock_sc seqlock.
>  		 */
> -		ka->master_tsc_scaling_ratio = ka->last_tsc_scaling_ratio;
> +		ka->master_tsc_scaling_ratio = ka->last_tsc_scaling_ratio ? :
> +			kvm_caps.default_tsc_scaling_ratio;
>  
>  		/*
>  		 * Calculate the scaling factors precisely the same way
> 		 * that kvm_guest_time_update() does.
>  		last_tsc_hz = kvm_scale_tsc(tsc_khz * 1000,
>                                           ka->last_tsc_scaling_ratio);

Should be ka->master_tsc_scaling_ratio?

If I restored the KVM_REQ_GLOBAL_CLOCK_UPDATE request from
kvm_arch_vcpu_load(), the selftest works for me, and I ran the test for
1000+ iterations, w/ or w/o TSC scaling, the TEST_ASSERT(delta_corrected
 <= ±1) never got hit. This is awesome!

However, without KVM_REQ_GLOBAL_CLOCK_UPDATE, it still fails on creating
a VM. Maybe the init sequence sill needs some rework.

 BUG: unable to handle page fault for address: 005b29e3f221ccf0
 #PF: supervisor read access in kernel mode
 #PF: error_code(0x0000) - not-present page
 PGD 0
 Oops: 0000 [#1] PREEMPT SMP NOPTI
 CPU: 86 PID: 4118 Comm: pvclock_test Tainted
 Hardware name: Inspur NF5280M7/NF5280M7, BIOS 05.08.01 08/18/2023
 RIP: 0010:start_creating+0x80/0x190
 Code: ce ad 48 c7 c6 70 a1 ce ad 48 c7 c7 80 1c 9b ab e8 b5 10 d5 ff 4c
63 e0 45 85 e4 0f 85 cd 00 00 00 48 85 db 0f 84 b5 00 00 00 <48> 8b 43
30 48 8d b8 b8 >
 RSP: 0018:ff786eaacf3cfdd0 EFLAGS: 00010206
 RAX: 0000000000000000 RBX: 005b29e3f221ccc0 RCX: 0000000000000000
 RDX: 0000000000000001 RSI: ffffffffadcea170 RDI: 0000000000000000
 RBP: ffffffffc06ac8cf R08: ffffffffa6ea0fe0 R09: ffffffffc06a5940
 R10: ff786eaacf3cfe30 R11: 00000013a7b5feaa R12: 0000000000000000
 R13: 0000000000000124 R14: ff786eaacfa11000 R15: 00000000000081a4
 FS:  00007f0837c89740(0000) GS:ff4f44b6bfd80000(0000)
knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0
 CR2: 005b29e3f221ccf0 CR3: 000000014bdf8002 CR4: 0000000000f73ef0
 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400
 PKRU: 55555554
 Call Trace:
  <TASK>
  ? __die+0x24/0x70
  ? page_fault_oops+0x81/0x150
  ? do_user_addr_fault+0x64/0x6c0
  ? exc_page_fault+0x8a/0x1a0
  ? asm_exc_page_fault+0x26/0x30
  ? start_creating+0x80/0x190
  __debugfs_create_file+0x43/0x1f0
  kvm_create_vm_debugfs+0x28b/0x2d0 [kvm]
  kvm_create_vm+0x457/0x650 [kvm]
  kvm_dev_ioctl+0x88/0x180 [kvm]
  __x64_sys_ioctl+0x8e/0xd0
  do_syscall_64+0x5b/0x120
  entry_SYSCALL_64_after_hwframe+0x71/0x79
 RIP: 0033:0x7f0837b1a94f
 Code: 00 48 89 44 24 18 31 c0 48 8d 44 24 60 c7 04 24 10 00 00 00 48 89
44 24 08 48 8d 44 24 20 48 89 44 24 10 b8 10 00 00 00 0f 05 <41> 89 c0
3d 00 f0 ff ff >
 RSP: 002b:00007ffe01be3fc0 EFLAGS: 00000246 ORIG_RAX
 RAX: ffffffffffffffda RBX: 0000000000434480 RCX: 00007f0837b1a94f
 RDX: 0000000000000000 RSI: 000000000000ae01 RDI: 0000000000000005
 RBP: 0000000000000009 R08: 000000000041b1a0 R09: 000000000041bfcf
 R10: 00007f0837bd8882 R11: 0000000000000246 R12: 0000000000434480
 R13: 000000000041e0f0 R14: 0000000000001000 R15: 0000000000000207
  </TASK>
 Modules linked in: kvm_intel(O) kvm(O) [last unloaded: kvm(O)]
 CR2: 005b29e3f221ccf0
David Woodhouse April 23, 2024, 7:49 a.m. UTC | #13
On Mon, 2024-04-22 at 15:02 -0700, Chen, Zide wrote:
> 
> 
> On 4/20/2024 9:03 AM, David Woodhouse wrote:
> > On Fri, 2024-04-19 at 16:54 -0700, Chen, Zide wrote:
> > > 
> > > However, the selftest hangs:
> > > 
> > > [Apr19 16:15] kselftest: Running tests in kvm
> > > [Apr19 16:16] rcu: INFO: rcu_preempt detected stalls on CPUs/tasks:
> > > [  +0.000628] rcu:      78-...0: (1 GPs behind) idle=3c8c/1/0x4000000000000000 softirq=5908/5913 fqs=14025
> > > [  +0.000468] rcu:      (detected by 104, t=60003 jiffies, g=60073, q=3100 ncpus=128)
> > > [  +0.000389] Sending NMI from CPU 104 to CPUs 78:
> > > [  +0.000360] NMI backtrace for cpu 78
> > > [  +0.000004] CPU: 78 PID: 33515 Comm: pvclock_test Tainted: G O       6.9.0-rc1zide-l0+ #194
> > > [  +0.000003] Hardware name: Inspur NF5280M7/NF5280M7, BIOS 05.08.01 08/18/2023
> > > [  +0.000002] RIP: 0010:pvclock_update_vm_gtod_copy+0xb5/0x200 [kvm]
> > 
> > Ah, kvm_get_time_scale() doesn't much like being asked to scale to zero.
> > 
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index a07b60351894..45fb99986cf9 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -3046,7 +3046,8 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
> >                  * Copy from the field protected solely by ka->tsc_write_lock,
> >                  * to the field protected by the ka->pvclock_sc seqlock.
> >                  */
> > -               ka->master_tsc_scaling_ratio = ka->last_tsc_scaling_ratio;
> > +               ka->master_tsc_scaling_ratio = ka->last_tsc_scaling_ratio ? :
> > +                       kvm_caps.default_tsc_scaling_ratio;
> >  
> >                 /*
> >                  * Calculate the scaling factors precisely the same way
> >                  * that kvm_guest_time_update() does.
> >                 last_tsc_hz = kvm_scale_tsc(tsc_khz * 1000,
> >                                           ka->last_tsc_scaling_ratio);
> 
> Should be ka->master_tsc_scaling_ratio?

Oops, yes. I'll actually do some proper testing on a host with TSC
scaling this week. Thanks.

> If I restored the KVM_REQ_GLOBAL_CLOCK_UPDATE request from
> kvm_arch_vcpu_load(), the selftest works for me, and I ran the test for
> 1000+ iterations, w/ or w/o TSC scaling, the TEST_ASSERT(delta_corrected
>  <= ±1) never got hit. This is awesome!
>
> However, without KVM_REQ_GLOBAL_CLOCK_UPDATE, it still fails on creating
> a VM. Maybe the init sequence sill needs some rework.

That one confuses me. The crash is actually in debugfs, as it's
registering the per-vm or per-vcpu stats. I can't imagine *how* that's
occurring. Or see why the availability of TSC scaling would cause it to
show up for you and not me. Can I have your .config please?

First thought would be that there's some change in the KVM structures
and you have some stale object files using the old struct, but then I
realise I forgot to actually *remove* the now-unused
kvmclock_update_work from x86's struct kvm_arch anyway.

I'll try to reproduce, as I think I want to *know* what's going on
here, even if I am going to drop that patch as mentioned in 
https://lore.kernel.org/kvm/a6723ac9e0169839cb33e8022a47c2de213866ac.camel@infradead.org

Are you able to load that vmlinux in gdb and
(gdb) list *start_creating+0x80
(gdb) list *kvm_create_vm_debugfs+0x28b

Thanks again.

>  BUG: unable to handle page fault for address: 005b29e3f221ccf0
>  #PF: supervisor read access in kernel mode
>  #PF: error_code(0x0000) - not-present page
>  PGD 0
>  Oops: 0000 [#1] PREEMPT SMP NOPTI
>  CPU: 86 PID: 4118 Comm: pvclock_test Tainted
>  Hardware name: Inspur NF5280M7/NF5280M7, BIOS 05.08.01 08/18/2023
>  RIP: 0010:start_creating+0x80/0x190
>  Code: ce ad 48 c7 c6 70 a1 ce ad 48 c7 c7 80 1c 9b ab e8 b5 10 d5 ff 4c
> 63 e0 45 85 e4 0f 85 cd 00 00 00 48 85 db 0f 84 b5 00 00 00 <48> 8b 43
> 30 48 8d b8 b8 >
>  RSP: 0018:ff786eaacf3cfdd0 EFLAGS: 00010206
>  RAX: 0000000000000000 RBX: 005b29e3f221ccc0 RCX: 0000000000000000
>  RDX: 0000000000000001 RSI: ffffffffadcea170 RDI: 0000000000000000
>  RBP: ffffffffc06ac8cf R08: ffffffffa6ea0fe0 R09: ffffffffc06a5940
>  R10: ff786eaacf3cfe30 R11: 00000013a7b5feaa R12: 0000000000000000
>  R13: 0000000000000124 R14: ff786eaacfa11000 R15: 00000000000081a4
>  FS:  00007f0837c89740(0000) GS:ff4f44b6bfd80000(0000)
> knlGS:0000000000000000
>  CS:  0010 DS: 0000 ES: 0000 CR0
>  CR2: 005b29e3f221ccf0 CR3: 000000014bdf8002 CR4: 0000000000f73ef0
>  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>  DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400
>  PKRU: 55555554
>  Call Trace:
>   <TASK>
>   ? __die+0x24/0x70
>   ? page_fault_oops+0x81/0x150
>   ? do_user_addr_fault+0x64/0x6c0
>   ? exc_page_fault+0x8a/0x1a0
>   ? asm_exc_page_fault+0x26/0x30
>   ? start_creating+0x80/0x190
>   __debugfs_create_file+0x43/0x1f0
>   kvm_create_vm_debugfs+0x28b/0x2d0 [kvm]
>   kvm_create_vm+0x457/0x650 [kvm]
>   kvm_dev_ioctl+0x88/0x180 [kvm]
>   __x64_sys_ioctl+0x8e/0xd0
>   do_syscall_64+0x5b/0x120
>   entry_SYSCALL_64_after_hwframe+0x71/0x79
>  RIP: 0033:0x7f0837b1a94f
>  Code: 00 48 89 44 24 18 31 c0 48 8d 44 24 60 c7 04 24 10 00 00 00 48 89
> 44 24 08 48 8d 44 24 20 48 89 44 24 10 b8 10 00 00 00 0f 05 <41> 89 c0
> 3d 00 f0 ff ff >
>  RSP: 002b:00007ffe01be3fc0 EFLAGS: 00000246 ORIG_RAX
>  RAX: ffffffffffffffda RBX: 0000000000434480 RCX: 00007f0837b1a94f
>  RDX: 0000000000000000 RSI: 000000000000ae01 RDI: 0000000000000005
>  RBP: 0000000000000009 R08: 000000000041b1a0 R09: 000000000041bfcf
>  R10: 00007f0837bd8882 R11: 0000000000000246 R12: 0000000000434480
>  R13: 000000000041e0f0 R14: 0000000000001000 R15: 0000000000000207
>   </TASK>
>  Modules linked in: kvm_intel(O) kvm(O) [last unloaded: kvm(O)]
>  CR2: 005b29e3f221ccf0
Chen, Zide April 23, 2024, 5:59 p.m. UTC | #14
On 4/23/2024 12:49 AM, David Woodhouse wrote:
>> If I restored the KVM_REQ_GLOBAL_CLOCK_UPDATE request from
>> kvm_arch_vcpu_load(), the selftest works for me, and I ran the test for
>> 1000+ iterations, w/ or w/o TSC scaling, the TEST_ASSERT(delta_corrected
>>  <= ±1) never got hit. This is awesome!
>>
>> However, without KVM_REQ_GLOBAL_CLOCK_UPDATE, it still fails on creating
>> a VM. Maybe the init sequence sill needs some rework.
> 
> That one confuses me. The crash is actually in debugfs, as it's
> registering the per-vm or per-vcpu stats. I can't imagine *how* that's
> occurring. Or see why the availability of TSC scaling would cause it to
> show up for you and not me. Can I have your .config please?
> 
> First thought would be that there's some change in the KVM structures
> and you have some stale object files using the old struct, but then I
> realise I forgot to actually *remove* the now-unused
> kvmclock_update_work from x86's struct kvm_arch anyway.
> 
> I'll try to reproduce, as I think I want to *know* what's going on
> here, even if I am going to drop that patch as mentioned in 
> https://lore.kernel.org/kvm/a6723ac9e0169839cb33e8022a47c2de213866ac.camel@infradead.org
> 
> Are you able to load that vmlinux in gdb and
> (gdb) list *start_creating+0x80
> (gdb) list *kvm_create_vm_debugfs+0x28b
> 
> Thanks again.

My apologies, it turns out the KVM_REQ_GLOBAL_CLOCK_UPDATE is not
needed. Today I can't reproduce the issue after removing it.  Yesterday
I thought it may miss something related to pfncache.

To be clear, with the above mentioned change to
kvm_scale_tsc(master_tsc_scaling_ratio), the selftest runs reliably
regardless TSC scaling is enabled or not.
David Woodhouse April 23, 2024, 9:02 p.m. UTC | #15
On 23 April 2024 18:59:21 BST, "Chen, Zide" <zide.chen@intel.com> wrote:
>
>
>On 4/23/2024 12:49 AM, David Woodhouse wrote:
>>> If I restored the KVM_REQ_GLOBAL_CLOCK_UPDATE request from
>>> kvm_arch_vcpu_load(), the selftest works for me, and I ran the test for
>>> 1000+ iterations, w/ or w/o TSC scaling, the TEST_ASSERT(delta_corrected
>>>  <= ±1) never got hit. This is awesome!
>>>
>>> However, without KVM_REQ_GLOBAL_CLOCK_UPDATE, it still fails on creating
>>> a VM. Maybe the init sequence sill needs some rework.
>> 
>> That one confuses me. The crash is actually in debugfs, as it's
>> registering the per-vm or per-vcpu stats. I can't imagine *how* that's
>> occurring. Or see why the availability of TSC scaling would cause it to
>> show up for you and not me. Can I have your .config please?
>> 
>> First thought would be that there's some change in the KVM structures
>> and you have some stale object files using the old struct, but then I
>> realise I forgot to actually *remove* the now-unused
>> kvmclock_update_work from x86's struct kvm_arch anyway.
>> 
>> I'll try to reproduce, as I think I want to *know* what's going on
>> here, even if I am going to drop that patch as mentioned in 
>> https://lore.kernel.org/kvm/a6723ac9e0169839cb33e8022a47c2de213866ac.camel@infradead.org
>> 
>> Are you able to load that vmlinux in gdb and
>> (gdb) list *start_creating+0x80
>> (gdb) list *kvm_create_vm_debugfs+0x28b
>> 
>> Thanks again.
>
>My apologies, it turns out the KVM_REQ_GLOBAL_CLOCK_UPDATE is not
>needed. Today I can't reproduce the issue after removing it.  Yesterday
>I thought it may miss something related to pfncache.
>
>To be clear, with the above mentioned change to
>kvm_scale_tsc(master_tsc_scaling_ratio), the selftest runs reliably
>regardless TSC scaling is enabled or not.

Thanks. That version is now in my git tree and I have tested it myself on Skylake. Then I got distracted by reverse-engineering kvm_get_time_scale() so I could actually add some comments to it.

I'm still going to have to put the clock updates back though, for the non-masterclock case.

While I'm ripping all this up I guess I ought to rename it to "reference clock" too?
David Woodhouse April 24, 2024, 12:58 p.m. UTC | #16
On Mon, 2024-04-22 at 15:02 -0700, Chen, Zide wrote:
> the selftest works for me, and I ran the test for 1000+ iterations,
> w/ or w/o TSC scaling, the TEST_ASSERT(delta_corrected <= ±1) never
> got hit. This is awesome!

I think that with further care we can get even better than that.

Let's look at where that ±1ns tolerance comes from.

Consider a 3GHz TSC. That gives us three ticks per nanosecond. Each TSC
value can be seen as (3n) (3n+1) or (3n+2) for a given nanosecond n.

If we take a new reference point at a (3n+2) TSC value and calculate
the KVM clock from that, we *know* we're going to round down and lose
two-thirds of a nanosecond.

So then we set the new KVM clock parameters to use that new reference
point, and that's why we have to allow a disruption of up to a single
nanosecond. In fact, I don't think it's ±1 ns, is it? It'll only ever
be in the same direction (rounding down)?

But if we're careful which *which* TSC value we use as the reference
point, we can reduce that error.

The TSC value we use should be *around* the current time, but what if
we were to evaluate maybe the previous 100 TSC values. Pass *each* of
them through the conversion to nanoseconds and use the one that comes
*closest* to a precise nanosecond (nnnnnnnn.000).

It's even fairly easy to calculate those, because of the way the KVM
clock ABI has us multiply and then shift right by 32 bits. We just need
to look at those low 32 bits (the fractional nanosecond) *before*
shifting them out of existence. Something like...

   uint64_t tsc_candidate, tsc_candidate_last, best_tsc;
   uint32_t frac_ns_min = 0xffffffff;
   uint64_t frac_ns;

   best_tsc = tsc_candidate = rdtsc();
   tsc_candidate_last = tsc_candidate - 100;

   while (tsc_candidate-- > tsc_candidate_last) {
      uint64_t guest_tsc = kvm_scale_tsc(tsc_candidate, ...);
      frac_ns = guest_tsc * hvclock->tsc_to_system_mul;
      /* Shift *after* multiplication, not before as pvclock_scale_cycles() does. */
      if (hvclock->tsc_shift < 0)
          frac_ns >>= -hvclock->tsc_shift;
      else
          frac_ns <<= hvclock->tsc_shift;

      if ( (uint32_t)frac_ns <= frac_ns_min ) {
          frac_ns_min = frac_ns;
          best_tsc = tsc_candidate;
      }
   }
   printk("Best TSC to use for reference point is %lld", best_tsc);

And then you calculate your CLOCK_MONOTONIC_RAW and guest KVM clock
from *that* host TSC value, and thus minimise the discrepancies due to
rounding down?

Aside from the fact that I literally just typed that into email and
it's barely even thought through let alone entirely untested... I'm
just not sure it's even worth the runtime cost, for that ±1 ns on a
rare case.

A slop of ±1ns is probably sufficient because over the past few years
we've already shifted the definition of the KVM clock to *not* be NTP-
corrected, and we leave guests to do fine-grained synchronization
through other means anyway.

But I see talk of people offering a PPS signal to *hundreds* of guests
on the same host simultaneously, just for them all to use it to
calibrate the same underlying oscillator. Which is a little bit insane.

We *should* have a way for the host to do that once and then expose the
precise time to its guests, in a much saner way than the KVM clock
does. I'll look at adding something along those lines to this series
too, which can be driven from the host's adjtimex() adjustments (which
KVM already consumes), and fed into each guest's timekeeping as a
PTP/PPS device or something.
diff mbox series

Patch

diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 741c7dc16afc..02ee1205bbed 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -87,6 +87,7 @@  TEST_GEN_PROGS_x86_64 += x86_64/pmu_counters_test
 TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test
 TEST_GEN_PROGS_x86_64 += x86_64/private_mem_conversions_test
 TEST_GEN_PROGS_x86_64 += x86_64/private_mem_kvm_exits_test
+TEST_GEN_PROGS_x86_64 += x86_64/pvclock_test
 TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id
 TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test
 TEST_GEN_PROGS_x86_64 += x86_64/smaller_maxphyaddr_emulation_test
diff --git a/tools/testing/selftests/kvm/x86_64/pvclock_test.c b/tools/testing/selftests/kvm/x86_64/pvclock_test.c
new file mode 100644
index 000000000000..172ef4d19c60
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/pvclock_test.c
@@ -0,0 +1,223 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright © 2024, Amazon.com, Inc. or its affiliates.
+ *
+ * Tests for pvclock API
+ * KVM_SET_CLOCK_GUEST/KVM_GET_CLOCK_GUEST
+ */
+#include <asm/pvclock.h>
+#include <asm/pvclock-abi.h>
+#include <sys/stat.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+enum {
+	STAGE_FIRST_BOOT,
+	STAGE_UNCORRECTED,
+	STAGE_CORRECTED,
+	NUM_STAGES
+};
+
+#define KVMCLOCK_GPA 0xc0000000ull
+#define KVMCLOCK_SIZE sizeof(struct pvclock_vcpu_time_info)
+
+static void trigger_pvti_update(vm_paddr_t pvti_pa)
+{
+	/*
+	 * We need a way to trigger KVM to update the fields
+	 * in the PV time info. The easiest way to do this is
+	 * to temporarily switch to the old KVM system time
+	 * method and then switch back to the new one.
+	 */
+	wrmsr(MSR_KVM_SYSTEM_TIME, pvti_pa | KVM_MSR_ENABLED);
+	wrmsr(MSR_KVM_SYSTEM_TIME_NEW, pvti_pa | KVM_MSR_ENABLED);
+}
+
+static void guest_code(vm_paddr_t pvti_pa)
+{
+	struct pvclock_vcpu_time_info *pvti_va =
+		(struct pvclock_vcpu_time_info *)pvti_pa;
+
+	struct pvclock_vcpu_time_info pvti_boot;
+	struct pvclock_vcpu_time_info pvti_uncorrected;
+	struct pvclock_vcpu_time_info pvti_corrected;
+	uint64_t cycles_boot;
+	uint64_t cycles_uncorrected;
+	uint64_t cycles_corrected;
+	uint64_t tsc_guest;
+
+	/*
+	 * Setup the KVMCLOCK in the guest & store the original
+	 * PV time structure that is used.
+	 */
+	wrmsr(MSR_KVM_SYSTEM_TIME_NEW, pvti_pa | KVM_MSR_ENABLED);
+	pvti_boot = *pvti_va;
+	GUEST_SYNC(STAGE_FIRST_BOOT);
+
+	/*
+	 * Trigger an update of the PVTI, if we calculate
+	 * the KVM clock using this structure we'll see
+	 * a drift from the TSC.
+	 */
+	trigger_pvti_update(pvti_pa);
+	pvti_uncorrected = *pvti_va;
+	GUEST_SYNC(STAGE_UNCORRECTED);
+
+	/*
+	 * The test should have triggered the correction by this
+	 * point in time. We have a copy of each of the PVTI structs
+	 * at each stage now.
+	 *
+	 * Let's sample the timestamp at a SINGLE point in time and
+	 * then calculate what the KVM clock would be using the PVTI
+	 * from each stage.
+	 *
+	 * Then return each of these values to the tester.
+	 */
+	pvti_corrected = *pvti_va;
+	tsc_guest = rdtsc();
+
+	cycles_boot = __pvclock_read_cycles(&pvti_boot, tsc_guest);
+	cycles_uncorrected = __pvclock_read_cycles(&pvti_uncorrected, tsc_guest);
+	cycles_corrected = __pvclock_read_cycles(&pvti_corrected, tsc_guest);
+
+	GUEST_SYNC_ARGS(STAGE_CORRECTED, cycles_boot, cycles_uncorrected,
+			cycles_corrected, 0);
+}
+
+static void run_test(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
+{
+	struct ucall uc;
+	uint64_t ucall_reason;
+	struct pvclock_vcpu_time_info pvti_before;
+	uint64_t before, uncorrected, corrected;
+	int64_t delta_uncorrected, delta_corrected;
+
+	/* Loop through each stage of the test. */
+	while (true) {
+
+		/* Start/restart the running vCPU code. */
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		/* Retrieve and verify our stage. */
+		ucall_reason = get_ucall(vcpu, &uc);
+		TEST_ASSERT(ucall_reason == UCALL_SYNC,
+			    "Unhandled ucall reason=%lu",
+			    ucall_reason);
+
+		/* Run host specific code relating to stage. */
+		switch (uc.args[1]) {
+		case STAGE_FIRST_BOOT:
+			/* Store the KVM clock values before an update. */
+			vm_ioctl(vm, KVM_GET_CLOCK_GUEST, &pvti_before);
+
+			/* Sleep for a set amount of time to induce drift. */
+			sleep(5);
+			break;
+
+		case STAGE_UNCORRECTED:
+			/* Restore the KVM clock values. */
+			vm_ioctl(vm, KVM_SET_CLOCK_GUEST, &pvti_before);
+			break;
+
+		case STAGE_CORRECTED:
+			/* Query the clock information and verify delta. */
+			before = uc.args[2];
+			uncorrected = uc.args[3];
+			corrected = uc.args[4];
+
+			delta_uncorrected = before - uncorrected;
+			delta_corrected = before - corrected;
+
+			pr_info("before=%lu uncorrected=%lu corrected=%lu\n",
+				before, uncorrected, corrected);
+
+			pr_info("delta_uncorrected=%ld delta_corrected=%ld\n",
+				delta_uncorrected, delta_corrected);
+
+			TEST_ASSERT((delta_corrected <= 1) && (delta_corrected >= -1),
+				    "larger than expected delta detected = %ld", delta_corrected);
+			return;
+		}
+	}
+}
+
+#define CLOCKSOURCE_PATH "/sys/devices/system/clocksource/clocksource0/current_clocksource"
+
+static void check_clocksource(void)
+{
+	char *clk_name;
+	struct stat st;
+	FILE *fp;
+
+	fp = fopen(CLOCKSOURCE_PATH, "r");
+	if (!fp) {
+		pr_info("failed to open clocksource file: %d; assuming TSC.\n",
+			errno);
+		return;
+	}
+
+	if (fstat(fileno(fp), &st)) {
+		pr_info("failed to stat clocksource file: %d; assuming TSC.\n",
+			errno);
+		goto out;
+	}
+
+	clk_name = malloc(st.st_size);
+	TEST_ASSERT(clk_name, "failed to allocate buffer to read file\n");
+
+	if (!fgets(clk_name, st.st_size, fp)) {
+		pr_info("failed to read clocksource file: %d; assuming TSC.\n",
+			ferror(fp));
+		goto out;
+	}
+
+	TEST_ASSERT(!strncmp(clk_name, "tsc\n", st.st_size),
+		    "clocksource not supported: %s", clk_name);
+out:
+	fclose(fp);
+}
+
+static void configure_pvclock(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
+{
+	unsigned int gpages;
+
+	gpages = vm_calc_num_guest_pages(VM_MODE_DEFAULT, KVMCLOCK_SIZE);
+	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+				    KVMCLOCK_GPA, 1, gpages, 0);
+	virt_map(vm, KVMCLOCK_GPA, KVMCLOCK_GPA, gpages);
+
+	vcpu_args_set(vcpu, 1, KVMCLOCK_GPA);
+}
+
+static void configure_scaled_tsc(struct kvm_vcpu *vcpu)
+{
+	uint64_t tsc_khz;
+
+	tsc_khz =  __vcpu_ioctl(vcpu, KVM_GET_TSC_KHZ, NULL);
+	pr_info("scaling tsc from %ldKHz to %ldKHz\n", tsc_khz, tsc_khz / 2);
+	tsc_khz /= 2;
+	vcpu_ioctl(vcpu, KVM_SET_TSC_KHZ, (void *)tsc_khz);
+}
+
+int main(void)
+{
+	struct kvm_vm *vm;
+	struct kvm_vcpu *vcpu;
+
+	check_clocksource();
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+
+	configure_pvclock(vm, vcpu);
+	configure_scaled_tsc(vcpu);
+
+	run_test(vm, vcpu);
+
+	return 0;
+}