[v2,1/8] KVM: x86/xen: improve accuracy of Xen timers

Message ID	20240227115648.3104-2-dwmw2@infradead.org (mailing list archive)
State	New, archived
Headers	show Received: from desiato.infradead.org (desiato.infradead.org [90.155.92.199]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id BBF2C13956D for <kvm@vger.kernel.org>; Tue, 27 Feb 2024 11:56:54 +0000 (UTC) From: David Woodhouse <dwmw2@infradead.org> To: kvm@vger.kernel.org Cc: Sean Christopherson <seanjc@google.com>, Paul Durrant <paul@xen.org>, Paolo Bonzini <pbonzini@redhat.com>, Michal Luczaj <mhal@rbox.co>, David Woodhouse <dwmw@amazon.co.uk> Subject: [PATCH v2 1/8] KVM: x86/xen: improve accuracy of Xen timers Date: Tue, 27 Feb 2024 11:49:15 +0000 Message-ID: <20240227115648.3104-2-dwmw2@infradead.org> In-Reply-To: <20240227115648.3104-1-dwmw2@infradead.org> References: <20240227115648.3104-1-dwmw2@infradead.org> Precedence: bulk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sender: David Woodhouse <dwmw2@infradead.org>
Series	KVM: x86/xen updates \| expand [v2,0/8] KVM: x86/xen updates [v2,1/8] KVM: x86/xen: improve accuracy of Xen timers [v2,2/8] KVM: x86/xen: inject vCPU upcall vector when local APIC is enabled [v2,3/8] KVM: x86/xen: remove WARN_ON_ONCE() with false positives in evtchn delivery [v2,4/8] KVM: pfncache: simplify locking and make more self-contained [v2,5/8] KVM: x86/xen: fix recursive deadlock in timer injection [v2,6/8] KVM: x86/xen: split up kvm_xen_set_evtchn_fast() [v2,7/8] KVM: x86/xen: avoid blocking in hardirq context in kvm_xen_set_evtchn_fast() [v2,8/8] KVM: pfncache: clean up rwlock abuse

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2911e6383fef..89815a887e4d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2862,7 +2862,11 @@ static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp, return v * clock->mult; } -static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp) +/* + * As with get_kvmclock_base_ns(), this counts from boot time, at the + * frequency of CLOCK_MONOTONIC_RAW (hence adding gtos->offs_boot). + */ +static int do_kvmclock_base(s64 *t, u64 *tsc_timestamp) { struct pvclock_gtod_data *gtod = &pvclock_gtod_data; unsigned long seq; @@ -2881,6 +2885,29 @@ static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp) return mode; } +/* + * This calculates CLOCK_MONOTONIC at the time of the TSC snapshot, with + * no boot time offset. + */ +static int do_monotonic(s64 *t, u64 *tsc_timestamp) +{ + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; + unsigned long seq; + int mode; + u64 ns; + + do { + seq = read_seqcount_begin(&gtod->seq); + ns = gtod->clock.base_cycles; + ns += vgettsc(&gtod->clock, tsc_timestamp, &mode); + ns >>= gtod->clock.shift; + ns += ktime_to_ns(gtod->clock.offset); + } while (unlikely(read_seqcount_retry(&gtod->seq, seq))); + *t = ns; + + return mode; +} + static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp) { struct pvclock_gtod_data *gtod = &pvclock_gtod_data; @@ -2902,18 +2929,42 @@ static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp) return mode; } -/* returns true if host is using TSC based clocksource */ +/* + * Calculates the kvmclock_base_ns (CLOCK_MONOTONIC_RAW + boot time) and + * reports the TSC value from which it do so. Returns true if host is + * using TSC based clocksource. + */ static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp) { /* checked again under seqlock below */ if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode)) return false; - return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns, - tsc_timestamp)); + return gtod_is_based_on_tsc(do_kvmclock_base(kernel_ns, + tsc_timestamp)); } -/* returns true if host is using TSC based clocksource */ +/* + * Calculates CLOCK_MONOTONIC and reports the TSC value from which it did + * so. Returns true if host is using TSC based clocksource. + */ +bool kvm_get_monotonic_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp) +{ + /* checked again under seqlock below */ + if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode)) + return false; + + return gtod_is_based_on_tsc(do_monotonic(kernel_ns, + tsc_timestamp)); +} + +/* + * Calculates CLOCK_REALTIME and reports the TSC value from which it did + * so. Returns true if host is using TSC based clocksource. + * + * DO NOT USE this for anything related to migration. You want CLOCK_TAI + * for that. + */ static bool kvm_get_walltime_and_clockread(struct timespec64 *ts, u64 *tsc_timestamp) { diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 2f7e19166658..56b7a78f45bf 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -294,6 +294,7 @@ void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); u64 get_kvmclock_ns(struct kvm *kvm); uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm); +bool kvm_get_monotonic_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp); int kvm_read_guest_virt(struct kvm_vcpu *vcpu, gva_t addr, void *val, unsigned int bytes, diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 8a04e0ae9245..ccd2dc753fd6 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -24,6 +24,7 @@ #include <xen/interface/sched.h> #include <asm/xen/cpuid.h> +#include <asm/pvclock.h> #include "cpuid.h" #include "trace.h" @@ -149,8 +150,93 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer) return HRTIMER_NORESTART; } -static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_ns) +static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, + bool linux_wa) { + uint64_t guest_now; + int64_t kernel_now, delta; + + /* + * The guest provides the requested timeout in absolute nanoseconds + * of the KVM clock — as *it* sees it, based on the scaled TSC and + * the pvclock information provided by KVM. + * + * The kernel doesn't support hrtimers based on CLOCK_MONOTONIC_RAW + * so use CLOCK_MONOTONIC. In the timescales covered by timers, the + * difference won't matter much as there is no cumulative effect. + * + * Calculate the time for some arbitrary point in time around "now" + * in terms of both kvmclock and CLOCK_MONOTONIC. Calculate the + * delta between the kvmclock "now" value and the guest's requested + * timeout, apply the "Linux workaround" described below, and add + * the resulting delta to the CLOCK_MONOTONIC "now" value, to get + * the absolute CLOCK_MONOTONIC time at which the timer should + * fire. + */ + if (vcpu->arch.hv_clock.version && vcpu->kvm->arch.use_master_clock && + static_cpu_has(X86_FEATURE_CONSTANT_TSC)) { + uint64_t host_tsc, guest_tsc; + + if (!IS_ENABLED(CONFIG_64BIT) || + !kvm_get_monotonic_and_clockread(&kernel_now, &host_tsc)) { + /* + * Don't fall back to get_kvmclock_ns() because it's + * broken; it has a systemic error in its results + * because it scales directly from host TSC to + * nanoseconds, and doesn't scale first to guest TSC + * and then* to nanoseconds as the guest does. + * + * There is a small error introduced here because time + * continues to elapse between the ktime_get() and the + * subsequent rdtsc(). But not the systemic drift due + * to get_kvmclock_ns(). + */ + kernel_now = ktime_get(); /* This is CLOCK_MONOTONIC */ + host_tsc = rdtsc(); + } + + /* Calculate the guest kvmclock as the guest would do it. */ + guest_tsc = kvm_read_l1_tsc(vcpu, host_tsc); + guest_now = __pvclock_read_cycles(&vcpu->arch.hv_clock, + guest_tsc); + } else { + /* + * Without CONSTANT_TSC, get_kvmclock_ns() is the only option. + * + * Also if the guest PV clock hasn't been set up yet, as is + * likely to be the case during migration when the vCPU has + * not been run yet. It would be possible to calculate the + * scaling factors properly in that case but there's not much + * point in doing so. The get_kvmclock_ns() drift accumulates + * over time, so it's OK to use it at startup. Besides, on + * migration there's going to be a little bit of skew in the + * precise moment at which timers fire anyway. Often they'll + * be in the "past" by the time the VM is running again after + * migration. + */ + guest_now = get_kvmclock_ns(vcpu->kvm); + kernel_now = ktime_get(); + } + + delta = guest_abs - guest_now; + + /* Xen has a 'Linux workaround' in do_set_timer_op() which + * checks for negative absolute timeout values (caused by + * integer overflow), and for values about 13 days in the + * future (2^50ns) which would be caused by jiffies + * overflow. For those cases, it sets the timeout 100ms in + * the future (not *too* soon, since if a guest really did + * set a long timeout on purpose we don't want to keep + * churning CPU time by waking it up). + */ + if (linux_wa) { + if ((unlikely((int64_t)guest_abs < 0 || + (delta > 0 && (uint32_t) (delta >> 50) != 0)))) { + delta = 100 * NSEC_PER_MSEC; + guest_abs = guest_now + delta; + } + } + /* * Avoid races with the old timer firing. Checking timer_expires * to avoid calling hrtimer_cancel() will only have false positives @@ -162,12 +248,11 @@ static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_ atomic_set(&vcpu->arch.xen.timer_pending, 0); vcpu->arch.xen.timer_expires = guest_abs; - if (delta_ns <= 0) { + if (delta <= 0) { xen_timer_callback(&vcpu->arch.xen.timer); } else { - ktime_t ktime_now = ktime_get(); hrtimer_start(&vcpu->arch.xen.timer, - ktime_add_ns(ktime_now, delta_ns), + ktime_add_ns(kernel_now, delta), HRTIMER_MODE_ABS_HARD); } } @@ -998,8 +1083,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) /* Start the timer if the new value has a valid vector+expiry. */ if (data->u.timer.port && data->u.timer.expires_ns) kvm_xen_start_timer(vcpu, data->u.timer.expires_ns, - data->u.timer.expires_ns - - get_kvmclock_ns(vcpu->kvm)); + false); r = 0; break; @@ -1472,7 +1556,6 @@ static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd, { struct vcpu_set_singleshot_timer oneshot; struct x86_exception e; - s64 delta; if (!kvm_xen_timer_enabled(vcpu)) return false; @@ -1506,9 +1589,7 @@ static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd, return true; } - /* A delta <= 0 results in an immediate callback, which is what we want */ - delta = oneshot.timeout_abs_ns - get_kvmclock_ns(vcpu->kvm); - kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, delta); + kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, false); *r = 0; return true; @@ -1532,25 +1613,7 @@ static bool kvm_xen_hcall_set_timer_op(struct kvm_vcpu *vcpu, uint64_t timeout, return false; if (timeout) { - uint64_t guest_now = get_kvmclock_ns(vcpu->kvm); - int64_t delta = timeout - guest_now; - - /* Xen has a 'Linux workaround' in do_set_timer_op() which - * checks for negative absolute timeout values (caused by - * integer overflow), and for values about 13 days in the - * future (2^50ns) which would be caused by jiffies - * overflow. For those cases, it sets the timeout 100ms in - * the future (not *too* soon, since if a guest really did - * set a long timeout on purpose we don't want to keep - * churning CPU time by waking it up). - */ - if (unlikely((int64_t)timeout < 0 || - (delta > 0 && (uint32_t) (delta >> 50) != 0))) { - delta = 100 * NSEC_PER_MSEC; - timeout = guest_now + delta; - } - - kvm_xen_start_timer(vcpu, timeout, delta); + kvm_xen_start_timer(vcpu, timeout, true); } else { kvm_xen_stop_timer(vcpu); }

[v2,1/8] KVM: x86/xen: improve accuracy of Xen timers

Commit Message

Comments

Patch