diff mbox series

[v3] KVM: x86: Use current rather than snapshotted TSC frequency if it is constant

Message ID 20220421172352.188745-1-romanton@google.com (mailing list archive)
State New, archived
Headers show
Series [v3] KVM: x86: Use current rather than snapshotted TSC frequency if it is constant | expand

Commit Message

Anton Romanov April 21, 2022, 5:23 p.m. UTC
Don't snapshot tsc_khz into per-cpu cpu_tsc_khz if the host TSC is
constant, in which case the actual TSC frequency will never change and thus
capturing TSC during initialization is unnecessary, KVM can simply use
tsc_khz.  This value is snapshotted from
kvm_timer_init->kvmclock_cpu_online->tsc_khz_changed(NULL)

On CPUs with constant TSC, but not a hardware-specified TSC frequency,
snapshotting cpu_tsc_khz and using that to set a VM's target TSC frequency
can lead to VM to think its TSC frequency is not what it actually is if
refining the TSC completes after KVM snapshots tsc_khz.  The actual
frequency never changes, only the kernel's calculation of what that
frequency is changes.

Ideally, KVM would not be able to race with TSC refinement, or would have
a hook into tsc_refine_calibration_work() to get an alert when refinement
is complete.  Avoiding the race altogether isn't practical as refinement
takes a relative eternity; it's deliberately put on a work queue outside of
the normal boot sequence to avoid unnecessarily delaying boot.

Adding a hook is doable, but somewhat gross due to KVM's ability to be
built as a module.  And if the TSC is constant, which is likely the case
for every VMX/SVM-capable CPU produced in the last decade, the race can be
hit if and only if userspace is able to create a VM before TSC refinement
completes; refinement is slow, but not that slow.

For now, punt on a proper fix, as not taking a snapshot can help some uses
cases and not taking a snapshot is arguably correct irrespective of the
race with refinement.

Signed-off-by: Anton Romanov <romanton@google.com>
---
v3:
    fixed typo
v2:
    fixed commit msg indentation
    added WARN_ON_ONCE in kvm_hyperv_tsc_notifier
    opened up condition in __get_kvmclock

 arch/x86/kvm/x86.c | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

Comments

Sean Christopherson May 11, 2022, 12:37 a.m. UTC | #1
On Thu, Apr 21, 2022, Anton Romanov wrote:
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 547ba00ef64f..f6f6ddaa2f6a 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2907,6 +2907,19 @@ static void kvm_update_masterclock(struct kvm *kvm)
>  	kvm_end_pvclock_update(kvm);
>  }
>  
> +/*
> + * If kvm is built into kernel it is possible that tsc_khz saved into
> + * per-cpu cpu_tsc_khz was yet unrefined value. If CPU provides CONSTANT_TSC it
> + * doesn't make sense to snapshot it anyway so just return tsc_khz


I wouldn't mention KVM being built into the kernel.  It is relevant to reproducing
the original bug, but KVM being built as a module doesn't 100% guarantee the race
can't be triggered.

Similarly, I wouldn't mention snapshotting, at least not without explaining _why_
it doesn't makes sense.  Even if KVM were to get a notification when TSC calibration
completes, the correct behavior would be to prevent VM creation until calibration
completes.  I.e. regardless of races, snapshotting when TSC is constant is never
the right thing to do, so IMO bringing it up in a comment only adds confusion.

Something like...

/*
 * Use the kernel's tsc_khz directly if the TSC is constant, otherwise use KVM's
 * per-CPU value (which may be zero if a CPU is going offline).  Note, tsc_khz
 * can change during boot even if the TSC is constant, as it's possible for KVM
 * to be loaded before TSC calibration completes.  Ideally, KVM would get a
 * notification when calibration completes, but practically speaking calibration
 * will complete before userspace is alive enough to create VMs.
 */

> + */
> +static unsigned long get_cpu_tsc_khz(void)
> +{
> +	if (static_cpu_has(X86_FEATURE_CONSTANT_TSC))
> +		return tsc_khz;
> +	else
> +		return __this_cpu_read(cpu_tsc_khz);
> +}
> +
>  /* Called within read_seqcount_begin/retry for kvm->pvclock_sc.  */
>  static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
>  {
> @@ -2917,7 +2930,8 @@ static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
>  	get_cpu();
>  
>  	data->flags = 0;
> -	if (ka->use_master_clock && __this_cpu_read(cpu_tsc_khz)) {
> +	if (ka->use_master_clock &&
> +		(static_cpu_has(X86_FEATURE_CONSTANT_TSC) || __this_cpu_read(cpu_tsc_khz))) {

Align indentation, e.g.

	if (ka->use_master_clock &&
	    (static_cpu_has(X86_FEATURE_CONSTANT_TSC) || __this_cpu_read(cpu_tsc_khz))) {

>  #ifdef CONFIG_X86_64
>  		struct timespec64 ts;
>  
> @@ -2931,7 +2945,7 @@ static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
>  		data->flags |= KVM_CLOCK_TSC_STABLE;
>  		hv_clock.tsc_timestamp = ka->master_cycle_now;
>  		hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
> -		kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
> +		kvm_get_time_scale(NSEC_PER_SEC, get_cpu_tsc_khz() * 1000LL,
>  				   &hv_clock.tsc_shift,
>  				   &hv_clock.tsc_to_system_mul);
>  		data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc);
> @@ -3049,7 +3063,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
>  
>  	/* Keep irq disabled to prevent changes to the clock */
>  	local_irq_save(flags);
> -	tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
> +	tgt_tsc_khz = get_cpu_tsc_khz();
>  	if (unlikely(tgt_tsc_khz == 0)) {
>  		local_irq_restore(flags);
>  		kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
> @@ -8646,9 +8660,12 @@ static void tsc_khz_changed(void *data)
>  	struct cpufreq_freqs *freq = data;
>  	unsigned long khz = 0;
>  
> +	if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))

I think it'd be better to do this in kvmclock_cpu_online().  The other caller,
__kvmclock_cpufreq_notifier(), is unreachable if the TSC is constant, and see a
"TSC changed" call with a constant TSC is odd/confusing.  Might be worth adding
a WARN here to prevent future goofs though?

@@ -8856,7 +8858,8 @@ static struct notifier_block kvmclock_cpufreq_notifier_block = {

 static int kvmclock_cpu_online(unsigned int cpu)
 {
-       tsc_khz_changed(NULL);
+       if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+               tsc_khz_changed(NULL);
        return 0;
 }



> +		return;
> +
>  	if (data)
>  		khz = freq->new;
> -	else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
> +	else
>  		khz = cpufreq_quick_get(raw_smp_processor_id());
>  	if (!khz)
>  		khz = tsc_khz;
> @@ -8661,6 +8678,8 @@ static void kvm_hyperv_tsc_notifier(void)
>  	struct kvm *kvm;
>  	int cpu;
>  
> +	WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_TSC_RELIABLE));

Vitaly wasn't 100% certain that this is (or should be) unreachable on migration
even with a "reliable" TSC.  I would just drop the WARN, it doesn't change what
KVM needs to do for a constant TSC.  If we do want to add it, we should do so in
a separate commit as it's not strictly relevant, and so that it can be easily
reverted.

What should go in this patch is to not write cpu_tsc_khz if TSC is constant.  I
don't care about the cost of the write, I care about establishing an invariant that
cpu_tsc_khz isn't touched if the TSC is constant.

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 53e9a429dff0..9453f844f147 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8742,8 +8742,10 @@ static void kvm_hyperv_tsc_notifier(void)
        hyperv_stop_tsc_emulation();

        /* TSC frequency always matches when on Hyper-V */
-       for_each_present_cpu(cpu)
-               per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
+       if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+               for_each_present_cpu(cpu)
+                       per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
+       }
        kvm_max_guest_tsc_khz = tsc_khz;

        list_for_each_entry(kvm, &vm_list, vm_list) {


>  	mutex_lock(&kvm_lock);
>  	list_for_each_entry(kvm, &vm_list, vm_list)
>  		kvm_make_mclock_inprogress_request(kvm);
> -- 
> 2.36.0.rc2.479.g8af0fa9b8e-goog
>
diff mbox series

Patch

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 547ba00ef64f..f6f6ddaa2f6a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2907,6 +2907,19 @@  static void kvm_update_masterclock(struct kvm *kvm)
 	kvm_end_pvclock_update(kvm);
 }
 
+/*
+ * If kvm is built into kernel it is possible that tsc_khz saved into
+ * per-cpu cpu_tsc_khz was yet unrefined value. If CPU provides CONSTANT_TSC it
+ * doesn't make sense to snapshot it anyway so just return tsc_khz
+ */
+static unsigned long get_cpu_tsc_khz(void)
+{
+	if (static_cpu_has(X86_FEATURE_CONSTANT_TSC))
+		return tsc_khz;
+	else
+		return __this_cpu_read(cpu_tsc_khz);
+}
+
 /* Called within read_seqcount_begin/retry for kvm->pvclock_sc.  */
 static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
 {
@@ -2917,7 +2930,8 @@  static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
 	get_cpu();
 
 	data->flags = 0;
-	if (ka->use_master_clock && __this_cpu_read(cpu_tsc_khz)) {
+	if (ka->use_master_clock &&
+		(static_cpu_has(X86_FEATURE_CONSTANT_TSC) || __this_cpu_read(cpu_tsc_khz))) {
 #ifdef CONFIG_X86_64
 		struct timespec64 ts;
 
@@ -2931,7 +2945,7 @@  static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
 		data->flags |= KVM_CLOCK_TSC_STABLE;
 		hv_clock.tsc_timestamp = ka->master_cycle_now;
 		hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
-		kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
+		kvm_get_time_scale(NSEC_PER_SEC, get_cpu_tsc_khz() * 1000LL,
 				   &hv_clock.tsc_shift,
 				   &hv_clock.tsc_to_system_mul);
 		data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc);
@@ -3049,7 +3063,7 @@  static int kvm_guest_time_update(struct kvm_vcpu *v)
 
 	/* Keep irq disabled to prevent changes to the clock */
 	local_irq_save(flags);
-	tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
+	tgt_tsc_khz = get_cpu_tsc_khz();
 	if (unlikely(tgt_tsc_khz == 0)) {
 		local_irq_restore(flags);
 		kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
@@ -8646,9 +8660,12 @@  static void tsc_khz_changed(void *data)
 	struct cpufreq_freqs *freq = data;
 	unsigned long khz = 0;
 
+	if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+		return;
+
 	if (data)
 		khz = freq->new;
-	else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+	else
 		khz = cpufreq_quick_get(raw_smp_processor_id());
 	if (!khz)
 		khz = tsc_khz;
@@ -8661,6 +8678,8 @@  static void kvm_hyperv_tsc_notifier(void)
 	struct kvm *kvm;
 	int cpu;
 
+	WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_TSC_RELIABLE));
+
 	mutex_lock(&kvm_lock);
 	list_for_each_entry(kvm, &vm_list, vm_list)
 		kvm_make_mclock_inprogress_request(kvm);