@@ -747,6 +747,7 @@ struct cpu_tsc_vars
s64 tsc_offset;
u64 tsc_measure_base;
atomic_t tsc_synchronized;
+ u64 last_ref;
};
static DEFINE_PER_CPU(struct cpu_tsc_vars, cpu_tsc_vars);
@@ -3570,42 +3571,83 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
}
EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
-static void evict(void *info)
+/*
+ * evict(struct cpufreq_freqs *data)
+ *
+ * This function is meant to be called from an IPI and runs with
+ * interrupts disabled. It records the last good value of the
+ * reference TSC before optionally updating the CPU frequency and
+ * marking the CPU unsynchronized. Since it is entered from an
+ * interrupt handler, it can be used to bring CPUs out of hardware
+ * virtualization and have them wait until their clock has been
+ * resynchronized.
+ */
+static void evict(void *data)
{
- /* nothing */
+ struct cpu_tsc_vars *cv = &__get_cpu_var(cpu_tsc_vars);
+ WARN_ON(!irqs_disabled());
+ cv->last_ref = compute_ref_tsc();
+ if (data) {
+ struct cpufreq_freqs *freq = data;
+ cv->tsc_khz = freq->new;
+ }
+ atomic_set(&cv->tsc_synchronized, 0);
}
-static struct execute_work resync_work;
-static int work_scheduled;
-
-static void resync_user(struct work_struct *work)
+static long resync(void *unused)
{
+ struct cpu_tsc_vars *cv = &__get_cpu_var(cpu_tsc_vars);
+ u64 tsc = 0;
int cpu;
- work_scheduled = 0;
- for_each_online_cpu(cpu)
- if (cpu != tsc_base_cpu)
- kvm_do_sync_tsc(cpu);
-}
+ /*
+ * First, make sure we are on the right CPU; between when the work got
+ * scheduled and when this runs, the tsc_base could have changed. We
+ * lock out changes to tsc_base_cpu with cpu_get(). The cpu takedown
+ * code can't run until all non-preempt sections are finished, so being
+ * in a non-preempt section protects the base from changing. Tricky.
+ */
+ cpu = get_cpu();
+ if (cpu != tsc_base_cpu)
+ return -1;
-static void resync(void *info)
-{
- struct cpu_tsc_vars *cv = &__get_cpu_var(cpu_tsc_vars);
- u64 tsc;
+ /*
+ * When the base CPU frequency changes independenetly of other
+ * cores, the reference TSC can fall behind. Rather than
+ * introduce a complicated and fragile locking scheme for this
+ * rare case, simply evict all CPUs from VM execution and take
+ * the highest global reference clock. This also allows us to
+ * recover from skew events which slightly advance other clocks
+ * relative to the base.
+ */
+ on_each_cpu(evict, NULL, 1);
+ for_each_online_cpu(cpu)
+ if (per_cpu(cpu_tsc_vars, cpu).last_ref > tsc)
+ tsc = per_cpu(cpu_tsc_vars, cpu).last_ref;
/* Fixup our own values to stay in sync with the reference */
- tsc = compute_ref_tsc();
cv->tsc_measure_base = native_read_tsc();
cv->tsc_offset = tsc;
+ cv->tsc_generation++; // XXX needed? */
compute_best_multiplier(ref_tsc_khz, cv->tsc_khz, &cv->tsc_multiplier,
&cv->tsc_shift);
atomic_set(&cv->tsc_synchronized, 1);
- /* Then, get everybody else on board */
- if (!work_scheduled) {
- work_scheduled = 1;
- execute_in_process_context(resync_user, &resync_work);
- }
+ for_each_online_cpu(cpu)
+ kvm_do_sync_tsc(cpu);
+
+ put_cpu();
+ return 0;
+}
+
+static DEFINE_MUTEX(resync_lock);
+
+static void resync_all(void)
+{
+ mutex_lock(&resync_lock);
+ while (work_on_cpu(tsc_base_cpu, resync, NULL) != 0);
+ /* do nothing */
+ mutex_unlock(&resync_lock);
}
static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
@@ -3614,8 +3656,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
struct cpufreq_freqs *freq = data;
struct kvm *kvm;
struct kvm_vcpu *vcpu;
- int i, send_ipi = 0;
- struct cpu_tsc_vars *cv = &per_cpu(cpu_tsc_vars, freq->cpu);
+ int i;
/*
* There is no way to precisely know the TSC value at which time the
@@ -3625,43 +3666,34 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
* any CPUs which may be running in hardware virtualization.
*
* We do this by setting cpu_tsc_synchronized to zero and polling for
- * this value to change when entering hardware virtualization.
+ * this value to change when entering hardware virtualization; this
+ * will happen in the call to evict(), which also stores the new freq
*/
if (val == CPUFREQ_PRECHANGE) {
- get_online_cpus();
- atomic_set(&cv->tsc_synchronized, 0);
spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
kvm_for_each_vcpu(i, vcpu, kvm) {
if (vcpu->cpu != freq->cpu)
continue;
- if (vcpu->cpu != smp_processor_id())
- send_ipi++;
kvm_request_guest_time_update(vcpu);
}
}
spin_unlock(&kvm_lock);
- if (send_ipi) {
- /*
- * In case we update the frequency for another cpu
- * (which might be in guest context) send an interrupt
- * to kick the cpu out of guest context. Next time
- * guest context is entered kvmclock will be updated,
- * so the guest will not see stale values.
- */
- smp_call_function_single(freq->cpu, evict, NULL, 1);
- }
/*
- * The update of the frequency can't happen while a VM
- * is running, nor can it happen during init when we can
- * race against the init code setting the first known freq.
- * Just use the vm_tsc_lock for a mutex.
+ * In case we update the frequency for another cpu (which might
+ * be in guest context) we must send an interrupt to kick the
+ * cpu out of guest context. Next time guest context is
+ * entered kvmclock will be updated, so the guest will not see
+ * stale values.
+ *
+ * The update of the frequency can't happen while a VM is
+ * running, nor can it happen while we might be possibly
+ * running a resync. To avoid locking problems which arise
+ * otherwise, always update the tsc_khz value on the affected
+ * processor; this is done by passing the freq data to evict
*/
- spin_lock(&kvm_tsc_lock);
- cv->tsc_khz = freq->new;
- spin_unlock(&kvm_tsc_lock);
-
+ smp_call_function_single(freq->cpu, evict, data, 1);
return 0;
}
@@ -3673,8 +3705,9 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
* order in which this notifier is called; this is harmless.
*/
if (val == CPUFREQ_POSTCHANGE) {
+ get_online_cpus();
if (freq->cpu == tsc_base_cpu)
- smp_call_function_single(freq->cpu, resync, NULL, 1);
+ resync_all();
else if (cpu_is_tsc_synchronized(tsc_base_cpu))
/* Can't do this if base is not yet updated */
kvm_do_sync_tsc(freq->cpu);
@@ -3694,19 +3727,17 @@ static int kvm_x86_cpu_hotplug(struct notifier_block *notifier,
val &= ~CPU_TASKS_FROZEN;
switch (val) {
- case CPU_DOWN_PREPARE:
+ case CPU_DYING:
if (cpu == tsc_base_cpu) {
int new_cpu;
- spin_lock(&kvm_tsc_lock);
for_each_online_cpu(new_cpu)
if (new_cpu != tsc_base_cpu)
break;
tsc_base_cpu = new_cpu;
- spin_unlock(&kvm_tsc_lock);
}
break;
- case CPU_DYING:
+ case CPU_DOWN_PREPARE:
case CPU_UP_CANCELED:
atomic_set(&per_cpu(cpu_tsc_vars, cpu).tsc_synchronized, 0);
break;
@@ -3784,18 +3815,18 @@ static void kvm_timer_init(void)
unsigned long khz = cpufreq_get(cpu);
if (!khz)
khz = tsc_khz;
- spin_lock(&kvm_tsc_lock);
+ local_irq_disable();
if (!per_cpu(cpu_tsc_vars, cpu).tsc_khz)
per_cpu(cpu_tsc_vars, cpu).tsc_khz = khz;
- spin_unlock(&kvm_tsc_lock);
+ local_irq_enable();
}
} else {
for_each_possible_cpu(cpu)
per_cpu(cpu_tsc_vars, cpu).tsc_khz = tsc_khz;
}
tsc_base_cpu = get_cpu();
- resync(NULL);
put_cpu();
+ resync_all();
}
int kvm_arch_init(void *opaque)