diff mbox

[RFC:,tsc,virtualization,15/20] Fix longstanding races

Message ID 1260850127-9766-16-git-send-email-zamsden@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Zachary Amsden Dec. 15, 2009, 4:08 a.m. UTC
None
diff mbox

Patch

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7bc20ac..7e2ba3e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -747,6 +747,7 @@  struct cpu_tsc_vars
 	s64			tsc_offset;
 	u64			tsc_measure_base;
 	atomic_t		tsc_synchronized;
+	u64			last_ref;
 };
 static DEFINE_PER_CPU(struct cpu_tsc_vars, cpu_tsc_vars);
 
@@ -3570,42 +3571,83 @@  int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
 
-static void evict(void *info)
+/*
+ * evict(struct cpufreq_freqs *data)
+ *
+ * This function is meant to be called from an IPI and runs with
+ * interrupts disabled.  It records the last good value of the
+ * reference TSC before optionally updating the CPU frequency and
+ * marking the CPU unsynchronized.  Since it is entered from an
+ * interrupt handler, it can be used to bring CPUs out of hardware
+ * virtualization and have them wait until their clock has been
+ * resynchronized.
+ */
+static void evict(void *data)
 {
-	/* nothing */
+	struct cpu_tsc_vars *cv = &__get_cpu_var(cpu_tsc_vars);
+	WARN_ON(!irqs_disabled());
+	cv->last_ref = compute_ref_tsc();
+	if (data) {
+		struct cpufreq_freqs *freq = data;
+		cv->tsc_khz = freq->new;
+	}
+	atomic_set(&cv->tsc_synchronized, 0);
 }
 
-static struct execute_work resync_work;
-static int work_scheduled;
-
-static void resync_user(struct work_struct *work)
+static long resync(void *unused)
 {
+	struct cpu_tsc_vars *cv = &__get_cpu_var(cpu_tsc_vars);
+	u64 tsc = 0;
 	int cpu;
 
-	work_scheduled = 0;
-	for_each_online_cpu(cpu)
-		if (cpu != tsc_base_cpu)
-			kvm_do_sync_tsc(cpu);
-}
+	/*
+	 * First, make sure we are on the right CPU; between when the work got
+	 * scheduled and when this runs, the tsc_base could have changed.  We
+	 * lock out changes to tsc_base_cpu with cpu_get().  The cpu takedown
+	 * code can't run until all non-preempt sections are finished, so being
+	 * in a non-preempt section protects the base from changing.  Tricky.
+	 */
+	cpu = get_cpu();
+	if (cpu != tsc_base_cpu)
+		return -1;
 
-static void resync(void *info)
-{
-	struct cpu_tsc_vars *cv = &__get_cpu_var(cpu_tsc_vars);
-	u64 tsc;
+	/*
+	 * When the base CPU frequency changes independenetly of other
+	 * cores, the reference TSC can fall behind.  Rather than
+	 * introduce a complicated and fragile locking scheme for this
+	 * rare case, simply evict all CPUs from VM execution and take
+	 * the highest global reference clock.  This also allows us to
+	 * recover from skew events which slightly advance other clocks
+	 * relative to the base.
+	 */
+	on_each_cpu(evict, NULL, 1);
+	for_each_online_cpu(cpu)
+		if (per_cpu(cpu_tsc_vars, cpu).last_ref > tsc)
+			tsc = per_cpu(cpu_tsc_vars, cpu).last_ref;
 
 	/* Fixup our own values to stay in sync with the reference */
-	tsc = compute_ref_tsc();
 	cv->tsc_measure_base = native_read_tsc();
 	cv->tsc_offset = tsc;
+	cv->tsc_generation++; // XXX needed? */
 	compute_best_multiplier(ref_tsc_khz, cv->tsc_khz, &cv->tsc_multiplier,
 				&cv->tsc_shift);
 	atomic_set(&cv->tsc_synchronized, 1);
 
-	/* Then, get everybody else on board */
-	if (!work_scheduled) {
-		work_scheduled = 1;
-		execute_in_process_context(resync_user, &resync_work);
-	}
+	for_each_online_cpu(cpu)
+		kvm_do_sync_tsc(cpu);
+
+	put_cpu();
+	return 0;
+}
+
+static DEFINE_MUTEX(resync_lock);
+
+static void resync_all(void)
+{
+	mutex_lock(&resync_lock);
+	while (work_on_cpu(tsc_base_cpu, resync, NULL) != 0);
+		/* do nothing */
+	mutex_unlock(&resync_lock);
 }
 
 static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
@@ -3614,8 +3656,7 @@  static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
 	struct cpufreq_freqs *freq = data;
 	struct kvm *kvm;
 	struct kvm_vcpu *vcpu;
-	int i, send_ipi = 0;
-	struct cpu_tsc_vars *cv = &per_cpu(cpu_tsc_vars, freq->cpu);
+	int i;
 
 	/*
 	 * There is no way to precisely know the TSC value at which time the
@@ -3625,43 +3666,34 @@  static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
 	 * any CPUs which may be running in hardware virtualization.
 	 *
 	 * We do this by setting cpu_tsc_synchronized to zero and polling for
-	 * this value to change when entering hardware virtualization.
+	 * this value to change when entering hardware virtualization; this
+	 * will happen in the call to evict(), which also stores the new freq
 	 */
 	if (val == CPUFREQ_PRECHANGE) {
-		get_online_cpus();
-		atomic_set(&cv->tsc_synchronized, 0);
 		spin_lock(&kvm_lock);
 		list_for_each_entry(kvm, &vm_list, vm_list) {
 			kvm_for_each_vcpu(i, vcpu, kvm) {
 				if (vcpu->cpu != freq->cpu)
 					continue;
-				if (vcpu->cpu != smp_processor_id())
-					send_ipi++;
 				kvm_request_guest_time_update(vcpu);
 			}
 		}
 		spin_unlock(&kvm_lock);
-		if (send_ipi) {
-			/*
-			 * In case we update the frequency for another cpu
-			 * (which might be in guest context) send an interrupt
-			 * to kick the cpu out of guest context.  Next time
-			 * guest context is entered kvmclock will be updated,
-			 * so the guest will not see stale values.
-			 */
-			smp_call_function_single(freq->cpu, evict, NULL, 1);
-		}
 
 		/*
-		 * The update of the frequency can't happen while a VM
-		 * is running, nor can it happen during init when we can
-		 * race against the init code setting the first known freq.
-		 * Just use the vm_tsc_lock for a mutex.
+		 * In case we update the frequency for another cpu (which might
+		 * be in guest context) we must send an interrupt to kick the
+		 * cpu out of guest context.  Next time guest context is
+		 * entered kvmclock will be updated, so the guest will not see
+		 * stale values.
+		 *
+		 * The update of the frequency can't happen while a VM is
+		 * running, nor can it happen while we might be possibly
+		 * running a resync.  To avoid locking problems which arise
+		 * otherwise, always update the tsc_khz value on the affected
+		 * processor; this is done by passing the freq data to evict
 		 */
-		spin_lock(&kvm_tsc_lock);
-		cv->tsc_khz = freq->new;
-		spin_unlock(&kvm_tsc_lock);
-
+		smp_call_function_single(freq->cpu, evict, data, 1);
 		return 0;
 	}
 
@@ -3673,8 +3705,9 @@  static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
 	 * order in which this notifier is called; this is harmless.
 	 */
 	if (val == CPUFREQ_POSTCHANGE) {
+		get_online_cpus();
 		if (freq->cpu == tsc_base_cpu)
-			smp_call_function_single(freq->cpu, resync, NULL, 1);
+			resync_all();
 		else if (cpu_is_tsc_synchronized(tsc_base_cpu))
 			/* Can't do this if base is not yet updated */
 			kvm_do_sync_tsc(freq->cpu);
@@ -3694,19 +3727,17 @@  static int kvm_x86_cpu_hotplug(struct notifier_block *notifier,
 
 	val &= ~CPU_TASKS_FROZEN;
 	switch (val) {
-	case CPU_DOWN_PREPARE:
+	case CPU_DYING:
 		if (cpu == tsc_base_cpu) {
 			int new_cpu;
-			spin_lock(&kvm_tsc_lock);
 			for_each_online_cpu(new_cpu)
 				if (new_cpu != tsc_base_cpu)
 					break;
 			tsc_base_cpu = new_cpu;
-			spin_unlock(&kvm_tsc_lock);
 		}
 		break;
 
-	case CPU_DYING:
+	case CPU_DOWN_PREPARE:
 	case CPU_UP_CANCELED:
 		atomic_set(&per_cpu(cpu_tsc_vars, cpu).tsc_synchronized, 0);
 		break;
@@ -3784,18 +3815,18 @@  static void kvm_timer_init(void)
 			unsigned long khz = cpufreq_get(cpu);
 			if (!khz)
 				khz = tsc_khz;
-			spin_lock(&kvm_tsc_lock);
+			local_irq_disable();
 			if (!per_cpu(cpu_tsc_vars, cpu).tsc_khz)
 				per_cpu(cpu_tsc_vars, cpu).tsc_khz = khz;
-			spin_unlock(&kvm_tsc_lock);
+			local_irq_enable();
 		}
 	} else {
 		for_each_possible_cpu(cpu)
 			per_cpu(cpu_tsc_vars, cpu).tsc_khz = tsc_khz;
 	}
 	tsc_base_cpu = get_cpu();
-	resync(NULL);
 	put_cpu();
+	resync_all();
 }
 
 int kvm_arch_init(void *opaque)