diff mbox

[RFC:,tsc,virtualization,03/20] TSC offset framework

Message ID 1260850127-9766-4-git-send-email-zamsden@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Zachary Amsden Dec. 15, 2009, 4:08 a.m. UTC
None
diff mbox

Patch

different TSC offsets from each other.  The TSC delta is measured
(in cross-cache-directions for highest accuracy) and stored.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
---
 arch/x86/kvm/x86.c |  202 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 202 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index faa467d..95e43b9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -730,6 +730,196 @@  static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *
 }
 
 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
+static DEFINE_PER_CPU(unsigned long, cpu_tsc_multiplier);
+static DEFINE_PER_CPU(int, cpu_tsc_shift);
+static DEFINE_PER_CPU(s64, cpu_tsc_offset);
+static DEFINE_PER_CPU(u64, cpu_tsc_measure_base);
+static int tsc_base_cpu = -1;
+static unsigned long ref_tsc_khz;
+
+static inline unsigned long div_precise(unsigned long hi, unsigned long lo,
+	unsigned long divisor, unsigned long *rptr)
+{
+	unsigned long quotient, remainder;
+	__asm__ ( "div %4"
+		: "=a" (quotient), "=d" (remainder)
+		: "0" (lo), "1" (hi), "rm" (divisor));
+	*rptr = remainder;
+	return quotient;
+}
+
+/*
+ * compute the best multipler and shift pair m,s, such that for n,
+ *   n * a / b  = (n * m) >> s
+ */
+static void compute_best_multiplier(unsigned long a, unsigned long b,
+	unsigned long *m, int *s)
+{
+	int shift, bit;
+	unsigned long lo, hi, remainder, mult;
+
+	/*
+	 * By pre-shifting and using maximum machine width, we get the most
+	 * bits of precision.
+	 */
+	shift = BITS_PER_LONG + fls(b) - fls(a) - 1;
+	if (shift > BITS_PER_LONG)
+		shift = BITS_PER_LONG;
+	if (shift < 0)
+		shift = 0;
+	lo = a << shift;
+	hi = a >> (BITS_PER_LONG - shift);
+	mult = div_precise(hi, lo, b, &remainder);
+
+	/* See if it can be further simplified */
+	bit = __ffs(mult);
+	if (bit > shift)
+		bit = shift;
+	mult >>= bit;
+	shift -= bit;
+	*m = mult;
+	*s = shift;
+}
+
+static inline unsigned long mult_precise(unsigned long val, unsigned long mult,
+	int shift)
+{
+	unsigned long top, bot;
+
+	__asm__ ( "mul %3; shrd %1, %0" :
+		 "=&a" (bot), "=&d" (top) :
+		 "0" (mult), "rm" (val), "c" (shift));
+	return bot;
+}
+
+static inline u64 compute_ref_tsc(int cpu)
+{
+	u64 tsc = native_read_tsc() - per_cpu(cpu_tsc_measure_base, cpu);
+	tsc = mult_precise(tsc, per_cpu(cpu_tsc_multiplier, cpu),
+				per_cpu(cpu_tsc_shift, cpu));
+	return tsc + per_cpu(cpu_tsc_offset, cpu);
+}
+
+#define SYNC_TRIES 64
+
+/*
+ * sync_tsc_helper is a dual-entry coroutine meant to be run by only
+ * two CPUs at a time, one of which is a measuring CPU.  Both CPUs
+ * synchronize entry and exit as well as the central recording loop
+ * using only memory barriers and atomic variables to avoid lock latency.
+ *
+ * To discount cache latency effects, this routine will be called
+ * twice, one with the measure / recording CPUs reversed.  In addition,
+ * the first 4 and last 2 results will be discarded to allow branch
+ * predicition to become established (and to discount effects from
+ * a potentially early predicted loop exit).
+ *
+ * Because of this, we must be extra careful to guard the entrance
+ * and exit against the CPU switch.  I chose to use atomic instructions
+ * only at the end of the measure loop and use the same routine for
+ * both CPUs, with symmetric comparisons, and a statically placed
+ * recording array, hopefully maximizing the branch predicition and
+ * cache locality.  The results appear quite good; on known to be
+ * synchronized CPUs, I typically get < 10 TSC delta measured, with
+ * maximum observed error on the order of 100 cycles.
+ *
+ * This doesn't account for NUMA cache effects, and could potentially
+ * be improved there by moving the delta[] array to the stack of the
+ * measuring CPU.  In fact, this modification might be worth trying
+ * for non-NUMA systems as well, but this appears to be fine for now.
+ */
+static void sync_tsc_helper(int measure_cpu, u64 *delta, atomic_t *ready)
+{
+	int tries;
+	static u64 tsc_other;
+	int junk = 0;
+	u64 tsc;
+	int cpu = raw_smp_processor_id();
+
+	if (cpu == measure_cpu) {
+		atomic_set(ready, 0);
+		while (!atomic_read(ready))
+			/* wait */;
+	} else {
+		while (atomic_read(ready))
+			/* wait */;
+		atomic_set(ready, 1);
+	}
+	for (tries = 0; tries < SYNC_TRIES; tries++) {
+		mb();
+		if (cpu == measure_cpu) {
+			atomic_set(ready, 0);
+		} else {
+			while (atomic_read(ready))
+				/* wait */;
+		}
+		native_cpuid(&junk, &junk, &junk, &junk);
+		tsc = compute_ref_tsc(cpu);
+		rdtsc_barrier();
+		if (cpu == measure_cpu) {
+			while (!atomic_read(ready))
+				/* wait */;
+			rmb();
+			delta[tries] = tsc - tsc_other;
+		} else {
+			tsc_other = tsc;
+			wmb();
+			atomic_set(ready, 1);
+		}
+	}
+	if (cpu == measure_cpu)
+		atomic_dec(ready);
+	else
+		atomic_inc(ready);
+	while (atomic_read(ready) != 1)
+		/* wait */;
+	mb();
+}
+
+static void kvm_sync_tsc(void *cpup)
+{
+	int new_cpu = *(int *)cpup;
+	unsigned long flags;
+	static s64 delta[SYNC_TRIES*2];
+	static atomic_t ready = ATOMIC_INIT(1);
+
+	BUG_ON(tsc_base_cpu == -1);
+	pr_debug("%s: IN, cpu = %d, freq = %ldkHz, tsc_base_cpu = %d\n", __func__, raw_smp_processor_id(), per_cpu(cpu_tsc_khz, raw_smp_processor_id()) , tsc_base_cpu);
+	local_irq_save(flags);
+	if (raw_smp_processor_id() == new_cpu) {
+		per_cpu(cpu_tsc_measure_base, new_cpu) = native_read_tsc();
+		per_cpu(cpu_tsc_offset, new_cpu) = 0;
+		compute_best_multiplier(ref_tsc_khz,
+			per_cpu(cpu_tsc_khz, new_cpu),
+			&per_cpu(cpu_tsc_multiplier, new_cpu),
+			&per_cpu(cpu_tsc_shift, new_cpu));
+	}
+	sync_tsc_helper(tsc_base_cpu, delta, &ready);
+	sync_tsc_helper(new_cpu, &delta[SYNC_TRIES], &ready);
+	if (raw_smp_processor_id() == new_cpu) {
+		int i;
+		s64 accumulator = 0;
+
+		/*
+		 * accumulate [SYNC_TRIES+4,-2) of tsc{base} - tsc{new}
+		 * subtract   [SYNC_TRIES+4,-2) of tsc{new} - tsc{base}
+		 *
+		 * this allows instruction cycle and cache differences to
+		 * cancel each other out and drops warm up/cool down variation
+		 *
+		 * Note the arithmatic must be signed because of the divide
+		 */
+
+		for (i = 4; i < SYNC_TRIES - 2; i++)
+			accumulator += delta[i];
+		for (i = 4; i < SYNC_TRIES - 2; i++)
+			accumulator -= delta[i+SYNC_TRIES];
+		accumulator = accumulator / (SYNC_TRIES*2-12);
+		per_cpu(cpu_tsc_offset, new_cpu) = accumulator;
+		pr_debug("%s: OUT, cpu = %d, cpu_tsc_offset = %lld, cpu_tsc_multiplier=%ld, cpu_tsc_shift=%d\n", __func__, raw_smp_processor_id(), per_cpu(cpu_tsc_offset, new_cpu), per_cpu(cpu_tsc_multiplier, new_cpu), per_cpu(cpu_tsc_shift, new_cpu));
+	}
+	local_irq_restore(flags);
+}
 
 static void kvm_write_guest_time(struct kvm_vcpu *v)
 {
@@ -3352,6 +3542,18 @@  static void kvm_timer_init(void)
 		for_each_possible_cpu(cpu)
 			per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
 	}
+	tsc_base_cpu = get_cpu();
+	ref_tsc_khz = per_cpu(cpu_tsc_khz, tsc_base_cpu);
+	per_cpu(cpu_tsc_multiplier, tsc_base_cpu) = 1;
+	per_cpu(cpu_tsc_shift, tsc_base_cpu) = 0;
+	per_cpu(cpu_tsc_offset, tsc_base_cpu) = 0;
+	for_each_online_cpu(cpu)
+		if (cpu != tsc_base_cpu) {
+			smp_call_function_single(cpu, kvm_sync_tsc,
+						 (void *)&cpu, 0);
+			kvm_sync_tsc((void *)&cpu);
+		}
+	put_cpu();
 }
 
 int kvm_arch_init(void *opaque)