new file mode 100644
@@ -0,0 +1,279 @@
+#include "libcflat.h"
+#include "smp.h"
+#include "atomic.h"
+#include "processor.h"
+#include "kvmclock.h"
+
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#define likely(x) __builtin_expect(!!(x), 1)
+
+
+struct pvclock_vcpu_time_info __attribute__((aligned(4))) hv_clock[MAX_CPU];
+struct pvclock_wall_clock wall_clock;
+static unsigned char valid_flags = 0;
+static atomic64_t last_value = ATOMIC64_INIT(0);
+
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
+{
+ u64 product;
+#ifdef __i386__
+ u32 tmp1, tmp2;
+#endif
+
+ if (shift < 0)
+ delta >>= -shift;
+ else
+ delta <<= shift;
+
+#ifdef __i386__
+ __asm__ (
+ "mul %5 ; "
+ "mov %4,%%eax ; "
+ "mov %%edx,%4 ; "
+ "mul %5 ; "
+ "xor %5,%5 ; "
+ "add %4,%%eax ; "
+ "adc %5,%%edx ; "
+ : "=A" (product), "=r" (tmp1), "=r" (tmp2)
+ : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
+#elif defined(__x86_64__)
+ __asm__ (
+ "mul %%rdx ; shrd $32,%%rdx,%%rax"
+ : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
+#else
+#error implement me!
+#endif
+
+ return product;
+}
+
+#ifdef __i386__
+# define do_div(n,base) ({ \
+ u32 __base = (base); \
+ u32 __rem; \
+ __rem = ((u64)(n)) % __base; \
+ (n) = ((u64)(n)) / __base; \
+ __rem; \
+ })
+#else
+u32 __attribute__((weak)) __div64_32(u64 *n, u32 base)
+{
+ u64 rem = *n;
+ u64 b = base;
+ u64 res, d = 1;
+ u32 high = rem >> 32;
+
+ /* Reduce the thing a bit first */
+ res = 0;
+ if (high >= base) {
+ high /= base;
+ res = (u64) high << 32;
+ rem -= (u64) (high*base) << 32;
+ }
+
+ while ((s64)b > 0 && b < rem) {
+ b = b+b;
+ d = d+d;
+ }
+
+ do {
+ if (rem >= b) {
+ rem -= b;
+ res += d;
+ }
+ b >>= 1;
+ d >>= 1;
+ } while (d);
+
+ *n = res;
+ return rem;
+}
+
+# define do_div(n,base) ({ \
+ u32 __base = (base); \
+ u32 __rem; \
+ (void)(((typeof((n)) *)0) == ((u64 *)0)); \
+ if (likely(((n) >> 32) == 0)) { \
+ __rem = (u32)(n) % __base; \
+ (n) = (u32)(n) / __base; \
+ } else \
+ __rem = __div64_32(&(n), __base); \
+ __rem; \
+ })
+#endif
+
+/**
+ * set_normalized_timespec - set timespec sec and nsec parts and normalize
+ *
+ * @ts: pointer to timespec variable to be set
+ * @sec: seconds to set
+ * @nsec: nanoseconds to set
+ *
+ * Set seconds and nanoseconds field of a timespec variable and
+ * normalize to the timespec storage format
+ *
+ * Note: The tv_nsec part is always in the range of
+ * 0 <= tv_nsec < NSEC_PER_SEC
+ * For negative values only the tv_sec field is negative !
+ */
+void set_normalized_timespec(struct timespec *ts, long sec, s64 nsec)
+{
+ while (nsec >= NSEC_PER_SEC) {
+ /*
+ * The following asm() prevents the compiler from
+ * optimising this loop into a modulo operation. See
+ * also __iter_div_u64_rem() in include/linux/time.h
+ */
+ asm("" : "+rm"(nsec));
+ nsec -= NSEC_PER_SEC;
+ ++sec;
+ }
+ while (nsec < 0) {
+ asm("" : "+rm"(nsec));
+ nsec += NSEC_PER_SEC;
+ --sec;
+ }
+ ts->tv_sec = sec;
+ ts->tv_nsec = nsec;
+}
+
+static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
+{
+ u64 delta = rdtsc() - shadow->tsc_timestamp;
+ return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
+}
+
+/*
+ * Reads a consistent set of time-base values from hypervisor,
+ * into a shadow data area.
+ */
+static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
+ struct pvclock_vcpu_time_info *src)
+{
+ do {
+ dst->version = src->version;
+ rmb(); /* fetch version before data */
+ dst->tsc_timestamp = src->tsc_timestamp;
+ dst->system_timestamp = src->system_time;
+ dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
+ dst->tsc_shift = src->tsc_shift;
+ dst->flags = src->flags;
+ rmb(); /* test version after fetching data */
+ } while ((src->version & 1) || (dst->version != src->version));
+
+ return dst->version;
+}
+
+cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
+{
+ struct pvclock_shadow_time shadow;
+ unsigned version;
+ cycle_t ret, offset;
+ u64 last;
+
+ do {
+ version = pvclock_get_time_values(&shadow, src);
+ barrier();
+ offset = pvclock_get_nsec_offset(&shadow);
+ ret = shadow.system_timestamp + offset;
+ barrier();
+ } while (version != src->version);
+
+ if ((valid_flags & PVCLOCK_RAW_CYCLE_BIT) ||
+ ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
+ (shadow.flags & PVCLOCK_TSC_STABLE_BIT)))
+ return ret;
+
+ /*
+ * Assumption here is that last_value, a global accumulator, always goes
+ * forward. If we are less than that, we should not be much smaller.
+ * We assume there is an error marging we're inside, and then the correction
+ * does not sacrifice accuracy.
+ *
+ * For reads: global may have changed between test and return,
+ * but this means someone else updated poked the clock at a later time.
+ * We just need to make sure we are not seeing a backwards event.
+ *
+ * For updates: last_value = ret is not enough, since two vcpus could be
+ * updating at the same time, and one of them could be slightly behind,
+ * making the assumption that last_value always go forward fail to hold.
+ */
+ last = atomic64_read(&last_value);
+ do {
+ if (ret < last)
+ return last;
+ last = atomic64_cmpxchg(&last_value, last, ret);
+ } while (unlikely(last != ret));
+
+ return ret;
+}
+
+cycle_t kvm_clock_read()
+{
+ struct pvclock_vcpu_time_info *src;
+ cycle_t ret;
+ int index = smp_id();
+
+ src = &hv_clock[index];
+ ret = pvclock_clocksource_read(src);
+ return ret;
+}
+
+void kvm_clock_init(void *data)
+{
+ int index = smp_id();
+ struct pvclock_vcpu_time_info *hvc = &hv_clock[index];
+
+ printf("kvm-clock: cpu %d, msr 0x:%lx \n", index, hvc);
+ wrmsr(MSR_KVM_SYSTEM_TIME, (unsigned long)hvc | 1);
+}
+
+void kvm_clock_clear(void *data)
+{
+ wrmsr(MSR_KVM_SYSTEM_TIME, 0LL);
+}
+
+void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
+ struct pvclock_vcpu_time_info *vcpu_time,
+ struct timespec *ts)
+{
+ u32 version;
+ u64 delta;
+ struct timespec now;
+
+ /* get wallclock at system boot */
+ do {
+ version = wall_clock->version;
+ rmb(); /* fetch version before time */
+ now.tv_sec = wall_clock->sec;
+ now.tv_nsec = wall_clock->nsec;
+ rmb(); /* fetch time before checking version */
+ } while ((wall_clock->version & 1) || (version != wall_clock->version));
+
+ delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */
+ delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
+
+ now.tv_nsec = do_div(delta, NSEC_PER_SEC);
+ now.tv_sec = delta;
+
+ set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
+}
+
+void kvm_get_wallclock(struct timespec *ts)
+{
+ struct pvclock_vcpu_time_info *vcpu_time;
+ int index = smp_id();
+
+ wrmsr(MSR_KVM_WALL_CLOCK, (unsigned long)&wall_clock);
+ vcpu_time = &hv_clock[index];
+ pvclock_read_wallclock(&wall_clock, vcpu_time, ts);
+}
+
+void pvclock_set_flags(unsigned char flags)
+{
+ valid_flags = flags;
+}
new file mode 100644
@@ -0,0 +1,60 @@
+#ifndef KVMCLOCK_H
+#define KVMCLOCK_H
+
+#define MSR_KVM_WALL_CLOCK 0x11
+#define MSR_KVM_SYSTEM_TIME 0x12
+
+#define MAX_CPU 64
+
+#define PVCLOCK_TSC_STABLE_BIT (1 << 0)
+#define PVCLOCK_RAW_CYCLE_BIT (1 << 7) /* Get raw cycle */
+
+# define NSEC_PER_SEC 1000000000ULL
+
+typedef u64 cycle_t;
+
+struct pvclock_vcpu_time_info {
+ u32 version;
+ u32 pad0;
+ u64 tsc_timestamp;
+ u64 system_time;
+ u32 tsc_to_system_mul;
+ s8 tsc_shift;
+ u8 flags;
+ u8 pad[2];
+} __attribute__((__packed__)); /* 32 bytes */
+
+struct pvclock_wall_clock {
+ u32 version;
+ u32 sec;
+ u32 nsec;
+} __attribute__((__packed__));
+
+/*
+ * These are perodically updated
+ * xen: magic shared_info page
+ * kvm: gpa registered via msr
+ * and then copied here.
+ */
+struct pvclock_shadow_time {
+ u64 tsc_timestamp; /* TSC at last update of time vals. */
+ u64 system_timestamp; /* Time, in nanosecs, since boot. */
+ u32 tsc_to_nsec_mul;
+ int tsc_shift;
+ u32 version;
+ u8 flags;
+};
+
+
+struct timespec {
+ long tv_sec;
+ long tv_nsec;
+};
+
+void pvclock_set_flags(unsigned char flags);
+cycle_t kvm_clock_read();
+void kvm_get_wallclock(struct timespec *ts);
+void kvm_clock_init(void *data);
+void kvm_clock_clear(void *data);
+
+#endif