diff mbox series

[v5,01/13] KVM: x86: Report host tsc and realtime values in KVM_GET_CLOCK

Message ID 20210729173300.181775-2-oupton@google.com (mailing list archive)
State New, archived
Headers show
Series KVM: Add idempotent controls for migrating system counter state | expand

Commit Message

Oliver Upton July 29, 2021, 5:32 p.m. UTC
Handling the migration of TSCs correctly is difficult, in part because
Linux does not provide userspace with the ability to retrieve a (TSC,
realtime) clock pair for a single instant in time. In lieu of a more
convenient facility, KVM can report similar information in the kvm_clock
structure.

Provide userspace with a host TSC & realtime pair iff the realtime clock
is based on the TSC. If userspace provides KVM_SET_CLOCK with a valid
realtime value, advance the KVM clock by the amount of elapsed time. Do
not step the KVM clock backwards, though, as it is a monotonic
oscillator.

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Oliver Upton <oupton@google.com>
---
 Documentation/virt/kvm/api.rst  |  42 +++++++--
 arch/x86/include/asm/kvm_host.h |   3 +
 arch/x86/kvm/x86.c              | 149 ++++++++++++++++++++------------
 include/uapi/linux/kvm.h        |   7 +-
 4 files changed, 137 insertions(+), 64 deletions(-)

Comments

Sean Christopherson July 30, 2021, 5:48 p.m. UTC | #1
On Thu, Jul 29, 2021, Oliver Upton wrote:
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 916c976e99ab..e052c7afaac4 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2782,17 +2782,24 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
>  #endif
>  }
>  
> -u64 get_kvmclock_ns(struct kvm *kvm)
> +/*
> + * Returns true if realtime and TSC values were written back to the caller.
> + * Returns false if a clock triplet cannot be obtained, such as if the host's
> + * realtime clock is not based on the TSC.
> + */
> +static bool get_kvmclock_and_realtime(struct kvm *kvm, u64 *kvmclock_ns,
> +				      u64 *realtime_ns, u64 *tsc)
>  {
>  	struct kvm_arch *ka = &kvm->arch;
>  	struct pvclock_vcpu_time_info hv_clock;
>  	unsigned long flags;
> -	u64 ret;
> +	bool ret = false;
>  
>  	spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
>  	if (!ka->use_master_clock) {
>  		spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
> -		return get_kvmclock_base_ns() + ka->kvmclock_offset;
> +		*kvmclock_ns = get_kvmclock_base_ns() + ka->kvmclock_offset;
> +		return false;
>  	}
>  
>  	hv_clock.tsc_timestamp = ka->master_cycle_now;
> @@ -2803,18 +2810,36 @@ u64 get_kvmclock_ns(struct kvm *kvm)
>       get_cpu();
>
>       if (__this_cpu_read(cpu_tsc_khz)) {
> +             struct timespec64 ts;
> +             u64 tsc_val;
> +
>               kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
>                                  &hv_clock.tsc_shift,
>                                  &hv_clock.tsc_to_system_mul);
> -             ret = __pvclock_read_cycles(&hv_clock, rdtsc());
> +
> +             if (kvm_get_walltime_and_clockread(&ts, &tsc_val)) {
> +                     *realtime_ns = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
> +                     *tsc = tsc_val;
> +                     ret = true;
> +             }
> +
> +             *kvmclock_ns = __pvclock_read_cycles(&hv_clock, tsc_val);

This is buggy, as tsc_val is not initialized if kvm_get_walltime_and_clockread()
returns false.  This also straight up fails to compile on 32-bit kernels because
kvm_get_walltime_and_clockread() is wrapped with CONFIG_X86_64.

A gross way to resolve both issues would be (see below regarding 'data'):

	if (__this_cpu_read(cpu_tsc_khz)) {
#ifdef CONFIG_X86_64
		struct timespec64 ts;

		if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) {
			data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
			data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
		} else
#endif
		data->host_tsc = rdtsc();

		kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
				   &hv_clock.tsc_shift,
				   &hv_clock.tsc_to_system_mul);

		data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc);
	} else {
		data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
	}


>       } else

Not your code, but this needs braces.

> -             ret = get_kvmclock_base_ns() + ka->kvmclock_offset;
> +             *kvmclock_ns = get_kvmclock_base_ns() + ka->kvmclock_offset;
>
>       put_cpu();
>
>       return ret;
>  } 

...

> @@ -5820,6 +5845,68 @@ int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state)
>  }
>  #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
>  
> +static int kvm_vm_ioctl_get_clock(struct kvm *kvm,
> +				  void __user *argp)
> +{
> +	struct kvm_clock_data data;
> +
> +	memset(&data, 0, sizeof(data));
> +
> +	if (get_kvmclock_and_realtime(kvm, &data.clock, &data.realtime,
> +				      &data.host_tsc))
> +		data.flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
> +
> +	if (kvm->arch.use_master_clock)
> +		data.flags |= KVM_CLOCK_TSC_STABLE;

I know nothing about the actual behavior, but this appears to have a potential
race (though it came from the existing code).  get_kvmclock_and_realtime() checks
arch.use_master_clock under pvclock_gtod_sync_lock, but this does not.  Even if
that's functionally ok, it's a needless complication.

Fixing that weirdness would also provide an opportunity to clean up the API,
e.g. the boolean return is not exactly straightforward.  The simplest approach
would be to take "struct kvm_clock_data *data" in get_kvmclock_and_realtime()
instead of multiple params.  Then that helper can set the flags accordingly, thus
avoiding the question of whether or not there's a race and eliminating the boolean
return.  E.g.

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e052c7afaac4..872a53a7c467 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2782,25 +2782,20 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 #endif
 }
 
-/*
- * Returns true if realtime and TSC values were written back to the caller.
- * Returns false if a clock triplet cannot be obtained, such as if the host's
- * realtime clock is not based on the TSC.
- */
-static bool get_kvmclock_and_realtime(struct kvm *kvm, u64 *kvmclock_ns,
-                                     u64 *realtime_ns, u64 *tsc)
+static void get_kvmclock_and_realtime(struct kvm *kvm,
+                                     struct kvm_clock_data *data)
 {
        struct kvm_arch *ka = &kvm->arch;
        struct pvclock_vcpu_time_info hv_clock;
        unsigned long flags;
-       bool ret = false;
 
        spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
        if (!ka->use_master_clock) {
                spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
-               *kvmclock_ns = get_kvmclock_base_ns() + ka->kvmclock_offset;
-               return false;
+               data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
+               return;
        }
+       data->flags |= KVM_CLOCK_TSC_STABLE;
 
        hv_clock.tsc_timestamp = ka->master_cycle_now;
        hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
@@ -2810,34 +2805,40 @@ static bool get_kvmclock_and_realtime(struct kvm *kvm, u64 *kvmclock_ns,
        get_cpu();
 
        if (__this_cpu_read(cpu_tsc_khz)) {
+#ifdef CONFIG_X86_64
                struct timespec64 ts;
-               u64 tsc_val;
+
+               if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) {
+                       data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
+                       data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
+               } else
+#endif
+               data->host_tsc = rdtsc();
 
                kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
                                   &hv_clock.tsc_shift,
                                   &hv_clock.tsc_to_system_mul);
 
-               if (kvm_get_walltime_and_clockread(&ts, &tsc_val)) {
-                       *realtime_ns = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
-                       *tsc = tsc_val;
-                       ret = true;
-               }
-
-               *kvmclock_ns = __pvclock_read_cycles(&hv_clock, tsc_val);
-       } else
-               *kvmclock_ns = get_kvmclock_base_ns() + ka->kvmclock_offset;
+               data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc);
+       } else {
+               data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
+       }
 
        put_cpu();
-
-       return ret;
 }
 
 u64 get_kvmclock_ns(struct kvm *kvm)
 {
-       u64 kvmclock_ns, realtime_ns, tsc;
+       struct kvm_clock_data data;
 
-       get_kvmclock_and_realtime(kvm, &kvmclock_ns, &realtime_ns, &tsc);
-       return kvmclock_ns;
+       /*
+        * Zero flags as it's accessed RMW, leave everything else uninitialized
+        * as clock is always written and no other fields are consumed.
+        */
+       data.flags = 0;
+
+       get_kvmclock_and_realtime(kvm, &data);
+       return data.clock;
 }
 
 static void kvm_setup_pvclock_page(struct kvm_vcpu *v,
@@ -5852,12 +5853,7 @@ static int kvm_vm_ioctl_get_clock(struct kvm *kvm,
 
        memset(&data, 0, sizeof(data));
 
-       if (get_kvmclock_and_realtime(kvm, &data.clock, &data.realtime,
-                                     &data.host_tsc))
-               data.flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
-
-       if (kvm->arch.use_master_clock)
-               data.flags |= KVM_CLOCK_TSC_STABLE;
+       get_kvmclock_and_realtime(kvm, &data);
 
        if (copy_to_user(argp, &data, sizeof(data)))
                return -EFAULT;

> +
> +	if (copy_to_user(argp, &data, sizeof(data)))
> +		return -EFAULT;
> +
> +	return 0;
> +}
> +
> +static int kvm_vm_ioctl_set_clock(struct kvm *kvm,
> +				  void __user *argp)
> +{
> +	struct kvm_arch *ka = &kvm->arch;
> +	struct kvm_clock_data data;
> +	u64 now_raw_ns;
> +
> +	if (copy_from_user(&data, argp, sizeof(data)))
> +		return -EFAULT;
> +
> +	if (data.flags & ~KVM_CLOCK_REALTIME)
> +		return -EINVAL;

Huh, this an odd ABI, the output of KVM_GET_CLOCK isn't legal input to KVM_SET_CLOCK.
The existing code has the same behavior, so apparently it's ok, just odd.

> +
> +	/*
> +	 * TODO: userspace has to take care of races with VCPU_RUN, so
> +	 * kvm_gen_update_masterclock() can be cut down to locked
> +	 * pvclock_update_vm_gtod_copy().
> +	 */
> +	kvm_gen_update_masterclock(kvm);
> +
> +	spin_lock_irq(&ka->pvclock_gtod_sync_lock);
> +	if (data.flags & KVM_CLOCK_REALTIME) {
> +		u64 now_real_ns = ktime_get_real_ns();
> +
> +		/*
> +		 * Avoid stepping the kvmclock backwards.
> +		 */
> +		if (now_real_ns > data.realtime)
> +			data.clock += now_real_ns - data.realtime;
> +	}
> +
> +	if (ka->use_master_clock)
> +		now_raw_ns = ka->master_kernel_ns;
> +	else
> +		now_raw_ns = get_kvmclock_base_ns();
> +	ka->kvmclock_offset = data.clock - now_raw_ns;
> +	spin_unlock_irq(&ka->pvclock_gtod_sync_lock);
> +
> +	kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
> +	return 0;
> +}
> +
>  long kvm_arch_vm_ioctl(struct file *filp,
>  		       unsigned int ioctl, unsigned long arg)
>  {
> @@ -6064,57 +6151,11 @@ long kvm_arch_vm_ioctl(struct file *filp,
>  	}
>  #endif
>  	case KVM_SET_CLOCK: {

The curly braces can be dropped, both here and in KVM_GET_CLOCK.

>  	}
>  	case KVM_GET_CLOCK: {

...

>  	}
Oliver Upton July 30, 2021, 6:24 p.m. UTC | #2
On Fri, Jul 30, 2021 at 10:48 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Thu, Jul 29, 2021, Oliver Upton wrote:
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 916c976e99ab..e052c7afaac4 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -2782,17 +2782,24 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
> >  #endif
> >  }
> >
> > -u64 get_kvmclock_ns(struct kvm *kvm)
> > +/*
> > + * Returns true if realtime and TSC values were written back to the caller.
> > + * Returns false if a clock triplet cannot be obtained, such as if the host's
> > + * realtime clock is not based on the TSC.
> > + */
> > +static bool get_kvmclock_and_realtime(struct kvm *kvm, u64 *kvmclock_ns,
> > +                                   u64 *realtime_ns, u64 *tsc)
> >  {
> >       struct kvm_arch *ka = &kvm->arch;
> >       struct pvclock_vcpu_time_info hv_clock;
> >       unsigned long flags;
> > -     u64 ret;
> > +     bool ret = false;
> >
> >       spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
> >       if (!ka->use_master_clock) {
> >               spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
> > -             return get_kvmclock_base_ns() + ka->kvmclock_offset;
> > +             *kvmclock_ns = get_kvmclock_base_ns() + ka->kvmclock_offset;
> > +             return false;
> >       }
> >
> >       hv_clock.tsc_timestamp = ka->master_cycle_now;
> > @@ -2803,18 +2810,36 @@ u64 get_kvmclock_ns(struct kvm *kvm)
> >       get_cpu();
> >
> >       if (__this_cpu_read(cpu_tsc_khz)) {
> > +             struct timespec64 ts;
> > +             u64 tsc_val;
> > +
> >               kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
> >                                  &hv_clock.tsc_shift,
> >                                  &hv_clock.tsc_to_system_mul);
> > -             ret = __pvclock_read_cycles(&hv_clock, rdtsc());
> > +
> > +             if (kvm_get_walltime_and_clockread(&ts, &tsc_val)) {
> > +                     *realtime_ns = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
> > +                     *tsc = tsc_val;
> > +                     ret = true;
> > +             }
> > +
> > +             *kvmclock_ns = __pvclock_read_cycles(&hv_clock, tsc_val);
>
> This is buggy, as tsc_val is not initialized if kvm_get_walltime_and_clockread()
> returns false.  This also straight up fails to compile on 32-bit kernels because
> kvm_get_walltime_and_clockread() is wrapped with CONFIG_X86_64.

Pssh... works on my machine ;-)

>
> A gross way to resolve both issues would be (see below regarding 'data'):
>
>         if (__this_cpu_read(cpu_tsc_khz)) {
> #ifdef CONFIG_X86_64
>                 struct timespec64 ts;
>
>                 if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) {
>                         data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
>                         data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
>                 } else
> #endif
>                 data->host_tsc = rdtsc();
>
>                 kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
>                                    &hv_clock.tsc_shift,
>                                    &hv_clock.tsc_to_system_mul);
>
>                 data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc);
>         } else {
>                 data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
>         }
>

I'll mull on this if there's any cleaner way to fix it, but thanks for
the suggested fix! I missed the x86_64 ifdefs first time around.

> >       } else
>
> Not your code, but this needs braces.

Ack.

>
> > -             ret = get_kvmclock_base_ns() + ka->kvmclock_offset;
> > +             *kvmclock_ns = get_kvmclock_base_ns() + ka->kvmclock_offset;
> >
> >       put_cpu();
> >
> >       return ret;
> >  }
>
> ...
>
> > @@ -5820,6 +5845,68 @@ int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state)
> >  }
> >  #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
> >
> > +static int kvm_vm_ioctl_get_clock(struct kvm *kvm,
> > +                               void __user *argp)
> > +{
> > +     struct kvm_clock_data data;
> > +
> > +     memset(&data, 0, sizeof(data));
> > +
> > +     if (get_kvmclock_and_realtime(kvm, &data.clock, &data.realtime,
> > +                                   &data.host_tsc))
> > +             data.flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
> > +
> > +     if (kvm->arch.use_master_clock)
> > +             data.flags |= KVM_CLOCK_TSC_STABLE;
>
> I know nothing about the actual behavior, but this appears to have a potential
> race (though it came from the existing code).  get_kvmclock_and_realtime() checks
> arch.use_master_clock under pvclock_gtod_sync_lock, but this does not.  Even if
> that's functionally ok, it's a needless complication.
>
> Fixing that weirdness would also provide an opportunity to clean up the API,
> e.g. the boolean return is not exactly straightforward.  The simplest approach
> would be to take "struct kvm_clock_data *data" in get_kvmclock_and_realtime()
> instead of multiple params.  Then that helper can set the flags accordingly, thus
> avoiding the question of whether or not there's a race and eliminating the boolean
> return.  E.g.

Yeah, agreed. I think it may be best to separate fixing the mess from
the new API here.

>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index e052c7afaac4..872a53a7c467 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2782,25 +2782,20 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
>  #endif
>  }
>
> -/*
> - * Returns true if realtime and TSC values were written back to the caller.
> - * Returns false if a clock triplet cannot be obtained, such as if the host's
> - * realtime clock is not based on the TSC.
> - */
> -static bool get_kvmclock_and_realtime(struct kvm *kvm, u64 *kvmclock_ns,
> -                                     u64 *realtime_ns, u64 *tsc)
> +static void get_kvmclock_and_realtime(struct kvm *kvm,
> +                                     struct kvm_clock_data *data)
>  {
>         struct kvm_arch *ka = &kvm->arch;
>         struct pvclock_vcpu_time_info hv_clock;
>         unsigned long flags;
> -       bool ret = false;
>
>         spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
>         if (!ka->use_master_clock) {
>                 spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
> -               *kvmclock_ns = get_kvmclock_base_ns() + ka->kvmclock_offset;
> -               return false;
> +               data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
> +               return;
>         }
> +       data->flags |= KVM_CLOCK_TSC_STABLE;
>
>         hv_clock.tsc_timestamp = ka->master_cycle_now;
>         hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
> @@ -2810,34 +2805,40 @@ static bool get_kvmclock_and_realtime(struct kvm *kvm, u64 *kvmclock_ns,
>         get_cpu();
>
>         if (__this_cpu_read(cpu_tsc_khz)) {
> +#ifdef CONFIG_X86_64
>                 struct timespec64 ts;
> -               u64 tsc_val;
> +
> +               if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) {
> +                       data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
> +                       data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
> +               } else
> +#endif
> +               data->host_tsc = rdtsc();
>
>                 kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
>                                    &hv_clock.tsc_shift,
>                                    &hv_clock.tsc_to_system_mul);
>
> -               if (kvm_get_walltime_and_clockread(&ts, &tsc_val)) {
> -                       *realtime_ns = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
> -                       *tsc = tsc_val;
> -                       ret = true;
> -               }
> -
> -               *kvmclock_ns = __pvclock_read_cycles(&hv_clock, tsc_val);
> -       } else
> -               *kvmclock_ns = get_kvmclock_base_ns() + ka->kvmclock_offset;
> +               data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc);
> +       } else {
> +               data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
> +       }
>
>         put_cpu();
> -
> -       return ret;
>  }
>
>  u64 get_kvmclock_ns(struct kvm *kvm)
>  {
> -       u64 kvmclock_ns, realtime_ns, tsc;
> +       struct kvm_clock_data data;
>
> -       get_kvmclock_and_realtime(kvm, &kvmclock_ns, &realtime_ns, &tsc);
> -       return kvmclock_ns;
> +       /*
> +        * Zero flags as it's accessed RMW, leave everything else uninitialized
> +        * as clock is always written and no other fields are consumed.
> +        */
> +       data.flags = 0;
> +
> +       get_kvmclock_and_realtime(kvm, &data);
> +       return data.clock;
>  }
>
>  static void kvm_setup_pvclock_page(struct kvm_vcpu *v,
> @@ -5852,12 +5853,7 @@ static int kvm_vm_ioctl_get_clock(struct kvm *kvm,
>
>         memset(&data, 0, sizeof(data));
>
> -       if (get_kvmclock_and_realtime(kvm, &data.clock, &data.realtime,
> -                                     &data.host_tsc))
> -               data.flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
> -
> -       if (kvm->arch.use_master_clock)
> -               data.flags |= KVM_CLOCK_TSC_STABLE;
> +       get_kvmclock_and_realtime(kvm, &data);
>
>         if (copy_to_user(argp, &data, sizeof(data)))
>                 return -EFAULT;
>
> > +
> > +     if (copy_to_user(argp, &data, sizeof(data)))
> > +             return -EFAULT;
> > +
> > +     return 0;
> > +}
> > +
> > +static int kvm_vm_ioctl_set_clock(struct kvm *kvm,
> > +                               void __user *argp)
> > +{
> > +     struct kvm_arch *ka = &kvm->arch;
> > +     struct kvm_clock_data data;
> > +     u64 now_raw_ns;
> > +
> > +     if (copy_from_user(&data, argp, sizeof(data)))
> > +             return -EFAULT;
> > +
> > +     if (data.flags & ~KVM_CLOCK_REALTIME)
> > +             return -EINVAL;
>
> Huh, this an odd ABI, the output of KVM_GET_CLOCK isn't legal input to KVM_SET_CLOCK.
> The existing code has the same behavior, so apparently it's ok, just odd.

Quite the head scratcher to me as well /shrug

> > +
> > +     /*
> > +      * TODO: userspace has to take care of races with VCPU_RUN, so
> > +      * kvm_gen_update_masterclock() can be cut down to locked
> > +      * pvclock_update_vm_gtod_copy().
> > +      */
> > +     kvm_gen_update_masterclock(kvm);
> > +
> > +     spin_lock_irq(&ka->pvclock_gtod_sync_lock);
> > +     if (data.flags & KVM_CLOCK_REALTIME) {
> > +             u64 now_real_ns = ktime_get_real_ns();
> > +
> > +             /*
> > +              * Avoid stepping the kvmclock backwards.
> > +              */
> > +             if (now_real_ns > data.realtime)
> > +                     data.clock += now_real_ns - data.realtime;
> > +     }
> > +
> > +     if (ka->use_master_clock)
> > +             now_raw_ns = ka->master_kernel_ns;
> > +     else
> > +             now_raw_ns = get_kvmclock_base_ns();
> > +     ka->kvmclock_offset = data.clock - now_raw_ns;
> > +     spin_unlock_irq(&ka->pvclock_gtod_sync_lock);
> > +
> > +     kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
> > +     return 0;
> > +}
> > +
> >  long kvm_arch_vm_ioctl(struct file *filp,
> >                      unsigned int ioctl, unsigned long arg)
> >  {
> > @@ -6064,57 +6151,11 @@ long kvm_arch_vm_ioctl(struct file *filp,
> >       }
> >  #endif
> >       case KVM_SET_CLOCK: {
>
> The curly braces can be dropped, both here and in KVM_GET_CLOCK.

Ack.

> >       }
> >       case KVM_GET_CLOCK: {
>
> ...
>
> >       }

As always, thanks for the careful review Sean!

--
Best,
Oliver
diff mbox series

Patch

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index dae68e68ca23..8d4a3471ad9e 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -993,20 +993,34 @@  such as migration.
 When KVM_CAP_ADJUST_CLOCK is passed to KVM_CHECK_EXTENSION, it returns the
 set of bits that KVM can return in struct kvm_clock_data's flag member.
 
-The only flag defined now is KVM_CLOCK_TSC_STABLE.  If set, the returned
-value is the exact kvmclock value seen by all VCPUs at the instant
-when KVM_GET_CLOCK was called.  If clear, the returned value is simply
-CLOCK_MONOTONIC plus a constant offset; the offset can be modified
-with KVM_SET_CLOCK.  KVM will try to make all VCPUs follow this clock,
-but the exact value read by each VCPU could differ, because the host
-TSC is not stable.
+FLAGS:
+
+KVM_CLOCK_TSC_STABLE.  If set, the returned value is the exact kvmclock
+value seen by all VCPUs at the instant when KVM_GET_CLOCK was called.
+If clear, the returned value is simply CLOCK_MONOTONIC plus a constant
+offset; the offset can be modified with KVM_SET_CLOCK.  KVM will try
+to make all VCPUs follow this clock, but the exact value read by each
+VCPU could differ, because the host TSC is not stable.
+
+KVM_CLOCK_REALTIME.  If set, the `realtime` field in the kvm_clock_data
+structure is populated with the value of the host's real time
+clocksource at the instant when KVM_GET_CLOCK was called. If clear,
+the `realtime` field does not contain a value.
+
+KVM_CLOCK_HOST_TSC.  If set, the `host_tsc` field in the kvm_clock_data
+structure is populated with the value of the host's timestamp counter (TSC)
+at the instant when KVM_GET_CLOCK was called. If clear, the `host_tsc` field
+does not contain a value.
 
 ::
 
   struct kvm_clock_data {
 	__u64 clock;  /* kvmclock current value */
 	__u32 flags;
-	__u32 pad[9];
+	__u32 pad0;
+	__u64 realtime;
+	__u64 host_tsc;
+	__u32 pad[4];
   };
 
 
@@ -1023,12 +1037,22 @@  Sets the current timestamp of kvmclock to the value specified in its parameter.
 In conjunction with KVM_GET_CLOCK, it is used to ensure monotonicity on scenarios
 such as migration.
 
+FLAGS:
+
+KVM_CLOCK_REALTIME.  If set, KVM will compare the value of the `realtime` field
+with the value of the host's real time clocksource at the instant when
+KVM_SET_CLOCK was called. The difference in elapsed time is added to the final
+kvmclock value that will be provided to guests.
+
 ::
 
   struct kvm_clock_data {
 	__u64 clock;  /* kvmclock current value */
 	__u32 flags;
-	__u32 pad[9];
+	__u32 pad0;
+	__u64 realtime;
+	__u64 host_tsc;
+	__u32 pad[4];
   };
 
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 99f37781a6fc..65a20c64f959 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1932,4 +1932,7 @@  int kvm_cpu_dirty_log_size(void);
 
 int alloc_all_memslots_rmaps(struct kvm *kvm);
 
+#define KVM_CLOCK_VALID_FLAGS						\
+	(KVM_CLOCK_TSC_STABLE | KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC)
+
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 916c976e99ab..e052c7afaac4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2782,17 +2782,24 @@  static void kvm_gen_update_masterclock(struct kvm *kvm)
 #endif
 }
 
-u64 get_kvmclock_ns(struct kvm *kvm)
+/*
+ * Returns true if realtime and TSC values were written back to the caller.
+ * Returns false if a clock triplet cannot be obtained, such as if the host's
+ * realtime clock is not based on the TSC.
+ */
+static bool get_kvmclock_and_realtime(struct kvm *kvm, u64 *kvmclock_ns,
+				      u64 *realtime_ns, u64 *tsc)
 {
 	struct kvm_arch *ka = &kvm->arch;
 	struct pvclock_vcpu_time_info hv_clock;
 	unsigned long flags;
-	u64 ret;
+	bool ret = false;
 
 	spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
 	if (!ka->use_master_clock) {
 		spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
-		return get_kvmclock_base_ns() + ka->kvmclock_offset;
+		*kvmclock_ns = get_kvmclock_base_ns() + ka->kvmclock_offset;
+		return false;
 	}
 
 	hv_clock.tsc_timestamp = ka->master_cycle_now;
@@ -2803,18 +2810,36 @@  u64 get_kvmclock_ns(struct kvm *kvm)
 	get_cpu();
 
 	if (__this_cpu_read(cpu_tsc_khz)) {
+		struct timespec64 ts;
+		u64 tsc_val;
+
 		kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
 				   &hv_clock.tsc_shift,
 				   &hv_clock.tsc_to_system_mul);
-		ret = __pvclock_read_cycles(&hv_clock, rdtsc());
+
+		if (kvm_get_walltime_and_clockread(&ts, &tsc_val)) {
+			*realtime_ns = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
+			*tsc = tsc_val;
+			ret = true;
+		}
+
+		*kvmclock_ns = __pvclock_read_cycles(&hv_clock, tsc_val);
 	} else
-		ret = get_kvmclock_base_ns() + ka->kvmclock_offset;
+		*kvmclock_ns = get_kvmclock_base_ns() + ka->kvmclock_offset;
 
 	put_cpu();
 
 	return ret;
 }
 
+u64 get_kvmclock_ns(struct kvm *kvm)
+{
+	u64 kvmclock_ns, realtime_ns, tsc;
+
+	get_kvmclock_and_realtime(kvm, &kvmclock_ns, &realtime_ns, &tsc);
+	return kvmclock_ns;
+}
+
 static void kvm_setup_pvclock_page(struct kvm_vcpu *v,
 				   struct gfn_to_hva_cache *cache,
 				   unsigned int offset)
@@ -4033,7 +4058,7 @@  int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		r = KVM_SYNC_X86_VALID_FIELDS;
 		break;
 	case KVM_CAP_ADJUST_CLOCK:
-		r = KVM_CLOCK_TSC_STABLE;
+		r = KVM_CLOCK_VALID_FLAGS;
 		break;
 	case KVM_CAP_X86_DISABLE_EXITS:
 		r |=  KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE |
@@ -5820,6 +5845,68 @@  int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state)
 }
 #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
 
+static int kvm_vm_ioctl_get_clock(struct kvm *kvm,
+				  void __user *argp)
+{
+	struct kvm_clock_data data;
+
+	memset(&data, 0, sizeof(data));
+
+	if (get_kvmclock_and_realtime(kvm, &data.clock, &data.realtime,
+				      &data.host_tsc))
+		data.flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
+
+	if (kvm->arch.use_master_clock)
+		data.flags |= KVM_CLOCK_TSC_STABLE;
+
+	if (copy_to_user(argp, &data, sizeof(data)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int kvm_vm_ioctl_set_clock(struct kvm *kvm,
+				  void __user *argp)
+{
+	struct kvm_arch *ka = &kvm->arch;
+	struct kvm_clock_data data;
+	u64 now_raw_ns;
+
+	if (copy_from_user(&data, argp, sizeof(data)))
+		return -EFAULT;
+
+	if (data.flags & ~KVM_CLOCK_REALTIME)
+		return -EINVAL;
+
+	/*
+	 * TODO: userspace has to take care of races with VCPU_RUN, so
+	 * kvm_gen_update_masterclock() can be cut down to locked
+	 * pvclock_update_vm_gtod_copy().
+	 */
+	kvm_gen_update_masterclock(kvm);
+
+	spin_lock_irq(&ka->pvclock_gtod_sync_lock);
+	if (data.flags & KVM_CLOCK_REALTIME) {
+		u64 now_real_ns = ktime_get_real_ns();
+
+		/*
+		 * Avoid stepping the kvmclock backwards.
+		 */
+		if (now_real_ns > data.realtime)
+			data.clock += now_real_ns - data.realtime;
+	}
+
+	if (ka->use_master_clock)
+		now_raw_ns = ka->master_kernel_ns;
+	else
+		now_raw_ns = get_kvmclock_base_ns();
+	ka->kvmclock_offset = data.clock - now_raw_ns;
+	spin_unlock_irq(&ka->pvclock_gtod_sync_lock);
+
+	kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
+	return 0;
+}
+
 long kvm_arch_vm_ioctl(struct file *filp,
 		       unsigned int ioctl, unsigned long arg)
 {
@@ -6064,57 +6151,11 @@  long kvm_arch_vm_ioctl(struct file *filp,
 	}
 #endif
 	case KVM_SET_CLOCK: {
-		struct kvm_arch *ka = &kvm->arch;
-		struct kvm_clock_data user_ns;
-		u64 now_ns;
-
-		r = -EFAULT;
-		if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
-			goto out;
-
-		r = -EINVAL;
-		if (user_ns.flags)
-			goto out;
-
-		r = 0;
-		/*
-		 * TODO: userspace has to take care of races with VCPU_RUN, so
-		 * kvm_gen_update_masterclock() can be cut down to locked
-		 * pvclock_update_vm_gtod_copy().
-		 */
-		kvm_gen_update_masterclock(kvm);
-
-		/*
-		 * This pairs with kvm_guest_time_update(): when masterclock is
-		 * in use, we use master_kernel_ns + kvmclock_offset to set
-		 * unsigned 'system_time' so if we use get_kvmclock_ns() (which
-		 * is slightly ahead) here we risk going negative on unsigned
-		 * 'system_time' when 'user_ns.clock' is very small.
-		 */
-		spin_lock_irq(&ka->pvclock_gtod_sync_lock);
-		if (kvm->arch.use_master_clock)
-			now_ns = ka->master_kernel_ns;
-		else
-			now_ns = get_kvmclock_base_ns();
-		ka->kvmclock_offset = user_ns.clock - now_ns;
-		spin_unlock_irq(&ka->pvclock_gtod_sync_lock);
-
-		kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
+		r = kvm_vm_ioctl_set_clock(kvm, argp);
 		break;
 	}
 	case KVM_GET_CLOCK: {
-		struct kvm_clock_data user_ns;
-		u64 now_ns;
-
-		now_ns = get_kvmclock_ns(kvm);
-		user_ns.clock = now_ns;
-		user_ns.flags = kvm->arch.use_master_clock ? KVM_CLOCK_TSC_STABLE : 0;
-		memset(&user_ns.pad, 0, sizeof(user_ns.pad));
-
-		r = -EFAULT;
-		if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
-			goto out;
-		r = 0;
+		r = kvm_vm_ioctl_get_clock(kvm, argp);
 		break;
 	}
 	case KVM_MEMORY_ENCRYPT_OP: {
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index d9e4aabcb31a..53a49cb8616a 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1223,11 +1223,16 @@  struct kvm_irqfd {
 
 /* Do not use 1, KVM_CHECK_EXTENSION returned it before we had flags.  */
 #define KVM_CLOCK_TSC_STABLE		2
+#define KVM_CLOCK_REALTIME		(1 << 2)
+#define KVM_CLOCK_HOST_TSC		(1 << 3)
 
 struct kvm_clock_data {
 	__u64 clock;
 	__u32 flags;
-	__u32 pad[9];
+	__u32 pad0;
+	__u64 realtime;
+	__u64 host_tsc;
+	__u32 pad[4];
 };
 
 /* For KVM_CAP_SW_TLB */