diff mbox

[qemu,V2,2/2] kvmclock: reduce kvmclock difference on migration

Message ID 20161117132432.367332082@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Marcelo Tosatti Nov. 17, 2016, 1:24 p.m. UTC
Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
indicates that KVM_GET_CLOCK returns a value as seen by the guest at
that moment.

For new machine types, use this value rather than reading 
from guest memory.

This reduces kvmclock difference on migration from 5s to 0.1s
(when max_downtime == 5s).

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

---
 hw/i386/kvm/clock.c    |  108 ++++++++++++++++++++++++++++++++++++++++++-------
 include/hw/i386/pc.h   |    5 ++
 target-i386/kvm.c      |    7 +++
 target-i386/kvm_i386.h |    1 
 4 files changed, 107 insertions(+), 14 deletions(-)

v2: 
- improve variable names (Juan)
- consolidate code on kvm_get_clock function (Paolo)
- return mach_use_reliable_get_clock from needed function (Paolo)

Comments

Eduardo Habkost Nov. 17, 2016, 2:11 p.m. UTC | #1
On Thu, Nov 17, 2016 at 11:24:13AM -0200, Marcelo Tosatti wrote:
> Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
> indicates that KVM_GET_CLOCK returns a value as seen by the guest at
> that moment.
> 
> For new machine types, use this value rather than reading 
> from guest memory.
> 
> This reduces kvmclock difference on migration from 5s to 0.1s
> (when max_downtime == 5s).
> 
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> ---
>  hw/i386/kvm/clock.c    |  108 ++++++++++++++++++++++++++++++++++++++++++-------
>  include/hw/i386/pc.h   |    5 ++
>  target-i386/kvm.c      |    7 +++
>  target-i386/kvm_i386.h |    1 
>  4 files changed, 107 insertions(+), 14 deletions(-)
> 
> v2: 
> - improve variable names (Juan)
> - consolidate code on kvm_get_clock function (Paolo)
> - return mach_use_reliable_get_clock from needed function (Paolo)
> 
> Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> ===================================================================
> --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-14 10:40:39.748116312 -0200
> +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-14 13:38:29.299955042 -0200
> @@ -36,6 +36,12 @@
>  
>      uint64_t clock;
>      bool clock_valid;
> +
> +    /* whether machine supports reliable KVM_GET_CLOCK */
> +    bool mach_use_reliable_get_clock;
> +
> +    /* whether source host supported reliable KVM_GET_CLOCK */
> +    bool src_use_reliable_get_clock;
>  } KVMClockState;
>  
>  struct pvclock_vcpu_time_info {
> @@ -81,6 +87,19 @@
>      return nsec + time.system_time;
>  }
>  
> +static uint64_t kvm_get_clock(void)
> +{
> +    struct kvm_clock_data data;
> +    int ret;
> +
> +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> +    if (ret < 0) {
> +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> +                abort();
> +    }
> +    return data.clock;
> +}
> +
>  static void kvmclock_vm_state_change(void *opaque, int running,
>                                       RunState state)
>  {
> @@ -91,15 +110,37 @@
>  
>      if (running) {
>          struct kvm_clock_data data = {};
> -        uint64_t time_at_migration = kvmclock_current_nsec(s);
> +        uint64_t pvclock_via_mem = 0;
>  
> -        s->clock_valid = false;
> +        /* local (running VM) restore */
> +        if (s->clock_valid) {
> +            /*
> +             * if host does not support reliable KVM_GET_CLOCK,
> +             * read kvmclock value from memory
> +             */
> +            if (!kvm_has_adjust_clock_stable()) {
> +                pvclock_via_mem = kvmclock_current_nsec(s);
> +            }
> +        /* migration/savevm/init restore */
> +        } else {
> +            /*
> +             * use s->clock in case machine uses reliable
> +             * get clock and source host supported
> +             * reliable get clock
> +             */
> +            if (!(s->mach_use_reliable_get_clock &&
> +                  s->src_use_reliable_get_clock)) {
> +                pvclock_via_mem = kvmclock_current_nsec(s);
> +            }

The s->mach_use_reliable_get_clock check seems redundant.
src_use_reliable_get_clock is set only if
mach_use_reliable_get_clock is true.

> +        }
>  
> -        /* We can't rely on the migrated clock value, just discard it */
> -        if (time_at_migration) {
> -            s->clock = time_at_migration;
> +        /* We can't rely on the saved clock value, just discard it */
> +        if (pvclock_via_mem) {
> +            s->clock = pvclock_via_mem;
>          }
>  
> +        s->clock_valid = false;
> +
>          data.clock = s->clock;
>          ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
>          if (ret < 0) {
> @@ -120,8 +161,6 @@
>              }
>          }
>      } else {
> -        struct kvm_clock_data data;
> -        int ret;
>  
>          if (s->clock_valid) {
>              return;
> @@ -129,13 +168,7 @@
>  
>          kvm_synchronize_all_tsc();
>  
> -        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> -        if (ret < 0) {
> -            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> -            abort();
> -        }
> -        s->clock = data.clock;
> -
> +        s->clock = kvm_get_clock();
>          /*
>           * If the VM is stopped, declare the clock state valid to
>           * avoid re-reading it on next vmsave (which would return
> @@ -152,22 +185,69 @@
>      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
>  }
>  
> +static bool kvmclock_src_use_reliable_get_clock(void *opaque)
> +{
> +    KVMClockState *s = opaque;
> +
> +    /*
> +     * On machine types that support reliable KVM_GET_CLOCK,
> +     * if host kernel does provide reliable KVM_GET_CLOCK,
> +     * set src_use_reliable_get_clock=true so that destination
> +     * avoids reading kvmclock from memory.
> +     */
> +    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
> +        s->src_use_reliable_get_clock = true;
> +    }

It feels fragile to change device state inside the .needed
function. Better to initialize src_use_reliable_get_clock on
kvmclock_realize()?

What exactly ensures src_use_reliable_get_clock is correctly
initialized on the migration destination as well?

> +
> +    return s->mach_use_reliable_get_clock;

If if kvm_has_adjust_clock_stable() is false, isn't it simpler to
simply skip the section?

It looks like mach_use_reliable_get_clock and
src_use_reliable_get_clock could become a single field:

* use_reliable_get_clock ("x-use-reliable-get-clock" property)
  set to true by default (set on DEFINE_PROP_BOOL parameter)
* "x-use-reliable-get-clock" set to false by default on older
  machine-types
* use_reliable_get_clock forced to false on kvmclock_realize() if
  !kvm_has_adjust_clock_stable()
* kvmclock_reliable_get_clock.needed return s->use_reliable_get_clock


> +}
> +
> +static const VMStateDescription kvmclock_reliable_get_clock = {
> +    .name = "kvmclock/src_use_reliable_get_clock",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .needed = kvmclock_src_use_reliable_get_clock,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_BOOL(src_use_reliable_get_clock, KVMClockState),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> +
> +static void kvmclock_pre_save(void *opaque)
> +{
> +    KVMClockState *s = opaque;
> +
> +    s->clock = kvm_get_clock();
> +}
> +
>  static const VMStateDescription kvmclock_vmsd = {
>      .name = "kvmclock",
>      .version_id = 1,
>      .minimum_version_id = 1,
> +    .pre_save = kvmclock_pre_save,
>      .fields = (VMStateField[]) {
>          VMSTATE_UINT64(clock, KVMClockState),
>          VMSTATE_END_OF_LIST()
> +    },
> +    .subsections = (const VMStateDescription * []) {
> +        &kvmclock_reliable_get_clock,
> +        NULL
>      }
>  };
>  
> +static Property kvmclock_properties[] = {
> +    DEFINE_PROP_BOOL("mach_use_reliable_get_clock", KVMClockState,
> +                      mach_use_reliable_get_clock, true),

QOM property coding style is to use "-" instead of "_". Also,
please prefix it with "x-" to indicate it is not part of QEMU
stable external interface.

> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
>  static void kvmclock_class_init(ObjectClass *klass, void *data)
>  {
>      DeviceClass *dc = DEVICE_CLASS(klass);
>  
>      dc->realize = kvmclock_realize;
>      dc->vmsd = &kvmclock_vmsd;
> +    dc->props = kvmclock_properties;
>  }
>  
>  static const TypeInfo kvmclock_info = {
> Index: qemu-mig-advance-clock/include/hw/i386/pc.h
> ===================================================================
> --- qemu-mig-advance-clock.orig/include/hw/i386/pc.h	2016-11-14 10:40:39.748116312 -0200
> +++ qemu-mig-advance-clock/include/hw/i386/pc.h	2016-11-14 10:40:48.059128649 -0200
> @@ -370,6 +370,11 @@
>  #define PC_COMPAT_2_7 \
>      HW_COMPAT_2_7 \
>      {\
> +        .driver   = "kvmclock",\
> +        .property = "mach_use_reliable_get_clock",\
> +        .value    = "off",\
> +    },\
> +    {\
>          .driver   = TYPE_X86_CPU,\
>          .property = "l3-cache",\
>          .value    = "off",\
> Index: qemu-mig-advance-clock/target-i386/kvm.c
> ===================================================================
> --- qemu-mig-advance-clock.orig/target-i386/kvm.c	2016-11-14 10:40:39.750116314 -0200
> +++ qemu-mig-advance-clock/target-i386/kvm.c	2016-11-14 10:40:48.061128652 -0200
> @@ -117,6 +117,13 @@
>      return kvm_check_extension(kvm_state, KVM_CAP_X86_SMM);
>  }
>  
> +bool kvm_has_adjust_clock_stable(void)
> +{
> +    int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
> +
> +    return (ret == KVM_CLOCK_TSC_STABLE);
> +}
> +
>  bool kvm_allows_irq0_override(void)
>  {
>      return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
> Index: qemu-mig-advance-clock/target-i386/kvm_i386.h
> ===================================================================
> --- qemu-mig-advance-clock.orig/target-i386/kvm_i386.h	2016-11-14 10:40:39.751116316 -0200
> +++ qemu-mig-advance-clock/target-i386/kvm_i386.h	2016-11-14 10:40:48.061128652 -0200
> @@ -17,6 +17,7 @@
>  
>  bool kvm_allows_irq0_override(void);
>  bool kvm_has_smm(void);
> +bool kvm_has_adjust_clock_stable(void);
>  void kvm_synchronize_all_tsc(void);
>  void kvm_arch_reset_vcpu(X86CPU *cs);
>  void kvm_arch_do_init_vcpu(X86CPU *cs);
> 
>
Paolo Bonzini Nov. 17, 2016, 2:15 p.m. UTC | #2
On 17/11/2016 15:11, Eduardo Habkost wrote:
> On Thu, Nov 17, 2016 at 11:24:13AM -0200, Marcelo Tosatti wrote:
>> Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
>> indicates that KVM_GET_CLOCK returns a value as seen by the guest at
>> that moment.
>>
>> For new machine types, use this value rather than reading 
>> from guest memory.
>>
>> This reduces kvmclock difference on migration from 5s to 0.1s
>> (when max_downtime == 5s).
>>
>> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
>>
>> ---
>>  hw/i386/kvm/clock.c    |  108 ++++++++++++++++++++++++++++++++++++++++++-------
>>  include/hw/i386/pc.h   |    5 ++
>>  target-i386/kvm.c      |    7 +++
>>  target-i386/kvm_i386.h |    1 
>>  4 files changed, 107 insertions(+), 14 deletions(-)
>>
>> v2: 
>> - improve variable names (Juan)
>> - consolidate code on kvm_get_clock function (Paolo)
>> - return mach_use_reliable_get_clock from needed function (Paolo)
>>
>> Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
>> ===================================================================
>> --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-14 10:40:39.748116312 -0200
>> +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-14 13:38:29.299955042 -0200
>> @@ -36,6 +36,12 @@
>>  
>>      uint64_t clock;
>>      bool clock_valid;
>> +
>> +    /* whether machine supports reliable KVM_GET_CLOCK */
>> +    bool mach_use_reliable_get_clock;
>> +
>> +    /* whether source host supported reliable KVM_GET_CLOCK */
>> +    bool src_use_reliable_get_clock;
>>  } KVMClockState;
>>  
>>  struct pvclock_vcpu_time_info {
>> @@ -81,6 +87,19 @@
>>      return nsec + time.system_time;
>>  }
>>  
>> +static uint64_t kvm_get_clock(void)
>> +{
>> +    struct kvm_clock_data data;
>> +    int ret;
>> +
>> +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
>> +    if (ret < 0) {
>> +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
>> +                abort();
>> +    }
>> +    return data.clock;
>> +}
>> +
>>  static void kvmclock_vm_state_change(void *opaque, int running,
>>                                       RunState state)
>>  {
>> @@ -91,15 +110,37 @@
>>  
>>      if (running) {
>>          struct kvm_clock_data data = {};
>> -        uint64_t time_at_migration = kvmclock_current_nsec(s);
>> +        uint64_t pvclock_via_mem = 0;
>>  
>> -        s->clock_valid = false;
>> +        /* local (running VM) restore */
>> +        if (s->clock_valid) {
>> +            /*
>> +             * if host does not support reliable KVM_GET_CLOCK,
>> +             * read kvmclock value from memory
>> +             */
>> +            if (!kvm_has_adjust_clock_stable()) {
>> +                pvclock_via_mem = kvmclock_current_nsec(s);
>> +            }
>> +        /* migration/savevm/init restore */
>> +        } else {
>> +            /*
>> +             * use s->clock in case machine uses reliable
>> +             * get clock and source host supported
>> +             * reliable get clock
>> +             */
>> +            if (!(s->mach_use_reliable_get_clock &&
>> +                  s->src_use_reliable_get_clock)) {
>> +                pvclock_via_mem = kvmclock_current_nsec(s);
>> +            }
> 
> The s->mach_use_reliable_get_clock check seems redundant.
> src_use_reliable_get_clock is set only if
> mach_use_reliable_get_clock is true.
> 
>> +        }
>>  
>> -        /* We can't rely on the migrated clock value, just discard it */
>> -        if (time_at_migration) {
>> -            s->clock = time_at_migration;
>> +        /* We can't rely on the saved clock value, just discard it */
>> +        if (pvclock_via_mem) {
>> +            s->clock = pvclock_via_mem;
>>          }
>>  
>> +        s->clock_valid = false;
>> +
>>          data.clock = s->clock;
>>          ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
>>          if (ret < 0) {
>> @@ -120,8 +161,6 @@
>>              }
>>          }
>>      } else {
>> -        struct kvm_clock_data data;
>> -        int ret;
>>  
>>          if (s->clock_valid) {
>>              return;
>> @@ -129,13 +168,7 @@
>>  
>>          kvm_synchronize_all_tsc();
>>  
>> -        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
>> -        if (ret < 0) {
>> -            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
>> -            abort();
>> -        }
>> -        s->clock = data.clock;
>> -
>> +        s->clock = kvm_get_clock();
>>          /*
>>           * If the VM is stopped, declare the clock state valid to
>>           * avoid re-reading it on next vmsave (which would return
>> @@ -152,22 +185,69 @@
>>      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
>>  }
>>  
>> +static bool kvmclock_src_use_reliable_get_clock(void *opaque)
>> +{
>> +    KVMClockState *s = opaque;
>> +
>> +    /*
>> +     * On machine types that support reliable KVM_GET_CLOCK,
>> +     * if host kernel does provide reliable KVM_GET_CLOCK,
>> +     * set src_use_reliable_get_clock=true so that destination
>> +     * avoids reading kvmclock from memory.
>> +     */
>> +    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
>> +        s->src_use_reliable_get_clock = true;
>> +    }
> 
> It feels fragile to change device state inside the .needed
> function. Better to initialize src_use_reliable_get_clock on
> kvmclock_realize()?
> 
> What exactly ensures src_use_reliable_get_clock is correctly
> initialized on the migration destination as well?
> 
>> +
>> +    return s->mach_use_reliable_get_clock;
> 
> If if kvm_has_adjust_clock_stable() is false, isn't it simpler to
> simply skip the section?

This is what I asked for. :)

However, I was proposing a different way to initialize
src_use_reliable_get_clock.  I still have to understand exactly how
Marcelo's algorithm works because (based on the kvmclock code) it's more
trick than it seems.

Paolo
Marcelo Tosatti Nov. 17, 2016, 4:30 p.m. UTC | #3
On Thu, Nov 17, 2016 at 03:15:03PM +0100, Paolo Bonzini wrote:
> 
> 
> On 17/11/2016 15:11, Eduardo Habkost wrote:
> > On Thu, Nov 17, 2016 at 11:24:13AM -0200, Marcelo Tosatti wrote:
> >> Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
> >> indicates that KVM_GET_CLOCK returns a value as seen by the guest at
> >> that moment.
> >>
> >> For new machine types, use this value rather than reading 
> >> from guest memory.
> >>
> >> This reduces kvmclock difference on migration from 5s to 0.1s
> >> (when max_downtime == 5s).
> >>
> >> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> >>
> >> ---
> >>  hw/i386/kvm/clock.c    |  108 ++++++++++++++++++++++++++++++++++++++++++-------
> >>  include/hw/i386/pc.h   |    5 ++
> >>  target-i386/kvm.c      |    7 +++
> >>  target-i386/kvm_i386.h |    1 
> >>  4 files changed, 107 insertions(+), 14 deletions(-)
> >>
> >> v2: 
> >> - improve variable names (Juan)
> >> - consolidate code on kvm_get_clock function (Paolo)
> >> - return mach_use_reliable_get_clock from needed function (Paolo)
> >>
> >> Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> >> ===================================================================
> >> --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-14 10:40:39.748116312 -0200
> >> +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-14 13:38:29.299955042 -0200
> >> @@ -36,6 +36,12 @@
> >>  
> >>      uint64_t clock;
> >>      bool clock_valid;
> >> +
> >> +    /* whether machine supports reliable KVM_GET_CLOCK */
> >> +    bool mach_use_reliable_get_clock;
> >> +
> >> +    /* whether source host supported reliable KVM_GET_CLOCK */
> >> +    bool src_use_reliable_get_clock;
> >>  } KVMClockState;
> >>  
> >>  struct pvclock_vcpu_time_info {
> >> @@ -81,6 +87,19 @@
> >>      return nsec + time.system_time;
> >>  }
> >>  
> >> +static uint64_t kvm_get_clock(void)
> >> +{
> >> +    struct kvm_clock_data data;
> >> +    int ret;
> >> +
> >> +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> >> +    if (ret < 0) {
> >> +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> >> +                abort();
> >> +    }
> >> +    return data.clock;
> >> +}
> >> +
> >>  static void kvmclock_vm_state_change(void *opaque, int running,
> >>                                       RunState state)
> >>  {
> >> @@ -91,15 +110,37 @@
> >>  
> >>      if (running) {
> >>          struct kvm_clock_data data = {};
> >> -        uint64_t time_at_migration = kvmclock_current_nsec(s);
> >> +        uint64_t pvclock_via_mem = 0;
> >>  
> >> -        s->clock_valid = false;
> >> +        /* local (running VM) restore */
> >> +        if (s->clock_valid) {
> >> +            /*
> >> +             * if host does not support reliable KVM_GET_CLOCK,
> >> +             * read kvmclock value from memory
> >> +             */
> >> +            if (!kvm_has_adjust_clock_stable()) {
> >> +                pvclock_via_mem = kvmclock_current_nsec(s);
> >> +            }
> >> +        /* migration/savevm/init restore */
> >> +        } else {
> >> +            /*
> >> +             * use s->clock in case machine uses reliable
> >> +             * get clock and source host supported
> >> +             * reliable get clock
> >> +             */
> >> +            if (!(s->mach_use_reliable_get_clock &&
> >> +                  s->src_use_reliable_get_clock)) {
> >> +                pvclock_via_mem = kvmclock_current_nsec(s);
> >> +            }
> > 
> > The s->mach_use_reliable_get_clock check seems redundant.
> > src_use_reliable_get_clock is set only if
> > mach_use_reliable_get_clock is true.
> > 
> >> +        }
> >>  
> >> -        /* We can't rely on the migrated clock value, just discard it */
> >> -        if (time_at_migration) {
> >> -            s->clock = time_at_migration;
> >> +        /* We can't rely on the saved clock value, just discard it */
> >> +        if (pvclock_via_mem) {
> >> +            s->clock = pvclock_via_mem;
> >>          }
> >>  
> >> +        s->clock_valid = false;
> >> +
> >>          data.clock = s->clock;
> >>          ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
> >>          if (ret < 0) {
> >> @@ -120,8 +161,6 @@
> >>              }
> >>          }
> >>      } else {
> >> -        struct kvm_clock_data data;
> >> -        int ret;
> >>  
> >>          if (s->clock_valid) {
> >>              return;
> >> @@ -129,13 +168,7 @@
> >>  
> >>          kvm_synchronize_all_tsc();
> >>  
> >> -        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> >> -        if (ret < 0) {
> >> -            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> >> -            abort();
> >> -        }
> >> -        s->clock = data.clock;
> >> -
> >> +        s->clock = kvm_get_clock();
> >>          /*
> >>           * If the VM is stopped, declare the clock state valid to
> >>           * avoid re-reading it on next vmsave (which would return
> >> @@ -152,22 +185,69 @@
> >>      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
> >>  }
> >>  
> >> +static bool kvmclock_src_use_reliable_get_clock(void *opaque)
> >> +{
> >> +    KVMClockState *s = opaque;
> >> +
> >> +    /*
> >> +     * On machine types that support reliable KVM_GET_CLOCK,
> >> +     * if host kernel does provide reliable KVM_GET_CLOCK,
> >> +     * set src_use_reliable_get_clock=true so that destination
> >> +     * avoids reading kvmclock from memory.
> >> +     */
> >> +    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
> >> +        s->src_use_reliable_get_clock = true;
> >> +    }
> > 
> > It feels fragile to change device state inside the .needed
> > function. Better to initialize src_use_reliable_get_clock on
> > kvmclock_realize()?

Done. Added a new variable because the code was not complete
(stop/cont also runs locally).

> > What exactly ensures src_use_reliable_get_clock is correctly
> > initialized on the migration destination as well?
> > 
> >> +
> >> +    return s->mach_use_reliable_get_clock;
> > 
> > If if kvm_has_adjust_clock_stable() is false, isn't it simpler to
> > simply skip the section?
> 
> This is what I asked for. :)
> 
> However, I was proposing a different way to initialize
> src_use_reliable_get_clock.  I still have to understand exactly how
> Marcelo's algorithm works because (based on the kvmclock code) it's more
> trick than it seems.
> 
> Paolo

You asked me to return s->mach_use_reliable_get_clock:

>>> +     * avoids reading kvmclock from memory.
>>> +     */
>>> +    if (s->mach_use_reliable_get_clock &&
>>> kvm_has_adjust_clock_stable()) {
>>> +        s->src_use_reliable_get_clock = true;
>>> +    }
>>> +
>>> +    return s->src_use_reliable_get_clock;
>>> +}
>>
>> Here you can just return s->mach_use_reliable_get_clock.
>
> mach_use_reliable_get_clock can be true but host might not support it.

Yes, but the "needed" function is only required to avoid breaking
pc-i440fx-2.7 and earlier.  If you return true here, you can still
migrate a "false" value for src_use_reliable_get_clock.

===================

I don't see why avoid the subsection, since new machine type is 
incompatible anyway. So Eduardo on your suggestion to 
skip sending the subsection, what is the advantage?
Eduardo Habkost Nov. 17, 2016, 5:41 p.m. UTC | #4
On Thu, Nov 17, 2016 at 02:30:52PM -0200, Marcelo Tosatti wrote:
> On Thu, Nov 17, 2016 at 03:15:03PM +0100, Paolo Bonzini wrote:
> > On 17/11/2016 15:11, Eduardo Habkost wrote:
> > > On Thu, Nov 17, 2016 at 11:24:13AM -0200, Marcelo Tosatti wrote:
[...]
> > > What exactly ensures src_use_reliable_get_clock is correctly
> > > initialized on the migration destination as well?
> > > 
> > >> +
> > >> +    return s->mach_use_reliable_get_clock;
> > > 
> > > If if kvm_has_adjust_clock_stable() is false, isn't it simpler to
> > > simply skip the section?
> > 
> > This is what I asked for. :)
> > 
> > However, I was proposing a different way to initialize
> > src_use_reliable_get_clock.  I still have to understand exactly how
> > Marcelo's algorithm works because (based on the kvmclock code) it's more
> > trick than it seems.
> > 
> > Paolo
> 
> You asked me to return s->mach_use_reliable_get_clock:
> 
> >>> +     * avoids reading kvmclock from memory.
> >>> +     */
> >>> +    if (s->mach_use_reliable_get_clock &&
> >>> kvm_has_adjust_clock_stable()) {
> >>> +        s->src_use_reliable_get_clock = true;
> >>> +    }
> >>> +
> >>> +    return s->src_use_reliable_get_clock;
> >>> +}
> >>
> >> Here you can just return s->mach_use_reliable_get_clock.
> >
> > mach_use_reliable_get_clock can be true but host might not support it.
> 
> Yes, but the "needed" function is only required to avoid breaking
> pc-i440fx-2.7 and earlier.  If you return true here, you can still
> migrate a "false" value for src_use_reliable_get_clock.
> 
> ===================
> 
> I don't see why avoid the subsection, since new machine type is 
> incompatible anyway. So Eduardo on your suggestion to 
> skip sending the subsection, what is the advantage?

I believe the main (only?) advantage is that it can make the code
simpler: if you simply skip the section if the field is false,
you don't even need two separate fields.
diff mbox

Patch

Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
===================================================================
--- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-14 10:40:39.748116312 -0200
+++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-14 13:38:29.299955042 -0200
@@ -36,6 +36,12 @@ 
 
     uint64_t clock;
     bool clock_valid;
+
+    /* whether machine supports reliable KVM_GET_CLOCK */
+    bool mach_use_reliable_get_clock;
+
+    /* whether source host supported reliable KVM_GET_CLOCK */
+    bool src_use_reliable_get_clock;
 } KVMClockState;
 
 struct pvclock_vcpu_time_info {
@@ -81,6 +87,19 @@ 
     return nsec + time.system_time;
 }
 
+static uint64_t kvm_get_clock(void)
+{
+    struct kvm_clock_data data;
+    int ret;
+
+    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
+    if (ret < 0) {
+        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
+                abort();
+    }
+    return data.clock;
+}
+
 static void kvmclock_vm_state_change(void *opaque, int running,
                                      RunState state)
 {
@@ -91,15 +110,37 @@ 
 
     if (running) {
         struct kvm_clock_data data = {};
-        uint64_t time_at_migration = kvmclock_current_nsec(s);
+        uint64_t pvclock_via_mem = 0;
 
-        s->clock_valid = false;
+        /* local (running VM) restore */
+        if (s->clock_valid) {
+            /*
+             * if host does not support reliable KVM_GET_CLOCK,
+             * read kvmclock value from memory
+             */
+            if (!kvm_has_adjust_clock_stable()) {
+                pvclock_via_mem = kvmclock_current_nsec(s);
+            }
+        /* migration/savevm/init restore */
+        } else {
+            /*
+             * use s->clock in case machine uses reliable
+             * get clock and source host supported
+             * reliable get clock
+             */
+            if (!(s->mach_use_reliable_get_clock &&
+                  s->src_use_reliable_get_clock)) {
+                pvclock_via_mem = kvmclock_current_nsec(s);
+            }
+        }
 
-        /* We can't rely on the migrated clock value, just discard it */
-        if (time_at_migration) {
-            s->clock = time_at_migration;
+        /* We can't rely on the saved clock value, just discard it */
+        if (pvclock_via_mem) {
+            s->clock = pvclock_via_mem;
         }
 
+        s->clock_valid = false;
+
         data.clock = s->clock;
         ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
         if (ret < 0) {
@@ -120,8 +161,6 @@ 
             }
         }
     } else {
-        struct kvm_clock_data data;
-        int ret;
 
         if (s->clock_valid) {
             return;
@@ -129,13 +168,7 @@ 
 
         kvm_synchronize_all_tsc();
 
-        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
-        if (ret < 0) {
-            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
-            abort();
-        }
-        s->clock = data.clock;
-
+        s->clock = kvm_get_clock();
         /*
          * If the VM is stopped, declare the clock state valid to
          * avoid re-reading it on next vmsave (which would return
@@ -152,22 +185,69 @@ 
     qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
 }
 
+static bool kvmclock_src_use_reliable_get_clock(void *opaque)
+{
+    KVMClockState *s = opaque;
+
+    /*
+     * On machine types that support reliable KVM_GET_CLOCK,
+     * if host kernel does provide reliable KVM_GET_CLOCK,
+     * set src_use_reliable_get_clock=true so that destination
+     * avoids reading kvmclock from memory.
+     */
+    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
+        s->src_use_reliable_get_clock = true;
+    }
+
+    return s->mach_use_reliable_get_clock;
+}
+
+static const VMStateDescription kvmclock_reliable_get_clock = {
+    .name = "kvmclock/src_use_reliable_get_clock",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = kvmclock_src_use_reliable_get_clock,
+    .fields = (VMStateField[]) {
+        VMSTATE_BOOL(src_use_reliable_get_clock, KVMClockState),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static void kvmclock_pre_save(void *opaque)
+{
+    KVMClockState *s = opaque;
+
+    s->clock = kvm_get_clock();
+}
+
 static const VMStateDescription kvmclock_vmsd = {
     .name = "kvmclock",
     .version_id = 1,
     .minimum_version_id = 1,
+    .pre_save = kvmclock_pre_save,
     .fields = (VMStateField[]) {
         VMSTATE_UINT64(clock, KVMClockState),
         VMSTATE_END_OF_LIST()
+    },
+    .subsections = (const VMStateDescription * []) {
+        &kvmclock_reliable_get_clock,
+        NULL
     }
 };
 
+static Property kvmclock_properties[] = {
+    DEFINE_PROP_BOOL("mach_use_reliable_get_clock", KVMClockState,
+                      mach_use_reliable_get_clock, true),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
 static void kvmclock_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
 
     dc->realize = kvmclock_realize;
     dc->vmsd = &kvmclock_vmsd;
+    dc->props = kvmclock_properties;
 }
 
 static const TypeInfo kvmclock_info = {
Index: qemu-mig-advance-clock/include/hw/i386/pc.h
===================================================================
--- qemu-mig-advance-clock.orig/include/hw/i386/pc.h	2016-11-14 10:40:39.748116312 -0200
+++ qemu-mig-advance-clock/include/hw/i386/pc.h	2016-11-14 10:40:48.059128649 -0200
@@ -370,6 +370,11 @@ 
 #define PC_COMPAT_2_7 \
     HW_COMPAT_2_7 \
     {\
+        .driver   = "kvmclock",\
+        .property = "mach_use_reliable_get_clock",\
+        .value    = "off",\
+    },\
+    {\
         .driver   = TYPE_X86_CPU,\
         .property = "l3-cache",\
         .value    = "off",\
Index: qemu-mig-advance-clock/target-i386/kvm.c
===================================================================
--- qemu-mig-advance-clock.orig/target-i386/kvm.c	2016-11-14 10:40:39.750116314 -0200
+++ qemu-mig-advance-clock/target-i386/kvm.c	2016-11-14 10:40:48.061128652 -0200
@@ -117,6 +117,13 @@ 
     return kvm_check_extension(kvm_state, KVM_CAP_X86_SMM);
 }
 
+bool kvm_has_adjust_clock_stable(void)
+{
+    int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
+
+    return (ret == KVM_CLOCK_TSC_STABLE);
+}
+
 bool kvm_allows_irq0_override(void)
 {
     return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
Index: qemu-mig-advance-clock/target-i386/kvm_i386.h
===================================================================
--- qemu-mig-advance-clock.orig/target-i386/kvm_i386.h	2016-11-14 10:40:39.751116316 -0200
+++ qemu-mig-advance-clock/target-i386/kvm_i386.h	2016-11-14 10:40:48.061128652 -0200
@@ -17,6 +17,7 @@ 
 
 bool kvm_allows_irq0_override(void);
 bool kvm_has_smm(void);
+bool kvm_has_adjust_clock_stable(void);
 void kvm_synchronize_all_tsc(void);
 void kvm_arch_reset_vcpu(X86CPU *cs);
 void kvm_arch_do_init_vcpu(X86CPU *cs);