diff mbox

[4/4] kvm: Allow migration with invtsc

Message ID 1482866480-26208-5-git-send-email-ehabkost@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Eduardo Habkost Dec. 27, 2016, 7:21 p.m. UTC
Instead of blocking migration on the source when invtsc is
enabled, rely on the migration destination to ensure there's no
TSC frequency mismatch.

We can't allow migration unconditionally because we don't know if
the destination is a QEMU version that is really going to ensure
there's no TSC frequency mismatch. To ensure we are migrating to
a destination that won't ignore SET_TSC_KHZ errors, allow invtsc
migration only on pc-*-2.9 and newer.

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
---
 include/hw/i386/pc.h |  7 ++++++-
 target/i386/cpu.h    |  1 +
 target/i386/cpu.c    |  1 +
 target/i386/kvm.c    | 15 +++++++++------
 4 files changed, 17 insertions(+), 7 deletions(-)

Comments

Marcelo Tosatti Jan. 4, 2017, 11:56 a.m. UTC | #1
On Tue, Dec 27, 2016 at 05:21:20PM -0200, Eduardo Habkost wrote:
> Instead of blocking migration on the source when invtsc is
> enabled, rely on the migration destination to ensure there's no
> TSC frequency mismatch.
> 
> We can't allow migration unconditionally because we don't know if
> the destination is a QEMU version that is really going to ensure
> there's no TSC frequency mismatch. To ensure we are migrating to
> a destination that won't ignore SET_TSC_KHZ errors, allow invtsc
> migration only on pc-*-2.9 and newer.
> 
> Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
> ---
>  include/hw/i386/pc.h |  7 ++++++-
>  target/i386/cpu.h    |  1 +
>  target/i386/cpu.c    |  1 +
>  target/i386/kvm.c    | 15 +++++++++------
>  4 files changed, 17 insertions(+), 7 deletions(-)
> 
> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> index ceeacca..4270923 100644
> --- a/include/hw/i386/pc.h
> +++ b/include/hw/i386/pc.h
> @@ -375,7 +375,12 @@ int e820_get_num_entries(void);
>  bool e820_get_entry(int, uint32_t, uint64_t *, uint64_t *);
>  
>  #define PC_COMPAT_2_8 \
> -    HW_COMPAT_2_8
> +    HW_COMPAT_2_8 \
> +    {\
> +        .driver   = TYPE_X86_CPU,\
> +        .property = "invtsc-migration",\
> +        .value    = "off",\
> +    },
>  
>  #define PC_COMPAT_2_7 \
>      HW_COMPAT_2_7 \
> diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> index a7f2f60..ec8cdbc 100644
> --- a/target/i386/cpu.h
> +++ b/target/i386/cpu.h
> @@ -1208,6 +1208,7 @@ struct X86CPU {
>      bool expose_kvm;
>      bool migratable;
>      bool host_features;
> +    bool invtsc_migration;
>      uint32_t apic_id;
>  
>      /* if true the CPUID code directly forward host cache leaves to the guest */
> diff --git a/target/i386/cpu.c b/target/i386/cpu.c
> index b0640f1..cc93b81 100644
> --- a/target/i386/cpu.c
> +++ b/target/i386/cpu.c
> @@ -3678,6 +3678,7 @@ static Property x86_cpu_properties[] = {
>      DEFINE_PROP_BOOL("cpuid-0xb", X86CPU, enable_cpuid_0xb, true),
>      DEFINE_PROP_BOOL("lmce", X86CPU, enable_lmce, false),
>      DEFINE_PROP_BOOL("l3-cache", X86CPU, enable_l3_cache, true),
> +    DEFINE_PROP_BOOL("invtsc-migration", X86CPU, invtsc_migration, true),
>      DEFINE_PROP_END_OF_LIST()
>  };
>  
> diff --git a/target/i386/kvm.c b/target/i386/kvm.c
> index 6a51399..2c3ee7b 100644
> --- a/target/i386/kvm.c
> +++ b/target/i386/kvm.c
> @@ -962,7 +962,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
>          has_msr_mcg_ext_ctl = has_msr_feature_control = true;
>      }
>  
> -    if (!env->user_tsc_khz) {
> +    if (!cpu->invtsc_migration && !env->user_tsc_khz) {
>          if ((env->features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC) &&
>              invtsc_mig_blocker == NULL) {
>              /* for migration */
> @@ -972,6 +972,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
>              migrate_add_blocker(invtsc_mig_blocker);
>              /* for savevm */
>              vmstate_x86_cpu.unmigratable = 1;
> +        }
>      }
>  
>      cpuid_data.cpuid.padding = 0;
> @@ -2655,12 +2656,14 @@ int kvm_arch_put_registers(CPUState *cpu, int level)
>      }
>  
>      if (level == KVM_PUT_FULL_STATE) {
> -        /* We don't check for kvm_arch_set_tsc_khz() errors here,
> -         * because TSC frequency mismatch shouldn't abort migration,
> -         * unless the user explicitly asked for a more strict TSC
> -         * setting (e.g. using an explicit "tsc-freq" option).
> +        /* Migration TSC frequency mismatch is fatal only if we are
> +         * actually reporting Invariant TSC to the guest.
>           */
> -        kvm_arch_set_tsc_khz(cpu);
> +        ret = kvm_arch_set_tsc_khz(cpu);
> +        if ((x86_cpu->env.features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC) &&
> +            ret < 0) {
> +            return ret;
> +        }
>      }

Will the guest continue in the source in this case?

I think this is past the point where migration has been declared
successful. 

Otherwise looks good.
Eduardo Habkost Jan. 4, 2017, 1:39 p.m. UTC | #2
On Wed, Jan 04, 2017 at 09:56:56AM -0200, Marcelo Tosatti wrote:
> On Tue, Dec 27, 2016 at 05:21:20PM -0200, Eduardo Habkost wrote:
> > Instead of blocking migration on the source when invtsc is
> > enabled, rely on the migration destination to ensure there's no
> > TSC frequency mismatch.
> > 
> > We can't allow migration unconditionally because we don't know if
> > the destination is a QEMU version that is really going to ensure
> > there's no TSC frequency mismatch. To ensure we are migrating to
> > a destination that won't ignore SET_TSC_KHZ errors, allow invtsc
> > migration only on pc-*-2.9 and newer.
> > 
> > Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
> > ---
> >  include/hw/i386/pc.h |  7 ++++++-
> >  target/i386/cpu.h    |  1 +
> >  target/i386/cpu.c    |  1 +
> >  target/i386/kvm.c    | 15 +++++++++------
> >  4 files changed, 17 insertions(+), 7 deletions(-)
> > 
> > diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> > index ceeacca..4270923 100644
> > --- a/include/hw/i386/pc.h
> > +++ b/include/hw/i386/pc.h
> > @@ -375,7 +375,12 @@ int e820_get_num_entries(void);
> >  bool e820_get_entry(int, uint32_t, uint64_t *, uint64_t *);
> >  
> >  #define PC_COMPAT_2_8 \
> > -    HW_COMPAT_2_8
> > +    HW_COMPAT_2_8 \
> > +    {\
> > +        .driver   = TYPE_X86_CPU,\
> > +        .property = "invtsc-migration",\
> > +        .value    = "off",\
> > +    },
> >  
> >  #define PC_COMPAT_2_7 \
> >      HW_COMPAT_2_7 \
> > diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> > index a7f2f60..ec8cdbc 100644
> > --- a/target/i386/cpu.h
> > +++ b/target/i386/cpu.h
> > @@ -1208,6 +1208,7 @@ struct X86CPU {
> >      bool expose_kvm;
> >      bool migratable;
> >      bool host_features;
> > +    bool invtsc_migration;
> >      uint32_t apic_id;
> >  
> >      /* if true the CPUID code directly forward host cache leaves to the guest */
> > diff --git a/target/i386/cpu.c b/target/i386/cpu.c
> > index b0640f1..cc93b81 100644
> > --- a/target/i386/cpu.c
> > +++ b/target/i386/cpu.c
> > @@ -3678,6 +3678,7 @@ static Property x86_cpu_properties[] = {
> >      DEFINE_PROP_BOOL("cpuid-0xb", X86CPU, enable_cpuid_0xb, true),
> >      DEFINE_PROP_BOOL("lmce", X86CPU, enable_lmce, false),
> >      DEFINE_PROP_BOOL("l3-cache", X86CPU, enable_l3_cache, true),
> > +    DEFINE_PROP_BOOL("invtsc-migration", X86CPU, invtsc_migration, true),
> >      DEFINE_PROP_END_OF_LIST()
> >  };
> >  
> > diff --git a/target/i386/kvm.c b/target/i386/kvm.c
> > index 6a51399..2c3ee7b 100644
> > --- a/target/i386/kvm.c
> > +++ b/target/i386/kvm.c
> > @@ -962,7 +962,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
> >          has_msr_mcg_ext_ctl = has_msr_feature_control = true;
> >      }
> >  
> > -    if (!env->user_tsc_khz) {
> > +    if (!cpu->invtsc_migration && !env->user_tsc_khz) {
> >          if ((env->features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC) &&
> >              invtsc_mig_blocker == NULL) {
> >              /* for migration */
> > @@ -972,6 +972,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
> >              migrate_add_blocker(invtsc_mig_blocker);
> >              /* for savevm */
> >              vmstate_x86_cpu.unmigratable = 1;
> > +        }
> >      }
> >  
> >      cpuid_data.cpuid.padding = 0;
> > @@ -2655,12 +2656,14 @@ int kvm_arch_put_registers(CPUState *cpu, int level)
> >      }
> >  
> >      if (level == KVM_PUT_FULL_STATE) {
> > -        /* We don't check for kvm_arch_set_tsc_khz() errors here,
> > -         * because TSC frequency mismatch shouldn't abort migration,
> > -         * unless the user explicitly asked for a more strict TSC
> > -         * setting (e.g. using an explicit "tsc-freq" option).
> > +        /* Migration TSC frequency mismatch is fatal only if we are
> > +         * actually reporting Invariant TSC to the guest.
> >           */
> > -        kvm_arch_set_tsc_khz(cpu);
> > +        ret = kvm_arch_set_tsc_khz(cpu);
> > +        if ((x86_cpu->env.features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC) &&
> > +            ret < 0) {
> > +            return ret;
> > +        }
> >      }
> 
> Will the guest continue in the source in this case?
> 
> I think this is past the point where migration has been declared
> successful. 
> 
> Otherwise looks good.

Good point. I will make additional tests and see if there's some
other place where the kvm_arch_set_tsc_khz() call can be moved
to.
Eduardo Habkost Jan. 4, 2017, 7:59 p.m. UTC | #3
On Wed, Jan 04, 2017 at 11:39:16AM -0200, Eduardo Habkost wrote:
> On Wed, Jan 04, 2017 at 09:56:56AM -0200, Marcelo Tosatti wrote:
> > On Tue, Dec 27, 2016 at 05:21:20PM -0200, Eduardo Habkost wrote:
> > > Instead of blocking migration on the source when invtsc is
> > > enabled, rely on the migration destination to ensure there's no
> > > TSC frequency mismatch.
> > > 
> > > We can't allow migration unconditionally because we don't know if
> > > the destination is a QEMU version that is really going to ensure
> > > there's no TSC frequency mismatch. To ensure we are migrating to
> > > a destination that won't ignore SET_TSC_KHZ errors, allow invtsc
> > > migration only on pc-*-2.9 and newer.
> > > 
> > > Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
[...]
> > > @@ -2655,12 +2656,14 @@ int kvm_arch_put_registers(CPUState *cpu, int level)
> > >      }
> > >  
> > >      if (level == KVM_PUT_FULL_STATE) {
> > > -        /* We don't check for kvm_arch_set_tsc_khz() errors here,
> > > -         * because TSC frequency mismatch shouldn't abort migration,
> > > -         * unless the user explicitly asked for a more strict TSC
> > > -         * setting (e.g. using an explicit "tsc-freq" option).
> > > +        /* Migration TSC frequency mismatch is fatal only if we are
> > > +         * actually reporting Invariant TSC to the guest.
> > >           */
> > > -        kvm_arch_set_tsc_khz(cpu);
> > > +        ret = kvm_arch_set_tsc_khz(cpu);
> > > +        if ((x86_cpu->env.features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC) &&
> > > +            ret < 0) {
> > > +            return ret;
> > > +        }
> > >      }
> > 
> > Will the guest continue in the source in this case?
> > 
> > I think this is past the point where migration has been declared
> > successful. 
> > 
> > Otherwise looks good.
> 
> Good point. I will make additional tests and see if there's some
> other place where the kvm_arch_set_tsc_khz() call can be moved
> to.

So, if we solve this and do something on (for example) post_load,
we still have a problem: device state is migrated after RAM. This
means QEMU will check for TSC scaling and abort migration very
late.

We could solve that by manually registering a SaveVMHandler that
will send the TSC frequency on save_live_setup, so migration is
aborted earlier.

But: this sounds like just a complex hack to work around the real
problems:

1) TSC frequency is guest-visible, and anything that affects
   guest ABI should depend on the VM configuration, not on host
   capabilities;
2) Setting TSC frequency depending on the host will make
   migratability unpredictable for management software: the same
   VM config could be migratable to host A when started on host
   B, but not migratable to host A when started on host C.

I suggest we allow migration with invtsc if and only if
tsc-frequency is set explicitly by management software. In other
words, apply only patches 1/4 and 2/4 from this series. After
that, we will need libvirt support for configuring tsc-frequency.
Marcelo Tosatti Jan. 4, 2017, 10:26 p.m. UTC | #4
On Wed, Jan 04, 2017 at 05:59:17PM -0200, Eduardo Habkost wrote:
> On Wed, Jan 04, 2017 at 11:39:16AM -0200, Eduardo Habkost wrote:
> > On Wed, Jan 04, 2017 at 09:56:56AM -0200, Marcelo Tosatti wrote:
> > > On Tue, Dec 27, 2016 at 05:21:20PM -0200, Eduardo Habkost wrote:
> > > > Instead of blocking migration on the source when invtsc is
> > > > enabled, rely on the migration destination to ensure there's no
> > > > TSC frequency mismatch.
> > > > 
> > > > We can't allow migration unconditionally because we don't know if
> > > > the destination is a QEMU version that is really going to ensure
> > > > there's no TSC frequency mismatch. To ensure we are migrating to
> > > > a destination that won't ignore SET_TSC_KHZ errors, allow invtsc
> > > > migration only on pc-*-2.9 and newer.
> > > > 
> > > > Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
> [...]
> > > > @@ -2655,12 +2656,14 @@ int kvm_arch_put_registers(CPUState *cpu, int level)
> > > >      }
> > > >  
> > > >      if (level == KVM_PUT_FULL_STATE) {
> > > > -        /* We don't check for kvm_arch_set_tsc_khz() errors here,
> > > > -         * because TSC frequency mismatch shouldn't abort migration,
> > > > -         * unless the user explicitly asked for a more strict TSC
> > > > -         * setting (e.g. using an explicit "tsc-freq" option).
> > > > +        /* Migration TSC frequency mismatch is fatal only if we are
> > > > +         * actually reporting Invariant TSC to the guest.
> > > >           */
> > > > -        kvm_arch_set_tsc_khz(cpu);
> > > > +        ret = kvm_arch_set_tsc_khz(cpu);
> > > > +        if ((x86_cpu->env.features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC) &&
> > > > +            ret < 0) {
> > > > +            return ret;
> > > > +        }
> > > >      }
> > > 
> > > Will the guest continue in the source in this case?
> > > 
> > > I think this is past the point where migration has been declared
> > > successful. 
> > > 
> > > Otherwise looks good.
> > 
> > Good point. I will make additional tests and see if there's some
> > other place where the kvm_arch_set_tsc_khz() call can be moved
> > to.
> 
> So, if we solve this and do something on (for example) post_load,
> we still have a problem: device state is migrated after RAM. This
> means QEMU will check for TSC scaling and abort migration very
> late.
> 
> We could solve that by manually registering a SaveVMHandler that
> will send the TSC frequency on save_live_setup, so migration is
> aborted earlier.
> 
> But: this sounds like just a complex hack to work around the real
> problems:
> 
> 1) TSC frequency is guest-visible, and anything that affects
>    guest ABI should depend on the VM configuration, not on host
>    capabilities;

Well not really: the TSC frequency where the guest starts 
is the frequency the guest software expects.
So it does depend on host capabilities.

> 2) Setting TSC frequency depending on the host will make
>    migratability unpredictable for management software: the same
>    VM config could be migratable to host A when started on host
>    B, but not migratable to host A when started on host C.

Well, just check the frequency.

> I suggest we allow migration with invtsc if and only if
> tsc-frequency is set explicitly by management software. In other
> words, apply only patches 1/4 and 2/4 from this series. After
> that, we will need libvirt support for configuring tsc-frequency.

I don't like that for the following reasons:

* It moves low level complexity from QEMU/KVM to libvirt 
   (libvirt has to call KVM_GET_TSC_KHZ from a vcpu thread).
* It makes it difficult to run QEMU manually (i use QEMU
   manually all the time).
* It requires patching libvirt.

In my experience things work better when the functionality is
not split across libvirt/qemu.

Can't this be fixed in QEMU? Just check that destination host supports
TSC scaling before migration (or that KVM_GET_TSC_KHZ return value
matches on source and destination).
Eduardo Habkost Jan. 5, 2017, 1:36 a.m. UTC | #5
On Wed, Jan 04, 2017 at 08:26:27PM -0200, Marcelo Tosatti wrote:
> On Wed, Jan 04, 2017 at 05:59:17PM -0200, Eduardo Habkost wrote:
> > On Wed, Jan 04, 2017 at 11:39:16AM -0200, Eduardo Habkost wrote:
> > > On Wed, Jan 04, 2017 at 09:56:56AM -0200, Marcelo Tosatti wrote:
> > > > On Tue, Dec 27, 2016 at 05:21:20PM -0200, Eduardo Habkost wrote:
> > > > > Instead of blocking migration on the source when invtsc is
> > > > > enabled, rely on the migration destination to ensure there's no
> > > > > TSC frequency mismatch.
> > > > > 
> > > > > We can't allow migration unconditionally because we don't know if
> > > > > the destination is a QEMU version that is really going to ensure
> > > > > there's no TSC frequency mismatch. To ensure we are migrating to
> > > > > a destination that won't ignore SET_TSC_KHZ errors, allow invtsc
> > > > > migration only on pc-*-2.9 and newer.
> > > > > 
> > > > > Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
> > [...]
> > > > > @@ -2655,12 +2656,14 @@ int kvm_arch_put_registers(CPUState *cpu, int level)
> > > > >      }
> > > > >  
> > > > >      if (level == KVM_PUT_FULL_STATE) {
> > > > > -        /* We don't check for kvm_arch_set_tsc_khz() errors here,
> > > > > -         * because TSC frequency mismatch shouldn't abort migration,
> > > > > -         * unless the user explicitly asked for a more strict TSC
> > > > > -         * setting (e.g. using an explicit "tsc-freq" option).
> > > > > +        /* Migration TSC frequency mismatch is fatal only if we are
> > > > > +         * actually reporting Invariant TSC to the guest.
> > > > >           */
> > > > > -        kvm_arch_set_tsc_khz(cpu);
> > > > > +        ret = kvm_arch_set_tsc_khz(cpu);
> > > > > +        if ((x86_cpu->env.features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC) &&
> > > > > +            ret < 0) {
> > > > > +            return ret;
> > > > > +        }
> > > > >      }
> > > > 
> > > > Will the guest continue in the source in this case?
> > > > 
> > > > I think this is past the point where migration has been declared
> > > > successful. 
> > > > 
> > > > Otherwise looks good.
> > > 
> > > Good point. I will make additional tests and see if there's some
> > > other place where the kvm_arch_set_tsc_khz() call can be moved
> > > to.
> > 
> > So, if we solve this and do something on (for example) post_load,
> > we still have a problem: device state is migrated after RAM. This
> > means QEMU will check for TSC scaling and abort migration very
> > late.
> > 
> > We could solve that by manually registering a SaveVMHandler that
> > will send the TSC frequency on save_live_setup, so migration is
> > aborted earlier.
> > 
> > But: this sounds like just a complex hack to work around the real
> > problems:
> > 
> > 1) TSC frequency is guest-visible, and anything that affects
> >    guest ABI should depend on the VM configuration, not on host
> >    capabilities;
> 
> Well not really: the TSC frequency where the guest starts 
> is the frequency the guest software expects.
> So it does depend on host capabilities.

Could you explain where this expectation comes from? I can see
multiple cases where choosing the TSC frequency where the VM
starts is not the best option.

I am considering two possible scenarios below. You probably have
another scenario in mind, so it would be useful if you could
describe it so we can consider possible solutions.


Scenario 1:

You have two hosts: A and B, both with TSC scaling. They have
different frequencies. The VM can be started on either one of
them, and can be migrated to either one depending on the policy
of management software.

Maybe B is a backup host, the VM is expected to run most times on
host A, and we want to use the TSC frequency from host A every
time. Maybe it's the opposite and we want to use the frequency of
B. Maybe we want to use the lowest frequency of both, maybe the
highest one. We (QEMU developers) can recommend policy to libvirt
developers, but a given QEMU process doesn't have information to
decide what's best.


Scenario 2:

Host A has TSC scaling, host B doesn't have TSC scaling. We want
to be able to start the VM on host A, and migrate to B. In this
case, the only possible solution is to use B's frequency when
starting the VM. The QEMU process doesn't have enough information
to make that decision.


> 
> > 2) Setting TSC frequency depending on the host will make
> >    migratability unpredictable for management software: the same
> >    VM config could be migratable to host A when started on host
> >    B, but not migratable to host A when started on host C.
> 
> Well, just check the frequency.

What do you mean by "just check the frequency"? What exactly
should management software do?

Whatever management software do, if you just use the source host
frequency you can get into a situation where you can start a VM
on host A and migrate it to B, but can't start the VM on host B
and migrate it to A. This would be confusing for users, and
probably break assumptions of existing management software.

> 
> > I suggest we allow migration with invtsc if and only if
> > tsc-frequency is set explicitly by management software. In other
> > words, apply only patches 1/4 and 2/4 from this series. After
> > that, we will need libvirt support for configuring tsc-frequency.
> 
> I don't like that for the following reasons:
> 
> * It moves low level complexity from QEMU/KVM to libvirt 
>    (libvirt has to call KVM_GET_TSC_KHZ from a vcpu thread).

It doesn't. It could ask QEMU for that information (the new
query-cpu-model-expansion QMP command would allow that).

Or, alternatively, it could just let the user choose the
frequency. It's probably acceptable for many use cases where
invtsc+migration is important.

> * It makes it difficult to run QEMU manually (i use QEMU
>    manually all the time).

I believe the set of people that: 1) need invtsc; 2) need
migration to work; and 3) run QEMU manually will be able to add
"tsc-frequency=XXX" to the command-line easily. :)

> * It requires patching libvirt.
> 
> In my experience things work better when the functionality is
> not split across libvirt/qemu.

In my experience things break when management software is the
only component able to make a decision but we don't provide
mechanisms to let management software make that decision.

The TSC frequency affects migratability to hosts. Choose the
wrong frequency, and you might not be able to migrate. Only
management software knows to which hosts the VM could be migrated
in the future, and which TSC frequency would allow that.

> 
> Can't this be fixed in QEMU? Just check that destination host supports
> TSC scaling before migration (or that KVM_GET_TSC_KHZ return value
> matches on source and destination).
> 

This solves only one use case: where you want to expose the
starting host TSC frequency to the guest. It doesn't cover any
scenario where the TSC frequency of the starting host isn't the
best one. (See the two scenarios I described above)

You seem to have a specific use case in mind. If you describe it,
we could decide if it's worth the extra migration code complexity
to deal with invtsc migration without explicit tsc-frequency. By
now, I am not convinced yet.

Maybe your use case just needs a explicit "invtsc-migration=on"
command-line flag without a mechanism to abort migration on
mismatch? I can't tell.

Note that even if we follow your suggestion and implement an
alternative version of patch 4/4 to cover your use case, I will
strongly recommend libvirt developers to support configuring TSC
frequency if they want to support invtsc + migration without
surprising/unpredictable restrictions on migratability.
Marcelo Tosatti Jan. 5, 2017, 10:48 a.m. UTC | #6
On Wed, Jan 04, 2017 at 11:36:31PM -0200, Eduardo Habkost wrote:
> On Wed, Jan 04, 2017 at 08:26:27PM -0200, Marcelo Tosatti wrote:
> > On Wed, Jan 04, 2017 at 05:59:17PM -0200, Eduardo Habkost wrote:
> > > On Wed, Jan 04, 2017 at 11:39:16AM -0200, Eduardo Habkost wrote:
> > > > On Wed, Jan 04, 2017 at 09:56:56AM -0200, Marcelo Tosatti wrote:
> > > > > On Tue, Dec 27, 2016 at 05:21:20PM -0200, Eduardo Habkost wrote:
> > > > > > Instead of blocking migration on the source when invtsc is
> > > > > > enabled, rely on the migration destination to ensure there's no
> > > > > > TSC frequency mismatch.
> > > > > > 
> > > > > > We can't allow migration unconditionally because we don't know if
> > > > > > the destination is a QEMU version that is really going to ensure
> > > > > > there's no TSC frequency mismatch. To ensure we are migrating to
> > > > > > a destination that won't ignore SET_TSC_KHZ errors, allow invtsc
> > > > > > migration only on pc-*-2.9 and newer.
> > > > > > 
> > > > > > Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
> > > [...]
> > > > > > @@ -2655,12 +2656,14 @@ int kvm_arch_put_registers(CPUState *cpu, int level)
> > > > > >      }
> > > > > >  
> > > > > >      if (level == KVM_PUT_FULL_STATE) {
> > > > > > -        /* We don't check for kvm_arch_set_tsc_khz() errors here,
> > > > > > -         * because TSC frequency mismatch shouldn't abort migration,
> > > > > > -         * unless the user explicitly asked for a more strict TSC
> > > > > > -         * setting (e.g. using an explicit "tsc-freq" option).
> > > > > > +        /* Migration TSC frequency mismatch is fatal only if we are
> > > > > > +         * actually reporting Invariant TSC to the guest.
> > > > > >           */
> > > > > > -        kvm_arch_set_tsc_khz(cpu);
> > > > > > +        ret = kvm_arch_set_tsc_khz(cpu);
> > > > > > +        if ((x86_cpu->env.features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC) &&
> > > > > > +            ret < 0) {
> > > > > > +            return ret;
> > > > > > +        }
> > > > > >      }
> > > > > 
> > > > > Will the guest continue in the source in this case?
> > > > > 
> > > > > I think this is past the point where migration has been declared
> > > > > successful. 
> > > > > 
> > > > > Otherwise looks good.
> > > > 
> > > > Good point. I will make additional tests and see if there's some
> > > > other place where the kvm_arch_set_tsc_khz() call can be moved
> > > > to.
> > > 
> > > So, if we solve this and do something on (for example) post_load,
> > > we still have a problem: device state is migrated after RAM. This
> > > means QEMU will check for TSC scaling and abort migration very
> > > late.
> > > 
> > > We could solve that by manually registering a SaveVMHandler that
> > > will send the TSC frequency on save_live_setup, so migration is
> > > aborted earlier.
> > > 
> > > But: this sounds like just a complex hack to work around the real
> > > problems:
> > > 
> > > 1) TSC frequency is guest-visible, and anything that affects
> > >    guest ABI should depend on the VM configuration, not on host
> > >    capabilities;
> > 
> > Well not really: the TSC frequency where the guest starts 
> > is the frequency the guest software expects.
> > So it does depend on host capabilities.
> 
> Could you explain where this expectation comes from? I can see
> multiple cases where choosing the TSC frequency where the VM
> starts is not the best option.

1. Boot guest.
2. Calibrate TSC.
3. Use delay() with TSC calibration above, or
use TSC to measure the passage of time (TSC clock 
in userspace).

> I am considering two possible scenarios below. You probably have
> another scenario in mind, so it would be useful if you could
> describe it so we can consider possible solutions.
> 
> 
> Scenario 1:
> 
> You have two hosts: A and B, both with TSC scaling. They have
> different frequencies. The VM can be started on either one of
> them, and can be migrated to either one depending on the policy
> of management software.
> 
> Maybe B is a backup host, the VM is expected to run most times on
> host A, and we want to use the TSC frequency from host A every
> time. Maybe it's the opposite and we want to use the frequency of
> B. Maybe we want to use the lowest frequency of both, maybe the
> highest one. We (QEMU developers) can recommend policy to libvirt
> developers, but a given QEMU process doesn't have information to
> decide what's best.

I can't see any practical scenario here, you will always want
to start with TSC frequency of the host where the VM was started.

If i am mistaken, please describe a practical case.

(If a practical scenario comes up, and there is a use-case
for setting the TSC frequency on startup, lets say 
a Windows VM which fails to boot if the TSC frequency
is too high, then it should be supported... But the
only known scenario to me, applying to 99.999% of cases, 
is to expose the TSC frequency where the guest booted at).

> Scenario 2:
> 
> Host A has TSC scaling, host B doesn't have TSC scaling. We want
> to be able to start the VM on host A, and migrate to B. In this
> case, the only possible solution is to use B's frequency when
> starting the VM. The QEMU process doesn't have enough information
> to make that decision.

That is a good point. But again, its a special case and 
should be supported by -cpu xxx,tsc-frequency=zzzz.

However, for the vast majority of 99.999% cases, the issue
can be handled entirely in QEMU, without libvirt involvement,
and without adding extra steps to the management software.

> > > 2) Setting TSC frequency depending on the host will make
> > >    migratability unpredictable for management software: the same
> > >    VM config could be migratable to host A when started on host
> > >    B, but not migratable to host A when started on host C.
> > 
> > Well, just check the frequency.
> 
> What do you mean by "just check the frequency"? What exactly
> should management software do?

VM booted on host-A can be migrated to host-B if TSC
frequency matches.

> Whatever management software do, if you just use the source host
> frequency you can get into a situation where you can start a VM
> on host A and migrate it to B, but can't start the VM on host B
> and migrate it to A. This would be confusing for users, and
> probably break assumptions of existing management software.

Well this is a side effect of invariant TSC and migration.

Look, i agree with all your points, but my point is this: i personally
prefer to handle the 99.999% case, which is the TSC frequency exposed is the one
from the host where the guest booted, in QEMU entirely (for example, to
make life easier for people who run qemu manually such as myself).

> > > I suggest we allow migration with invtsc if and only if
> > > tsc-frequency is set explicitly by management software. In other
> > > words, apply only patches 1/4 and 2/4 from this series. After
> > > that, we will need libvirt support for configuring tsc-frequency.
> > 
> > I don't like that for the following reasons:
> > 
> > * It moves low level complexity from QEMU/KVM to libvirt 
> >    (libvirt has to call KVM_GET_TSC_KHZ from a vcpu thread).
> 
> It doesn't. It could ask QEMU for that information (the new
> query-cpu-model-expansion QMP command would allow that).
> 
> Or, alternatively, it could just let the user choose the
> frequency. 

Again, you want to expose the host where the VM booted in most
cases (except the ones you list above).

> It's probably acceptable for many use cases where
> invtsc+migration is important.

Ok, thats better. Can you add a patch to your series with the steps 
as how mgmt software should proceed ?

> > * It makes it difficult to run QEMU manually (i use QEMU
> >    manually all the time).
> 
> I believe the set of people that: 1) need invtsc; 2) need
> migration to work; and 3) run QEMU manually will be able to add
> "tsc-frequency=XXX" to the command-line easily. :)

Ok, so migration is only allowed if tsc-frequency= is specified.

> > * It requires patching libvirt.
> > 
> > In my experience things work better when the functionality is
> > not split across libvirt/qemu.
> 
> In my experience things break when management software is the
> only component able to make a decision but we don't provide
> mechanisms to let management software make that decision.
> 
> The TSC frequency affects migratability to hosts. Choose the
> wrong frequency, and you might not be able to migrate. Only
> management software knows to which hosts the VM could be migrated
> in the future, and which TSC frequency would allow that.

True.

> > Can't this be fixed in QEMU? Just check that destination host supports
> > TSC scaling before migration (or that KVM_GET_TSC_KHZ return value
> > matches on source and destination).
> > 
> 
> This solves only one use case: where you want to expose the
> starting host TSC frequency to the guest. It doesn't cover any
> scenario where the TSC frequency of the starting host isn't the
> best one. (See the two scenarios I described above)

True.

> You seem to have a specific use case in mind. If you describe it,
> we could decide if it's worth the extra migration code complexity
> to deal with invtsc migration without explicit tsc-frequency. By
> now, I am not convinced yet.

I don't have any specific use case in mind. I'm just trying to
avoid moving complexity to libvirt which in my experience is a
the best thing to do.

> Maybe your use case just needs a explicit "invtsc-migration=on"
> command-line flag without a mechanism to abort migration on
> mismatch? I can't tell.

Again, there is no special case.

> Note that even if we follow your suggestion and implement an
> alternative version of patch 4/4 to cover your use case, I will
> strongly recommend libvirt developers to support configuring TSC
> frequency if they want to support invtsc + migration without
> surprising/unpredictable restrictions on migratability.

Well, alright. If you make the TSC frequency of the host
available to mgmt software as you describe, and write the steps mgmt
software should take, i'm good.

Thanks for the help Eduardo.
Marcelo Tosatti Jan. 5, 2017, 10:50 a.m. UTC | #7
On Thu, Jan 05, 2017 at 08:48:32AM -0200, Marcelo Tosatti wrote:
> > Note that even if we follow your suggestion and implement an
> > alternative version of patch 4/4 to cover your use case, I will
> > strongly recommend libvirt developers to support configuring TSC
> > frequency if they want to support invtsc + migration without
> > surprising/unpredictable restrictions on migratability.
> 
> Well, alright. If you make the TSC frequency of the host
> available to mgmt software as you describe, and write the steps mgmt
> software should take, i'm good.
> 
> Thanks for the help Eduardo.
> 

I'll be happy to not have to add a weird script to 

1. Query the host TSC frequency.
2. Pass that to -cpu xxx,tsc-frequency=ZZZ

To my migration scripts. 

Probably easy to make that the default behaviour...
Eduardo Habkost Jan. 5, 2017, 12:19 p.m. UTC | #8
On Thu, Jan 05, 2017 at 08:48:32AM -0200, Marcelo Tosatti wrote:
> On Wed, Jan 04, 2017 at 11:36:31PM -0200, Eduardo Habkost wrote:
> > On Wed, Jan 04, 2017 at 08:26:27PM -0200, Marcelo Tosatti wrote:
> > > On Wed, Jan 04, 2017 at 05:59:17PM -0200, Eduardo Habkost wrote:
> > > > On Wed, Jan 04, 2017 at 11:39:16AM -0200, Eduardo Habkost wrote:
> > > > > On Wed, Jan 04, 2017 at 09:56:56AM -0200, Marcelo Tosatti wrote:
> > > > > > On Tue, Dec 27, 2016 at 05:21:20PM -0200, Eduardo Habkost wrote:
> > > > > > > Instead of blocking migration on the source when invtsc is
> > > > > > > enabled, rely on the migration destination to ensure there's no
> > > > > > > TSC frequency mismatch.
> > > > > > > 
> > > > > > > We can't allow migration unconditionally because we don't know if
> > > > > > > the destination is a QEMU version that is really going to ensure
> > > > > > > there's no TSC frequency mismatch. To ensure we are migrating to
> > > > > > > a destination that won't ignore SET_TSC_KHZ errors, allow invtsc
> > > > > > > migration only on pc-*-2.9 and newer.
> > > > > > > 
> > > > > > > Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
> > > > [...]
> > > > > > > @@ -2655,12 +2656,14 @@ int kvm_arch_put_registers(CPUState *cpu, int level)
> > > > > > >      }
> > > > > > >  
> > > > > > >      if (level == KVM_PUT_FULL_STATE) {
> > > > > > > -        /* We don't check for kvm_arch_set_tsc_khz() errors here,
> > > > > > > -         * because TSC frequency mismatch shouldn't abort migration,
> > > > > > > -         * unless the user explicitly asked for a more strict TSC
> > > > > > > -         * setting (e.g. using an explicit "tsc-freq" option).
> > > > > > > +        /* Migration TSC frequency mismatch is fatal only if we are
> > > > > > > +         * actually reporting Invariant TSC to the guest.
> > > > > > >           */
> > > > > > > -        kvm_arch_set_tsc_khz(cpu);
> > > > > > > +        ret = kvm_arch_set_tsc_khz(cpu);
> > > > > > > +        if ((x86_cpu->env.features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC) &&
> > > > > > > +            ret < 0) {
> > > > > > > +            return ret;
> > > > > > > +        }
> > > > > > >      }
> > > > > > 
> > > > > > Will the guest continue in the source in this case?
> > > > > > 
> > > > > > I think this is past the point where migration has been declared
> > > > > > successful. 
> > > > > > 
> > > > > > Otherwise looks good.
> > > > > 
> > > > > Good point. I will make additional tests and see if there's some
> > > > > other place where the kvm_arch_set_tsc_khz() call can be moved
> > > > > to.
> > > > 
> > > > So, if we solve this and do something on (for example) post_load,
> > > > we still have a problem: device state is migrated after RAM. This
> > > > means QEMU will check for TSC scaling and abort migration very
> > > > late.
> > > > 
> > > > We could solve that by manually registering a SaveVMHandler that
> > > > will send the TSC frequency on save_live_setup, so migration is
> > > > aborted earlier.
> > > > 
> > > > But: this sounds like just a complex hack to work around the real
> > > > problems:
> > > > 
> > > > 1) TSC frequency is guest-visible, and anything that affects
> > > >    guest ABI should depend on the VM configuration, not on host
> > > >    capabilities;
> > > 
> > > Well not really: the TSC frequency where the guest starts 
> > > is the frequency the guest software expects.
> > > So it does depend on host capabilities.
> > 
> > Could you explain where this expectation comes from? I can see
> > multiple cases where choosing the TSC frequency where the VM
> > starts is not the best option.
> 
> 1. Boot guest.
> 2. Calibrate TSC.
> 3. Use delay() with TSC calibration above, or
> use TSC to measure the passage of time (TSC clock 
> in userspace).

If TSC scaling is available, using a different frequency should
be safe, shouldn't it? Otherwise, migrating with TSC scaling
wouldn't be safe either.

Anyway: I don't disagree the starting host frequency is a good
default. It is. But I don't think it's the best option on all
cases.

> 
> > I am considering two possible scenarios below. You probably have
> > another scenario in mind, so it would be useful if you could
> > describe it so we can consider possible solutions.
> > 
> > 
> > Scenario 1:
> > 
> > You have two hosts: A and B, both with TSC scaling. They have
> > different frequencies. The VM can be started on either one of
> > them, and can be migrated to either one depending on the policy
> > of management software.
> > 
> > Maybe B is a backup host, the VM is expected to run most times on
> > host A, and we want to use the TSC frequency from host A every
> > time. Maybe it's the opposite and we want to use the frequency of
> > B. Maybe we want to use the lowest frequency of both, maybe the
> > highest one. We (QEMU developers) can recommend policy to libvirt
> > developers, but a given QEMU process doesn't have information to
> > decide what's best.
> 
> I can't see any practical scenario here, you will always want
> to start with TSC frequency of the host where the VM was started.
> 
> If i am mistaken, please describe a practical case.
> 
> (If a practical scenario comes up, and there is a use-case
> for setting the TSC frequency on startup, lets say 
> a Windows VM which fails to boot if the TSC frequency
> is too high, then it should be supported... But the
> only known scenario to me, applying to 99.999% of cases, 
> is to expose the TSC frequency where the guest booted at).

I don't have any specific case: my point is that I can't tell
what's the best frequency if I don't know where the hosts are
expected to be migrated to.

You claim that using the starting host frequency is the best
option on the vast majority of cases. Maybe it's true, and it
would be a good default. The only problem is that this default
affects migratability:

> 
> > Scenario 2:
> > 
> > Host A has TSC scaling, host B doesn't have TSC scaling. We want
> > to be able to start the VM on host A, and migrate to B. In this
> > case, the only possible solution is to use B's frequency when
> > starting the VM. The QEMU process doesn't have enough information
> > to make that decision.
> 
> That is a good point. But again, its a special case and 
> should be supported by -cpu xxx,tsc-frequency=zzzz.
> 
> However, for the vast majority of 99.999% cases, the issue
> can be handled entirely in QEMU, without libvirt involvement,
> and without adding extra steps to the management software.

I agree it should cover most cases. The only problem here is that
it can break migration in unexpected ways.

Then my point is: assuming that libvirt will prefer to require
explicit TSC frequency configuration to enable invtsc migration
(instead of getting unpredictable migration compatibility), is
the added complexity to migration code worth the effort, if
choosing an explicit frequency is safer and more predictable? I
believe this is where we disagree.

> 
> > > > 2) Setting TSC frequency depending on the host will make
> > > >    migratability unpredictable for management software: the same
> > > >    VM config could be migratable to host A when started on host
> > > >    B, but not migratable to host A when started on host C.
> > > 
> > > Well, just check the frequency.
> > 
> > What do you mean by "just check the frequency"? What exactly
> > should management software do?
> 
> VM booted on host-A can be migrated to host-B if TSC
> frequency matches.

Except when TSC scaling is available. Then you may or may not
migrate from A to B, and you don't know that unless you actually
try to migrate.

> 
> > Whatever management software do, if you just use the source host
> > frequency you can get into a situation where you can start a VM
> > on host A and migrate it to B, but can't start the VM on host B
> > and migrate it to A. This would be confusing for users, and
> > probably break assumptions of existing management software.
> 
> Well this is a side effect of invariant TSC and migration.
> 
> Look, i agree with all your points, but my point is this: i personally
> prefer to handle the 99.999% case, which is the TSC frequency exposed is the one
> from the host where the guest booted, in QEMU entirely (for example, to
> make life easier for people who run qemu manually such as myself).

Right. I don't mind not covering 100% of cases. But I worry if
the cases not covered by us behave unexpectedly and
unpredictably. If we caused a failure every time an
unsafe/unsupported config was used, it would be OK. But making
the ability to migrate unpredictable is a more serious issue IMO.

I believe we both agree about how the final version should
behave, but disagree about what is more important in the first
version:

* My proposal for the first version means:
  * People would have to configure TSC frequency manually if
    they want invtsc + migration (until we also provide a
    mechanism to query the host TSC frequency so
    management/scripts could choose it as default);
  * Adding more code to libvirt.
* Your proposal for the first version means:
  * The ability to migrate won't be predictable by libvirt;
  * Extra complexity on the migration code to ensure
    we abort migration on mismatch.

We seem to be weighting those issues differently. To me, having
predictability on migration ability since the first version is
more important to me than making configuration easier (on the
first version).

> 
> > > > I suggest we allow migration with invtsc if and only if
> > > > tsc-frequency is set explicitly by management software. In other
> > > > words, apply only patches 1/4 and 2/4 from this series. After
> > > > that, we will need libvirt support for configuring tsc-frequency.
> > > 
> > > I don't like that for the following reasons:
> > > 
> > > * It moves low level complexity from QEMU/KVM to libvirt 
> > >    (libvirt has to call KVM_GET_TSC_KHZ from a vcpu thread).
> > 
> > It doesn't. It could ask QEMU for that information (the new
> > query-cpu-model-expansion QMP command would allow that).
> > 
> > Or, alternatively, it could just let the user choose the
> > frequency. 
> 
> Again, you want to expose the host where the VM booted in most
> cases (except the ones you list above).
> 
> > It's probably acceptable for many use cases where
> > invtsc+migration is important.
> 
> Ok, thats better. Can you add a patch to your series with the steps 
> as how mgmt software should proceed ?
> 
> > > * It makes it difficult to run QEMU manually (i use QEMU
> > >    manually all the time).
> > 
> > I believe the set of people that: 1) need invtsc; 2) need
> > migration to work; and 3) run QEMU manually will be able to add
> > "tsc-frequency=XXX" to the command-line easily. :)
> 
> Ok, so migration is only allowed if tsc-frequency= is specified.
> 
> > > * It requires patching libvirt.
> > > 
> > > In my experience things work better when the functionality is
> > > not split across libvirt/qemu.
> > 
> > In my experience things break when management software is the
> > only component able to make a decision but we don't provide
> > mechanisms to let management software make that decision.
> > 
> > The TSC frequency affects migratability to hosts. Choose the
> > wrong frequency, and you might not be able to migrate. Only
> > management software knows to which hosts the VM could be migrated
> > in the future, and which TSC frequency would allow that.
> 
> True.
> 
> > > Can't this be fixed in QEMU? Just check that destination host supports
> > > TSC scaling before migration (or that KVM_GET_TSC_KHZ return value
> > > matches on source and destination).
> > > 
> > 
> > This solves only one use case: where you want to expose the
> > starting host TSC frequency to the guest. It doesn't cover any
> > scenario where the TSC frequency of the starting host isn't the
> > best one. (See the two scenarios I described above)
> 
> True.
> 
> > You seem to have a specific use case in mind. If you describe it,
> > we could decide if it's worth the extra migration code complexity
> > to deal with invtsc migration without explicit tsc-frequency. By
> > now, I am not convinced yet.
> 
> I don't have any specific use case in mind. I'm just trying to
> avoid moving complexity to libvirt which in my experience is a
> the best thing to do.

I think both our proposals make things harder for libvirt in
different ways. I just want to make the complexity explicit for
libvirt, instead of giving them the illusion of safety (making
migration break in ways libvirt can't detect).

Anyway, your points are still valid and it would be still useful
to do what you propose. I will give it a try on a new version of
patch 4/4 that can abort migration earlier. But note that I
expect some pushback from other maintainers because of the
complexity of the code. If that happens, be aware that I won't be
able to justify the added complexity because I would still think
it's not worth it.

> 
> > Maybe your use case just needs a explicit "invtsc-migration=on"
> > command-line flag without a mechanism to abort migration on
> > mismatch? I can't tell.
> 
> Again, there is no special case.
> 
> > Note that even if we follow your suggestion and implement an
> > alternative version of patch 4/4 to cover your use case, I will
> > strongly recommend libvirt developers to support configuring TSC
> > frequency if they want to support invtsc + migration without
> > surprising/unpredictable restrictions on migratability.
> 
> Well, alright. If you make the TSC frequency of the host
> available to mgmt software as you describe, and write the steps mgmt
> software should take, i'm good.

I plan to. The problem is that the mechanism to query the host
frequency may be unavailable in the first version.

> 
> Thanks for the help Eduardo.
>
Daniel P. Berrangé Jan. 5, 2017, 12:33 p.m. UTC | #9
On Thu, Jan 05, 2017 at 10:19:50AM -0200, Eduardo Habkost wrote:
> On Thu, Jan 05, 2017 at 08:48:32AM -0200, Marcelo Tosatti wrote:
> > On Wed, Jan 04, 2017 at 11:36:31PM -0200, Eduardo Habkost wrote:
> > > I am considering two possible scenarios below. You probably have
> > > another scenario in mind, so it would be useful if you could
> > > describe it so we can consider possible solutions.
> > > 
> > > 
> > > Scenario 1:
> > > 
> > > You have two hosts: A and B, both with TSC scaling. They have
> > > different frequencies. The VM can be started on either one of
> > > them, and can be migrated to either one depending on the policy
> > > of management software.
> > > 
> > > Maybe B is a backup host, the VM is expected to run most times on
> > > host A, and we want to use the TSC frequency from host A every
> > > time. Maybe it's the opposite and we want to use the frequency of
> > > B. Maybe we want to use the lowest frequency of both, maybe the
> > > highest one. We (QEMU developers) can recommend policy to libvirt
> > > developers, but a given QEMU process doesn't have information to
> > > decide what's best.
> > 
> > I can't see any practical scenario here, you will always want
> > to start with TSC frequency of the host where the VM was started.
> > 
> > If i am mistaken, please describe a practical case.
> > 
> > (If a practical scenario comes up, and there is a use-case
> > for setting the TSC frequency on startup, lets say 
> > a Windows VM which fails to boot if the TSC frequency
> > is too high, then it should be supported... But the
> > only known scenario to me, applying to 99.999% of cases, 
> > is to expose the TSC frequency where the guest booted at).
> 
> I don't have any specific case: my point is that I can't tell
> what's the best frequency if I don't know where the hosts are
> expected to be migrated to.
> 
> You claim that using the starting host frequency is the best
> option on the vast majority of cases. Maybe it's true, and it
> would be a good default. The only problem is that this default
> affects migratability:

Yep, this is the same scenario as CPU model choice. While
using "host" model may be a reasonable default if all the
hosts in your cluster use identical CPUs. If you have a
mix of CPUs though, and you care about migration, you'll
want to determine a common denominator instead. The same
applies to TSC frequency choice.

So from that POV, the libvirt capabilities XML for a host
needs to have a way to report the supported TSC frequency
for the host. The mgmt app can then figure out what explicit
TSC freq to set for guests that need to be migratable to
maximise runnability across the hosts.

> > > Scenario 2:
> > > 
> > > Host A has TSC scaling, host B doesn't have TSC scaling. We want
> > > to be able to start the VM on host A, and migrate to B. In this
> > > case, the only possible solution is to use B's frequency when
> > > starting the VM. The QEMU process doesn't have enough information
> > > to make that decision.
> > 
> > That is a good point. But again, its a special case and 
> > should be supported by -cpu xxx,tsc-frequency=zzzz.
> > 
> > However, for the vast majority of 99.999% cases, the issue
> > can be handled entirely in QEMU, without libvirt involvement,
> > and without adding extra steps to the management software.
> 
> I agree it should cover most cases. The only problem here is that
> it can break migration in unexpected ways.
> 
> Then my point is: assuming that libvirt will prefer to require
> explicit TSC frequency configuration to enable invtsc migration
> (instead of getting unpredictable migration compatibility), is
> the added complexity to migration code worth the effort, if
> choosing an explicit frequency is safer and more predictable? I
> believe this is where we disagree.

I believe that if libvirt detects that QEMU supports the 'tsc-frequency'
option, then libvirt should set it by default in the XML, if not already
set by the mgmt app. That way, libvirt can validate TSC freq comapt
for migration before it even launches QEMU in the target host.

Regards,
Daniel
Eduardo Habkost Jan. 5, 2017, 12:48 p.m. UTC | #10
On Thu, Jan 05, 2017 at 12:33:51PM +0000, Daniel P. Berrange wrote:
> On Thu, Jan 05, 2017 at 10:19:50AM -0200, Eduardo Habkost wrote:
> > On Thu, Jan 05, 2017 at 08:48:32AM -0200, Marcelo Tosatti wrote:
> > > On Wed, Jan 04, 2017 at 11:36:31PM -0200, Eduardo Habkost wrote:
[...]
> > > > Scenario 2:
> > > > 
> > > > Host A has TSC scaling, host B doesn't have TSC scaling. We want
> > > > to be able to start the VM on host A, and migrate to B. In this
> > > > case, the only possible solution is to use B's frequency when
> > > > starting the VM. The QEMU process doesn't have enough information
> > > > to make that decision.
> > > 
> > > That is a good point. But again, its a special case and 
> > > should be supported by -cpu xxx,tsc-frequency=zzzz.
> > > 
> > > However, for the vast majority of 99.999% cases, the issue
> > > can be handled entirely in QEMU, without libvirt involvement,
> > > and without adding extra steps to the management software.
> > 
> > I agree it should cover most cases. The only problem here is that
> > it can break migration in unexpected ways.
> > 
> > Then my point is: assuming that libvirt will prefer to require
> > explicit TSC frequency configuration to enable invtsc migration
> > (instead of getting unpredictable migration compatibility), is
> > the added complexity to migration code worth the effort, if
> > choosing an explicit frequency is safer and more predictable? I
> > believe this is where we disagree.
> 
> I believe that if libvirt detects that QEMU supports the 'tsc-frequency'
> option, then libvirt should set it by default in the XML, if not already
> set by the mgmt app. That way, libvirt can validate TSC freq comapt
> for migration before it even launches QEMU in the target host.

If you do this unconditionally, you have another problem: if
tsc-frequency is set explicitly, migration is only possible if
TSC frequency of the destination matches[1], or if TSC scaling is
supported by the destination. It's a good idea to set a TSC
frequency only if invtsc is enabled explicitly in the config.

[1] Currently the frequency needs to match exactly. That's a
separate issue: we should probably add a knob to allow a slight
variation in TSC frequency (e.g. <1% difference).
Daniel P. Berrangé Jan. 5, 2017, 1 p.m. UTC | #11
On Thu, Jan 05, 2017 at 10:48:57AM -0200, Eduardo Habkost wrote:
> On Thu, Jan 05, 2017 at 12:33:51PM +0000, Daniel P. Berrange wrote:
> > On Thu, Jan 05, 2017 at 10:19:50AM -0200, Eduardo Habkost wrote:
> > > On Thu, Jan 05, 2017 at 08:48:32AM -0200, Marcelo Tosatti wrote:
> > > > On Wed, Jan 04, 2017 at 11:36:31PM -0200, Eduardo Habkost wrote:
> [...]
> > > > > Scenario 2:
> > > > > 
> > > > > Host A has TSC scaling, host B doesn't have TSC scaling. We want
> > > > > to be able to start the VM on host A, and migrate to B. In this
> > > > > case, the only possible solution is to use B's frequency when
> > > > > starting the VM. The QEMU process doesn't have enough information
> > > > > to make that decision.
> > > > 
> > > > That is a good point. But again, its a special case and 
> > > > should be supported by -cpu xxx,tsc-frequency=zzzz.
> > > > 
> > > > However, for the vast majority of 99.999% cases, the issue
> > > > can be handled entirely in QEMU, without libvirt involvement,
> > > > and without adding extra steps to the management software.
> > > 
> > > I agree it should cover most cases. The only problem here is that
> > > it can break migration in unexpected ways.
> > > 
> > > Then my point is: assuming that libvirt will prefer to require
> > > explicit TSC frequency configuration to enable invtsc migration
> > > (instead of getting unpredictable migration compatibility), is
> > > the added complexity to migration code worth the effort, if
> > > choosing an explicit frequency is safer and more predictable? I
> > > believe this is where we disagree.
> > 
> > I believe that if libvirt detects that QEMU supports the 'tsc-frequency'
> > option, then libvirt should set it by default in the XML, if not already
> > set by the mgmt app. That way, libvirt can validate TSC freq comapt
> > for migration before it even launches QEMU in the target host.
> 
> If you do this unconditionally, you have another problem: if
> tsc-frequency is set explicitly, migration is only possible if
> TSC frequency of the destination matches[1], or if TSC scaling is
> supported by the destination. It's a good idea to set a TSC
> frequency only if invtsc is enabled explicitly in the config.

If we don't set tsc-frequency and the TSC frequency doesn't
match, does that mean the guest migration succeed, but suddenly
sees different TSC frequency ?

I guess we we allowed that historically we can't break that
now, so setting it only if invtsc is set seems reasonable.

> 
> [1] Currently the frequency needs to match exactly. That's a
> separate issue: we should probably add a knob to allow a slight
> variation in TSC frequency (e.g. <1% difference).

Regards,
Daniel
Eduardo Habkost Jan. 5, 2017, 1:11 p.m. UTC | #12
On Thu, Jan 05, 2017 at 01:00:32PM +0000, Daniel P. Berrange wrote:
> On Thu, Jan 05, 2017 at 10:48:57AM -0200, Eduardo Habkost wrote:
> > On Thu, Jan 05, 2017 at 12:33:51PM +0000, Daniel P. Berrange wrote:
> > > On Thu, Jan 05, 2017 at 10:19:50AM -0200, Eduardo Habkost wrote:
> > > > On Thu, Jan 05, 2017 at 08:48:32AM -0200, Marcelo Tosatti wrote:
> > > > > On Wed, Jan 04, 2017 at 11:36:31PM -0200, Eduardo Habkost wrote:
> > [...]
> > > > > > Scenario 2:
> > > > > > 
> > > > > > Host A has TSC scaling, host B doesn't have TSC scaling. We want
> > > > > > to be able to start the VM on host A, and migrate to B. In this
> > > > > > case, the only possible solution is to use B's frequency when
> > > > > > starting the VM. The QEMU process doesn't have enough information
> > > > > > to make that decision.
> > > > > 
> > > > > That is a good point. But again, its a special case and 
> > > > > should be supported by -cpu xxx,tsc-frequency=zzzz.
> > > > > 
> > > > > However, for the vast majority of 99.999% cases, the issue
> > > > > can be handled entirely in QEMU, without libvirt involvement,
> > > > > and without adding extra steps to the management software.
> > > > 
> > > > I agree it should cover most cases. The only problem here is that
> > > > it can break migration in unexpected ways.
> > > > 
> > > > Then my point is: assuming that libvirt will prefer to require
> > > > explicit TSC frequency configuration to enable invtsc migration
> > > > (instead of getting unpredictable migration compatibility), is
> > > > the added complexity to migration code worth the effort, if
> > > > choosing an explicit frequency is safer and more predictable? I
> > > > believe this is where we disagree.
> > > 
> > > I believe that if libvirt detects that QEMU supports the 'tsc-frequency'
> > > option, then libvirt should set it by default in the XML, if not already
> > > set by the mgmt app. That way, libvirt can validate TSC freq comapt
> > > for migration before it even launches QEMU in the target host.
> > 
> > If you do this unconditionally, you have another problem: if
> > tsc-frequency is set explicitly, migration is only possible if
> > TSC frequency of the destination matches[1], or if TSC scaling is
> > supported by the destination. It's a good idea to set a TSC
> > frequency only if invtsc is enabled explicitly in the config.
> 
> If we don't set tsc-frequency and the TSC frequency doesn't
> match, does that mean the guest migration succeed, but suddenly
> sees different TSC frequency ?

If TSC scaling is unavailable, yes. If the destination host
supports TSC scaling, we automatically keep the original TSC
frequency on migration.

> 
> I guess we we allowed that historically we can't break that
> now, so setting it only if invtsc is set seems reasonable.

I don't think we really had a choice, KVM would be much less
useful if we didn't allow migration between hosts with different
frequencies.

> 
> > 
> > [1] Currently the frequency needs to match exactly. That's a
> > separate issue: we should probably add a knob to allow a slight
> > variation in TSC frequency (e.g. <1% difference).
> 
> Regards,
> Daniel
> -- 
> |: http://berrange.com      -o-    http://www.flickr.com/photos/dberrange/ :|
> |: http://libvirt.org              -o-             http://virt-manager.org :|
> |: http://entangle-photo.org       -o-    http://search.cpan.org/~danberr/ :|
Marcelo Tosatti Jan. 6, 2017, 10:31 a.m. UTC | #13
On Thu, Jan 05, 2017 at 10:19:50AM -0200, Eduardo Habkost wrote:
> On Thu, Jan 05, 2017 at 08:48:32AM -0200, Marcelo Tosatti wrote:
> > On Wed, Jan 04, 2017 at 11:36:31PM -0200, Eduardo Habkost wrote:
> > > On Wed, Jan 04, 2017 at 08:26:27PM -0200, Marcelo Tosatti wrote:
> > > > On Wed, Jan 04, 2017 at 05:59:17PM -0200, Eduardo Habkost wrote:
> > > > > On Wed, Jan 04, 2017 at 11:39:16AM -0200, Eduardo Habkost wrote:
> > > > > > On Wed, Jan 04, 2017 at 09:56:56AM -0200, Marcelo Tosatti wrote:
> > > > > > > On Tue, Dec 27, 2016 at 05:21:20PM -0200, Eduardo Habkost wrote:
> > > > > > > > Instead of blocking migration on the source when invtsc is
> > > > > > > > enabled, rely on the migration destination to ensure there's no
> > > > > > > > TSC frequency mismatch.
> > > > > > > > 
> > > > > > > > We can't allow migration unconditionally because we don't know if
> > > > > > > > the destination is a QEMU version that is really going to ensure
> > > > > > > > there's no TSC frequency mismatch. To ensure we are migrating to
> > > > > > > > a destination that won't ignore SET_TSC_KHZ errors, allow invtsc
> > > > > > > > migration only on pc-*-2.9 and newer.
> > > > > > > > 
> > > > > > > > Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
> > > > > [...]
> > > > > > > > @@ -2655,12 +2656,14 @@ int kvm_arch_put_registers(CPUState *cpu, int level)
> > > > > > > >      }
> > > > > > > >  
> > > > > > > >      if (level == KVM_PUT_FULL_STATE) {
> > > > > > > > -        /* We don't check for kvm_arch_set_tsc_khz() errors here,
> > > > > > > > -         * because TSC frequency mismatch shouldn't abort migration,
> > > > > > > > -         * unless the user explicitly asked for a more strict TSC
> > > > > > > > -         * setting (e.g. using an explicit "tsc-freq" option).
> > > > > > > > +        /* Migration TSC frequency mismatch is fatal only if we are
> > > > > > > > +         * actually reporting Invariant TSC to the guest.
> > > > > > > >           */
> > > > > > > > -        kvm_arch_set_tsc_khz(cpu);
> > > > > > > > +        ret = kvm_arch_set_tsc_khz(cpu);
> > > > > > > > +        if ((x86_cpu->env.features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC) &&
> > > > > > > > +            ret < 0) {
> > > > > > > > +            return ret;
> > > > > > > > +        }
> > > > > > > >      }
> > > > > > > 
> > > > > > > Will the guest continue in the source in this case?
> > > > > > > 
> > > > > > > I think this is past the point where migration has been declared
> > > > > > > successful. 
> > > > > > > 
> > > > > > > Otherwise looks good.
> > > > > > 
> > > > > > Good point. I will make additional tests and see if there's some
> > > > > > other place where the kvm_arch_set_tsc_khz() call can be moved
> > > > > > to.
> > > > > 
> > > > > So, if we solve this and do something on (for example) post_load,
> > > > > we still have a problem: device state is migrated after RAM. This
> > > > > means QEMU will check for TSC scaling and abort migration very
> > > > > late.
> > > > > 
> > > > > We could solve that by manually registering a SaveVMHandler that
> > > > > will send the TSC frequency on save_live_setup, so migration is
> > > > > aborted earlier.
> > > > > 
> > > > > But: this sounds like just a complex hack to work around the real
> > > > > problems:
> > > > > 
> > > > > 1) TSC frequency is guest-visible, and anything that affects
> > > > >    guest ABI should depend on the VM configuration, not on host
> > > > >    capabilities;
> > > > 
> > > > Well not really: the TSC frequency where the guest starts 
> > > > is the frequency the guest software expects.
> > > > So it does depend on host capabilities.
> > > 
> > > Could you explain where this expectation comes from? I can see
> > > multiple cases where choosing the TSC frequency where the VM
> > > starts is not the best option.
> > 
> > 1. Boot guest.
> > 2. Calibrate TSC.
> > 3. Use delay() with TSC calibration above, or
> > use TSC to measure the passage of time (TSC clock 
> > in userspace).
> 
> If TSC scaling is available, using a different frequency should
> be safe, shouldn't it? Otherwise, migrating with TSC scaling
> wouldn't be safe either.

Yes, but if you don't have TSC scaling, you have to boot with the
host frequency.

> Anyway: I don't disagree the starting host frequency is a good
> default. It is. But I don't think it's the best option on all
> cases.

Agree.

> > 
> > > I am considering two possible scenarios below. You probably have
> > > another scenario in mind, so it would be useful if you could
> > > describe it so we can consider possible solutions.
> > > 
> > > 
> > > Scenario 1:
> > > 
> > > You have two hosts: A and B, both with TSC scaling. They have
> > > different frequencies. The VM can be started on either one of
> > > them, and can be migrated to either one depending on the policy
> > > of management software.
> > > 
> > > Maybe B is a backup host, the VM is expected to run most times on
> > > host A, and we want to use the TSC frequency from host A every
> > > time. Maybe it's the opposite and we want to use the frequency of
> > > B. Maybe we want to use the lowest frequency of both, maybe the
> > > highest one. We (QEMU developers) can recommend policy to libvirt
> > > developers, but a given QEMU process doesn't have information to
> > > decide what's best.
> > 
> > I can't see any practical scenario here, you will always want
> > to start with TSC frequency of the host where the VM was started.
> > 
> > If i am mistaken, please describe a practical case.
> > 
> > (If a practical scenario comes up, and there is a use-case
> > for setting the TSC frequency on startup, lets say 
> > a Windows VM which fails to boot if the TSC frequency
> > is too high, then it should be supported... But the
> > only known scenario to me, applying to 99.999% of cases, 
> > is to expose the TSC frequency where the guest booted at).
> 
> I don't have any specific case: my point is that I can't tell
> what's the best frequency if I don't know where the hosts are
> expected to be migrated to.
> 
> You claim that using the starting host frequency is the best
> option on the vast majority of cases. Maybe it's true, and it
> would be a good default. The only problem is that this default
> affects migratability:
> 
> > 
> > > Scenario 2:
> > > 
> > > Host A has TSC scaling, host B doesn't have TSC scaling. We want
> > > to be able to start the VM on host A, and migrate to B. In this
> > > case, the only possible solution is to use B's frequency when
> > > starting the VM. The QEMU process doesn't have enough information
> > > to make that decision.
> > 
> > That is a good point. But again, its a special case and 
> > should be supported by -cpu xxx,tsc-frequency=zzzz.
> > 
> > However, for the vast majority of 99.999% cases, the issue
> > can be handled entirely in QEMU, without libvirt involvement,
> > and without adding extra steps to the management software.
> 
> I agree it should cover most cases. The only problem here is that
> it can break migration in unexpected ways.
> 
> Then my point is: assuming that libvirt will prefer to require
> explicit TSC frequency configuration to enable invtsc migration
> (instead of getting unpredictable migration compatibility), is
> the added complexity to migration code worth the effort, if
> choosing an explicit frequency is safer and more predictable? I
> believe this is where we disagree.
> 
> > 
> > > > > 2) Setting TSC frequency depending on the host will make
> > > > >    migratability unpredictable for management software: the same
> > > > >    VM config could be migratable to host A when started on host
> > > > >    B, but not migratable to host A when started on host C.
> > > > 
> > > > Well, just check the frequency.
> > > 
> > > What do you mean by "just check the frequency"? What exactly
> > > should management software do?
> > 
> > VM booted on host-A can be migrated to host-B if TSC
> > frequency matches.
> 
> Except when TSC scaling is available. Then you may or may not
> migrate from A to B, and you don't know that unless you actually
> try to migrate.
> 
> > 
> > > Whatever management software do, if you just use the source host
> > > frequency you can get into a situation where you can start a VM
> > > on host A and migrate it to B, but can't start the VM on host B
> > > and migrate it to A. This would be confusing for users, and
> > > probably break assumptions of existing management software.
> > 
> > Well this is a side effect of invariant TSC and migration.
> > 
> > Look, i agree with all your points, but my point is this: i personally
> > prefer to handle the 99.999% case, which is the TSC frequency exposed is the one
> > from the host where the guest booted, in QEMU entirely (for example, to
> > make life easier for people who run qemu manually such as myself).
> 
> Right. I don't mind not covering 100% of cases. But I worry if
> the cases not covered by us behave unexpectedly and
> unpredictably. If we caused a failure every time an
> unsafe/unsupported config was used, it would be OK. But making
> the ability to migrate unpredictable is a more serious issue IMO.
> 
> I believe we both agree about how the final version should
> behave, but disagree about what is more important in the first
> version:
> 
> * My proposal for the first version means:
>   * People would have to configure TSC frequency manually if
>     they want invtsc + migration (until we also provide a
>     mechanism to query the host TSC frequency so
>     management/scripts could choose it as default);
>   * Adding more code to libvirt.
> * Your proposal for the first version means:
>   * The ability to migrate won't be predictable by libvirt;
>   * Extra complexity on the migration code to ensure
>     we abort migration on mismatch.
> 
> We seem to be weighting those issues differently. To me, having
> predictability on migration ability since the first version is
> more important to me than making configuration easier (on the
> first version).
> 
> > 
> > > > > I suggest we allow migration with invtsc if and only if
> > > > > tsc-frequency is set explicitly by management software. In other
> > > > > words, apply only patches 1/4 and 2/4 from this series. After
> > > > > that, we will need libvirt support for configuring tsc-frequency.
> > > > 
> > > > I don't like that for the following reasons:
> > > > 
> > > > * It moves low level complexity from QEMU/KVM to libvirt 
> > > >    (libvirt has to call KVM_GET_TSC_KHZ from a vcpu thread).
> > > 
> > > It doesn't. It could ask QEMU for that information (the new
> > > query-cpu-model-expansion QMP command would allow that).
> > > 
> > > Or, alternatively, it could just let the user choose the
> > > frequency. 
> > 
> > Again, you want to expose the host where the VM booted in most
> > cases (except the ones you list above).
> > 
> > > It's probably acceptable for many use cases where
> > > invtsc+migration is important.
> > 
> > Ok, thats better. Can you add a patch to your series with the steps 
> > as how mgmt software should proceed ?
> > 
> > > > * It makes it difficult to run QEMU manually (i use QEMU
> > > >    manually all the time).
> > > 
> > > I believe the set of people that: 1) need invtsc; 2) need
> > > migration to work; and 3) run QEMU manually will be able to add
> > > "tsc-frequency=XXX" to the command-line easily. :)
> > 
> > Ok, so migration is only allowed if tsc-frequency= is specified.
> > 
> > > > * It requires patching libvirt.
> > > > 
> > > > In my experience things work better when the functionality is
> > > > not split across libvirt/qemu.
> > > 
> > > In my experience things break when management software is the
> > > only component able to make a decision but we don't provide
> > > mechanisms to let management software make that decision.
> > > 
> > > The TSC frequency affects migratability to hosts. Choose the
> > > wrong frequency, and you might not be able to migrate. Only
> > > management software knows to which hosts the VM could be migrated
> > > in the future, and which TSC frequency would allow that.
> > 
> > True.
> > 
> > > > Can't this be fixed in QEMU? Just check that destination host supports
> > > > TSC scaling before migration (or that KVM_GET_TSC_KHZ return value
> > > > matches on source and destination).
> > > > 
> > > 
> > > This solves only one use case: where you want to expose the
> > > starting host TSC frequency to the guest. It doesn't cover any
> > > scenario where the TSC frequency of the starting host isn't the
> > > best one. (See the two scenarios I described above)
> > 
> > True.
> > 
> > > You seem to have a specific use case in mind. If you describe it,
> > > we could decide if it's worth the extra migration code complexity
> > > to deal with invtsc migration without explicit tsc-frequency. By
> > > now, I am not convinced yet.
> > 
> > I don't have any specific use case in mind. I'm just trying to
> > avoid moving complexity to libvirt which in my experience is a
> > the best thing to do.
> 
> I think both our proposals make things harder for libvirt in
> different ways. I just want to make the complexity explicit for
> libvirt, instead of giving them the illusion of safety (making
> migration break in ways libvirt can't detect).
> 
> Anyway, your points are still valid and it would be still useful
> to do what you propose. I will give it a try on a new version of
> patch 4/4 that can abort migration earlier. But note that I
> expect some pushback from other maintainers because of the
> complexity of the code. If that happens, be aware that I won't be
> able to justify the added complexity because I would still think
> it's not worth it.
> 
> > 
> > > Maybe your use case just needs a explicit "invtsc-migration=on"
> > > command-line flag without a mechanism to abort migration on
> > > mismatch? I can't tell.
> > 
> > Again, there is no special case.
> > 
> > > Note that even if we follow your suggestion and implement an
> > > alternative version of patch 4/4 to cover your use case, I will
> > > strongly recommend libvirt developers to support configuring TSC
> > > frequency if they want to support invtsc + migration without
> > > surprising/unpredictable restrictions on migratability.
> > 
> > Well, alright. If you make the TSC frequency of the host
> > available to mgmt software as you describe, and write the steps mgmt
> > software should take, i'm good.
> 
> I plan to. The problem is that the mechanism to query the host
> frequency may be unavailable in the first version.

Well just export KVM_GET_TSC_KHZ in a QMP command right? Its pretty
easy.

Let me know if you need any help coding or testing.
Eduardo Habkost Jan. 8, 2017, 3:49 p.m. UTC | #14
On Fri, Jan 06, 2017 at 08:31:27AM -0200, Marcelo Tosatti wrote:
> On Thu, Jan 05, 2017 at 10:19:50AM -0200, Eduardo Habkost wrote:
> > On Thu, Jan 05, 2017 at 08:48:32AM -0200, Marcelo Tosatti wrote:
> > > On Wed, Jan 04, 2017 at 11:36:31PM -0200, Eduardo Habkost wrote:
> > > > On Wed, Jan 04, 2017 at 08:26:27PM -0200, Marcelo Tosatti wrote:
> > > > > On Wed, Jan 04, 2017 at 05:59:17PM -0200, Eduardo Habkost wrote:
> > > > > > On Wed, Jan 04, 2017 at 11:39:16AM -0200, Eduardo Habkost wrote:
> > > > > > > On Wed, Jan 04, 2017 at 09:56:56AM -0200, Marcelo Tosatti wrote:
> > > > > > > > On Tue, Dec 27, 2016 at 05:21:20PM -0200, Eduardo Habkost wrote:
[...]
> > > > > Can't this be fixed in QEMU? Just check that destination host supports
> > > > > TSC scaling before migration (or that KVM_GET_TSC_KHZ return value
> > > > > matches on source and destination).
> > > > > 
> > > > 
> > > > This solves only one use case: where you want to expose the
> > > > starting host TSC frequency to the guest. It doesn't cover any
> > > > scenario where the TSC frequency of the starting host isn't the
> > > > best one. (See the two scenarios I described above)
> > > 
> > > True.
> > > 
> > > > You seem to have a specific use case in mind. If you describe it,
> > > > we could decide if it's worth the extra migration code complexity
> > > > to deal with invtsc migration without explicit tsc-frequency. By
> > > > now, I am not convinced yet.
> > > 
> > > I don't have any specific use case in mind. I'm just trying to
> > > avoid moving complexity to libvirt which in my experience is a
> > > the best thing to do.
> > 
> > I think both our proposals make things harder for libvirt in
> > different ways. I just want to make the complexity explicit for
> > libvirt, instead of giving them the illusion of safety (making
> > migration break in ways libvirt can't detect).
> > 
> > Anyway, your points are still valid and it would be still useful
> > to do what you propose. I will give it a try on a new version of
> > patch 4/4 that can abort migration earlier. But note that I
> > expect some pushback from other maintainers because of the
> > complexity of the code. If that happens, be aware that I won't be
> > able to justify the added complexity because I would still think
> > it's not worth it.

I tried to move the destination TSC-scaling check to post_load,
and the bad news is that post_load is also too late to abort
migration (destination aborts, but source shows migration as
completed).

I'm CCing Juan, Amit and David to see if they have any
suggestion.

Summarizing the problem for them: on some cases we want to allow
migration only if the destination host has a matching TSC
frequency or if TSC scaling is supported by the destination host.
I don't know what's the best way to implement that check. We
could either:

* Send TSC frequency/scaling info from the the source host, and
  make the source host abort migration. Is there a good mechanism
  for that?
* Make the destination host abort migration. I don't know if
  there's a safe mechanism for that.



While we figure that out, I will submit v2 without patches 3/4
and 4/4, because patch 2/2 (allowing migration if tsc-freq is
explicitly set) is simple and useful. After that, we can add:

* The mechanism to query the host TSC frequency (see below)
* The mechanism you asked for, to allow migration without
  explicit TSC frequency if we know the destination host supports
  TSC scaling or has a matching TSC frequency (more complex, and
  maybe unnecessary if we implement the previous item)


> > 
> > > 
> > > > Maybe your use case just needs a explicit "invtsc-migration=on"
> > > > command-line flag without a mechanism to abort migration on
> > > > mismatch? I can't tell.
> > > 
> > > Again, there is no special case.
> > > 
> > > > Note that even if we follow your suggestion and implement an
> > > > alternative version of patch 4/4 to cover your use case, I will
> > > > strongly recommend libvirt developers to support configuring TSC
> > > > frequency if they want to support invtsc + migration without
> > > > surprising/unpredictable restrictions on migratability.
> > > 
> > > Well, alright. If you make the TSC frequency of the host
> > > available to mgmt software as you describe, and write the steps mgmt
> > > software should take, i'm good.
> > 
> > I plan to. The problem is that the mechanism to query the host
> > frequency may be unavailable in the first version.
> 
> Well just export KVM_GET_TSC_KHZ in a QMP command right? Its pretty
> easy.

I plan to expose it as part of query-cpu-model-expansion (work in
progress, right now) when querying the "host" CPU model.

> 
> Let me know if you need any help coding or testing.

Thanks!
Eduardo Habkost Jan. 8, 2017, 8:28 p.m. UTC | #15
On Fri, Jan 06, 2017 at 08:31:27AM -0200, Marcelo Tosatti wrote:
[...]
> > > 
> > > > Maybe your use case just needs a explicit "invtsc-migration=on"
> > > > command-line flag without a mechanism to abort migration on
> > > > mismatch? I can't tell.
> > > 
> > > Again, there is no special case.
> > > 
> > > > Note that even if we follow your suggestion and implement an
> > > > alternative version of patch 4/4 to cover your use case, I will
> > > > strongly recommend libvirt developers to support configuring TSC
> > > > frequency if they want to support invtsc + migration without
> > > > surprising/unpredictable restrictions on migratability.
> > > 
> > > Well, alright. If you make the TSC frequency of the host
> > > available to mgmt software as you describe, and write the steps mgmt
> > > software should take, i'm good.
> > 
> > I plan to. The problem is that the mechanism to query the host
> > frequency may be unavailable in the first version.
> 
> Well just export KVM_GET_TSC_KHZ in a QMP command right? Its pretty
> easy.
> 
> Let me know if you need any help coding or testing.

I just found out that KVM doesn't provide something that QEMU and
libvirt need: the value of kvm_max_guest_tsc_khz. Without it, we
have no way to know if a given VM is really migratable to a host.

Could we add a KVM_CAP_MAX_TSC_KHZ capability for that?
Dr. David Alan Gilbert Jan. 9, 2017, 10:04 a.m. UTC | #16
* Eduardo Habkost (ehabkost@redhat.com) wrote:
> On Fri, Jan 06, 2017 at 08:31:27AM -0200, Marcelo Tosatti wrote:
> > On Thu, Jan 05, 2017 at 10:19:50AM -0200, Eduardo Habkost wrote:
> > > On Thu, Jan 05, 2017 at 08:48:32AM -0200, Marcelo Tosatti wrote:
> > > > On Wed, Jan 04, 2017 at 11:36:31PM -0200, Eduardo Habkost wrote:
> > > > > On Wed, Jan 04, 2017 at 08:26:27PM -0200, Marcelo Tosatti wrote:
> > > > > > On Wed, Jan 04, 2017 at 05:59:17PM -0200, Eduardo Habkost wrote:
> > > > > > > On Wed, Jan 04, 2017 at 11:39:16AM -0200, Eduardo Habkost wrote:
> > > > > > > > On Wed, Jan 04, 2017 at 09:56:56AM -0200, Marcelo Tosatti wrote:
> > > > > > > > > On Tue, Dec 27, 2016 at 05:21:20PM -0200, Eduardo Habkost wrote:
> [...]
> > > > > > Can't this be fixed in QEMU? Just check that destination host supports
> > > > > > TSC scaling before migration (or that KVM_GET_TSC_KHZ return value
> > > > > > matches on source and destination).
> > > > > > 
> > > > > 
> > > > > This solves only one use case: where you want to expose the
> > > > > starting host TSC frequency to the guest. It doesn't cover any
> > > > > scenario where the TSC frequency of the starting host isn't the
> > > > > best one. (See the two scenarios I described above)
> > > > 
> > > > True.
> > > > 
> > > > > You seem to have a specific use case in mind. If you describe it,
> > > > > we could decide if it's worth the extra migration code complexity
> > > > > to deal with invtsc migration without explicit tsc-frequency. By
> > > > > now, I am not convinced yet.
> > > > 
> > > > I don't have any specific use case in mind. I'm just trying to
> > > > avoid moving complexity to libvirt which in my experience is a
> > > > the best thing to do.
> > > 
> > > I think both our proposals make things harder for libvirt in
> > > different ways. I just want to make the complexity explicit for
> > > libvirt, instead of giving them the illusion of safety (making
> > > migration break in ways libvirt can't detect).
> > > 
> > > Anyway, your points are still valid and it would be still useful
> > > to do what you propose. I will give it a try on a new version of
> > > patch 4/4 that can abort migration earlier. But note that I
> > > expect some pushback from other maintainers because of the
> > > complexity of the code. If that happens, be aware that I won't be
> > > able to justify the added complexity because I would still think
> > > it's not worth it.
> 
> I tried to move the destination TSC-scaling check to post_load,
> and the bad news is that post_load is also too late to abort
> migration (destination aborts, but source shows migration as
> completed).
> 
> I'm CCing Juan, Amit and David to see if they have any
> suggestion.

Yes, that's not too surprising; we don't have any 'finished' flag in
the stream; it's just a one directional stream where the source claims
to have finished after it's stuffed the last byte into the socket 
(I think before it's closed it).

> Summarizing the problem for them: on some cases we want to allow
> migration only if the destination host has a matching TSC
> frequency or if TSC scaling is supported by the destination host.
> I don't know what's the best way to implement that check. We
> could either:
> 
> * Send TSC frequency/scaling info from the the source host, and
>   make the source host abort migration. Is there a good mechanism
>   for that?

Normal migration is just a 1-directional stream, so we dont have a way
to get the TSC frequency back to the source host.

> * Make the destination host abort migration. I don't know if
>   there's a safe mechanism for that.

The only safe way to do it is to do it early; preferably before the RAM
so that you know (via buffer depths) that the source can't have got to
the end yet (and that's assuming these are constant values that are
checked, not something guest configurable).
We do have a 'global_state' in migration.c - but it's just for migration
state rather than device state.
I don't think we currently have any hook for sending device state early;
the setup phase hooks on devices are only called for iterative devices
(i.e. RAM/block etc); I think to do what you need we need to either
add a subsection into vmstate_globalstate in migration.c (which we somehow
have to make device/machine independent) or we have to find a way to
have setup sections for non-iterative devices.

One worry; is this going to affect loading of snapshots on the same
host?  If I take a snapshot, and then come back on a nice warm day
is the TSC on the host going to be sufficiently different the snapshot
wont load?

Dave

> 
> 
> 
> While we figure that out, I will submit v2 without patches 3/4
> and 4/4, because patch 2/2 (allowing migration if tsc-freq is
> explicitly set) is simple and useful. After that, we can add:
> 
> * The mechanism to query the host TSC frequency (see below)
> * The mechanism you asked for, to allow migration without
>   explicit TSC frequency if we know the destination host supports
>   TSC scaling or has a matching TSC frequency (more complex, and
>   maybe unnecessary if we implement the previous item)
> 
> 
> > > 
> > > > 
> > > > > Maybe your use case just needs a explicit "invtsc-migration=on"
> > > > > command-line flag without a mechanism to abort migration on
> > > > > mismatch? I can't tell.
> > > > 
> > > > Again, there is no special case.
> > > > 
> > > > > Note that even if we follow your suggestion and implement an
> > > > > alternative version of patch 4/4 to cover your use case, I will
> > > > > strongly recommend libvirt developers to support configuring TSC
> > > > > frequency if they want to support invtsc + migration without
> > > > > surprising/unpredictable restrictions on migratability.
> > > > 
> > > > Well, alright. If you make the TSC frequency of the host
> > > > available to mgmt software as you describe, and write the steps mgmt
> > > > software should take, i'm good.
> > > 
> > > I plan to. The problem is that the mechanism to query the host
> > > frequency may be unavailable in the first version.
> > 
> > Well just export KVM_GET_TSC_KHZ in a QMP command right? Its pretty
> > easy.
> 
> I plan to expose it as part of query-cpu-model-expansion (work in
> progress, right now) when querying the "host" CPU model.
> 
> > 
> > Let me know if you need any help coding or testing.
> 
> Thanks!
> 
> -- 
> Eduardo
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Paolo Bonzini Jan. 9, 2017, 2:58 p.m. UTC | #17
On 08/01/2017 21:28, Eduardo Habkost wrote:
>> Well just export KVM_GET_TSC_KHZ in a QMP command right? Its pretty
>> easy.
>>
>> Let me know if you need any help coding or testing.
> I just found out that KVM doesn't provide something that QEMU and
> libvirt need: the value of kvm_max_guest_tsc_khz. Without it, we
> have no way to know if a given VM is really migratable to a host.
> 
> Could we add a KVM_CAP_MAX_TSC_KHZ capability for that?

The ratio is really quite high, 256x the host frequency for AMD and
65536x for Intel.  Anything below 2^32 Hz (above that, there would
probably be other failures) is safe.

Paolo
Paolo Bonzini Jan. 10, 2017, 4:36 p.m. UTC | #18
On 05/01/2017 11:48, Marcelo Tosatti wrote:
>> Host A has TSC scaling, host B doesn't have TSC scaling. We want
>> to be able to start the VM on host A, and migrate to B. In this
>> case, the only possible solution is to use B's frequency when
>> starting the VM. The QEMU process doesn't have enough information
>> to make that decision.
> That is a good point. But again, its a special case and 
> should be supported by -cpu xxx,tsc-frequency=zzzz.

I don't think this is a scenario that can work reliably.  The computed
TSC frequency may vary by 0.5% or so on every boot (e.g. you may get
2497000 kHz or 2511000 kHz for a 2.5 GHz TSC).  You can start the VM on
host A, reboot host B, and then you'll be unable to migrate.

Paolo
Paolo Bonzini Jan. 10, 2017, 4:38 p.m. UTC | #19
On 05/01/2017 14:00, Daniel P. Berrange wrote:
>> If you do this unconditionally, you have another problem: if
>> tsc-frequency is set explicitly, migration is only possible if
>> TSC frequency of the destination matches[1], or if TSC scaling is
>> supported by the destination. It's a good idea to set a TSC
>> frequency only if invtsc is enabled explicitly in the config.
> 
> If we don't set tsc-frequency and the TSC frequency doesn't
> match, does that mean the guest migration succeed, but suddenly
> sees different TSC frequency ?

That's the reason why kvmclock exists (or more precisely, the reason why
kvmclock is still useful even when hosts have invtsc).

Paolo

> I guess we we allowed that historically we can't break that
> now, so setting it only if invtsc is set seems reasonable.
Eduardo Habkost Jan. 11, 2017, 11:58 a.m. UTC | #20
On Tue, Jan 10, 2017 at 05:36:48PM +0100, Paolo Bonzini wrote:
> 
> 
> On 05/01/2017 11:48, Marcelo Tosatti wrote:
> >> Host A has TSC scaling, host B doesn't have TSC scaling. We want
> >> to be able to start the VM on host A, and migrate to B. In this
> >> case, the only possible solution is to use B's frequency when
> >> starting the VM. The QEMU process doesn't have enough information
> >> to make that decision.
> > That is a good point. But again, its a special case and 
> > should be supported by -cpu xxx,tsc-frequency=zzzz.
> 
> I don't think this is a scenario that can work reliably.  The computed
> TSC frequency may vary by 0.5% or so on every boot (e.g. you may get
> 2497000 kHz or 2511000 kHz for a 2.5 GHz TSC).  You can start the VM on
> host A, reboot host B, and then you'll be unable to migrate.

Right, so it means invtsc migration without TSC scaling will be
possible in practice only if we tolerate a small variance on TSC
frequency on migration. The question is: should we? Can we
tolerate a 0.5% variance on TSC frequency and still expose invtsc
to the guest?
Eduardo Habkost Jan. 11, 2017, 1:26 p.m. UTC | #21
On Mon, Jan 09, 2017 at 03:58:11PM +0100, Paolo Bonzini wrote:
> 
> 
> On 08/01/2017 21:28, Eduardo Habkost wrote:
> >> Well just export KVM_GET_TSC_KHZ in a QMP command right? Its pretty
> >> easy.
> >>
> >> Let me know if you need any help coding or testing.
> > I just found out that KVM doesn't provide something that QEMU and
> > libvirt need: the value of kvm_max_guest_tsc_khz. Without it, we
> > have no way to know if a given VM is really migratable to a host.
> > 
> > Could we add a KVM_CAP_MAX_TSC_KHZ capability for that?
> 
> The ratio is really quite high, 256x the host frequency for AMD and
> 65536x for Intel.  Anything below 2^32 Hz (above that, there would
> probably be other failures) is safe.

2^32 Hz (~4.3 GHz) sounds like a limit likely to be hit by future
CPUs. Which kind of failures do you think we could see?
Paolo Bonzini Jan. 11, 2017, 2:06 p.m. UTC | #22
On 11/01/2017 14:26, Eduardo Habkost wrote:
> On Mon, Jan 09, 2017 at 03:58:11PM +0100, Paolo Bonzini wrote:
>>
>>
>> On 08/01/2017 21:28, Eduardo Habkost wrote:
>>>> Well just export KVM_GET_TSC_KHZ in a QMP command right? Its pretty
>>>> easy.
>>>>
>>>> Let me know if you need any help coding or testing.
>>> I just found out that KVM doesn't provide something that QEMU and
>>> libvirt need: the value of kvm_max_guest_tsc_khz. Without it, we
>>> have no way to know if a given VM is really migratable to a host.
>>>
>>> Could we add a KVM_CAP_MAX_TSC_KHZ capability for that?
>>
>> The ratio is really quite high, 256x the host frequency for AMD and
>> 65536x for Intel.  Anything below 2^32 Hz (above that, there would
>> probably be other failures) is safe.
> 
> 2^32 Hz (~4.3 GHz) sounds like a limit likely to be hit by future
> CPUs. Which kind of failures do you think we could see?

Just integer overflows.  Current CPUs have been at 3.9 GHz for a while now.

Paolo
Marcelo Tosatti Jan. 18, 2017, 11:55 a.m. UTC | #23
On Tue, Jan 10, 2017 at 05:36:48PM +0100, Paolo Bonzini wrote:
> 
> 
> On 05/01/2017 11:48, Marcelo Tosatti wrote:
> >> Host A has TSC scaling, host B doesn't have TSC scaling. We want
> >> to be able to start the VM on host A, and migrate to B. In this
> >> case, the only possible solution is to use B's frequency when
> >> starting the VM. The QEMU process doesn't have enough information
> >> to make that decision.
> > That is a good point. But again, its a special case and 
> > should be supported by -cpu xxx,tsc-frequency=zzzz.
> 
> I don't think this is a scenario that can work reliably.  The computed
> TSC frequency may vary by 0.5% or so on every boot (e.g. you may get
> 2497000 kHz or 2511000 kHz for a 2.5 GHz TSC).  You can start the VM on
> host A, reboot host B, and then you'll be unable to migrate.
> 
> Paolo

Including an acceptable error in the comparison seems to be acceptable to work around that
case.
Eduardo Habkost Jan. 18, 2017, 12:43 p.m. UTC | #24
On Wed, Jan 18, 2017 at 09:55:43AM -0200, Marcelo Tosatti wrote:
> On Tue, Jan 10, 2017 at 05:36:48PM +0100, Paolo Bonzini wrote:
> > 
> > 
> > On 05/01/2017 11:48, Marcelo Tosatti wrote:
> > >> Host A has TSC scaling, host B doesn't have TSC scaling. We want
> > >> to be able to start the VM on host A, and migrate to B. In this
> > >> case, the only possible solution is to use B's frequency when
> > >> starting the VM. The QEMU process doesn't have enough information
> > >> to make that decision.
> > > That is a good point. But again, its a special case and 
> > > should be supported by -cpu xxx,tsc-frequency=zzzz.
> > 
> > I don't think this is a scenario that can work reliably.  The computed
> > TSC frequency may vary by 0.5% or so on every boot (e.g. you may get
> > 2497000 kHz or 2511000 kHz for a 2.5 GHz TSC).  You can start the VM on
> > host A, reboot host B, and then you'll be unable to migrate.
> > 
> > Paolo
> 
> Including an acceptable error in the comparison seems to be
> acceptable to work around that case.

How much error is acceptable when exposing the invtsc flag to the
guest?
diff mbox

Patch

diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index ceeacca..4270923 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -375,7 +375,12 @@  int e820_get_num_entries(void);
 bool e820_get_entry(int, uint32_t, uint64_t *, uint64_t *);
 
 #define PC_COMPAT_2_8 \
-    HW_COMPAT_2_8
+    HW_COMPAT_2_8 \
+    {\
+        .driver   = TYPE_X86_CPU,\
+        .property = "invtsc-migration",\
+        .value    = "off",\
+    },
 
 #define PC_COMPAT_2_7 \
     HW_COMPAT_2_7 \
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index a7f2f60..ec8cdbc 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1208,6 +1208,7 @@  struct X86CPU {
     bool expose_kvm;
     bool migratable;
     bool host_features;
+    bool invtsc_migration;
     uint32_t apic_id;
 
     /* if true the CPUID code directly forward host cache leaves to the guest */
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index b0640f1..cc93b81 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -3678,6 +3678,7 @@  static Property x86_cpu_properties[] = {
     DEFINE_PROP_BOOL("cpuid-0xb", X86CPU, enable_cpuid_0xb, true),
     DEFINE_PROP_BOOL("lmce", X86CPU, enable_lmce, false),
     DEFINE_PROP_BOOL("l3-cache", X86CPU, enable_l3_cache, true),
+    DEFINE_PROP_BOOL("invtsc-migration", X86CPU, invtsc_migration, true),
     DEFINE_PROP_END_OF_LIST()
 };
 
diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index 6a51399..2c3ee7b 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -962,7 +962,7 @@  int kvm_arch_init_vcpu(CPUState *cs)
         has_msr_mcg_ext_ctl = has_msr_feature_control = true;
     }
 
-    if (!env->user_tsc_khz) {
+    if (!cpu->invtsc_migration && !env->user_tsc_khz) {
         if ((env->features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC) &&
             invtsc_mig_blocker == NULL) {
             /* for migration */
@@ -972,6 +972,7 @@  int kvm_arch_init_vcpu(CPUState *cs)
             migrate_add_blocker(invtsc_mig_blocker);
             /* for savevm */
             vmstate_x86_cpu.unmigratable = 1;
+        }
     }
 
     cpuid_data.cpuid.padding = 0;
@@ -2655,12 +2656,14 @@  int kvm_arch_put_registers(CPUState *cpu, int level)
     }
 
     if (level == KVM_PUT_FULL_STATE) {
-        /* We don't check for kvm_arch_set_tsc_khz() errors here,
-         * because TSC frequency mismatch shouldn't abort migration,
-         * unless the user explicitly asked for a more strict TSC
-         * setting (e.g. using an explicit "tsc-freq" option).
+        /* Migration TSC frequency mismatch is fatal only if we are
+         * actually reporting Invariant TSC to the guest.
          */
-        kvm_arch_set_tsc_khz(cpu);
+        ret = kvm_arch_set_tsc_khz(cpu);
+        if ((x86_cpu->env.features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC) &&
+            ret < 0) {
+            return ret;
+        }
     }
 
     ret = kvm_getput_regs(x86_cpu, 1);