diff mbox

[05/23] hyperv: ensure VP index equal to QEMU cpu_index

Message ID 20170606181948.16238-6-rkagan@virtuozzo.com (mailing list archive)
State New, archived
Headers show

Commit Message

Roman Kagan June 6, 2017, 6:19 p.m. UTC
Hyper-V identifies vcpus by the virtual processor (VP) index which is
normally queried by the guest via HV_X64_MSR_VP_INDEX msr.

It has to be owned by QEMU in order to preserve it across migration.

However, the current implementation in KVM doesn't allow to set this
msr, and KVM uses its own notion of VP index.  Fortunately, the way
vcpus are created in QEMU/KVM basically guarantees that the KVM value is
equal to QEMU cpu_index.

So, for back and forward compatibility, attempt to set the msr at vcpu
init time to cpu_index, but ignore the errors; then query the msr value
from KVM and assert that it's equal to cpu_index.  On current kernels
this will work by luck; future ones will accept the value from
userspace.

Signed-off-by: Roman Kagan <rkagan@virtuozzo.com>
---
 target/i386/hyperv.h |  2 ++
 target/i386/hyperv.c |  5 +++++
 target/i386/kvm.c    | 26 ++++++++++++++++++++++++++
 3 files changed, 33 insertions(+)

Comments

Eduardo Habkost June 13, 2017, 6:57 p.m. UTC | #1
On Tue, Jun 06, 2017 at 09:19:30PM +0300, Roman Kagan wrote:
> Hyper-V identifies vcpus by the virtual processor (VP) index which is
> normally queried by the guest via HV_X64_MSR_VP_INDEX msr.
> 
> It has to be owned by QEMU in order to preserve it across migration.
> 
> However, the current implementation in KVM doesn't allow to set this
> msr, and KVM uses its own notion of VP index.  Fortunately, the way
> vcpus are created in QEMU/KVM basically guarantees that the KVM value is
> equal to QEMU cpu_index.

This might not be true in the future.  cpu_index is not a good
identifier for CPUs, and we would like to remove it in the
future.

But it looks like we have no choice, see extra comments below:

> 
> So, for back and forward compatibility, attempt to set the msr at vcpu
> init time to cpu_index, but ignore the errors; then query the msr value
> from KVM and assert that it's equal to cpu_index.  On current kernels
> this will work by luck; future ones will accept the value from
> userspace.
> 
> Signed-off-by: Roman Kagan <rkagan@virtuozzo.com>
> ---
>  target/i386/hyperv.h |  2 ++
>  target/i386/hyperv.c |  5 +++++
>  target/i386/kvm.c    | 26 ++++++++++++++++++++++++++
>  3 files changed, 33 insertions(+)
> 
> diff --git a/target/i386/hyperv.h b/target/i386/hyperv.h
> index 0c3b562..35da0b1 100644
> --- a/target/i386/hyperv.h
> +++ b/target/i386/hyperv.h
> @@ -39,4 +39,6 @@ void kvm_hv_sint_route_destroy(HvSintRoute *sint_route);
>  
>  int kvm_hv_sint_route_set_sint(HvSintRoute *sint_route);
>  
> +uint32_t hyperv_vp_index(X86CPU *cpu);
> +
>  #endif
> diff --git a/target/i386/hyperv.c b/target/i386/hyperv.c
> index 227185c..27de5bc 100644
> --- a/target/i386/hyperv.c
> +++ b/target/i386/hyperv.c
> @@ -16,6 +16,11 @@
>  #include "hyperv.h"
>  #include "hyperv_proto.h"
>  
> +uint32_t hyperv_vp_index(X86CPU *cpu)
> +{
> +    return CPU(cpu)->cpu_index;
> +}
> +

You are introducing a wrapper, but you are using cs->cpu_index
directly at hyperv_set_vp_index().  The knowledge that vp_index
== cpu_index is duplicated on two places, and the functions risk
getting out of sync.

I suggest using the wrapper inside hyperv_set_vp_index() too.


>  int kvm_hv_handle_exit(X86CPU *cpu, struct kvm_hyperv_exit *exit)
>  {
>      CPUX86State *env = &cpu->env;
> diff --git a/target/i386/kvm.c b/target/i386/kvm.c
> index 251aa95..eb9cde4 100644
> --- a/target/i386/kvm.c
> +++ b/target/i386/kvm.c
> @@ -610,11 +610,37 @@ static int kvm_arch_set_tsc_khz(CPUState *cs)
>      return 0;
>  }
>  
> +static void hyperv_set_vp_index(CPUState *cs)
> +{
> +    struct {
> +        struct kvm_msrs info;
> +        struct kvm_msr_entry entries[1];
> +    } msr_data;
> +    int ret;
> +
> +    msr_data.info.nmsrs = 1;
> +    msr_data.entries[0].index = HV_X64_MSR_VP_INDEX;
> +
> +    /*
> +     * Some kernels don't support setting HV_X64_MSR_VP_INDEX.  However,
> +     * they use VP_INDEX equal to cpu_index due to the way vcpus are created.
> +     * So ignore the errors from SET_MSRS, but verify that GET_MSRS returns the
> +     * expected value.
> +     */

Oh.  This sounds like a problem.  As reference for others, this
is the KVM code in kvm_hv_get_msr():

	case HV_X64_MSR_VP_INDEX: {
		int r;
		struct kvm_vcpu *v;

		kvm_for_each_vcpu(r, v, vcpu->kvm) {
			if (v == vcpu) {
				data = r;
				break;
			}
		}
		break;
	}

The problem with that is that it will break as soon as we create
VCPUs in a different order.  Unsolvable on hosts that don't allow
HV_X64_MSR_VP_INDEX to be set, however.

> +    msr_data.entries[0].data = cs->cpu_index;
> +    kvm_vcpu_ioctl(cs, KVM_SET_MSRS, &msr_data);
> +    ret = kvm_vcpu_ioctl(cs, KVM_GET_MSRS, &msr_data);
> +    assert(ret == 1);
> +    assert(msr_data.entries[0].data == cs->cpu_index);

If KVM already initializes the MSR to cpu_index and we will abort
if it was not set to anything except cpu_index here, why exactly
do we need the KVM_SET_MSRS call?

> +}
> +
>  static int hyperv_handle_properties(CPUState *cs)
>  {
>      X86CPU *cpu = X86_CPU(cs);
>      CPUX86State *env = &cpu->env;
>  
> +    hyperv_set_vp_index(cs);
> +

The purpose of hyperv_handle_properties() is just to initialize
cpu->features[] CPUID data based on the hyperv options.  I suggest
creating a new hyperv_vcpu_init() or x86_cpu_hyperv_init()
function for these initialization steps.

>      if (cpu->hyperv_time &&
>              kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV_TIME) <= 0) {
>          cpu->hyperv_time = false;
> -- 
> 2.9.4
>
Roman Kagan June 14, 2017, 11:25 a.m. UTC | #2
On Tue, Jun 13, 2017 at 03:57:52PM -0300, Eduardo Habkost wrote:
> On Tue, Jun 06, 2017 at 09:19:30PM +0300, Roman Kagan wrote:
> > Hyper-V identifies vcpus by the virtual processor (VP) index which is
> > normally queried by the guest via HV_X64_MSR_VP_INDEX msr.
> > 
> > It has to be owned by QEMU in order to preserve it across migration.
> > 
> > However, the current implementation in KVM doesn't allow to set this
> > msr, and KVM uses its own notion of VP index.  Fortunately, the way
> > vcpus are created in QEMU/KVM basically guarantees that the KVM value is
> > equal to QEMU cpu_index.
> 
> This might not be true in the future.  cpu_index is not a good
> identifier for CPUs, and we would like to remove it in the
> future.
> 
> But it looks like we have no choice, see extra comments below:

> > +static void hyperv_set_vp_index(CPUState *cs)
> > +{
> > +    struct {
> > +        struct kvm_msrs info;
> > +        struct kvm_msr_entry entries[1];
> > +    } msr_data;
> > +    int ret;
> > +
> > +    msr_data.info.nmsrs = 1;
> > +    msr_data.entries[0].index = HV_X64_MSR_VP_INDEX;
> > +
> > +    /*
> > +     * Some kernels don't support setting HV_X64_MSR_VP_INDEX.  However,
> > +     * they use VP_INDEX equal to cpu_index due to the way vcpus are created.
> > +     * So ignore the errors from SET_MSRS, but verify that GET_MSRS returns the
> > +     * expected value.
> > +     */
> 
> Oh.  This sounds like a problem.  As reference for others, this
> is the KVM code in kvm_hv_get_msr():
> 
> 	case HV_X64_MSR_VP_INDEX: {
> 		int r;
> 		struct kvm_vcpu *v;
> 
> 		kvm_for_each_vcpu(r, v, vcpu->kvm) {
> 			if (v == vcpu) {
> 				data = r;
> 				break;
> 			}
> 		}
> 		break;
> 	}
> 
> The problem with that is that it will break as soon as we create
> VCPUs in a different order.  Unsolvable on hosts that don't allow
> HV_X64_MSR_VP_INDEX to be set, however.

Right, thanks for putting together a detailed explanation.

This was a thinko back then, not to have HV_X64_MSR_VP_INDEX maintained
by QEMU.  I'm going to post a patch to KVM fixing that.

Meanwhile QEMU needs a way to maintain its notion of vp_index that is
  1) in sync with kernel's notion
  2) also with kernels that don't support setting the msr
  3) persistent across migrations

cpu_index looked like a perfect candidate.

> > +    msr_data.entries[0].data = cs->cpu_index;
> > +    kvm_vcpu_ioctl(cs, KVM_SET_MSRS, &msr_data);
> > +    ret = kvm_vcpu_ioctl(cs, KVM_GET_MSRS, &msr_data);
> > +    assert(ret == 1);
> > +    assert(msr_data.entries[0].data == cs->cpu_index);
> 
> If KVM already initializes the MSR to cpu_index and we will abort
> if it was not set to anything except cpu_index here, why exactly
> do we need the KVM_SET_MSRS call?

The kernel made no obligations to keep initializing its vp_index
identically to QEMU's cpu_index.  So we'd better check and abort if that
got out of sync.  Once KVM gains the ability to learn vp_index from QEMU
we'll no longer depend on that as we'll do explicit synchronization.

> > +}
> > +
> >  static int hyperv_handle_properties(CPUState *cs)
> >  {
> >      X86CPU *cpu = X86_CPU(cs);
> >      CPUX86State *env = &cpu->env;
> >  
> > +    hyperv_set_vp_index(cs);
> > +
> 
> The purpose of hyperv_handle_properties() is just to initialize
> cpu->features[] CPUID data based on the hyperv options.  I suggest
> creating a new hyperv_vcpu_init() or x86_cpu_hyperv_init()
> function for these initialization steps.

Sounds like a good idea, thanks.

Roman.
Paolo Bonzini June 14, 2017, 11:26 a.m. UTC | #3
On 14/06/2017 13:25, Roman Kagan wrote:
>> The problem with that is that it will break as soon as we create
>> VCPUs in a different order.  Unsolvable on hosts that don't allow
>> HV_X64_MSR_VP_INDEX to be set, however.
> Right, thanks for putting together a detailed explanation.
> 
> This was a thinko back then, not to have HV_X64_MSR_VP_INDEX maintained
> by QEMU.  I'm going to post a patch to KVM fixing that.
> 
> Meanwhile QEMU needs a way to maintain its notion of vp_index that is
>   1) in sync with kernel's notion
>   2) also with kernels that don't support setting the msr
>   3) persistent across migrations
> 
> cpu_index looked like a perfect candidate.
> 

What you want is the APIC id, which _is_ cpu_index but may not be in the
future.  But the APIC id is also the KVM vcpu_id, so there's no need to
have VP_INDEX maintained by QEMU.

Paolo
Eduardo Habkost June 14, 2017, 1 p.m. UTC | #4
On Wed, Jun 14, 2017 at 02:25:07PM +0300, Roman Kagan wrote:
> On Tue, Jun 13, 2017 at 03:57:52PM -0300, Eduardo Habkost wrote:
> > On Tue, Jun 06, 2017 at 09:19:30PM +0300, Roman Kagan wrote:
> > > Hyper-V identifies vcpus by the virtual processor (VP) index which is
> > > normally queried by the guest via HV_X64_MSR_VP_INDEX msr.
> > > 
> > > It has to be owned by QEMU in order to preserve it across migration.
> > > 
> > > However, the current implementation in KVM doesn't allow to set this
> > > msr, and KVM uses its own notion of VP index.  Fortunately, the way
> > > vcpus are created in QEMU/KVM basically guarantees that the KVM value is
> > > equal to QEMU cpu_index.
> > 
> > This might not be true in the future.  cpu_index is not a good
> > identifier for CPUs, and we would like to remove it in the
> > future.
> > 
> > But it looks like we have no choice, see extra comments below:
> 
> > > +static void hyperv_set_vp_index(CPUState *cs)
> > > +{
> > > +    struct {
> > > +        struct kvm_msrs info;
> > > +        struct kvm_msr_entry entries[1];
> > > +    } msr_data;
> > > +    int ret;
> > > +
> > > +    msr_data.info.nmsrs = 1;
> > > +    msr_data.entries[0].index = HV_X64_MSR_VP_INDEX;
> > > +
> > > +    /*
> > > +     * Some kernels don't support setting HV_X64_MSR_VP_INDEX.  However,
> > > +     * they use VP_INDEX equal to cpu_index due to the way vcpus are created.
> > > +     * So ignore the errors from SET_MSRS, but verify that GET_MSRS returns the
> > > +     * expected value.
> > > +     */
> > 
> > Oh.  This sounds like a problem.  As reference for others, this
> > is the KVM code in kvm_hv_get_msr():
> > 
> > 	case HV_X64_MSR_VP_INDEX: {
> > 		int r;
> > 		struct kvm_vcpu *v;
> > 
> > 		kvm_for_each_vcpu(r, v, vcpu->kvm) {
> > 			if (v == vcpu) {
> > 				data = r;
> > 				break;
> > 			}
> > 		}
> > 		break;
> > 	}
> > 
> > The problem with that is that it will break as soon as we create
> > VCPUs in a different order.  Unsolvable on hosts that don't allow
> > HV_X64_MSR_VP_INDEX to be set, however.
> 
> Right, thanks for putting together a detailed explanation.
> 
> This was a thinko back then, not to have HV_X64_MSR_VP_INDEX maintained
> by QEMU.  I'm going to post a patch to KVM fixing that.
> 
> Meanwhile QEMU needs a way to maintain its notion of vp_index that is
>   1) in sync with kernel's notion
>   2) also with kernels that don't support setting the msr
>   3) persistent across migrations
> 
> cpu_index looked like a perfect candidate.

I would like to be able to stop using cpu_index as a persistent
VCPU identifier in the future.  I would also like to be able to
create VCPUs in any order without breaking guest ABI.  But it
seems to be impossible to do that without breaking vp_index on
older kernels.

So cpu_index looks like the only candidate we have.

> 
> > > +    msr_data.entries[0].data = cs->cpu_index;
> > > +    kvm_vcpu_ioctl(cs, KVM_SET_MSRS, &msr_data);
> > > +    ret = kvm_vcpu_ioctl(cs, KVM_GET_MSRS, &msr_data);
> > > +    assert(ret == 1);
> > > +    assert(msr_data.entries[0].data == cs->cpu_index);
> > 
> > If KVM already initializes the MSR to cpu_index and we will abort
> > if it was not set to anything except cpu_index here, why exactly
> > do we need the KVM_SET_MSRS call?
> 
> The kernel made no obligations to keep initializing its vp_index
> identically to QEMU's cpu_index.  So we'd better check and abort if that
> got out of sync.  Once KVM gains the ability to learn vp_index from QEMU
> we'll no longer depend on that as we'll do explicit synchronization.

I think the kernel now has an obligation to keep initializing
HV_X64_MSR_VP_INDEX in a compatible way, or it will break older
QEMU versions that don't set the MSR.

But if you don't think we can rely on that, then the KVM_SET_MSRS
call won't hurt.
Igor Mammedov June 14, 2017, 1 p.m. UTC | #5
On Wed, 14 Jun 2017 13:26:44 +0200
Paolo Bonzini <pbonzini@redhat.com> wrote:

> On 14/06/2017 13:25, Roman Kagan wrote:
> >> The problem with that is that it will break as soon as we create
> >> VCPUs in a different order.  Unsolvable on hosts that don't allow
> >> HV_X64_MSR_VP_INDEX to be set, however.  
> > Right, thanks for putting together a detailed explanation.
> > 
> > This was a thinko back then, not to have HV_X64_MSR_VP_INDEX maintained
> > by QEMU.  I'm going to post a patch to KVM fixing that.
> > 
> > Meanwhile QEMU needs a way to maintain its notion of vp_index that is
> >   1) in sync with kernel's notion
> >   2) also with kernels that don't support setting the msr
> >   3) persistent across migrations
> > 
> > cpu_index looked like a perfect candidate.
> >   
> 
> What you want is the APIC id,

> which _is_ cpu_index but may not be in the
depending on topology cpu_index won't be the same as APIC ID/vcpu_id
/AMDs odd core count/.

> future.  But the APIC id is also the KVM vcpu_id, so there's no need to
> have VP_INDEX maintained by QEMU.
agreed it'd be better to reuse vcpu_id/apic id as interface between
qemu/kvm/guest instead of adding additional cpu_index concept in ABI

> 
> Paolo
Eduardo Habkost June 14, 2017, 1:01 p.m. UTC | #6
On Wed, Jun 14, 2017 at 01:26:44PM +0200, Paolo Bonzini wrote:
> 
> 
> On 14/06/2017 13:25, Roman Kagan wrote:
> >> The problem with that is that it will break as soon as we create
> >> VCPUs in a different order.  Unsolvable on hosts that don't allow
> >> HV_X64_MSR_VP_INDEX to be set, however.
> > Right, thanks for putting together a detailed explanation.
> > 
> > This was a thinko back then, not to have HV_X64_MSR_VP_INDEX maintained
> > by QEMU.  I'm going to post a patch to KVM fixing that.
> > 
> > Meanwhile QEMU needs a way to maintain its notion of vp_index that is
> >   1) in sync with kernel's notion
> >   2) also with kernels that don't support setting the msr
> >   3) persistent across migrations
> > 
> > cpu_index looked like a perfect candidate.
> > 
> 
> What you want is the APIC id, which _is_ cpu_index but may not be in the
> future.  But the APIC id is also the KVM vcpu_id, so there's no need to
> have VP_INDEX maintained by QEMU.

No, KVM really uses the VCPU _index_ for HV_X64_MSR_VP_INDEX:

kvm_hv_get_msr():

        case HV_X64_MSR_VP_INDEX: {
                int r;
                struct kvm_vcpu *v;

                kvm_for_each_vcpu(r, v, vcpu->kvm) {
                        if (v == vcpu) {
                                data = r;
                                break;
                        }
                }
                break;
        }
Igor Mammedov June 14, 2017, 1:11 p.m. UTC | #7
On Wed, 14 Jun 2017 10:01:49 -0300
Eduardo Habkost <ehabkost@redhat.com> wrote:

> On Wed, Jun 14, 2017 at 01:26:44PM +0200, Paolo Bonzini wrote:
> > 
> > 
> > On 14/06/2017 13:25, Roman Kagan wrote:  
> > >> The problem with that is that it will break as soon as we create
> > >> VCPUs in a different order.  Unsolvable on hosts that don't allow
> > >> HV_X64_MSR_VP_INDEX to be set, however.  
> > > Right, thanks for putting together a detailed explanation.
> > > 
> > > This was a thinko back then, not to have HV_X64_MSR_VP_INDEX maintained
> > > by QEMU.  I'm going to post a patch to KVM fixing that.
> > > 
> > > Meanwhile QEMU needs a way to maintain its notion of vp_index that is
> > >   1) in sync with kernel's notion
> > >   2) also with kernels that don't support setting the msr
> > >   3) persistent across migrations
> > > 
> > > cpu_index looked like a perfect candidate.
> > >   
> > 
> > What you want is the APIC id, which _is_ cpu_index but may not be in the
> > future.  But the APIC id is also the KVM vcpu_id, so there's no need to
> > have VP_INDEX maintained by QEMU.  
> 
> No, KVM really uses the VCPU _index_ for HV_X64_MSR_VP_INDEX:
and as you pointed out that works just by luck,
as soon as we there would be out of order created CPUs
returned value won't match cpu_index.

So instead of spreading this nonsense out to QEMU, is it possible
to fix kernel(kvm+guest) to use apic_id instead?


> kvm_hv_get_msr():
> 
>         case HV_X64_MSR_VP_INDEX: {
>                 int r;
>                 struct kvm_vcpu *v;
> 
>                 kvm_for_each_vcpu(r, v, vcpu->kvm) {
>                         if (v == vcpu) {
>                                 data = r;
>                                 break;
>                         }
>                 }
>                 break;
>         }
> 
>
Paolo Bonzini June 14, 2017, 1:17 p.m. UTC | #8
On 14/06/2017 15:11, Igor Mammedov wrote:
>> No, KVM really uses the VCPU _index_ for HV_X64_MSR_VP_INDEX:
>
> and as you pointed out that works just by luck,
> as soon as we there would be out of order created CPUs
> returned value won't match cpu_index.
> 
> So instead of spreading this nonsense out to QEMU, is it possible
> to fix kernel(kvm+guest) to use apic_id instead?

Yes, definitely.  At this point let's add a new "KVM_CAP_HYPERV_SYNIC2"
capability and declare the old one broken, that will make things easier.

Paolo
Eduardo Habkost June 14, 2017, 1:19 p.m. UTC | #9
On Wed, Jun 14, 2017 at 03:11:17PM +0200, Igor Mammedov wrote:
> On Wed, 14 Jun 2017 10:01:49 -0300
> Eduardo Habkost <ehabkost@redhat.com> wrote:
> 
> > On Wed, Jun 14, 2017 at 01:26:44PM +0200, Paolo Bonzini wrote:
> > > 
> > > 
> > > On 14/06/2017 13:25, Roman Kagan wrote:  
> > > >> The problem with that is that it will break as soon as we create
> > > >> VCPUs in a different order.  Unsolvable on hosts that don't allow
> > > >> HV_X64_MSR_VP_INDEX to be set, however.  
> > > > Right, thanks for putting together a detailed explanation.
> > > > 
> > > > This was a thinko back then, not to have HV_X64_MSR_VP_INDEX maintained
> > > > by QEMU.  I'm going to post a patch to KVM fixing that.
> > > > 
> > > > Meanwhile QEMU needs a way to maintain its notion of vp_index that is
> > > >   1) in sync with kernel's notion
> > > >   2) also with kernels that don't support setting the msr
> > > >   3) persistent across migrations
> > > > 
> > > > cpu_index looked like a perfect candidate.
> > > >   
> > > 
> > > What you want is the APIC id, which _is_ cpu_index but may not be in the
> > > future.  But the APIC id is also the KVM vcpu_id, so there's no need to
> > > have VP_INDEX maintained by QEMU.  
> > 
> > No, KVM really uses the VCPU _index_ for HV_X64_MSR_VP_INDEX:
> and as you pointed out that works just by luck,
> as soon as we there would be out of order created CPUs
> returned value won't match cpu_index.

This is true, which makes it even worse.

> 
> So instead of spreading this nonsense out to QEMU, is it possible
> to fix kernel(kvm+guest) to use apic_id instead?

It is possible to fix the kernel, but if we want to support older
kernels, QEMU has no choice but creating the VCPUs always in the
same order.
Eduardo Habkost June 14, 2017, 1:22 p.m. UTC | #10
On Wed, Jun 14, 2017 at 03:17:54PM +0200, Paolo Bonzini wrote:
> 
> 
> On 14/06/2017 15:11, Igor Mammedov wrote:
> >> No, KVM really uses the VCPU _index_ for HV_X64_MSR_VP_INDEX:
> >
> > and as you pointed out that works just by luck,
> > as soon as we there would be out of order created CPUs
> > returned value won't match cpu_index.
> > 
> > So instead of spreading this nonsense out to QEMU, is it possible
> > to fix kernel(kvm+guest) to use apic_id instead?
> 
> Yes, definitely.  At this point let's add a new "KVM_CAP_HYPERV_SYNIC2"
> capability and declare the old one broken, that will make things easier.

What do we need a capability for?  Can't we just call
KVM_SET_MSRS using arch_id (== apic_id) and warn the user if it
doesn't work?
Igor Mammedov June 14, 2017, 1:24 p.m. UTC | #11
On Wed, 14 Jun 2017 10:00:15 -0300
Eduardo Habkost <ehabkost@redhat.com> wrote:

> On Wed, Jun 14, 2017 at 02:25:07PM +0300, Roman Kagan wrote:
> > On Tue, Jun 13, 2017 at 03:57:52PM -0300, Eduardo Habkost wrote:  
> > > On Tue, Jun 06, 2017 at 09:19:30PM +0300, Roman Kagan wrote:  
> > > > Hyper-V identifies vcpus by the virtual processor (VP) index which is
> > > > normally queried by the guest via HV_X64_MSR_VP_INDEX msr.
> > > > 
> > > > It has to be owned by QEMU in order to preserve it across migration.
> > > > 
> > > > However, the current implementation in KVM doesn't allow to set this
> > > > msr, and KVM uses its own notion of VP index.  Fortunately, the way
> > > > vcpus are created in QEMU/KVM basically guarantees that the KVM value is
> > > > equal to QEMU cpu_index.  
> > > 
> > > This might not be true in the future.  cpu_index is not a good
> > > identifier for CPUs, and we would like to remove it in the
> > > future.
> > > 
> > > But it looks like we have no choice, see extra comments below:  
> >   
> > > > +static void hyperv_set_vp_index(CPUState *cs)
> > > > +{
> > > > +    struct {
> > > > +        struct kvm_msrs info;
> > > > +        struct kvm_msr_entry entries[1];
> > > > +    } msr_data;
> > > > +    int ret;
> > > > +
> > > > +    msr_data.info.nmsrs = 1;
> > > > +    msr_data.entries[0].index = HV_X64_MSR_VP_INDEX;
> > > > +
> > > > +    /*
> > > > +     * Some kernels don't support setting HV_X64_MSR_VP_INDEX.  However,
> > > > +     * they use VP_INDEX equal to cpu_index due to the way vcpus are created.
> > > > +     * So ignore the errors from SET_MSRS, but verify that GET_MSRS returns the
> > > > +     * expected value.
> > > > +     */  
> > > 
> > > Oh.  This sounds like a problem.  As reference for others, this
> > > is the KVM code in kvm_hv_get_msr():
> > > 
> > > 	case HV_X64_MSR_VP_INDEX: {
> > > 		int r;
> > > 		struct kvm_vcpu *v;
> > > 
> > > 		kvm_for_each_vcpu(r, v, vcpu->kvm) {
> > > 			if (v == vcpu) {
> > > 				data = r;
> > > 				break;
> > > 			}
> > > 		}
> > > 		break;
> > > 	}
> > > 
> > > The problem with that is that it will break as soon as we create
> > > VCPUs in a different order.  Unsolvable on hosts that don't allow
> > > HV_X64_MSR_VP_INDEX to be set, however.  
> > 
> > Right, thanks for putting together a detailed explanation.
> > 
> > This was a thinko back then, not to have HV_X64_MSR_VP_INDEX maintained
> > by QEMU.  I'm going to post a patch to KVM fixing that.
> > 
> > Meanwhile QEMU needs a way to maintain its notion of vp_index that is
> >   1) in sync with kernel's notion
> >   2) also with kernels that don't support setting the msr
> >   3) persistent across migrations
> > 
> > cpu_index looked like a perfect candidate.  
> 
> I would like to be able to stop using cpu_index as a persistent
> VCPU identifier in the future.  I would also like to be able to
> create VCPUs in any order without breaking guest ABI.  But it
> seems to be impossible to do that without breaking vp_index on
> older kernels.
> 
> So cpu_index looks like the only candidate we have.
> 
> >   
> > > > +    msr_data.entries[0].data = cs->cpu_index;
> > > > +    kvm_vcpu_ioctl(cs, KVM_SET_MSRS, &msr_data);
> > > > +    ret = kvm_vcpu_ioctl(cs, KVM_GET_MSRS, &msr_data);
> > > > +    assert(ret == 1);
> > > > +    assert(msr_data.entries[0].data == cs->cpu_index);  
> > > 
> > > If KVM already initializes the MSR to cpu_index and we will abort
> > > if it was not set to anything except cpu_index here, why exactly
> > > do we need the KVM_SET_MSRS call?  
> > 
> > The kernel made no obligations to keep initializing its vp_index
> > identically to QEMU's cpu_index.  So we'd better check and abort if that
> > got out of sync.  Once KVM gains the ability to learn vp_index from QEMU
> > we'll no longer depend on that as we'll do explicit synchronization.  
> 
> I think the kernel now has an obligation to keep initializing
> HV_X64_MSR_VP_INDEX in a compatible way, or it will break older
> QEMU versions that don't set the MSR.
> 
> But if you don't think we can rely on that, then the KVM_SET_MSRS
> call won't hurt.

if we can tell apart old index based vp_index kernel and new that supports
MSR setting (add cap check perhaps) then we should be able to
 - leave out old vp_index broken as it is now (for old kernels/old QEMU and/ new QEMU old machine types)
     i.e. do not set vp_index MSR in QEMU
 - in case of (new QEMU new machine type/new kernel) use APIC ID which would work
   with out of order CPU creation
Eduardo Habkost June 14, 2017, 1:35 p.m. UTC | #12
On Wed, Jun 14, 2017 at 03:24:50PM +0200, Igor Mammedov wrote:
> On Wed, 14 Jun 2017 10:00:15 -0300
> Eduardo Habkost <ehabkost@redhat.com> wrote:
> 
> > On Wed, Jun 14, 2017 at 02:25:07PM +0300, Roman Kagan wrote:
> > > On Tue, Jun 13, 2017 at 03:57:52PM -0300, Eduardo Habkost wrote:  
> > > > On Tue, Jun 06, 2017 at 09:19:30PM +0300, Roman Kagan wrote:  
> > > > > Hyper-V identifies vcpus by the virtual processor (VP) index which is
> > > > > normally queried by the guest via HV_X64_MSR_VP_INDEX msr.
> > > > > 
> > > > > It has to be owned by QEMU in order to preserve it across migration.
> > > > > 
> > > > > However, the current implementation in KVM doesn't allow to set this
> > > > > msr, and KVM uses its own notion of VP index.  Fortunately, the way
> > > > > vcpus are created in QEMU/KVM basically guarantees that the KVM value is
> > > > > equal to QEMU cpu_index.  
> > > > 
> > > > This might not be true in the future.  cpu_index is not a good
> > > > identifier for CPUs, and we would like to remove it in the
> > > > future.
> > > > 
> > > > But it looks like we have no choice, see extra comments below:  
> > >   
> > > > > +static void hyperv_set_vp_index(CPUState *cs)
> > > > > +{
> > > > > +    struct {
> > > > > +        struct kvm_msrs info;
> > > > > +        struct kvm_msr_entry entries[1];
> > > > > +    } msr_data;
> > > > > +    int ret;
> > > > > +
> > > > > +    msr_data.info.nmsrs = 1;
> > > > > +    msr_data.entries[0].index = HV_X64_MSR_VP_INDEX;
> > > > > +
> > > > > +    /*
> > > > > +     * Some kernels don't support setting HV_X64_MSR_VP_INDEX.  However,
> > > > > +     * they use VP_INDEX equal to cpu_index due to the way vcpus are created.
> > > > > +     * So ignore the errors from SET_MSRS, but verify that GET_MSRS returns the
> > > > > +     * expected value.
> > > > > +     */  
> > > > 
> > > > Oh.  This sounds like a problem.  As reference for others, this
> > > > is the KVM code in kvm_hv_get_msr():
> > > > 
> > > > 	case HV_X64_MSR_VP_INDEX: {
> > > > 		int r;
> > > > 		struct kvm_vcpu *v;
> > > > 
> > > > 		kvm_for_each_vcpu(r, v, vcpu->kvm) {
> > > > 			if (v == vcpu) {
> > > > 				data = r;
> > > > 				break;
> > > > 			}
> > > > 		}
> > > > 		break;
> > > > 	}
> > > > 
> > > > The problem with that is that it will break as soon as we create
> > > > VCPUs in a different order.  Unsolvable on hosts that don't allow
> > > > HV_X64_MSR_VP_INDEX to be set, however.  
> > > 
> > > Right, thanks for putting together a detailed explanation.
> > > 
> > > This was a thinko back then, not to have HV_X64_MSR_VP_INDEX maintained
> > > by QEMU.  I'm going to post a patch to KVM fixing that.
> > > 
> > > Meanwhile QEMU needs a way to maintain its notion of vp_index that is
> > >   1) in sync with kernel's notion
> > >   2) also with kernels that don't support setting the msr
> > >   3) persistent across migrations
> > > 
> > > cpu_index looked like a perfect candidate.  
> > 
> > I would like to be able to stop using cpu_index as a persistent
> > VCPU identifier in the future.  I would also like to be able to
> > create VCPUs in any order without breaking guest ABI.  But it
> > seems to be impossible to do that without breaking vp_index on
> > older kernels.
> > 
> > So cpu_index looks like the only candidate we have.
> > 
> > >   
> > > > > +    msr_data.entries[0].data = cs->cpu_index;
> > > > > +    kvm_vcpu_ioctl(cs, KVM_SET_MSRS, &msr_data);
> > > > > +    ret = kvm_vcpu_ioctl(cs, KVM_GET_MSRS, &msr_data);
> > > > > +    assert(ret == 1);
> > > > > +    assert(msr_data.entries[0].data == cs->cpu_index);  
> > > > 
> > > > If KVM already initializes the MSR to cpu_index and we will abort
> > > > if it was not set to anything except cpu_index here, why exactly
> > > > do we need the KVM_SET_MSRS call?  
> > > 
> > > The kernel made no obligations to keep initializing its vp_index
> > > identically to QEMU's cpu_index.  So we'd better check and abort if that
> > > got out of sync.  Once KVM gains the ability to learn vp_index from QEMU
> > > we'll no longer depend on that as we'll do explicit synchronization.  
> > 
> > I think the kernel now has an obligation to keep initializing
> > HV_X64_MSR_VP_INDEX in a compatible way, or it will break older
> > QEMU versions that don't set the MSR.
> > 
> > But if you don't think we can rely on that, then the KVM_SET_MSRS
> > call won't hurt.
> 
> if we can tell apart old index based vp_index kernel and new that supports
> MSR setting (add cap check perhaps) then we should be able to
>  - leave out old vp_index broken as it is now (for old kernels/old QEMU and/ new QEMU old machine types)
>      i.e. do not set vp_index MSR in QEMU
>  - in case of (new QEMU new machine type/new kernel) use APIC ID which would work
>    with out of order CPU creation

I agree with the proposal, but I don't see the need for a
capability check:

Old machine-types can simply not call KVM_SET_MSRS, as you
suggested.

New machine-types can just call KVM_SET_MSRS unconditionally, and
warn the user in case the it fails and the APIC ID really doesn't
match the index in KVM (which in practice will happen only if the
user is also configuring a CPU topology explicitly).
Paolo Bonzini June 14, 2017, 1:37 p.m. UTC | #13
On 14/06/2017 15:22, Eduardo Habkost wrote:
>> Yes, definitely.  At this point let's add a new "KVM_CAP_HYPERV_SYNIC2"
>> capability and declare the old one broken, that will make things easier.
> What do we need a capability for?  Can't we just call
> KVM_SET_MSRS using arch_id (== apic_id) and warn the user if it
> doesn't work?

As mentioned in the review of patch 16/23, we may want to break
backwards compatibility (meaning we drop the old capability and
introduce a new one) for other reasons.

Paolo
Igor Mammedov June 14, 2017, 1:38 p.m. UTC | #14
On Wed, 14 Jun 2017 10:22:16 -0300
Eduardo Habkost <ehabkost@redhat.com> wrote:

> On Wed, Jun 14, 2017 at 03:17:54PM +0200, Paolo Bonzini wrote:
> > 
> > 
> > On 14/06/2017 15:11, Igor Mammedov wrote:  
> > >> No, KVM really uses the VCPU _index_ for HV_X64_MSR_VP_INDEX:  
> > >
> > > and as you pointed out that works just by luck,
> > > as soon as we there would be out of order created CPUs
> > > returned value won't match cpu_index.
> > > 
> > > So instead of spreading this nonsense out to QEMU, is it possible
> > > to fix kernel(kvm+guest) to use apic_id instead?  
> > 
> > Yes, definitely.  At this point let's add a new "KVM_CAP_HYPERV_SYNIC2"
> > capability and declare the old one broken, that will make things easier.  
> 
> What do we need a capability for?
> Can't we just call
> KVM_SET_MSRS using arch_id (== apic_id) and warn the user if it
> doesn't work?
warn won't solve issue, and it's called rather late in vcpu creation
chain without any error propagation so it's not possible (hard) to
gracefully fail cpu hotplug.

with KVM_CAP_HYPERV_SYNIC2 we could easily turn on compat property for
SYNC device (needs QOMification) that would turn on broken compat
logic if capability is not present/old machine type.
Eduardo Habkost June 14, 2017, 1:45 p.m. UTC | #15
On Wed, Jun 14, 2017 at 03:38:59PM +0200, Igor Mammedov wrote:
> On Wed, 14 Jun 2017 10:22:16 -0300
> Eduardo Habkost <ehabkost@redhat.com> wrote:
> 
> > On Wed, Jun 14, 2017 at 03:17:54PM +0200, Paolo Bonzini wrote:
> > > 
> > > 
> > > On 14/06/2017 15:11, Igor Mammedov wrote:  
> > > >> No, KVM really uses the VCPU _index_ for HV_X64_MSR_VP_INDEX:  
> > > >
> > > > and as you pointed out that works just by luck,
> > > > as soon as we there would be out of order created CPUs
> > > > returned value won't match cpu_index.
> > > > 
> > > > So instead of spreading this nonsense out to QEMU, is it possible
> > > > to fix kernel(kvm+guest) to use apic_id instead?  
> > > 
> > > Yes, definitely.  At this point let's add a new "KVM_CAP_HYPERV_SYNIC2"
> > > capability and declare the old one broken, that will make things easier.  
> > 
> > What do we need a capability for?
> > Can't we just call
> > KVM_SET_MSRS using arch_id (== apic_id) and warn the user if it
> > doesn't work?
> warn won't solve issue, and it's called rather late in vcpu creation
> chain without any error propagation so it's not possible (hard) to
> gracefully fail cpu hotplug.

The issue is unsolvable on old kernels (and I don't think we want
to prevent the VM from starting), hence the warning.  But the
ability to report errors gracefully on CPU hotplug is a very good
reason.  Good point.

> 
> with KVM_CAP_HYPERV_SYNIC2 we could easily turn on compat property for
> SYNC device (needs QOMification) that would turn on broken compat
> logic if capability is not present/old machine type.

I was going to propose enabling the compat logic if KVM_SET_MSRS
failed, but you have a good point about KVM_SET_MSRS being called
very late.
Igor Mammedov June 14, 2017, 3:31 p.m. UTC | #16
On Wed, 14 Jun 2017 10:35:36 -0300
Eduardo Habkost <ehabkost@redhat.com> wrote:

> On Wed, Jun 14, 2017 at 03:24:50PM +0200, Igor Mammedov wrote:
> > On Wed, 14 Jun 2017 10:00:15 -0300
> > Eduardo Habkost <ehabkost@redhat.com> wrote:
> >   
> > > On Wed, Jun 14, 2017 at 02:25:07PM +0300, Roman Kagan wrote:  
> > > > On Tue, Jun 13, 2017 at 03:57:52PM -0300, Eduardo Habkost wrote:    
> > > > > On Tue, Jun 06, 2017 at 09:19:30PM +0300, Roman Kagan wrote:    
> > > > > > Hyper-V identifies vcpus by the virtual processor (VP) index which is
> > > > > > normally queried by the guest via HV_X64_MSR_VP_INDEX msr.
> > > > > > 
> > > > > > It has to be owned by QEMU in order to preserve it across migration.
> > > > > > 
> > > > > > However, the current implementation in KVM doesn't allow to set this
> > > > > > msr, and KVM uses its own notion of VP index.  Fortunately, the way
> > > > > > vcpus are created in QEMU/KVM basically guarantees that the KVM value is
> > > > > > equal to QEMU cpu_index.    
> > > > > 
> > > > > This might not be true in the future.  cpu_index is not a good
> > > > > identifier for CPUs, and we would like to remove it in the
> > > > > future.
> > > > > 
> > > > > But it looks like we have no choice, see extra comments below:    
> > > >     
> > > > > > +static void hyperv_set_vp_index(CPUState *cs)
> > > > > > +{
> > > > > > +    struct {
> > > > > > +        struct kvm_msrs info;
> > > > > > +        struct kvm_msr_entry entries[1];
> > > > > > +    } msr_data;
> > > > > > +    int ret;
> > > > > > +
> > > > > > +    msr_data.info.nmsrs = 1;
> > > > > > +    msr_data.entries[0].index = HV_X64_MSR_VP_INDEX;
> > > > > > +
> > > > > > +    /*
> > > > > > +     * Some kernels don't support setting HV_X64_MSR_VP_INDEX.  However,
> > > > > > +     * they use VP_INDEX equal to cpu_index due to the way vcpus are created.
> > > > > > +     * So ignore the errors from SET_MSRS, but verify that GET_MSRS returns the
> > > > > > +     * expected value.
> > > > > > +     */    
> > > > > 
> > > > > Oh.  This sounds like a problem.  As reference for others, this
> > > > > is the KVM code in kvm_hv_get_msr():
> > > > > 
> > > > > 	case HV_X64_MSR_VP_INDEX: {
> > > > > 		int r;
> > > > > 		struct kvm_vcpu *v;
> > > > > 
> > > > > 		kvm_for_each_vcpu(r, v, vcpu->kvm) {
> > > > > 			if (v == vcpu) {
> > > > > 				data = r;
> > > > > 				break;
> > > > > 			}
> > > > > 		}
> > > > > 		break;
> > > > > 	}
> > > > > 
> > > > > The problem with that is that it will break as soon as we create
> > > > > VCPUs in a different order.  Unsolvable on hosts that don't allow
> > > > > HV_X64_MSR_VP_INDEX to be set, however.    
> > > > 
> > > > Right, thanks for putting together a detailed explanation.
> > > > 
> > > > This was a thinko back then, not to have HV_X64_MSR_VP_INDEX maintained
> > > > by QEMU.  I'm going to post a patch to KVM fixing that.
> > > > 
> > > > Meanwhile QEMU needs a way to maintain its notion of vp_index that is
> > > >   1) in sync with kernel's notion
> > > >   2) also with kernels that don't support setting the msr
> > > >   3) persistent across migrations
> > > > 
> > > > cpu_index looked like a perfect candidate.    
> > > 
> > > I would like to be able to stop using cpu_index as a persistent
> > > VCPU identifier in the future.  I would also like to be able to
> > > create VCPUs in any order without breaking guest ABI.  But it
> > > seems to be impossible to do that without breaking vp_index on
> > > older kernels.
> > > 
> > > So cpu_index looks like the only candidate we have.
> > >   
> > > >     
> > > > > > +    msr_data.entries[0].data = cs->cpu_index;
> > > > > > +    kvm_vcpu_ioctl(cs, KVM_SET_MSRS, &msr_data);
> > > > > > +    ret = kvm_vcpu_ioctl(cs, KVM_GET_MSRS, &msr_data);
> > > > > > +    assert(ret == 1);
> > > > > > +    assert(msr_data.entries[0].data == cs->cpu_index);    
> > > > > 
> > > > > If KVM already initializes the MSR to cpu_index and we will abort
> > > > > if it was not set to anything except cpu_index here, why exactly
> > > > > do we need the KVM_SET_MSRS call?    
> > > > 
> > > > The kernel made no obligations to keep initializing its vp_index
> > > > identically to QEMU's cpu_index.  So we'd better check and abort if that
> > > > got out of sync.  Once KVM gains the ability to learn vp_index from QEMU
> > > > we'll no longer depend on that as we'll do explicit synchronization.    
> > > 
> > > I think the kernel now has an obligation to keep initializing
> > > HV_X64_MSR_VP_INDEX in a compatible way, or it will break older
> > > QEMU versions that don't set the MSR.
> > > 
> > > But if you don't think we can rely on that, then the KVM_SET_MSRS
> > > call won't hurt.  
> > 
> > if we can tell apart old index based vp_index kernel and new that supports
> > MSR setting (add cap check perhaps) then we should be able to
> >  - leave out old vp_index broken as it is now (for old kernels/old QEMU and/ new QEMU old machine types)
> >      i.e. do not set vp_index MSR in QEMU
> >  - in case of (new QEMU new machine type/new kernel) use APIC ID which would work
> >    with out of order CPU creation  
> 
> I agree with the proposal, but I don't see the need for a
> capability check:
> 
> Old machine-types can simply not call KVM_SET_MSRS, as you
> suggested.
> 
> New machine-types can just call KVM_SET_MSRS unconditionally, and
> warn the user in case the it fails and the APIC ID really doesn't
> match the index in KVM (which in practice will happen only if the
> user is also configuring a CPU topology explicitly).

I don't know anything about hyperv so here goes question:
   new machine booted on new kernel (using ACPI ID) and
   than migrated to new qemu on old kernel.
   Will it work or break. If it breaks we need cap check and if not
   we probably could do without it.
Roman Kagan June 14, 2017, 6:40 p.m. UTC | #17
On Wed, Jun 14, 2017 at 10:45:23AM -0300, Eduardo Habkost wrote:
> On Wed, Jun 14, 2017 at 03:38:59PM +0200, Igor Mammedov wrote:
> > On Wed, 14 Jun 2017 10:22:16 -0300
> > Eduardo Habkost <ehabkost@redhat.com> wrote:
> > 
> > > On Wed, Jun 14, 2017 at 03:17:54PM +0200, Paolo Bonzini wrote:
> > > > 
> > > > 
> > > > On 14/06/2017 15:11, Igor Mammedov wrote:  
> > > > >> No, KVM really uses the VCPU _index_ for HV_X64_MSR_VP_INDEX:  
> > > > >
> > > > > and as you pointed out that works just by luck,
> > > > > as soon as we there would be out of order created CPUs
> > > > > returned value won't match cpu_index.
> > > > > 
> > > > > So instead of spreading this nonsense out to QEMU, is it possible
> > > > > to fix kernel(kvm+guest) to use apic_id instead?  
> > > > 
> > > > Yes, definitely.  At this point let's add a new "KVM_CAP_HYPERV_SYNIC2"
> > > > capability and declare the old one broken, that will make things easier.  
> > > 
> > > What do we need a capability for?
> > > Can't we just call
> > > KVM_SET_MSRS using arch_id (== apic_id) and warn the user if it
> > > doesn't work?
> > warn won't solve issue, and it's called rather late in vcpu creation
> > chain without any error propagation so it's not possible (hard) to
> > gracefully fail cpu hotplug.
> 
> The issue is unsolvable on old kernels (and I don't think we want
> to prevent the VM from starting), hence the warning.  But the
> ability to report errors gracefully on CPU hotplug is a very good
> reason.  Good point.
> 
> > 
> > with KVM_CAP_HYPERV_SYNIC2 we could easily turn on compat property for
> > SYNC device (needs QOMification) that would turn on broken compat
> > logic if capability is not present/old machine type.
> 
> I was going to propose enabling the compat logic if KVM_SET_MSRS
> failed, but you have a good point about KVM_SET_MSRS being called
> very late.

Thanks for this discussion, I didn't realize all the implications
(including that hotplug issue too).

One more data point is that until now there was no use for vp_index in
QEMU, so it didn't care how KVM managed it.  In KVM the only
vp_index-aware path that the guests could trigger was exactly reading of
HV_X64_MSR_VP_INDEX.

So let me try to sum up (to make sure I understand it right);

1) we add KVM_CAP_HYPERV_SYNIC2 to KVM; when QEMU enables it KVM
   switches to using vcpu_id as vp_index and stops zeroing synic pages

2) new QEMU refuses to start in non-compat mode when
   KVM_CAP_HYPERV_SYNIC2 is not supported

3) old QEMU or new QEMU in compat mode enables KVM_CAP_HYPERV_SYNIC
   making KVM keep using internal cpu index as vp_index and zeroing
   synic pages

4) new QEMU in compat mode refuses vmbus or sint route creation

Is this what is proposed?  My only problem here is that KVM will have to
somehow guarantee stable numbering in the old synic mode.  How can this
be ensured?  And should it be at all?

Thanks,
Roman.
Eduardo Habkost June 14, 2017, 6:59 p.m. UTC | #18
On Wed, Jun 14, 2017 at 09:40:37PM +0300, Roman Kagan wrote:
> On Wed, Jun 14, 2017 at 10:45:23AM -0300, Eduardo Habkost wrote:
> > On Wed, Jun 14, 2017 at 03:38:59PM +0200, Igor Mammedov wrote:
> > > On Wed, 14 Jun 2017 10:22:16 -0300
> > > Eduardo Habkost <ehabkost@redhat.com> wrote:
> > > 
> > > > On Wed, Jun 14, 2017 at 03:17:54PM +0200, Paolo Bonzini wrote:
> > > > > 
> > > > > 
> > > > > On 14/06/2017 15:11, Igor Mammedov wrote:  
> > > > > >> No, KVM really uses the VCPU _index_ for HV_X64_MSR_VP_INDEX:  
> > > > > >
> > > > > > and as you pointed out that works just by luck,
> > > > > > as soon as we there would be out of order created CPUs
> > > > > > returned value won't match cpu_index.
> > > > > > 
> > > > > > So instead of spreading this nonsense out to QEMU, is it possible
> > > > > > to fix kernel(kvm+guest) to use apic_id instead?  
> > > > > 
> > > > > Yes, definitely.  At this point let's add a new "KVM_CAP_HYPERV_SYNIC2"
> > > > > capability and declare the old one broken, that will make things easier.  
> > > > 
> > > > What do we need a capability for?
> > > > Can't we just call
> > > > KVM_SET_MSRS using arch_id (== apic_id) and warn the user if it
> > > > doesn't work?
> > > warn won't solve issue, and it's called rather late in vcpu creation
> > > chain without any error propagation so it's not possible (hard) to
> > > gracefully fail cpu hotplug.
> > 
> > The issue is unsolvable on old kernels (and I don't think we want
> > to prevent the VM from starting), hence the warning.  But the
> > ability to report errors gracefully on CPU hotplug is a very good
> > reason.  Good point.
> > 
> > > 
> > > with KVM_CAP_HYPERV_SYNIC2 we could easily turn on compat property for
> > > SYNC device (needs QOMification) that would turn on broken compat
> > > logic if capability is not present/old machine type.
> > 
> > I was going to propose enabling the compat logic if KVM_SET_MSRS
> > failed, but you have a good point about KVM_SET_MSRS being called
> > very late.
> 
> Thanks for this discussion, I didn't realize all the implications
> (including that hotplug issue too).
> 
> One more data point is that until now there was no use for vp_index in
> QEMU, so it didn't care how KVM managed it.  In KVM the only
> vp_index-aware path that the guests could trigger was exactly reading of
> HV_X64_MSR_VP_INDEX.
> 
> So let me try to sum up (to make sure I understand it right);
> 
> 1) we add KVM_CAP_HYPERV_SYNIC2 to KVM; when QEMU enables it KVM
>    switches to using vcpu_id as vp_index and stops zeroing synic pages

If we want to keep KVM code simpler, we could make QEMU
explicitly initialize vp_index using vcpu_id (== arch_id == apic_id)
if KVM_CAP_HYPERV_SYNIC2 is reported as supported.

> 
> 2) new QEMU refuses to start in non-compat mode when
>    KVM_CAP_HYPERV_SYNIC2 is not supported

It depends on which cases are considered "in non-compat mode".
Getting a VM not runnable just because the machine-type was
updated is not desirable, especially considering that on most
cases we will create the VCPUs on the same order and things would
keep working.  Probably the best we can do on this case is to
automatically enable compat mode, but print a warning saying
future QEMU versions might break if the kernel is not upgraded.

> 
> 3) old QEMU or new QEMU in compat mode enables KVM_CAP_HYPERV_SYNIC
>    making KVM keep using internal cpu index as vp_index and zeroing
>    synic pages

Maybe we need a warning on this case too, because QEMU won't
guarantee it will always create VCPUs in the same order in the
future.

> 
> 4) new QEMU in compat mode refuses vmbus or sint route creation

Also: new QEMU in compat mode refuses CPU hotplug.

> 
> Is this what is proposed?  My only problem here is that KVM will have to
> somehow guarantee stable numbering in the old synic mode.  How can this
> be ensured?  And should it be at all?

As far as I can see, KVM can only guarantee stable numbering in
the old mode if QEMU creates the VCPUs in the same order.  QEMU
can do a reasonable effort to not break that unnecessarily, but
users should be aware that old mode is deprecated and can break.
Paolo Bonzini June 15, 2017, 8:26 a.m. UTC | #19
On 14/06/2017 20:59, Eduardo Habkost wrote:
> On Wed, Jun 14, 2017 at 09:40:37PM +0300, Roman Kagan wrote:
>> One more data point is that until now there was no use for vp_index in
>> QEMU, so it didn't care how KVM managed it.  In KVM the only
>> vp_index-aware path that the guests could trigger was exactly reading of
>> HV_X64_MSR_VP_INDEX.
>>
>> So let me try to sum up (to make sure I understand it right);
>>
>> 1) we add KVM_CAP_HYPERV_SYNIC2 to KVM; when QEMU enables it KVM
>>    switches to using vcpu_id as vp_index and stops zeroing synic pages
> 
> If we want to keep KVM code simpler, we could make QEMU
> explicitly initialize vp_index using vcpu_id (== arch_id == apic_id)
> if KVM_CAP_HYPERV_SYNIC2 is reported as supported.
>
>> 2) new QEMU refuses to start in non-compat mode when
>>    KVM_CAP_HYPERV_SYNIC2 is not supported
> 
> It depends on which cases are considered "in non-compat mode".
> Getting a VM not runnable just because the machine-type was
> updated is not desirable, especially considering that on most
> cases we will create the VCPUs on the same order and things would
> keep working.  Probably the best we can do on this case is to
> automatically enable compat mode, but print a warning saying
> future QEMU versions might break if the kernel is not upgraded.

Anything that specifies hv_synic can be broken.  There was really no
reason to specify it for anything except experimenting.

So QEMU never has to enable KVM_CAP_HYPERV_SYNIC.  No compat mode is
necessary.  In fact, I don't see any reason to _keep_
KVM_CAP_HYPERV_SYNIC in the kernel, either.  This means that KVM can
also switch unconditionally the vp_index to vcpu_id, because old QEMU +
new kernel won't even start.

Paolo
Roman Kagan June 15, 2017, 11:40 a.m. UTC | #20
On Thu, Jun 15, 2017 at 10:26:58AM +0200, Paolo Bonzini wrote:
> On 14/06/2017 20:59, Eduardo Habkost wrote:
> > On Wed, Jun 14, 2017 at 09:40:37PM +0300, Roman Kagan wrote:
> >> One more data point is that until now there was no use for vp_index in
> >> QEMU, so it didn't care how KVM managed it.  In KVM the only
> >> vp_index-aware path that the guests could trigger was exactly reading of
> >> HV_X64_MSR_VP_INDEX.
> >>
> >> So let me try to sum up (to make sure I understand it right);
> >>
> >> 1) we add KVM_CAP_HYPERV_SYNIC2 to KVM; when QEMU enables it KVM
> >>    switches to using vcpu_id as vp_index and stops zeroing synic pages
> > 
> > If we want to keep KVM code simpler, we could make QEMU
> > explicitly initialize vp_index using vcpu_id (== arch_id == apic_id)
> > if KVM_CAP_HYPERV_SYNIC2 is reported as supported.
> >
> >> 2) new QEMU refuses to start in non-compat mode when
> >>    KVM_CAP_HYPERV_SYNIC2 is not supported
> > 
> > It depends on which cases are considered "in non-compat mode".
> > Getting a VM not runnable just because the machine-type was
> > updated is not desirable, especially considering that on most
> > cases we will create the VCPUs on the same order and things would
> > keep working.  Probably the best we can do on this case is to
> > automatically enable compat mode, but print a warning saying
> > future QEMU versions might break if the kernel is not upgraded.
> 
> Anything that specifies hv_synic can be broken.  There was really no
> reason to specify it for anything except experimenting.

Hyper-V SynIC timers depend on it, and they work since QEMU 2.6 / KVM
4.5 and even supported by libvirt since 1.3.3.

> So QEMU never has to enable KVM_CAP_HYPERV_SYNIC.  No compat mode is
> necessary.  In fact, I don't see any reason to _keep_
> KVM_CAP_HYPERV_SYNIC in the kernel, either.  This means that KVM can
> also switch unconditionally the vp_index to vcpu_id, because old QEMU +
> new kernel won't even start.

I'm afraid this is too harsh on users...

Roman.
Paolo Bonzini June 15, 2017, 11:42 a.m. UTC | #21
On 15/06/2017 13:40, Roman Kagan wrote:
> On Thu, Jun 15, 2017 at 10:26:58AM +0200, Paolo Bonzini wrote:
>> On 14/06/2017 20:59, Eduardo Habkost wrote:
>>> On Wed, Jun 14, 2017 at 09:40:37PM +0300, Roman Kagan wrote:
>>>> One more data point is that until now there was no use for vp_index in
>>>> QEMU, so it didn't care how KVM managed it.  In KVM the only
>>>> vp_index-aware path that the guests could trigger was exactly reading of
>>>> HV_X64_MSR_VP_INDEX.
>>>>
>>>> So let me try to sum up (to make sure I understand it right);
>>>>
>>>> 1) we add KVM_CAP_HYPERV_SYNIC2 to KVM; when QEMU enables it KVM
>>>>    switches to using vcpu_id as vp_index and stops zeroing synic pages
>>>
>>> If we want to keep KVM code simpler, we could make QEMU
>>> explicitly initialize vp_index using vcpu_id (== arch_id == apic_id)
>>> if KVM_CAP_HYPERV_SYNIC2 is reported as supported.
>>>
>>>> 2) new QEMU refuses to start in non-compat mode when
>>>>    KVM_CAP_HYPERV_SYNIC2 is not supported
>>>
>>> It depends on which cases are considered "in non-compat mode".
>>> Getting a VM not runnable just because the machine-type was
>>> updated is not desirable, especially considering that on most
>>> cases we will create the VCPUs on the same order and things would
>>> keep working.  Probably the best we can do on this case is to
>>> automatically enable compat mode, but print a warning saying
>>> future QEMU versions might break if the kernel is not upgraded.
>>
>> Anything that specifies hv_synic can be broken.  There was really no
>> reason to specify it for anything except experimenting.
> 
> Hyper-V SynIC timers depend on it, and they work since QEMU 2.6 / KVM
> 4.5 and even supported by libvirt since 1.3.3.

But who is using them, and why would they be doing that?  What is the
advantage of using SynIC timers?

Paolo

>> So QEMU never has to enable KVM_CAP_HYPERV_SYNIC.  No compat mode is
>> necessary.  In fact, I don't see any reason to _keep_
>> KVM_CAP_HYPERV_SYNIC in the kernel, either.  This means that KVM can
>> also switch unconditionally the vp_index to vcpu_id, because old QEMU +
>> new kernel won't even start.
> 
> I'm afraid this is too harsh on users...
> 
> Roman.
>
Roman Kagan June 15, 2017, 12:03 p.m. UTC | #22
On Thu, Jun 15, 2017 at 01:42:56PM +0200, Paolo Bonzini wrote:
> 
> 
> On 15/06/2017 13:40, Roman Kagan wrote:
> > On Thu, Jun 15, 2017 at 10:26:58AM +0200, Paolo Bonzini wrote:
> >> On 14/06/2017 20:59, Eduardo Habkost wrote:
> >>> On Wed, Jun 14, 2017 at 09:40:37PM +0300, Roman Kagan wrote:
> >>>> One more data point is that until now there was no use for vp_index in
> >>>> QEMU, so it didn't care how KVM managed it.  In KVM the only
> >>>> vp_index-aware path that the guests could trigger was exactly reading of
> >>>> HV_X64_MSR_VP_INDEX.
> >>>>
> >>>> So let me try to sum up (to make sure I understand it right);
> >>>>
> >>>> 1) we add KVM_CAP_HYPERV_SYNIC2 to KVM; when QEMU enables it KVM
> >>>>    switches to using vcpu_id as vp_index and stops zeroing synic pages
> >>>
> >>> If we want to keep KVM code simpler, we could make QEMU
> >>> explicitly initialize vp_index using vcpu_id (== arch_id == apic_id)
> >>> if KVM_CAP_HYPERV_SYNIC2 is reported as supported.
> >>>
> >>>> 2) new QEMU refuses to start in non-compat mode when
> >>>>    KVM_CAP_HYPERV_SYNIC2 is not supported
> >>>
> >>> It depends on which cases are considered "in non-compat mode".
> >>> Getting a VM not runnable just because the machine-type was
> >>> updated is not desirable, especially considering that on most
> >>> cases we will create the VCPUs on the same order and things would
> >>> keep working.  Probably the best we can do on this case is to
> >>> automatically enable compat mode, but print a warning saying
> >>> future QEMU versions might break if the kernel is not upgraded.
> >>
> >> Anything that specifies hv_synic can be broken.  There was really no
> >> reason to specify it for anything except experimenting.
> > 
> > Hyper-V SynIC timers depend on it, and they work since QEMU 2.6 / KVM
> > 4.5 and even supported by libvirt since 1.3.3.
> 
> But who is using them, and why would they be doing that?  What is the
> advantage of using SynIC timers?

I guess because they are lighter-weight than HPET, and Windows used to
ignore apic timer so it would choose SynIC timers if available.
Dunno...

Roman.
Roman Kagan June 15, 2017, 12:41 p.m. UTC | #23
On Wed, Jun 14, 2017 at 03:00:27PM +0200, Igor Mammedov wrote:
> On Wed, 14 Jun 2017 13:26:44 +0200
> Paolo Bonzini <pbonzini@redhat.com> wrote:
> 
> > On 14/06/2017 13:25, Roman Kagan wrote:
> > >> The problem with that is that it will break as soon as we create
> > >> VCPUs in a different order.  Unsolvable on hosts that don't allow
> > >> HV_X64_MSR_VP_INDEX to be set, however.  
> > > Right, thanks for putting together a detailed explanation.
> > > 
> > > This was a thinko back then, not to have HV_X64_MSR_VP_INDEX maintained
> > > by QEMU.  I'm going to post a patch to KVM fixing that.
> > > 
> > > Meanwhile QEMU needs a way to maintain its notion of vp_index that is
> > >   1) in sync with kernel's notion
> > >   2) also with kernels that don't support setting the msr
> > >   3) persistent across migrations
> > > 
> > > cpu_index looked like a perfect candidate.
> > >   
> > 
> > What you want is the APIC id,
> 
> > which _is_ cpu_index but may not be in the
> depending on topology cpu_index won't be the same as APIC ID/vcpu_id
> /AMDs odd core count/.

So vcpu_id can be sparse?

> > future.  But the APIC id is also the KVM vcpu_id, so there's no need to
> > have VP_INDEX maintained by QEMU.
> agreed it'd be better to reuse vcpu_id/apic id as interface between
> qemu/kvm/guest instead of adding additional cpu_index concept in ABI

Having consulted the spec, I'm not so confident any more this is the
right move.

> 7.8.1 Virtual Processor Index
> 
> Virtual processors are identified by using an index (VP index). The
> maximum number of virtual processors per partition supported by the
> current implementation of the hypervisor can be obtained through CPUID
> leaf 0x40000005. A virtual processor index must be less than the
> maximum number of virtual processors per partition.

This seems to imply that VP index should be dense.  As if they use it
directly as an index into an array whose length is equal to the max
number of vcpus.  (BTW the value we report for it is currently hardcoded
to 0x40 which probably needs fixing, too.)

I'm starting to drift back to adding SET_MSRS support to
HV_X64_MSR_VP_INDEX in KVM to delegate the control to QEMU, and having
QEMU use cpu_index as its value...  :-/

Roman.
Paolo Bonzini June 15, 2017, 1:22 p.m. UTC | #24
On 15/06/2017 14:41, Roman Kagan wrote:
> On Wed, Jun 14, 2017 at 03:00:27PM +0200, Igor Mammedov wrote:
>> On Wed, 14 Jun 2017 13:26:44 +0200
>> Paolo Bonzini <pbonzini@redhat.com> wrote:
>>
>>> On 14/06/2017 13:25, Roman Kagan wrote:
>>>>> The problem with that is that it will break as soon as we create
>>>>> VCPUs in a different order.  Unsolvable on hosts that don't allow
>>>>> HV_X64_MSR_VP_INDEX to be set, however.  
>>>> Right, thanks for putting together a detailed explanation.
>>>>
>>>> This was a thinko back then, not to have HV_X64_MSR_VP_INDEX maintained
>>>> by QEMU.  I'm going to post a patch to KVM fixing that.
>>>>
>>>> Meanwhile QEMU needs a way to maintain its notion of vp_index that is
>>>>   1) in sync with kernel's notion
>>>>   2) also with kernels that don't support setting the msr
>>>>   3) persistent across migrations
>>>>
>>>> cpu_index looked like a perfect candidate.
>>>>   
>>>
>>> What you want is the APIC id,
>>
>>> which _is_ cpu_index but may not be in the
>> depending on topology cpu_index won't be the same as APIC ID/vcpu_id
>> /AMDs odd core count/.
> 
> So vcpu_id can be sparse?

Yes.

> Having consulted the spec, I'm not so confident any more this is the
> right move.
> 
>> 7.8.1 Virtual Processor Index
>>
>> Virtual processors are identified by using an index (VP index). The
>> maximum number of virtual processors per partition supported by the
>> current implementation of the hypervisor can be obtained through CPUID
>> leaf 0x40000005. A virtual processor index must be less than the
>> maximum number of virtual processors per partition.
> 
> This seems to imply that VP index should be dense.  As if they use it
> directly as an index into an array whose length is equal to the max
> number of vcpus.  (BTW the value we report for it is currently hardcoded
> to 0x40 which probably needs fixing, too.)
> 
> I'm starting to drift back to adding SET_MSRS support to
> HV_X64_MSR_VP_INDEX in KVM to delegate the control to QEMU, and having
> QEMU use cpu_index as its value...  :-/

Yes, definitely.

Paolo
Igor Mammedov June 15, 2017, 1:27 p.m. UTC | #25
On Thu, 15 Jun 2017 15:41:08 +0300
Roman Kagan <rkagan@virtuozzo.com> wrote:

> On Wed, Jun 14, 2017 at 03:00:27PM +0200, Igor Mammedov wrote:
> > On Wed, 14 Jun 2017 13:26:44 +0200
> > Paolo Bonzini <pbonzini@redhat.com> wrote:
> >   
> > > On 14/06/2017 13:25, Roman Kagan wrote:  
> > > >> The problem with that is that it will break as soon as we create
> > > >> VCPUs in a different order.  Unsolvable on hosts that don't allow
> > > >> HV_X64_MSR_VP_INDEX to be set, however.    
> > > > Right, thanks for putting together a detailed explanation.
> > > > 
> > > > This was a thinko back then, not to have HV_X64_MSR_VP_INDEX maintained
> > > > by QEMU.  I'm going to post a patch to KVM fixing that.
> > > > 
> > > > Meanwhile QEMU needs a way to maintain its notion of vp_index that is
> > > >   1) in sync with kernel's notion
> > > >   2) also with kernels that don't support setting the msr
> > > >   3) persistent across migrations
> > > > 
> > > > cpu_index looked like a perfect candidate.
> > > >     
> > > 
> > > What you want is the APIC id,  
> >   
> > > which _is_ cpu_index but may not be in the  
> > depending on topology cpu_index won't be the same as APIC ID/vcpu_id
> > /AMDs odd core count/.  
> 
> So vcpu_id can be sparse?
yes

I suppose kernel even has cpu->apic_id map somewhere that you can reuse
instead of custom map hv_context.vp_index[]

> > > future.  But the APIC id is also the KVM vcpu_id, so there's no need to
> > > have VP_INDEX maintained by QEMU.  
> > agreed it'd be better to reuse vcpu_id/apic id as interface between
> > qemu/kvm/guest instead of adding additional cpu_index concept in ABI  
> 
> Having consulted the spec, I'm not so confident any more this is the
> right move.
> 
> > 7.8.1 Virtual Processor Index
> > 
> > Virtual processors are identified by using an index (VP index). The
> > maximum number of virtual processors per partition supported by the
> > current implementation of the hypervisor can be obtained through CPUID
> > leaf 0x40000005. A virtual processor index must be less than the
> > maximum number of virtual processors per partition.
> 
> This seems to imply that VP index should be dense.  As if they use it
> directly as an index into an array whose length is equal to the max
> number of vcpus.  (BTW the value we report for it is currently hardcoded
> to 0x40 which probably needs fixing, too.)
the question here is where "maximum number of virtual processors" comes from

if it's possible to tell guest value explicitly,
we can use pcms->apic_id_limit for it.
 
> I'm starting to drift back to adding SET_MSRS support to
> HV_X64_MSR_VP_INDEX in KVM to delegate the control to QEMU, and having
> QEMU use cpu_index as its value...  :-/
> 
> Roman.
Roman Kagan June 15, 2017, 4:05 p.m. UTC | #26
On Thu, Jun 15, 2017 at 03:27:28PM +0200, Igor Mammedov wrote:
> On Thu, 15 Jun 2017 15:41:08 +0300
> Roman Kagan <rkagan@virtuozzo.com> wrote:
> > On Wed, Jun 14, 2017 at 03:00:27PM +0200, Igor Mammedov wrote:
> > > On Wed, 14 Jun 2017 13:26:44 +0200
> > > Paolo Bonzini <pbonzini@redhat.com> wrote:
> > >   
> > > > On 14/06/2017 13:25, Roman Kagan wrote:  
> > > > >> The problem with that is that it will break as soon as we create
> > > > >> VCPUs in a different order.  Unsolvable on hosts that don't allow
> > > > >> HV_X64_MSR_VP_INDEX to be set, however.    
> > > > > Right, thanks for putting together a detailed explanation.
> > > > > 
> > > > > This was a thinko back then, not to have HV_X64_MSR_VP_INDEX maintained
> > > > > by QEMU.  I'm going to post a patch to KVM fixing that.
> > > > > 
> > > > > Meanwhile QEMU needs a way to maintain its notion of vp_index that is
> > > > >   1) in sync with kernel's notion
> > > > >   2) also with kernels that don't support setting the msr
> > > > >   3) persistent across migrations
> > > > > 
> > > > > cpu_index looked like a perfect candidate.
> > > > >     
> > > > 
> > > > What you want is the APIC id,  
> > >   
> > > > which _is_ cpu_index but may not be in the  
> > > depending on topology cpu_index won't be the same as APIC ID/vcpu_id
> > > /AMDs odd core count/.  
> > 
> > So vcpu_id can be sparse?
> yes
> 
> I suppose kernel even has cpu->apic_id map somewhere that you can reuse
> instead of custom map hv_context.vp_index[]

Right, but this is not the problem.  Whether apic_id matches the
expected vp_index semantics is.

> > > > future.  But the APIC id is also the KVM vcpu_id, so there's no need to
> > > > have VP_INDEX maintained by QEMU.  
> > > agreed it'd be better to reuse vcpu_id/apic id as interface between
> > > qemu/kvm/guest instead of adding additional cpu_index concept in ABI  
> > 
> > Having consulted the spec, I'm not so confident any more this is the
> > right move.
> > 
> > > 7.8.1 Virtual Processor Index
> > > 
> > > Virtual processors are identified by using an index (VP index). The
> > > maximum number of virtual processors per partition supported by the
> > > current implementation of the hypervisor can be obtained through CPUID
> > > leaf 0x40000005. A virtual processor index must be less than the
> > > maximum number of virtual processors per partition.
> > 
> > This seems to imply that VP index should be dense.  As if they use it
> > directly as an index into an array whose length is equal to the max
> > number of vcpus.  (BTW the value we report for it is currently hardcoded
> > to 0x40 which probably needs fixing, too.)
> the question here is where "maximum number of virtual processors" comes from

We provide it in a corresponding cpuid leaf.

> if it's possible to tell guest value explicitly,
> we can use pcms->apic_id_limit for it.

But, depending on the apic_id encoding scheme, this can be arbitrarily
bigger than the actual maximum number of vcpus.  Which can result in
guest confusion or inefficiency.

> > I'm starting to drift back to adding SET_MSRS support to
> > HV_X64_MSR_VP_INDEX in KVM to delegate the control to QEMU, and having
> > QEMU use cpu_index as its value...  :-/

Roman.
Eduardo Habkost June 18, 2017, 3:29 p.m. UTC | #27
On Thu, Jun 15, 2017 at 07:05:29PM +0300, Roman Kagan wrote:
> On Thu, Jun 15, 2017 at 03:27:28PM +0200, Igor Mammedov wrote:
[...]
> > > > > future.  But the APIC id is also the KVM vcpu_id, so there's no need to
> > > > > have VP_INDEX maintained by QEMU.  
> > > > agreed it'd be better to reuse vcpu_id/apic id as interface between
> > > > qemu/kvm/guest instead of adding additional cpu_index concept in ABI  
> > > 
> > > Having consulted the spec, I'm not so confident any more this is the
> > > right move.
> > > 
> > > > 7.8.1 Virtual Processor Index
> > > > 
> > > > Virtual processors are identified by using an index (VP index). The
> > > > maximum number of virtual processors per partition supported by the
> > > > current implementation of the hypervisor can be obtained through CPUID
> > > > leaf 0x40000005. A virtual processor index must be less than the
> > > > maximum number of virtual processors per partition.
> > > 
> > > This seems to imply that VP index should be dense.  As if they use it
> > > directly as an index into an array whose length is equal to the max
> > > number of vcpus.  (BTW the value we report for it is currently hardcoded
> > > to 0x40 which probably needs fixing, too.)
> > the question here is where "maximum number of virtual processors" comes from
> 
> We provide it in a corresponding cpuid leaf.
> 
> > if it's possible to tell guest value explicitly,
> > we can use pcms->apic_id_limit for it.
> 
> But, depending on the apic_id encoding scheme, this can be arbitrarily
> bigger than the actual maximum number of vcpus.  Which can result in
> guest confusion or inefficiency.

Note that it will be bigger only if threads-per-core or
cores-per-thread are not powers of 2.  If your use case don't
require configuration of a virtual thread/core topology, the APIC
IDs can be always contiguous.
diff mbox

Patch

diff --git a/target/i386/hyperv.h b/target/i386/hyperv.h
index 0c3b562..35da0b1 100644
--- a/target/i386/hyperv.h
+++ b/target/i386/hyperv.h
@@ -39,4 +39,6 @@  void kvm_hv_sint_route_destroy(HvSintRoute *sint_route);
 
 int kvm_hv_sint_route_set_sint(HvSintRoute *sint_route);
 
+uint32_t hyperv_vp_index(X86CPU *cpu);
+
 #endif
diff --git a/target/i386/hyperv.c b/target/i386/hyperv.c
index 227185c..27de5bc 100644
--- a/target/i386/hyperv.c
+++ b/target/i386/hyperv.c
@@ -16,6 +16,11 @@ 
 #include "hyperv.h"
 #include "hyperv_proto.h"
 
+uint32_t hyperv_vp_index(X86CPU *cpu)
+{
+    return CPU(cpu)->cpu_index;
+}
+
 int kvm_hv_handle_exit(X86CPU *cpu, struct kvm_hyperv_exit *exit)
 {
     CPUX86State *env = &cpu->env;
diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index 251aa95..eb9cde4 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -610,11 +610,37 @@  static int kvm_arch_set_tsc_khz(CPUState *cs)
     return 0;
 }
 
+static void hyperv_set_vp_index(CPUState *cs)
+{
+    struct {
+        struct kvm_msrs info;
+        struct kvm_msr_entry entries[1];
+    } msr_data;
+    int ret;
+
+    msr_data.info.nmsrs = 1;
+    msr_data.entries[0].index = HV_X64_MSR_VP_INDEX;
+
+    /*
+     * Some kernels don't support setting HV_X64_MSR_VP_INDEX.  However,
+     * they use VP_INDEX equal to cpu_index due to the way vcpus are created.
+     * So ignore the errors from SET_MSRS, but verify that GET_MSRS returns the
+     * expected value.
+     */
+    msr_data.entries[0].data = cs->cpu_index;
+    kvm_vcpu_ioctl(cs, KVM_SET_MSRS, &msr_data);
+    ret = kvm_vcpu_ioctl(cs, KVM_GET_MSRS, &msr_data);
+    assert(ret == 1);
+    assert(msr_data.entries[0].data == cs->cpu_index);
+}
+
 static int hyperv_handle_properties(CPUState *cs)
 {
     X86CPU *cpu = X86_CPU(cs);
     CPUX86State *env = &cpu->env;
 
+    hyperv_set_vp_index(cs);
+
     if (cpu->hyperv_time &&
             kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV_TIME) <= 0) {
         cpu->hyperv_time = false;