diff mbox series

[v5,03/26] x86/hyperv: Update 'struct hv_enlightened_vmcs' definition

Message ID 20220802160756.339464-4-vkuznets@redhat.com (mailing list archive)
State New, archived
Headers show
Series KVM: VMX: Support updated eVMCSv1 revision + use vmcs_config for L1 VMX MSRs | expand

Commit Message

Vitaly Kuznetsov Aug. 2, 2022, 4:07 p.m. UTC
Updated Hyper-V Enlightened VMCS specification lists several new
fields for the following features:

- PerfGlobalCtrl
- EnclsExitingBitmap
- Tsc Scaling
- GuestLbrCtl
- CET
- SSP

Update the definition. The updated definition is available only when
CPUID.0x4000000A.EBX BIT(0) is '1'. Add a define for it as well.

Note: The latest TLFS is available at
https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/tlfs/tlfs

Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
---
 arch/x86/include/asm/hyperv-tlfs.h | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

Comments

Sean Christopherson Aug. 18, 2022, 3:21 p.m. UTC | #1
On Tue, Aug 02, 2022, Vitaly Kuznetsov wrote:
> Updated Hyper-V Enlightened VMCS specification lists several new
> fields for the following features:
> 
> - PerfGlobalCtrl
> - EnclsExitingBitmap
> - Tsc Scaling
> - GuestLbrCtl
> - CET
> - SSP
> 
> Update the definition. The updated definition is available only when
> CPUID.0x4000000A.EBX BIT(0) is '1'. Add a define for it as well.
> 
> Note: The latest TLFS is available at
> https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/tlfs/tlfs
> 
> Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
> ---
>  arch/x86/include/asm/hyperv-tlfs.h | 26 ++++++++++++++++++++++++--
>  1 file changed, 24 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
> index 6f0acc45e67a..ebc27017fa48 100644
> --- a/arch/x86/include/asm/hyperv-tlfs.h
> +++ b/arch/x86/include/asm/hyperv-tlfs.h
> @@ -138,6 +138,17 @@
>  #define HV_X64_NESTED_GUEST_MAPPING_FLUSH		BIT(18)
>  #define HV_X64_NESTED_MSR_BITMAP			BIT(19)
>  
> +/*
> + * Nested quirks. These are HYPERV_CPUID_NESTED_FEATURES.EBX bits.

The "quirks" part is very confusing, largely because KVM has a well-established
quirks mechanism.  I also don't see "quirks" anywhere in the TLFS documentation.
Can the "Nested quirks" part simply be dropped?

> + *
> + * Note: HV_X64_NESTED_EVMCS1_2022_UPDATE is not currently documented in any
> + * published TLFS version. When the bit is set, nested hypervisor can use
> + * 'updated' eVMCSv1 specification (perf_global_ctrl, s_cet, ssp, lbr_ctl,
> + * encls_exiting_bitmap, tsc_multiplier fields which were missing in 2016
> + * specification).
> + */
> +#define HV_X64_NESTED_EVMCS1_2022_UPDATE		BIT(0)

This bit is now defined[*], but the docs says it's only for perf_global_ctrl.  Are
we expecting an update to the TLFS?

	Indicates support for the GuestPerfGlobalCtrl and HostPerfGlobalCtrl fields
	in the enlightened VMCS.

[*] https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/tlfs/feature-discovery#hypervisor-nested-virtualization-features---0x4000000a

> +
>  /*
>   * This is specific to AMD and specifies that enlightened TLB flush is
>   * supported. If guest opts in to this feature, ASID invalidations only
> @@ -559,9 +570,20 @@ struct hv_enlightened_vmcs {
>  	u64 partition_assist_page;
>  	u64 padding64_4[4];
>  	u64 guest_bndcfgs;
> -	u64 padding64_5[7];
> +	u64 guest_ia32_perf_global_ctrl;
> +	u64 guest_ia32_s_cet;
> +	u64 guest_ssp;
> +	u64 guest_ia32_int_ssp_table_addr;
> +	u64 guest_ia32_lbr_ctl;
> +	u64 padding64_5[2];
>  	u64 xss_exit_bitmap;
> -	u64 padding64_6[7];
> +	u64 encls_exiting_bitmap;
> +	u64 host_ia32_perf_global_ctrl;
> +	u64 tsc_multiplier;
> +	u64 host_ia32_s_cet;
> +	u64 host_ssp;
> +	u64 host_ia32_int_ssp_table_addr;
> +	u64 padding64_6;
>  } __packed;
>  
>  #define HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE			0
> -- 
> 2.35.3
>
Vitaly Kuznetsov Aug. 18, 2022, 3:29 p.m. UTC | #2
Sean Christopherson <seanjc@google.com> writes:

> On Tue, Aug 02, 2022, Vitaly Kuznetsov wrote:
>> Updated Hyper-V Enlightened VMCS specification lists several new
>> fields for the following features:
>> 
>> - PerfGlobalCtrl
>> - EnclsExitingBitmap
>> - Tsc Scaling
>> - GuestLbrCtl
>> - CET
>> - SSP
>> 
>> Update the definition. The updated definition is available only when
>> CPUID.0x4000000A.EBX BIT(0) is '1'. Add a define for it as well.
>> 
>> Note: The latest TLFS is available at
>> https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/tlfs/tlfs
>> 
>> Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
>> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
>> ---
>>  arch/x86/include/asm/hyperv-tlfs.h | 26 ++++++++++++++++++++++++--
>>  1 file changed, 24 insertions(+), 2 deletions(-)
>> 
>> diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
>> index 6f0acc45e67a..ebc27017fa48 100644
>> --- a/arch/x86/include/asm/hyperv-tlfs.h
>> +++ b/arch/x86/include/asm/hyperv-tlfs.h
>> @@ -138,6 +138,17 @@
>>  #define HV_X64_NESTED_GUEST_MAPPING_FLUSH		BIT(18)
>>  #define HV_X64_NESTED_MSR_BITMAP			BIT(19)
>>  
>> +/*
>> + * Nested quirks. These are HYPERV_CPUID_NESTED_FEATURES.EBX bits.
>
> The "quirks" part is very confusing, largely because KVM has a well-established
> quirks mechanism.  I also don't see "quirks" anywhere in the TLFS documentation.
> Can the "Nested quirks" part simply be dropped?
>

Yes, it's completely made up (just like I made up
HV_X64_NESTED_EVMCS1_2022_UPDATE), let's drop it.

>> + *
>> + * Note: HV_X64_NESTED_EVMCS1_2022_UPDATE is not currently documented in any
>> + * published TLFS version. When the bit is set, nested hypervisor can use
>> + * 'updated' eVMCSv1 specification (perf_global_ctrl, s_cet, ssp, lbr_ctl,
>> + * encls_exiting_bitmap, tsc_multiplier fields which were missing in 2016
>> + * specification).
>> + */
>> +#define HV_X64_NESTED_EVMCS1_2022_UPDATE		BIT(0)
>
> This bit is now defined[*], but the docs says it's only for perf_global_ctrl.  Are
> we expecting an update to the TLFS?
>
> 	Indicates support for the GuestPerfGlobalCtrl and HostPerfGlobalCtrl fields
> 	in the enlightened VMCS.
>
> [*] https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/tlfs/feature-discovery#hypervisor-nested-virtualization-features---0x4000000a
>

Oh well, better this than nothing. I'll ping the people who told me
about this bit that their description is incomplete.

>> +
>>  /*
>>   * This is specific to AMD and specifies that enlightened TLB flush is
>>   * supported. If guest opts in to this feature, ASID invalidations only
>> @@ -559,9 +570,20 @@ struct hv_enlightened_vmcs {
>>  	u64 partition_assist_page;
>>  	u64 padding64_4[4];
>>  	u64 guest_bndcfgs;
>> -	u64 padding64_5[7];
>> +	u64 guest_ia32_perf_global_ctrl;
>> +	u64 guest_ia32_s_cet;
>> +	u64 guest_ssp;
>> +	u64 guest_ia32_int_ssp_table_addr;
>> +	u64 guest_ia32_lbr_ctl;
>> +	u64 padding64_5[2];
>>  	u64 xss_exit_bitmap;
>> -	u64 padding64_6[7];
>> +	u64 encls_exiting_bitmap;
>> +	u64 host_ia32_perf_global_ctrl;
>> +	u64 tsc_multiplier;
>> +	u64 host_ia32_s_cet;
>> +	u64 host_ssp;
>> +	u64 host_ia32_int_ssp_table_addr;
>> +	u64 padding64_6;
>>  } __packed;
>>  
>>  #define HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE			0
>> -- 
>> 2.35.3
>> 
>
Sean Christopherson Aug. 18, 2022, 5:57 p.m. UTC | #3
On Thu, Aug 18, 2022, Vitaly Kuznetsov wrote:
> Sean Christopherson <seanjc@google.com> writes:
> 
> > On Tue, Aug 02, 2022, Vitaly Kuznetsov wrote:
> >> + * Note: HV_X64_NESTED_EVMCS1_2022_UPDATE is not currently documented in any
> >> + * published TLFS version. When the bit is set, nested hypervisor can use
> >> + * 'updated' eVMCSv1 specification (perf_global_ctrl, s_cet, ssp, lbr_ctl,
> >> + * encls_exiting_bitmap, tsc_multiplier fields which were missing in 2016
> >> + * specification).
> >> + */
> >> +#define HV_X64_NESTED_EVMCS1_2022_UPDATE		BIT(0)
> >
> > This bit is now defined[*], but the docs says it's only for perf_global_ctrl.  Are
> > we expecting an update to the TLFS?
> >
> > 	Indicates support for the GuestPerfGlobalCtrl and HostPerfGlobalCtrl fields
> > 	in the enlightened VMCS.
> >
> > [*] https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/tlfs/feature-discovery#hypervisor-nested-virtualization-features---0x4000000a
> >
> 
> Oh well, better this than nothing. I'll ping the people who told me
> about this bit that their description is incomplete.

Not that it changes anything, but I'd rather have no documentation.  I'd much rather
KVM say "this is the undocumented behavior" than "the document behavior is wrong".
Vitaly Kuznetsov Aug. 22, 2022, 9:18 a.m. UTC | #4
Sean Christopherson <seanjc@google.com> writes:

> On Thu, Aug 18, 2022, Vitaly Kuznetsov wrote:
>> Sean Christopherson <seanjc@google.com> writes:
>> 
>> > On Tue, Aug 02, 2022, Vitaly Kuznetsov wrote:
>> >> + * Note: HV_X64_NESTED_EVMCS1_2022_UPDATE is not currently documented in any
>> >> + * published TLFS version. When the bit is set, nested hypervisor can use
>> >> + * 'updated' eVMCSv1 specification (perf_global_ctrl, s_cet, ssp, lbr_ctl,
>> >> + * encls_exiting_bitmap, tsc_multiplier fields which were missing in 2016
>> >> + * specification).
>> >> + */
>> >> +#define HV_X64_NESTED_EVMCS1_2022_UPDATE		BIT(0)
>> >
>> > This bit is now defined[*], but the docs says it's only for perf_global_ctrl.  Are
>> > we expecting an update to the TLFS?
>> >
>> > 	Indicates support for the GuestPerfGlobalCtrl and HostPerfGlobalCtrl fields
>> > 	in the enlightened VMCS.
>> >
>> > [*] https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/tlfs/feature-discovery#hypervisor-nested-virtualization-features---0x4000000a
>> >
>> 
>> Oh well, better this than nothing. I'll ping the people who told me
>> about this bit that their description is incomplete.
>
> Not that it changes anything, but I'd rather have no documentation.  I'd much rather
> KVM say "this is the undocumented behavior" than "the document behavior is wrong".
>

So I reached out to Microsoft and their answer was that for all these new
eVMCS fields (including *PerfGlobalCtrl) observing architectural VMX
MSRs should be enough. *PerfGlobalCtrl case is special because of Win11
bug (if we expose the feature in VMX feature MSRs but don't set
CPUID.0x4000000A.EBX BIT(0) it just doesn't boot).

What I'm still concerned about is future proofing KVM for new
features. When something is getting added to KVM for which no eVMCS
field is currently defined, both Hyper-V-on-KVM and KVM-on-Hyper-V cases
should be taken care of. It would probably be better to reverse our
filtering, explicitly listing features supported in eVMCS. The lists are
going to be fairly long but at least we won't have to take care of any
new architectural feature added to KVM.
Sean Christopherson Aug. 22, 2022, 3:55 p.m. UTC | #5
On Mon, Aug 22, 2022, Vitaly Kuznetsov wrote:
> Sean Christopherson <seanjc@google.com> writes:
> 
> > On Thu, Aug 18, 2022, Vitaly Kuznetsov wrote:
> >> Sean Christopherson <seanjc@google.com> writes:
> >> 
> >> > On Tue, Aug 02, 2022, Vitaly Kuznetsov wrote:
> >> >> + * Note: HV_X64_NESTED_EVMCS1_2022_UPDATE is not currently documented in any
> >> >> + * published TLFS version. When the bit is set, nested hypervisor can use
> >> >> + * 'updated' eVMCSv1 specification (perf_global_ctrl, s_cet, ssp, lbr_ctl,
> >> >> + * encls_exiting_bitmap, tsc_multiplier fields which were missing in 2016
> >> >> + * specification).
> >> >> + */
> >> >> +#define HV_X64_NESTED_EVMCS1_2022_UPDATE		BIT(0)
> >> >
> >> > This bit is now defined[*], but the docs says it's only for perf_global_ctrl.  Are
> >> > we expecting an update to the TLFS?
> >> >
> >> > 	Indicates support for the GuestPerfGlobalCtrl and HostPerfGlobalCtrl fields
> >> > 	in the enlightened VMCS.
> >> >
> >> > [*] https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/tlfs/feature-discovery#hypervisor-nested-virtualization-features---0x4000000a
> >> >
> >> 
> >> Oh well, better this than nothing. I'll ping the people who told me
> >> about this bit that their description is incomplete.
> >
> > Not that it changes anything, but I'd rather have no documentation.  I'd much rather
> > KVM say "this is the undocumented behavior" than "the document behavior is wrong".
> >
> 
> So I reached out to Microsoft and their answer was that for all these new
> eVMCS fields (including *PerfGlobalCtrl) observing architectural VMX
> MSRs should be enough. *PerfGlobalCtrl case is special because of Win11
> bug (if we expose the feature in VMX feature MSRs but don't set
> CPUID.0x4000000A.EBX BIT(0) it just doesn't boot).

I.e. TSC_SCALING shouldn't be gated on the flag?  If so, then the 2-D array approach
is overkill since (a) the CPUID flag only controls PERF_GLOBAL_CTRL and (b) we aren't
expecting any more flags in the future.

What about this for an implementation?

static bool evmcs_has_perf_global_ctrl(struct kvm_vcpu *vcpu)
{
	struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);

	/*
	 * Filtering VMX controls for eVMCS compatibility should only be done
	 * for guest accesses, and all such accesses should be gated on Hyper-V
	 * being enabled and initialized.
	 */
	if (WARN_ON_ONCE(!hv_vcpu))
		return false;

	return hv_vcpu->cpuid_cache.nested_ebx & HV_X64_NESTED_EVMCS1_PERF_GLOBAL_CTRL;
}

static u32 evmcs_get_unsupported_ctls(struct kvm_vcpu *vcpu, u32 msr_index)
{
	u32 unsupported_ctrls;

	switch (msr_index) {
	case MSR_IA32_VMX_EXIT_CTLS:
	case MSR_IA32_VMX_TRUE_EXIT_CTLS:
		unsupported_ctrls = EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
		if (!evmcs_has_perf_global_ctrl(vcpu))
			unsupported_ctrls |= VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
		return unsupported_ctrls;
	case MSR_IA32_VMX_ENTRY_CTLS:
	case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
		unsupported_ctrls = EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
		if (!evmcs_has_perf_global_ctrl(vcpu))
			unsupported_ctrls |= VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
		return unsupported_ctrls;
	case MSR_IA32_VMX_PROCBASED_CTLS2:
		return EVMCS1_UNSUPPORTED_2NDEXEC;
	case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
	case MSR_IA32_VMX_PINBASED_CTLS:
		return EVMCS1_UNSUPPORTED_PINCTRL;
	case MSR_IA32_VMX_VMFUNC:
		return EVMCS1_UNSUPPORTED_VMFUNC;
	default:
		KVM_BUG_ON(1, vcpu->kvm);
		return 0;
	}
}

void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
{
	u64 unsupported_ctrls = evmcs_get_unsupported_ctls(vcpu, msr_index);

	if (msr_index == MSR_IA32_VMX_VMFUNC)
		*pdata &= ~unsupported_ctrls;
	else
		*pdata &= ~(unsupported_ctrls << 32);
}


> What I'm still concerned about is future proofing KVM for new
> features. When something is getting added to KVM for which no eVMCS
> field is currently defined, both Hyper-V-on-KVM and KVM-on-Hyper-V cases
> should be taken care of. It would probably be better to reverse our
> filtering, explicitly listing features supported in eVMCS. The lists are
> going to be fairly long but at least we won't have to take care of any
> new architectural feature added to KVM.

Having the filtering be opt-in crossed my mind as well.  Reversing the filtering
can be done after this series though, correct?
Sean Christopherson Aug. 22, 2022, 4:13 p.m. UTC | #6
On Mon, Aug 22, 2022, Vitaly Kuznetsov wrote:
> So I reached out to Microsoft and their answer was that for all these new
> eVMCS fields (including *PerfGlobalCtrl) observing architectural VMX
> MSRs should be enough. *PerfGlobalCtrl case is special because of Win11
> bug (if we expose the feature in VMX feature MSRs but don't set
> CPUID.0x4000000A.EBX BIT(0) it just doesn't boot).

Does this mean that KVM-on-HyperV needs to avoid using the PERF_GLOBAL_CTRL fields
when the bit is not set?
Vitaly Kuznetsov Aug. 22, 2022, 4:21 p.m. UTC | #7
Sean Christopherson <seanjc@google.com> writes:

> On Mon, Aug 22, 2022, Vitaly Kuznetsov wrote:
>> Sean Christopherson <seanjc@google.com> writes:
>> 
>> > On Thu, Aug 18, 2022, Vitaly Kuznetsov wrote:
>> >> Sean Christopherson <seanjc@google.com> writes:
>> >> 
>> >> > On Tue, Aug 02, 2022, Vitaly Kuznetsov wrote:
>> >> >> + * Note: HV_X64_NESTED_EVMCS1_2022_UPDATE is not currently documented in any
>> >> >> + * published TLFS version. When the bit is set, nested hypervisor can use
>> >> >> + * 'updated' eVMCSv1 specification (perf_global_ctrl, s_cet, ssp, lbr_ctl,
>> >> >> + * encls_exiting_bitmap, tsc_multiplier fields which were missing in 2016
>> >> >> + * specification).
>> >> >> + */
>> >> >> +#define HV_X64_NESTED_EVMCS1_2022_UPDATE		BIT(0)
>> >> >
>> >> > This bit is now defined[*], but the docs says it's only for perf_global_ctrl.  Are
>> >> > we expecting an update to the TLFS?
>> >> >
>> >> > 	Indicates support for the GuestPerfGlobalCtrl and HostPerfGlobalCtrl fields
>> >> > 	in the enlightened VMCS.
>> >> >
>> >> > [*] https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/tlfs/feature-discovery#hypervisor-nested-virtualization-features---0x4000000a
>> >> >
>> >> 
>> >> Oh well, better this than nothing. I'll ping the people who told me
>> >> about this bit that their description is incomplete.
>> >
>> > Not that it changes anything, but I'd rather have no documentation.  I'd much rather
>> > KVM say "this is the undocumented behavior" than "the document behavior is wrong".
>> >
>> 
>> So I reached out to Microsoft and their answer was that for all these new
>> eVMCS fields (including *PerfGlobalCtrl) observing architectural VMX
>> MSRs should be enough. *PerfGlobalCtrl case is special because of Win11
>> bug (if we expose the feature in VMX feature MSRs but don't set
>> CPUID.0x4000000A.EBX BIT(0) it just doesn't boot).
>
> I.e. TSC_SCALING shouldn't be gated on the flag?  If so, then the 2-D array approach
> is overkill since (a) the CPUID flag only controls PERF_GLOBAL_CTRL and (b) we aren't
> expecting any more flags in the future.
>

Unfortunately, we have to gate the presence of these new features on
something, otherwise VMM has no way to specify which particular eVMCS
"revision" it wants (TL;DR: we will break migration).

My initial implementation was inventing 'eVMCS revision' concept:
https://lore.kernel.org/kvm/20220629150625.238286-7-vkuznets@redhat.com/

which is needed if we don't gate all these new fields on CPUID.0x4000000A.EBX BIT(0).

Going forward, we will still (likely) need something when new fields show up.

> What about this for an implementation?
>
> static bool evmcs_has_perf_global_ctrl(struct kvm_vcpu *vcpu)
> {
> 	struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
>
> 	/*
> 	 * Filtering VMX controls for eVMCS compatibility should only be done
> 	 * for guest accesses, and all such accesses should be gated on Hyper-V
> 	 * being enabled and initialized.
> 	 */
> 	if (WARN_ON_ONCE(!hv_vcpu))
> 		return false;
>
> 	return hv_vcpu->cpuid_cache.nested_ebx & HV_X64_NESTED_EVMCS1_PERF_GLOBAL_CTRL;
> }
>
> static u32 evmcs_get_unsupported_ctls(struct kvm_vcpu *vcpu, u32 msr_index)
> {
> 	u32 unsupported_ctrls;
>
> 	switch (msr_index) {
> 	case MSR_IA32_VMX_EXIT_CTLS:
> 	case MSR_IA32_VMX_TRUE_EXIT_CTLS:
> 		unsupported_ctrls = EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
> 		if (!evmcs_has_perf_global_ctrl(vcpu))
> 			unsupported_ctrls |= VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
> 		return unsupported_ctrls;
> 	case MSR_IA32_VMX_ENTRY_CTLS:
> 	case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
> 		unsupported_ctrls = EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
> 		if (!evmcs_has_perf_global_ctrl(vcpu))
> 			unsupported_ctrls |= VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
> 		return unsupported_ctrls;
> 	case MSR_IA32_VMX_PROCBASED_CTLS2:
> 		return EVMCS1_UNSUPPORTED_2NDEXEC;
> 	case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
> 	case MSR_IA32_VMX_PINBASED_CTLS:
> 		return EVMCS1_UNSUPPORTED_PINCTRL;
> 	case MSR_IA32_VMX_VMFUNC:
> 		return EVMCS1_UNSUPPORTED_VMFUNC;
> 	default:
> 		KVM_BUG_ON(1, vcpu->kvm);
> 		return 0;
> 	}
> }
>
> void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
> {
> 	u64 unsupported_ctrls = evmcs_get_unsupported_ctls(vcpu, msr_index);
>
> 	if (msr_index == MSR_IA32_VMX_VMFUNC)
> 		*pdata &= ~unsupported_ctrls;
> 	else
> 		*pdata &= ~(unsupported_ctrls << 32);
> }
>

It's smaller and I like it but it would only work in conjunction with
KVM_CAP_HYPERV_ENLIGHTENED_VMCS2...

>
>> What I'm still concerned about is future proofing KVM for new
>> features. When something is getting added to KVM for which no eVMCS
>> field is currently defined, both Hyper-V-on-KVM and KVM-on-Hyper-V cases
>> should be taken care of. It would probably be better to reverse our
>> filtering, explicitly listing features supported in eVMCS. The lists are
>> going to be fairly long but at least we won't have to take care of any
>> new architectural feature added to KVM.
>
> Having the filtering be opt-in crossed my mind as well.  Reversing the filtering
> can be done after this series though, correct?
>

Yes, that's my plan, Get this in to fix the immediate issue with 2022
features and probably reverse the filtering before Microsoft releases
something else :-)
Vitaly Kuznetsov Aug. 22, 2022, 4:24 p.m. UTC | #8
Sean Christopherson <seanjc@google.com> writes:

> On Mon, Aug 22, 2022, Vitaly Kuznetsov wrote:
>> So I reached out to Microsoft and their answer was that for all these new
>> eVMCS fields (including *PerfGlobalCtrl) observing architectural VMX
>> MSRs should be enough. *PerfGlobalCtrl case is special because of Win11
>> bug (if we expose the feature in VMX feature MSRs but don't set
>> CPUID.0x4000000A.EBX BIT(0) it just doesn't boot).
>
> Does this mean that KVM-on-HyperV needs to avoid using the PERF_GLOBAL_CTRL fields
> when the bit is not set?

It doesn't have to, based on the reply I got from Microsoft, if 
PERF_GLOBAL_CTRL is exposed in architectural VMX feature MSRs than eVMCS
fields are guaranteed to be present. The PV bit is 'extra'.
Sean Christopherson Aug. 22, 2022, 5:01 p.m. UTC | #9
On Mon, Aug 22, 2022, Vitaly Kuznetsov wrote:
> Sean Christopherson <seanjc@google.com> writes:
> 
> > On Mon, Aug 22, 2022, Vitaly Kuznetsov wrote:
> >> So I reached out to Microsoft and their answer was that for all these new
> >> eVMCS fields (including *PerfGlobalCtrl) observing architectural VMX
> >> MSRs should be enough. *PerfGlobalCtrl case is special because of Win11
> >> bug (if we expose the feature in VMX feature MSRs but don't set
> >> CPUID.0x4000000A.EBX BIT(0) it just doesn't boot).
> >
> > I.e. TSC_SCALING shouldn't be gated on the flag?  If so, then the 2-D array approach
> > is overkill since (a) the CPUID flag only controls PERF_GLOBAL_CTRL and (b) we aren't
> > expecting any more flags in the future.
> >
> 
> Unfortunately, we have to gate the presence of these new features on
> something, otherwise VMM has no way to specify which particular eVMCS
> "revision" it wants (TL;DR: we will break migration).
> 
> My initial implementation was inventing 'eVMCS revision' concept:
> https://lore.kernel.org/kvm/20220629150625.238286-7-vkuznets@redhat.com/
> 
> which is needed if we don't gate all these new fields on CPUID.0x4000000A.EBX BIT(0).
> 
> Going forward, we will still (likely) need something when new fields show up.

My comments from that thread still apply.  Adding "revisions" or feature flags
isn't maintanable, e.g. at best KVM will end up with a ridiculous number of flags.

Looking at QEMU, which I strongly suspect is the only VMM that enables
KVM_CAP_HYPERV_ENLIGHTENED_VMCS, it does the sane thing of enabling the capability
before grabbing the VMX MSRs.

So, why not simply apply filtering for host accesses as well?  E.g.

		/*
		 * New Enlightened VMCS fields always lag behind their hardware
		 * counterparts, filter out fields that are not yet defined.
		 */
		if (vmx->nested.enlightened_vmcs_enabled)
			nested_evmcs_filter_control_msr(vcpu, msr_info);

and then the eVMCS can end up being:

static bool evmcs_has_perf_global_ctrl(struct kvm_vcpu *vcpu)
{
	struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);

	/*
	 * PERF_GLOBAL_CTRL is filtered only for guest accesses, and all guest
	 * accesses should be gated on Hyper-V being enabled and initialized.
	 */
	if (WARN_ON_ONCE(!hv_vcpu))
		return false;

	return hv_vcpu->cpuid_cache.nested_ebx & HV_X64_NESTED_EVMCS1_PERF_GLOBAL_CTRL;
}

static u32 evmcs_get_unsupported_ctls(struct kvm_vcpu *vcpu, u32 msr_index,
				      bool host_initiated)
{
	u32 unsupported_ctrls;

	switch (msr_index) {
	case MSR_IA32_VMX_EXIT_CTLS:
	case MSR_IA32_VMX_TRUE_EXIT_CTLS:
		unsupported_ctrls = EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
		if (!host_initiated && !evmcs_has_perf_global_ctrl(vcpu))
			unsupported_ctrls |= VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
		return unsupported_ctrls;
	case MSR_IA32_VMX_ENTRY_CTLS:
	case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
		unsupported_ctrls = EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
		if (!host_initiated && !evmcs_has_perf_global_ctrl(vcpu))
			unsupported_ctrls |= VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
		return unsupported_ctrls;
	case MSR_IA32_VMX_PROCBASED_CTLS2:
		return EVMCS1_UNSUPPORTED_2NDEXEC;
	case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
	case MSR_IA32_VMX_PINBASED_CTLS:
		return EVMCS1_UNSUPPORTED_PINCTRL;
	case MSR_IA32_VMX_VMFUNC:
		return EVMCS1_UNSUPPORTED_VMFUNC;
	default:
		KVM_BUG_ON(1, vcpu->kvm);
		return 0;
	}
}

void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu,
				     struct msr_data *msr_info)
{
	u64 unsupported_ctrls;
	
	if (!msr_info->host_initiated && !vcpu->arch.hyperv_enabled)
		return;

	unsupported_ctrls = evmcs_get_unsupported_ctls(vcpu, msr_info->index,
						       msr_info->host_initiated);
	if (msr_info->index == MSR_IA32_VMX_VMFUNC)
		msr_info->data &= ~unsupported_ctrls;
	else
		msr_info->data &= ~(unsupported_ctrls << 32);
}

static bool nested_evmcs_is_valid_controls(struct kvm_vcpu *vcpu,
					   u32 msr_index, u32 val)
{
	return val & evmcs_get_unsupported_ctls(vcpu, msr_index, false);
}
Vitaly Kuznetsov Aug. 22, 2022, 5:46 p.m. UTC | #10
Sean Christopherson <seanjc@google.com> writes:

> On Mon, Aug 22, 2022, Vitaly Kuznetsov wrote:
>> Sean Christopherson <seanjc@google.com> writes:
>> 
>> > On Mon, Aug 22, 2022, Vitaly Kuznetsov wrote:
>> >> So I reached out to Microsoft and their answer was that for all these new
>> >> eVMCS fields (including *PerfGlobalCtrl) observing architectural VMX
>> >> MSRs should be enough. *PerfGlobalCtrl case is special because of Win11
>> >> bug (if we expose the feature in VMX feature MSRs but don't set
>> >> CPUID.0x4000000A.EBX BIT(0) it just doesn't boot).
>> >
>> > I.e. TSC_SCALING shouldn't be gated on the flag?  If so, then the 2-D array approach
>> > is overkill since (a) the CPUID flag only controls PERF_GLOBAL_CTRL and (b) we aren't
>> > expecting any more flags in the future.
>> >
>> 
>> Unfortunately, we have to gate the presence of these new features on
>> something, otherwise VMM has no way to specify which particular eVMCS
>> "revision" it wants (TL;DR: we will break migration).
>> 
>> My initial implementation was inventing 'eVMCS revision' concept:
>> https://lore.kernel.org/kvm/20220629150625.238286-7-vkuznets@redhat.com/
>> 
>> which is needed if we don't gate all these new fields on CPUID.0x4000000A.EBX BIT(0).
>> 
>> Going forward, we will still (likely) need something when new fields show up.
>
> My comments from that thread still apply.  Adding "revisions" or feature flags
> isn't maintanable, e.g. at best KVM will end up with a ridiculous number of flags.
>
> Looking at QEMU, which I strongly suspect is the only VMM that enables
> KVM_CAP_HYPERV_ENLIGHTENED_VMCS, it does the sane thing of enabling the capability
> before grabbing the VMX MSRs.
>
> So, why not simply apply filtering for host accesses as well?

(I understand that using QEMU to justify KVM's behavior is flawed but...)

QEMU's migration depends on the assumption that identical QEMU's command
lines create identical (from guest PoV) configurations. Assume we have
(simplified)

"-cpu CascadeLake-Sever,hv-evmcs"

on both source and destination but source host is newer, i.e. its KVM
knows about TSC Scaling in eVMCS and destination host has no idea about
it. If we just apply filtering upon vCPU creation, guest visible MSR
values are going to be different, right? Ok, assuming QEMU also migrates
VMX feature MSRs (TODO: check if that's true), we will be able to fail
mirgration late (which is already much worse than not being able to
create the desired configuration on destination, 'fail early') if we use
in-KVM filtering to throw an error to userspace. But if we blindly
filter control MSRs on the destination, 'TscScaling' will just disapper
undreneath the guest. This is unlikely to work.

In any case, what we need, is an option for VMM (read: QEMU) to create
the configuration with 'TscScaling' filtered out even KVM supports the
bit in eVMCS. This way the guest will be able to migrate backwards to an
older KVM which doesn't support it, i.e.

'-cpu CascadeLake-Sever,hv-evmcs'
 creates the 'origin' eVMCS configuration, no TscScaling

'-cpu CascadeLake-Sever,hv-evmcs,hv-evmcs-2022' creates the updated one.

KVM_CAP_HYPERV_ENLIGHTENED_VMCS is bad as it only takes 'eVMCS' version
as a parameter (as we assumed it will always change when new fields are
added, but that turned out to be false). That's why I suggested
KVM_CAP_HYPERV_ENLIGHTENED_VMCS2.

For the issue at hand, 'hv-evmcs-2022' can just set CPUID.0x4000000A.EBX
BIT(0) and then we gate all new fields' existence on it. It doesn't
matter much if we filter host accesses or not in this scheme.

Going all the way back, I'd certainly made the filtering apply to host
writes throwing an error when eVMCS is enabled (and I'd made it per-VM
and mandate that it is enabled prior to getting MSRs) but that doesn't
seem to help us much now.
Sean Christopherson Aug. 22, 2022, 6:32 p.m. UTC | #11
On Mon, Aug 22, 2022, Vitaly Kuznetsov wrote:
> Sean Christopherson <seanjc@google.com> writes:
> 
> > On Mon, Aug 22, 2022, Vitaly Kuznetsov wrote:
> >> My initial implementation was inventing 'eVMCS revision' concept:
> >> https://lore.kernel.org/kvm/20220629150625.238286-7-vkuznets@redhat.com/
> >> 
> >> which is needed if we don't gate all these new fields on CPUID.0x4000000A.EBX BIT(0).
> >> 
> >> Going forward, we will still (likely) need something when new fields show up.
> >
> > My comments from that thread still apply.  Adding "revisions" or feature flags
> > isn't maintanable, e.g. at best KVM will end up with a ridiculous number of flags.
> >
> > Looking at QEMU, which I strongly suspect is the only VMM that enables
> > KVM_CAP_HYPERV_ENLIGHTENED_VMCS, it does the sane thing of enabling the capability
> > before grabbing the VMX MSRs.
> >
> > So, why not simply apply filtering for host accesses as well?
> 
> (I understand that using QEMU to justify KVM's behavior is flawed but...)
> 
> QEMU's migration depends on the assumption that identical QEMU's command
> lines create identical (from guest PoV) configurations. Assume we have
> (simplified)
> 
> "-cpu CascadeLake-Sever,hv-evmcs"
> 
> on both source and destination but source host is newer, i.e. its KVM
> knows about TSC Scaling in eVMCS and destination host has no idea about
> it. If we just apply filtering upon vCPU creation, guest visible MSR
> values are going to be different, right? Ok, assuming QEMU also migrates
> VMX feature MSRs (TODO: check if that's true), we will be able to fail
> mirgration late (which is already much worse than not being able to
> create the desired configuration on destination, 'fail early') if we use
> in-KVM filtering to throw an error to userspace. But if we blindly
> filter control MSRs on the destination, 'TscScaling' will just disapper
> undreneath the guest. This is unlikely to work.

But all of that holds true irrespetive of eVMCS.  If QEMU attempts to migrate a
nested guest from a KVM that supports TSC_SCALING to a KVM that doesn't support
TSC_SCALING, then TSC_SCALING is going to disappear and VM-Entry on the dest will
fail.  I.e. QEMU _must_ be able to detect the incompatibility and not attempt
the migration.  And with that code in place, QEMU doesn't need to do anything new
for eVMCS, it Just Works.

> In any case, what we need, is an option for VMM (read: QEMU) to create
> the configuration with 'TscScaling' filtered out even KVM supports the
> bit in eVMCS. This way the guest will be able to migrate backwards to an
> older KVM which doesn't support it, i.e.
> 
> '-cpu CascadeLake-Sever,hv-evmcs'
>  creates the 'origin' eVMCS configuration, no TscScaling
> 
> '-cpu CascadeLake-Sever,hv-evmcs,hv-evmcs-2022' creates the updated one.

Again, this conundrum exists irrespective of eVMCS.  Properly solve the problem
for regular nVMX and eVMCS should naturally work.

> KVM_CAP_HYPERV_ENLIGHTENED_VMCS is bad as it only takes 'eVMCS' version
> as a parameter (as we assumed it will always change when new fields are
> added, but that turned out to be false). That's why I suggested
> KVM_CAP_HYPERV_ENLIGHTENED_VMCS2.

Enumerating features via versions is such a bad API though, e.g. if there's a
bug with nested TSC_SCALING, userspace can't disable just nested TSC_SCALING
without everything else under the inscrutable "hv-evmcs-2022" being collateral
damage.
Vitaly Kuznetsov Aug. 23, 2022, 7:33 a.m. UTC | #12
Sean Christopherson <seanjc@google.com> writes:

> On Mon, Aug 22, 2022, Vitaly Kuznetsov wrote:
>> Sean Christopherson <seanjc@google.com> writes:
>> 
>> > On Mon, Aug 22, 2022, Vitaly Kuznetsov wrote:
>> >> My initial implementation was inventing 'eVMCS revision' concept:
>> >> https://lore.kernel.org/kvm/20220629150625.238286-7-vkuznets@redhat.com/
>> >> 
>> >> which is needed if we don't gate all these new fields on CPUID.0x4000000A.EBX BIT(0).
>> >> 
>> >> Going forward, we will still (likely) need something when new fields show up.
>> >
>> > My comments from that thread still apply.  Adding "revisions" or feature flags
>> > isn't maintanable, e.g. at best KVM will end up with a ridiculous number of flags.
>> >
>> > Looking at QEMU, which I strongly suspect is the only VMM that enables
>> > KVM_CAP_HYPERV_ENLIGHTENED_VMCS, it does the sane thing of enabling the capability
>> > before grabbing the VMX MSRs.
>> >
>> > So, why not simply apply filtering for host accesses as well?
>> 
>> (I understand that using QEMU to justify KVM's behavior is flawed but...)
>> 
>> QEMU's migration depends on the assumption that identical QEMU's command
>> lines create identical (from guest PoV) configurations. Assume we have
>> (simplified)
>> 
>> "-cpu CascadeLake-Sever,hv-evmcs"
>> 
>> on both source and destination but source host is newer, i.e. its KVM
>> knows about TSC Scaling in eVMCS and destination host has no idea about
>> it. If we just apply filtering upon vCPU creation, guest visible MSR
>> values are going to be different, right? Ok, assuming QEMU also migrates
>> VMX feature MSRs (TODO: check if that's true), we will be able to fail
>> mirgration late (which is already much worse than not being able to
>> create the desired configuration on destination, 'fail early') if we use
>> in-KVM filtering to throw an error to userspace. But if we blindly
>> filter control MSRs on the destination, 'TscScaling' will just disapper
>> undreneath the guest. This is unlikely to work.
>
> But all of that holds true irrespetive of eVMCS.  If QEMU attempts to migrate a
> nested guest from a KVM that supports TSC_SCALING to a KVM that doesn't support
> TSC_SCALING, then TSC_SCALING is going to disappear and VM-Entry on the dest will
> fail.  I.e. QEMU _must_ be able to detect the incompatibility and not attempt
> the migration.  And with that code in place, QEMU doesn't need to do anything new
> for eVMCS, it Just Works.

I'm obviously missing something. "-cpu CascadeLake-Sever" presumes
cetain features, including VMX features (e.g. TSC_SCALING), an attempt
to create such vCPU on a CPU which doesn't support it will lead to
immediate failure. So two VMs created on different hosts with

-cpu CascadeLake-Sever

are guaranteed to look exactly the same from guest PoV. This is not true
for '-cpu CascadeLake-Sever,hv-evmcs' (if we do it the way you suggest)
as 'hv-evmcs' will be a *different* filter on each host (which is going
to depend on KVM version, not even on the host's hardware).

>
>> In any case, what we need, is an option for VMM (read: QEMU) to create
>> the configuration with 'TscScaling' filtered out even KVM supports the
>> bit in eVMCS. This way the guest will be able to migrate backwards to an
>> older KVM which doesn't support it, i.e.
>> 
>> '-cpu CascadeLake-Sever,hv-evmcs'
>>  creates the 'origin' eVMCS configuration, no TscScaling
>> 
>> '-cpu CascadeLake-Sever,hv-evmcs,hv-evmcs-2022' creates the updated one.
>
> Again, this conundrum exists irrespective of eVMCS.  Properly solve the problem
> for regular nVMX and eVMCS should naturally work.

I don't think we have this problem for VMX features as named CPU models
in QEMU encode all of them explicitly, they *must* be present whenever
such vCPU is created.

>
>> KVM_CAP_HYPERV_ENLIGHTENED_VMCS is bad as it only takes 'eVMCS' version
>> as a parameter (as we assumed it will always change when new fields are
>> added, but that turned out to be false). That's why I suggested
>> KVM_CAP_HYPERV_ENLIGHTENED_VMCS2.
>
> Enumerating features via versions is such a bad API though, e.g. if there's a
> bug with nested TSC_SCALING, userspace can't disable just nested TSC_SCALING
> without everything else under the inscrutable "hv-evmcs-2022" being collateral
> damage.

Why? Something like 

"-cpu CascadeLake-Sever,hv-evmcs,hv-evmcs-2022,-vmx-tsc-scaling" 

should work well, no? 'hv-evmcs*' are just filters, if the VMX feature
is not there -- it's not there.

We can certainly make this fine-grained and introduce
KVM_CAP_HYPERV_EVMCS_TSC_SCALING instead and make a 'hv-*' flag for it,
however, with Hyper-V and Windows I really don't like 'frankenstein'
Hyper-V configurations which look nothing like genuine Hyper-V as nobody
at Microsoft tests new Windows builds with such configurations. And yes,
bugs were discovered in the past (e.g. PerfGlobalCtrl and Win11).
Sean Christopherson Aug. 23, 2022, 3 p.m. UTC | #13
On Tue, Aug 23, 2022, Vitaly Kuznetsov wrote:
> Sean Christopherson <seanjc@google.com> writes:
> 
> > On Mon, Aug 22, 2022, Vitaly Kuznetsov wrote:
> >> QEMU's migration depends on the assumption that identical QEMU's command
> >> lines create identical (from guest PoV) configurations. Assume we have
> >> (simplified)
> >> 
> >> "-cpu CascadeLake-Sever,hv-evmcs"
> >> 
> >> on both source and destination but source host is newer, i.e. its KVM
> >> knows about TSC Scaling in eVMCS and destination host has no idea about
> >> it. If we just apply filtering upon vCPU creation, guest visible MSR
> >> values are going to be different, right? Ok, assuming QEMU also migrates
> >> VMX feature MSRs (TODO: check if that's true), we will be able to fail
> >> mirgration late (which is already much worse than not being able to
> >> create the desired configuration on destination, 'fail early') if we use
> >> in-KVM filtering to throw an error to userspace. But if we blindly
> >> filter control MSRs on the destination, 'TscScaling' will just disapper
> >> undreneath the guest. This is unlikely to work.
> >
> > But all of that holds true irrespetive of eVMCS.  If QEMU attempts to migrate a
> > nested guest from a KVM that supports TSC_SCALING to a KVM that doesn't support
> > TSC_SCALING, then TSC_SCALING is going to disappear and VM-Entry on the dest will
> > fail.  I.e. QEMU _must_ be able to detect the incompatibility and not attempt
> > the migration.  And with that code in place, QEMU doesn't need to do anything new
> > for eVMCS, it Just Works.
> 
> I'm obviously missing something. "-cpu CascadeLake-Sever" presumes
> cetain features, including VMX features (e.g. TSC_SCALING), an attempt
> to create such vCPU on a CPU which doesn't support it will lead to
> immediate failure. So two VMs created on different hosts with
> 
> -cpu CascadeLake-Sever
> 
> are guaranteed to look exactly the same from guest PoV. This is not true
> for '-cpu CascadeLake-Sever,hv-evmcs' (if we do it the way you suggest)
> as 'hv-evmcs' will be a *different* filter on each host (which is going
> to depend on KVM version, not even on the host's hardware).

We're talking about nested VMX, i.e. exposing TSC_SCALING to L1.  QEMU's CLX
definition doesn't include TSC_SCALING.  In fact, none of QEMU's predefined CPU
models supports TSC_SCALING, precisely because KVM didn't support exposing the
feature to L1 until relatively recently.

$ git grep VMX_SECONDARY_EXEC_TSC_SCALING
target/i386/cpu.h:#define VMX_SECONDARY_EXEC_TSC_SCALING              0x02000000
target/i386/kvm/kvm.c:    if (f[FEAT_VMX_SECONDARY_CTLS] & VMX_SECONDARY_EXEC_TSC_SCALING) {

> >> In any case, what we need, is an option for VMM (read: QEMU) to create
> >> the configuration with 'TscScaling' filtered out even KVM supports the
> >> bit in eVMCS. This way the guest will be able to migrate backwards to an
> >> older KVM which doesn't support it, i.e.
> >> 
> >> '-cpu CascadeLake-Sever,hv-evmcs'
> >>  creates the 'origin' eVMCS configuration, no TscScaling
> >> 
> >> '-cpu CascadeLake-Sever,hv-evmcs,hv-evmcs-2022' creates the updated one.

Ah, I see what you're worried about.  Your concern is that QEMU will add a VMX
feature to a predefined CPU model, but only later gain eVMCS support, and so
"CascadeLake-Server,hv-evmcs" will do different things depending on the KVM
version.

But again, that's already reality.  Run "-cpu CascadeLake-Server" against a KVM
from before commits:

  28c1c9fabf48 ("KVM/VMX: Emulate MSR_IA32_ARCH_CAPABILITIES")
  1eaafe91a0df ("kvm: x86: IA32_ARCH_CAPABILITIES is always supported")

and it will fail.  There are undoubtedly many other features that are similarly
affected, just go back far enough in KVM time.

Or simply run "-cpu CascadeLake-Server" on pre-CLX hardware.  Anything that KVM
doesn't fully emulate will not be present.

> > Again, this conundrum exists irrespective of eVMCS.  Properly solve the problem
> > for regular nVMX and eVMCS should naturally work.
> 
> I don't think we have this problem for VMX features as named CPU models
> in QEMU encode all of them explicitly, they *must* be present whenever
> such vCPU is created.

Yes, and if KVM doesn't support features that CascadeLake-Server requires, spawning
the VM will fail on the destination, as it should.  My point is that this behavior
is not unique to eVMCS.

QEMU/Libvirt must also be prepared for rejection, because it is flat out impossible
to ensure that KVM+hardware supports a specific feature.

> >> KVM_CAP_HYPERV_ENLIGHTENED_VMCS is bad as it only takes 'eVMCS' version
> >> as a parameter (as we assumed it will always change when new fields are
> >> added, but that turned out to be false). That's why I suggested
> >> KVM_CAP_HYPERV_ENLIGHTENED_VMCS2.
> >
> > Enumerating features via versions is such a bad API though, e.g. if there's a
> > bug with nested TSC_SCALING, userspace can't disable just nested TSC_SCALING
> > without everything else under the inscrutable "hv-evmcs-2022" being collateral
> > damage.
> 
> Why? Something like 
> 
> "-cpu CascadeLake-Sever,hv-evmcs,hv-evmcs-2022,-vmx-tsc-scaling" 
> 
> should work well, no? 'hv-evmcs*' are just filters, if the VMX feature
> is not there -- it's not there.

Because it's completely unnecessary, adds non-trivial maintenance burden to KVM,
and requires explicit documentation to explain to userspace what "hv-evmcs-2022"
means.

It's unnecessary because if the user is concerned about eVMCS features showing up
in the future, then they should do:

  -cpu CascadeLake-Server,hv-evmcs,-vmx-tsc-scaling,-<any other VMX features not eVMCS-friendly>

If QEMU wants to make that more user friendly, then define CascadeLake-Server-eVMCS
or whatever so that the features that are unlikely be supported for eVMCS are off by
default.  This is no different than QEMU not including nested TSC_SCALING in any of
the predefined models; the developers _know_ KVM doesn't widely support TSC_SCALING,
so it was omitted, even though a real CLX CPU is guaranteed to support TSC_SCALING.

It's non-trivial maintenance for KVM because it would require defining new versions
every time an eVMCS field is added, allowing userspace to specify and restrict
features based on arbitrary versions, and do all of that without conflicting with
whatever PV enumeration Microsoft adds.
Sean Christopherson Aug. 23, 2022, 3:31 p.m. UTC | #14
On Tue, Aug 23, 2022, Sean Christopherson wrote:
> On Tue, Aug 23, 2022, Vitaly Kuznetsov wrote:
> > >> In any case, what we need, is an option for VMM (read: QEMU) to create
> > >> the configuration with 'TscScaling' filtered out even KVM supports the
> > >> bit in eVMCS. This way the guest will be able to migrate backwards to an
> > >> older KVM which doesn't support it, i.e.
> > >> 
> > >> '-cpu CascadeLake-Sever,hv-evmcs'
> > >>  creates the 'origin' eVMCS configuration, no TscScaling
> > >> 
> > >> '-cpu CascadeLake-Sever,hv-evmcs,hv-evmcs-2022' creates the updated one.
> 
> Ah, I see what you're worried about.  Your concern is that QEMU will add a VMX
> feature to a predefined CPU model, but only later gain eVMCS support, and so
> "CascadeLake-Server,hv-evmcs" will do different things depending on the KVM
> version.
> 
> But again, that's already reality.  Run "-cpu CascadeLake-Server" against a KVM
> from before commits:
> 
>   28c1c9fabf48 ("KVM/VMX: Emulate MSR_IA32_ARCH_CAPABILITIES")
>   1eaafe91a0df ("kvm: x86: IA32_ARCH_CAPABILITIES is always supported")
> 
> and it will fail.  There are undoubtedly many other features that are similarly
> affected, just go back far enough in KVM time.

The one potential issue I see is that KVM currently silently hides TSC_SCALING
and PERF_GLOBAL_CTRL, i.e. migrating from new KVM to old KVM may "succeed" and
then later fail a nested VM-Entry.

PERF_GLOBAL_CTRL is solved because Microsoft has conveniently provided a CPUID
bit.

TSC_SCALING is unlikely to be a problem since it's so new, but if we're worried
about someone doing e.g. "-cpu CascadeLake-Server,hv-evmcs,+vmx-tsc-scaling", then
we can add a KVM quirk to silently hide TSC_SCALING from the guest when eVMCS is
enabled.
Vitaly Kuznetsov Aug. 23, 2022, 4:54 p.m. UTC | #15
Sean Christopherson <seanjc@google.com> writes:

> We're talking about nested VMX, i.e. exposing TSC_SCALING to L1.  QEMU's CLX
> definition doesn't include TSC_SCALING.  In fact, none of QEMU's predefined CPU
> models supports TSC_SCALING, precisely because KVM didn't support exposing the
> feature to L1 until relatively recently.
>
> $ git grep VMX_SECONDARY_EXEC_TSC_SCALING
> target/i386/cpu.h:#define VMX_SECONDARY_EXEC_TSC_SCALING              0x02000000
> target/i386/kvm/kvm.c:    if (f[FEAT_VMX_SECONDARY_CTLS] &  VMX_SECONDARY_EXEC_TSC_SCALING) {

(sorry for my persistence but I still believe there are issues which we
won't be able to solve if we take the suggested approach).

You got me. Indeed, "vmx-tsc-scaling" feature is indeed not set for
named CPU models so my example was flawed. Let's swap it with
VMX_VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL /
VMX_VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL which a bunch of named models
have. So I do the same,

'-cpu CascadeLake-Sever,hv-evmcs'

on both the source host which knows about these eVMCS fields and the
destination host which doesn't.

First problem: CPUID. On the source host, we will have
CPUID.0x4000000A.EBX BIT(0) = 1, and "=0" on the destination. I don't
think we migrate CPUID data (can be wrong, though).

Second, assuming VMX feature MSRs are actually migrated, we must fail on
the destnation because VMX_VM_{ENTRY,EXIT}_LOAD_IA32_PERF_GLOBAL_CTRL is
trying to get set. We can do this in KVM but note: currently, KVM
filters guest reads but not host's so when you're trying to migrate from
a non-fixed KVM, VMX_VM_{ENTRY,EXIT}_LOAD_IA32_PERF_GLOBAL_CTRL are
actually present! So how do we distinguinsh in KVM between these two
cases, i.e. how do we know if
VMX_VM_{ENTRY,EXIT}_LOAD_IA32_PERF_GLOBAL_CTRL were filtered out on the
source (old kvm) or not (new KVM)?

...
>
> Because it's completely unnecessary, adds non-trivial maintenance burden to KVM,
> and requires explicit documentation to explain to userspace what "hv-evmcs-2022"
> means.
>
> It's unnecessary because if the user is concerned about eVMCS features showing up
> in the future, then they should do:
>
>   -cpu CascadeLake-Server,hv-evmcs,-vmx-tsc-scaling,-<any other VMX features not eVMCS-friendly>
>
> If QEMU wants to make that more user friendly, then define CascadeLake-Server-eVMCS
> or whatever so that the features that are unlikely be supported for eVMCS are off by
> default.

I completely agree that what I'm trying to achieve here could've been
done in QEMU from day 1 but we now have what we have: KVM silently
filtering out certain VMX features and zero indication to userspace
VMM whether filtering is being done or not (besides this
CPUID.0x4000000A.EBX BIT(0) bit but I'm not even sure we analyze
source's CPUID data upon migration).

>  This is no different than QEMU not including nested TSC_SCALING in any of
> the predefined models; the developers _know_ KVM doesn't widely support TSC_SCALING,
> so it was omitted, even though a real CLX CPU is guaranteed to support TSC_SCALING.
>

Out of curiosity, what happens if someone sends the following patch to
QEMU:

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 1db1278a599b..2278f4522b44 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -3191,6 +3191,12 @@ static const X86CPUDefinition builtin_x86_defs[] = {
                   { "vmx-xsaves", "on" },
                   { /* end of list */ }
               },
+            { .version = 6,
+              .note = "ARCH_CAPABILITIES, EPT switching, XSAVES, no TSX, TSC_SCALING",
+              .props = (PropValue[]) {
+                  { "vmx-tsc-scaling", "on" },
+                  { /* end of list */ }
+              },
             },
             { /* end of list */ }
         }

Will Paolo remember about eVMCS and reject it?

> It's non-trivial maintenance for KVM because it would require defining new versions
> every time an eVMCS field is added, allowing userspace to specify and restrict
> features based on arbitrary versions, and do all of that without conflicting with
> whatever PV enumeration Microsoft adds.

The update at hand comes with a feature bit so no mater what we do, we
will need a new QEMU flag to support this feature bit. My suggestion was
just that we stretch its definition a bit and encode not only
PERF_GLOBAL_CTRL but all fields which were added. At the same time we
can switch to filtering host reads and failing host writes for what's
missing (and to do so we'll likely need to invert the logic and
explicitly list what eVMCS supports) so we're better prepared to the
next update.
Sean Christopherson Aug. 23, 2022, 8:16 p.m. UTC | #16
On Tue, Aug 23, 2022, Vitaly Kuznetsov wrote:
> Sean Christopherson <seanjc@google.com> writes:
> 
> > We're talking about nested VMX, i.e. exposing TSC_SCALING to L1.  QEMU's CLX
> > definition doesn't include TSC_SCALING.  In fact, none of QEMU's predefined CPU
> > models supports TSC_SCALING, precisely because KVM didn't support exposing the
> > feature to L1 until relatively recently.
> >
> > $ git grep VMX_SECONDARY_EXEC_TSC_SCALING
> > target/i386/cpu.h:#define VMX_SECONDARY_EXEC_TSC_SCALING              0x02000000
> > target/i386/kvm/kvm.c:    if (f[FEAT_VMX_SECONDARY_CTLS] &  VMX_SECONDARY_EXEC_TSC_SCALING) {
> 
> (sorry for my persistence but I still believe there are issues which we
> won't be able to solve if we take the suggested approach).
> 
> You got me. Indeed, "vmx-tsc-scaling" feature is indeed not set for
> named CPU models so my example was flawed. Let's swap it with
> VMX_VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL /
> VMX_VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL which a bunch of named models
> have. So I do the same,
> 
> '-cpu CascadeLake-Sever,hv-evmcs'
> 
> on both the source host which knows about these eVMCS fields and the
> destination host which doesn't.
 
> First problem: CPUID. On the source host, we will have
> CPUID.0x4000000A.EBX BIT(0) = 1, and "=0" on the destination. I don't
> think we migrate CPUID data (can be wrong, though).

Huh?  Why would the source have CPUID.0x4000000A.EBX.BIT(0) = 1?  If QEMU is
automatically parroting all KVM-supported Hyper-V features back into KVM via
KVM_SET_CPUID2 _and_ expects the resulting VM to be migratable, then that's a
QEMU bug.

The CPUID bits that matter _have_ to be "migrated", in the sense that the source
and destination absolutely must have compatible CPUID definitions.  The Linux kernel
does not support refreshing CPUID, where as userspace might depending on when the
userspace application starts up[*].  Dropping or adding CPUID bits across migration
is all but guaranteed to cause breakage, e.g. drop the PCID bit and KVM will start
injection #GPs on the destination.

[*] https://lore.kernel.org/lkml/Yvn5BNXfOm3uA7WA@worktop.programming.kicks-ass.net

> Second, assuming VMX feature MSRs are actually migrated, we must fail on

VMX feature MSRs are basically CPL-only CPUID leafs, i.e. they too must be "migrated",
where migrated can be an actual save/restore or QEMU ensuring that the destination
ends up with the same configuration as the source.

> the destnation because VMX_VM_{ENTRY,EXIT}_LOAD_IA32_PERF_GLOBAL_CTRL is
> trying to get set. We can do this in KVM but note: currently, KVM
> filters guest reads but not host's so when you're trying to migrate from
> a non-fixed KVM, VMX_VM_{ENTRY,EXIT}_LOAD_IA32_PERF_GLOBAL_CTRL are
> actually present! So how do we distinguinsh in KVM between these two
> cases, i.e. how do we know if
> VMX_VM_{ENTRY,EXIT}_LOAD_IA32_PERF_GLOBAL_CTRL were filtered out on the
> source (old kvm) or not (new KVM)?

PERF_GLOBAL_CTRL is "solved" because Microsoft provided a CPUID bit.  First, fix
KVM to filter KVM_GET_MSRS when eVMCS is enabled.  Then, expose PERF_GLOBAL_CTRL
to the guest if and only if the new CPUID bit is set.

That guarantees that userspace has to explicitly enable exposure of the fields.
And again, if QEMU is blindly reflecting Hyper-V CPUID leafs, that's a QEMU bug.

But peeking ahead, I think we're in violent agreement on these points.

> > Because it's completely unnecessary, adds non-trivial maintenance burden to KVM,
> > and requires explicit documentation to explain to userspace what "hv-evmcs-2022"
> > means.
> >
> > It's unnecessary because if the user is concerned about eVMCS features showing up
> > in the future, then they should do:
> >
> >   -cpu CascadeLake-Server,hv-evmcs,-vmx-tsc-scaling,-<any other VMX features not eVMCS-friendly>
> >
> > If QEMU wants to make that more user friendly, then define CascadeLake-Server-eVMCS
> > or whatever so that the features that are unlikely be supported for eVMCS are off by
> > default.
> 
> I completely agree that what I'm trying to achieve here could've been
> done in QEMU from day 1 but we now have what we have: KVM silently
> filtering out certain VMX features and zero indication to userspace
> VMM whether filtering is being done or not (besides this
> CPUID.0x4000000A.EBX BIT(0) bit but I'm not even sure we analyze
> source's CPUID data upon migration).
>
> >  This is no different than QEMU not including nested TSC_SCALING in any of
> > the predefined models; the developers _know_ KVM doesn't widely support TSC_SCALING,
> > so it was omitted, even though a real CLX CPU is guaranteed to support TSC_SCALING.
> >
> 
> Out of curiosity, what happens if someone sends the following patch to
> QEMU:
> 
> diff --git a/target/i386/cpu.c b/target/i386/cpu.c
> index 1db1278a599b..2278f4522b44 100644
> --- a/target/i386/cpu.c
> +++ b/target/i386/cpu.c
> @@ -3191,6 +3191,12 @@ static const X86CPUDefinition builtin_x86_defs[] = {
>                    { "vmx-xsaves", "on" },
>                    { /* end of list */ }
>                },
> +            { .version = 6,
> +              .note = "ARCH_CAPABILITIES, EPT switching, XSAVES, no TSX, TSC_SCALING",
> +              .props = (PropValue[]) {
> +                  { "vmx-tsc-scaling", "on" },
> +                  { /* end of list */ }
> +              },
>              },
>              { /* end of list */ }
>          }
> 
> Will Paolo remember about eVMCS and reject it?

Ah, I see.  If QEMU adds vmx-tsc-scaling in the future, then creating a VM will
not fail as it should if QEMU runs with an older KVM that silently hides
TSC_SCALING.

Argh.  There's another problem.  KVM will break userspace if KVM starts enforcing
writes to VMX MSRs.  This isn't solvable without new uAPI.  We can handle
PERF_GLOBAL_CTRL and TSC_SCALING by enabling the enforcement after they're no
longer marked unsupported, but that doesn't address all the other controls that
are unsupported.  E.g. PML is in many of QEMU's named CPU models but is unsupported
when eVMCS is enabled.

This might end up looking at lot like your "versioning" approach, except that there
will be exactly two versions: legacy and enforced (or whatever we want to call 'em).

I suspect this may force QEMU to have eVMCS-specific named CPU models.  I don't see
any way around that, "CascadeLake-Server,hv-evmcs" really ends up being a wildly
different vCPU than vanilla "CascadeLake-Server".

> > It's non-trivial maintenance for KVM because it would require defining new versions
> > every time an eVMCS field is added, allowing userspace to specify and restrict
> > features based on arbitrary versions, and do all of that without conflicting with
> > whatever PV enumeration Microsoft adds.
> 
> The update at hand comes with a feature bit so no mater what we do, we
> will need a new QEMU flag to support this feature bit. My suggestion was
> just that we stretch its definition a bit and encode not only
> PERF_GLOBAL_CTRL but all fields which were added.

I really don't think KVM should take liberties with others' "architectural" CPUID
bits.  IMO, redefining Hyper-V's CPUID bits is no different than redefining Intel
or AMD's CPUID bits.

I'm pretty sure it's a moot point though, because we can't gate userspace behavior
on guest CPUID.

> At the same time we can switch to filtering host reads and failing host
> writes for what's missing (and to do so we'll likely need to invert the logic
> and explicitly list what eVMCS supports) so we're better prepared to the next update.

Yep.  Inverting the logic may not be strictly necessary, i.e. might be able to go
on top, but it definitely should be done sooner than later.

As above, we also have to snapshot the "legacy" controls and restrict the guest to
the legacy controls when KVM is _not_ enforcing userspace accesses.

Let me package up what I have so far, do some (very) light testing, and post it as
RFC so that we can make this less theoretical, and so that I can hand things back
off to you.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index 6f0acc45e67a..ebc27017fa48 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -138,6 +138,17 @@ 
 #define HV_X64_NESTED_GUEST_MAPPING_FLUSH		BIT(18)
 #define HV_X64_NESTED_MSR_BITMAP			BIT(19)
 
+/*
+ * Nested quirks. These are HYPERV_CPUID_NESTED_FEATURES.EBX bits.
+ *
+ * Note: HV_X64_NESTED_EVMCS1_2022_UPDATE is not currently documented in any
+ * published TLFS version. When the bit is set, nested hypervisor can use
+ * 'updated' eVMCSv1 specification (perf_global_ctrl, s_cet, ssp, lbr_ctl,
+ * encls_exiting_bitmap, tsc_multiplier fields which were missing in 2016
+ * specification).
+ */
+#define HV_X64_NESTED_EVMCS1_2022_UPDATE		BIT(0)
+
 /*
  * This is specific to AMD and specifies that enlightened TLB flush is
  * supported. If guest opts in to this feature, ASID invalidations only
@@ -559,9 +570,20 @@  struct hv_enlightened_vmcs {
 	u64 partition_assist_page;
 	u64 padding64_4[4];
 	u64 guest_bndcfgs;
-	u64 padding64_5[7];
+	u64 guest_ia32_perf_global_ctrl;
+	u64 guest_ia32_s_cet;
+	u64 guest_ssp;
+	u64 guest_ia32_int_ssp_table_addr;
+	u64 guest_ia32_lbr_ctl;
+	u64 padding64_5[2];
 	u64 xss_exit_bitmap;
-	u64 padding64_6[7];
+	u64 encls_exiting_bitmap;
+	u64 host_ia32_perf_global_ctrl;
+	u64 tsc_multiplier;
+	u64 host_ia32_s_cet;
+	u64 host_ssp;
+	u64 host_ia32_int_ssp_table_addr;
+	u64 padding64_6;
 } __packed;
 
 #define HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE			0