diff mbox series

[v2,1/7] KVM: nVMX: Introduce nested_evmcs_is_used()

Message ID 20210517135054.1914802-2-vkuznets@redhat.com (mailing list archive)
State New, archived
Headers show
Series KVM: nVMX: Fixes for nested state migration when eVMCS is in use | expand

Commit Message

Vitaly Kuznetsov May 17, 2021, 1:50 p.m. UTC
Unlike regular set_current_vmptr(), nested_vmx_handle_enlightened_vmptrld()
can not be called directly from vmx_set_nested_state() as KVM may not have
all the information yet (e.g. HV_X64_MSR_VP_ASSIST_PAGE MSR may not be
restored yet). Enlightened VMCS is mapped later while getting nested state
pages. In the meantime, vmx->nested.hv_evmcs remains NULL and using it
for various checks is incorrect. In particular, if KVM_GET_NESTED_STATE is
called right after KVM_SET_NESTED_STATE, KVM_STATE_NESTED_EVMCS flag in the
resulting state will be unset (and such state will later fail to load).

Introduce nested_evmcs_is_used() and use 'is_guest_mode(vcpu) &&
vmx->nested.current_vmptr == -1ull' check to detect not-yet-mapped eVMCS
after restore.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

Comments

Maxim Levitsky May 24, 2021, 12:11 p.m. UTC | #1
On Mon, 2021-05-17 at 15:50 +0200, Vitaly Kuznetsov wrote:
> Unlike regular set_current_vmptr(), nested_vmx_handle_enlightened_vmptrld()
> can not be called directly from vmx_set_nested_state() as KVM may not have
> all the information yet (e.g. HV_X64_MSR_VP_ASSIST_PAGE MSR may not be
> restored yet). Enlightened VMCS is mapped later while getting nested state
> pages. In the meantime, vmx->nested.hv_evmcs remains NULL and using it
> for various checks is incorrect. In particular, if KVM_GET_NESTED_STATE is
> called right after KVM_SET_NESTED_STATE, KVM_STATE_NESTED_EVMCS flag in the
> resulting state will be unset (and such state will later fail to load).
> 
> Introduce nested_evmcs_is_used() and use 'is_guest_mode(vcpu) &&
> vmx->nested.current_vmptr == -1ull' check to detect not-yet-mapped eVMCS
> after restore.
> 
> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
> ---
>  arch/x86/kvm/vmx/nested.c | 31 ++++++++++++++++++++++++++-----
>  1 file changed, 26 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index 6058a65a6ede..3080e00c8f90 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -141,6 +141,27 @@ static void init_vmcs_shadow_fields(void)
>  	max_shadow_read_write_fields = j;
>  }
>  
> +static inline bool nested_evmcs_is_used(struct vcpu_vmx *vmx)
> +{
> +	struct kvm_vcpu *vcpu = &vmx->vcpu;
> +
> +	if (vmx->nested.hv_evmcs)
> +		return true;
> +
> +	/*
> +	 * After KVM_SET_NESTED_STATE, enlightened VMCS is mapped during
> +	 * KVM_REQ_GET_NESTED_STATE_PAGES handling and until the request is
> +	 * processed vmx->nested.hv_evmcs is NULL. It is, however, possible to
> +	 * detect such state by checking 'nested.current_vmptr == -1ull' when
> +	 * vCPU is in guest mode, it is only possible with eVMCS.
> +	 */
> +	if (unlikely(vmx->nested.enlightened_vmcs_enabled && is_guest_mode(vcpu) &&
> +		     (vmx->nested.current_vmptr == -1ull)))
> +		return true;
> +
> +	return false;
> +}


I think that this is a valid way to solve the issue,
but it feels like there might be a better way.
I don't mind though to accept this patch as is.

So here are my 2 cents about this:

First of all after studying how evmcs works I take my words back
about needing to migrate its contents. 

It is indeed enough to migrate its physical address, 
or maybe even just a flag that evmcs is loaded
(and to my surprise we already do this - KVM_STATE_NESTED_EVMCS)

So how about just having a boolean flag that indicates that evmcs is in use, 
but doesn't imply that we know its address or that it is mapped 
to host address space, something like 'vmx->nested.enlightened_vmcs_loaded'

On migration that flag saved and restored as the KVM_STATE_NESTED_EVMCS,
otherwise it set when we load an evmcs and cleared when it is released.

Then as far as I can see we can use this flag in nested_evmcs_is_used
since all its callers don't touch evmcs, thus don't need it to be
mapped.

What do you think?

Best regards,
	Maxim Levitsky





>  /*
>   * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
>   * set the success or error code of an emulated VMX instruction (as specified
> @@ -187,7 +208,7 @@ static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error)
>  	 * failValid writes the error number to the current VMCS, which
>  	 * can't be done if there isn't a current VMCS.
>  	 */
> -	if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs)
> +	if (vmx->nested.current_vmptr == -1ull && !nested_evmcs_is_used(vmx))
>  		return nested_vmx_failInvalid(vcpu);
>  
>  	return nested_vmx_failValid(vcpu, vm_instruction_error);
> @@ -2208,7 +2229,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
>  	u32 exec_control;
>  	u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
>  
> -	if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs)
> +	if (vmx->nested.dirty_vmcs12 || nested_evmcs_is_used(vmx))
>  		prepare_vmcs02_early_rare(vmx, vmcs12);
>  
>  	/*
> @@ -3437,7 +3458,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
>  
>  	load_vmcs12_host_state(vcpu, vmcs12);
>  	vmcs12->vm_exit_reason = exit_reason.full;
> -	if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
> +	if (enable_shadow_vmcs || nested_evmcs_is_used(vmx))
>  		vmx->nested.need_vmcs12_to_shadow_sync = true;
>  	return NVMX_VMENTRY_VMEXIT;
>  }
> @@ -4032,7 +4053,7 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>  {
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
>  
> -	if (vmx->nested.hv_evmcs)
> +	if (nested_evmcs_is_used(vmx))
>  		sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
>  
>  	vmx->nested.need_sync_vmcs02_to_vmcs12_rare = !vmx->nested.hv_evmcs;
> @@ -6056,7 +6077,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
>  		if (vmx_has_valid_vmcs12(vcpu)) {
>  			kvm_state.size += sizeof(user_vmx_nested_state->vmcs12);
>  
> -			if (vmx->nested.hv_evmcs)
> +			if (nested_evmcs_is_used(vmx))
>  				kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
>  
>  			if (is_guest_mode(vcpu) &&
Vitaly Kuznetsov May 24, 2021, 12:35 p.m. UTC | #2
Maxim Levitsky <mlevitsk@redhat.com> writes:

> On Mon, 2021-05-17 at 15:50 +0200, Vitaly Kuznetsov wrote:
>> Unlike regular set_current_vmptr(), nested_vmx_handle_enlightened_vmptrld()
>> can not be called directly from vmx_set_nested_state() as KVM may not have
>> all the information yet (e.g. HV_X64_MSR_VP_ASSIST_PAGE MSR may not be
>> restored yet). Enlightened VMCS is mapped later while getting nested state
>> pages. In the meantime, vmx->nested.hv_evmcs remains NULL and using it
>> for various checks is incorrect. In particular, if KVM_GET_NESTED_STATE is
>> called right after KVM_SET_NESTED_STATE, KVM_STATE_NESTED_EVMCS flag in the
>> resulting state will be unset (and such state will later fail to load).
>> 
>> Introduce nested_evmcs_is_used() and use 'is_guest_mode(vcpu) &&
>> vmx->nested.current_vmptr == -1ull' check to detect not-yet-mapped eVMCS
>> after restore.
>> 
>> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
>> ---
>>  arch/x86/kvm/vmx/nested.c | 31 ++++++++++++++++++++++++++-----
>>  1 file changed, 26 insertions(+), 5 deletions(-)
>> 
>> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
>> index 6058a65a6ede..3080e00c8f90 100644
>> --- a/arch/x86/kvm/vmx/nested.c
>> +++ b/arch/x86/kvm/vmx/nested.c
>> @@ -141,6 +141,27 @@ static void init_vmcs_shadow_fields(void)
>>  	max_shadow_read_write_fields = j;
>>  }
>>  
>> +static inline bool nested_evmcs_is_used(struct vcpu_vmx *vmx)
>> +{
>> +	struct kvm_vcpu *vcpu = &vmx->vcpu;
>> +
>> +	if (vmx->nested.hv_evmcs)
>> +		return true;
>> +
>> +	/*
>> +	 * After KVM_SET_NESTED_STATE, enlightened VMCS is mapped during
>> +	 * KVM_REQ_GET_NESTED_STATE_PAGES handling and until the request is
>> +	 * processed vmx->nested.hv_evmcs is NULL. It is, however, possible to
>> +	 * detect such state by checking 'nested.current_vmptr == -1ull' when
>> +	 * vCPU is in guest mode, it is only possible with eVMCS.
>> +	 */
>> +	if (unlikely(vmx->nested.enlightened_vmcs_enabled && is_guest_mode(vcpu) &&
>> +		     (vmx->nested.current_vmptr == -1ull)))
>> +		return true;
>> +
>> +	return false;
>> +}
>
>
> I think that this is a valid way to solve the issue,
> but it feels like there might be a better way.
> I don't mind though to accept this patch as is.
>
> So here are my 2 cents about this:
>
> First of all after studying how evmcs works I take my words back
> about needing to migrate its contents. 
>
> It is indeed enough to migrate its physical address, 
> or maybe even just a flag that evmcs is loaded
> (and to my surprise we already do this - KVM_STATE_NESTED_EVMCS)
>
> So how about just having a boolean flag that indicates that evmcs is in use, 
> but doesn't imply that we know its address or that it is mapped 
> to host address space, something like 'vmx->nested.enlightened_vmcs_loaded'
>
> On migration that flag saved and restored as the KVM_STATE_NESTED_EVMCS,
> otherwise it set when we load an evmcs and cleared when it is released.
>
> Then as far as I can see we can use this flag in nested_evmcs_is_used
> since all its callers don't touch evmcs, thus don't need it to be
> mapped.
>
> What do you think?
>

First, we need to be compatible with older KVMs which don't have the
flag and this is problematic: currently, we always expect vmcs12 to
carry valid contents. This is challenging.

Second, vCPU can be migrated in three different states:
1) While L2 was running ('true' nested state is in VMCS02)
2) While L1 was running ('true' nested state is in eVMCS)
3) Right after an exit from L2 to L1 was forced
('need_vmcs12_to_shadow_sync = true') ('true' nested state is in
VMCS12).

The current solution is to always use VMCS12 as a container to transfer
the state and conceptually, it is at least easier to understand.

We can, indeed, transfer eVMCS (or VMCS12) in case 2) through guest
memory and I even tried that but that was making the code more complex
so eventually I gave up and decided to preserve the 'always use VMCS12
as a container' status quo.
Paolo Bonzini May 24, 2021, 1:54 p.m. UTC | #3
On 17/05/21 15:50, Vitaly Kuznetsov wrote:
> Unlike regular set_current_vmptr(), nested_vmx_handle_enlightened_vmptrld()
> can not be called directly from vmx_set_nested_state() as KVM may not have
> all the information yet (e.g. HV_X64_MSR_VP_ASSIST_PAGE MSR may not be
> restored yet). Enlightened VMCS is mapped later while getting nested state
> pages. In the meantime, vmx->nested.hv_evmcs remains NULL and using it
> for various checks is incorrect. In particular, if KVM_GET_NESTED_STATE is
> called right after KVM_SET_NESTED_STATE, KVM_STATE_NESTED_EVMCS flag in the
> resulting state will be unset (and such state will later fail to load).
> 
> Introduce nested_evmcs_is_used() and use 'is_guest_mode(vcpu) &&
> vmx->nested.current_vmptr == -1ull' check to detect not-yet-mapped eVMCS
> after restore.
> 
> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>

Would it make sense to instead use hv_evmcs_ptr, making the unused
value -1 instead of 0?

I even had this untested patch already lying around as a cleanup I
never bothered to submit:

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index a19cfcb625da..dd3e4ddaaf26 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -187,7 +187,7 @@ static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error)
  	 * failValid writes the error number to the current VMCS, which
  	 * can't be done if there isn't a current VMCS.
  	 */
-	if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs)
+	if (vmx->nested.current_vmptr == -1ull && vmx->nested.hv_evmcs_vmptr == -1)
  		return nested_vmx_failInvalid(vcpu);
  
  	return nested_vmx_failValid(vcpu, vm_instruction_error);
@@ -221,11 +221,11 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
  {
  	struct vcpu_vmx *vmx = to_vmx(vcpu);
  
-	if (!vmx->nested.hv_evmcs)
+	if (vmx->nested.hv_evmcs_vmptr == -1)
  		return;
  
  	kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, NULL, true);
-	vmx->nested.hv_evmcs_vmptr = 0;
+	vmx->nested.hv_evmcs_vmptr = -1;
  	vmx->nested.hv_evmcs = NULL;
  }
  
@@ -1978,10 +1978,8 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
  	if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa))
  		return EVMPTRLD_DISABLED;
  
-	if (unlikely(!vmx->nested.hv_evmcs ||
-		     evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
-		if (!vmx->nested.hv_evmcs)
-			vmx->nested.current_vmptr = -1ull;
+	if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
+		vmx->nested.current_vmptr = -1ull;
  
  		nested_release_evmcs(vcpu);
  
@@ -2052,7 +2050,7 @@ void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
  {
  	struct vcpu_vmx *vmx = to_vmx(vcpu);
  
-	if (vmx->nested.hv_evmcs) {
+	if (vmx->nested.hv_evmcs_vmptr != -1) {
  		copy_vmcs12_to_enlightened(vmx);
  		/* All fields are clean */
  		vmx->nested.hv_evmcs->hv_clean_fields |=
@@ -2204,7 +2202,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
  	u32 exec_control;
  	u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
  
-	if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs)
+	if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs_vmptr != -1)
  		prepare_vmcs02_early_rare(vmx, vmcs12);
  
  	/*
@@ -2487,9 +2485,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
  			  enum vm_entry_failure_code *entry_failure_code)
  {
  	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
  
-	if (vmx->nested.dirty_vmcs12 || hv_evmcs) {
+	if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs_vmptr != -1) {
  		prepare_vmcs02_rare(vmx, vmcs12);
  		vmx->nested.dirty_vmcs12 = false;
  	}
@@ -3075,7 +3072,7 @@ static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
  	 * L2 was running), map it here to make sure vmcs12 changes are
  	 * properly reflected.
  	 */
-	if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs) {
+	if (vmx->nested.enlightened_vmcs_enabled && vmx->nested.hv_evmcs_vmptr == -1) {
  		enum nested_evmptrld_status evmptrld_status =
  			nested_vmx_handle_enlightened_vmptrld(vcpu, false);
  
@@ -3419,7 +3416,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
  
  	load_vmcs12_host_state(vcpu, vmcs12);
  	vmcs12->vm_exit_reason = exit_reason.full;
-	if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
+	if (enable_shadow_vmcs || vmx->nested.hv_evmcs_vmptr != -1)
  		vmx->nested.need_vmcs12_to_shadow_sync = true;
  	return NVMX_VMENTRY_VMEXIT;
  }
@@ -3449,7 +3446,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
  		return nested_vmx_failInvalid(vcpu);
  	}
  
-	if (CC(!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull))
+	if (CC(vmx->nested.hv_evmcs_vmptr == -1 && vmx->nested.current_vmptr == -1ull))
  		return nested_vmx_failInvalid(vcpu);
  
  	vmcs12 = get_vmcs12(vcpu);
@@ -3463,7 +3460,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
  	if (CC(vmcs12->hdr.shadow_vmcs))
  		return nested_vmx_failInvalid(vcpu);
  
-	if (vmx->nested.hv_evmcs) {
+	if (vmx->nested.hv_evmcs_vmptr != -1) {
  		copy_enlightened_to_vmcs12(vmx);
  		/* Enlightened VMCS doesn't have launch state */
  		vmcs12->launch_state = !launch;
@@ -4014,10 +4011,10 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
  {
  	struct vcpu_vmx *vmx = to_vmx(vcpu);
  
-	if (vmx->nested.hv_evmcs)
+	if (vmx->nested.hv_evmcs_vmptr != -1)
  		sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
  
-	vmx->nested.need_sync_vmcs02_to_vmcs12_rare = !vmx->nested.hv_evmcs;
+	vmx->nested.need_sync_vmcs02_to_vmcs12_rare = (vmx->nested.hv_evmcs_vmptr == -1);
  
  	vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
  	vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
@@ -4514,7 +4511,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
  	}
  
  	if ((vm_exit_reason != -1) &&
-	    (enable_shadow_vmcs || vmx->nested.hv_evmcs))
+	    (enable_shadow_vmcs || vmx->nested.hv_evmcs_vmptr != -1))
  		vmx->nested.need_vmcs12_to_shadow_sync = true;
  
  	/* in case we halted in L2 */
@@ -5210,7 +5207,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
  		return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
  
  	/* Forbid normal VMPTRLD if Enlightened version was used */
-	if (vmx->nested.hv_evmcs)
+	if (vmx->nested.hv_evmcs_vmptr != -1)
  		return 1;
  
  	if (vmx->nested.current_vmptr != vmptr) {
@@ -5266,7 +5263,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
  	if (!nested_vmx_check_permission(vcpu))
  		return 1;
  
-	if (unlikely(to_vmx(vcpu)->nested.hv_evmcs))
+	if (unlikely(to_vmx(vcpu)->nested.hv_evmcs_vmptr != -1))
  		return 1;
  
  	if (get_vmx_mem_address(vcpu, exit_qual, instr_info,
@@ -6038,7 +6035,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
  		if (vmx_has_valid_vmcs12(vcpu)) {
  			kvm_state.size += sizeof(user_vmx_nested_state->vmcs12);
  
-			if (vmx->nested.hv_evmcs)
+			if (vmx->nested.hv_evmcs_vmptr != -1)
  				kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
  
  			if (is_guest_mode(vcpu) &&
@@ -6094,7 +6091,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
  	} else  {
  		copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
  		if (!vmx->nested.need_vmcs12_to_shadow_sync) {
-			if (vmx->nested.hv_evmcs)
+			if (vmx->nested.hv_evmcs_vmptr != -1)
  				copy_enlightened_to_vmcs12(vmx);
  			else if (enable_shadow_vmcs)
  				copy_shadow_to_vmcs12(vmx);
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index 184418baeb3c..55925be48973 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -63,7 +63,7 @@ static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu)
  	 * have vmcs12 if it is true.
  	 */
  	return is_guest_mode(vcpu) || vmx->nested.current_vmptr != -1ull ||
-		vmx->nested.hv_evmcs;
+		vmx->nested.hv_evmcs_vmptr != -1;
  }
  
  static inline u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index f2fd447eed45..33208aa4ac87 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6974,6 +6974,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
  
  	vmx->nested.posted_intr_nv = -1;
  	vmx->nested.current_vmptr = -1ull;
+	vmx->nested.hv_evmcs_vmptr = -1;
  
  	vcpu->arch.microcode_version = 0x100000000ULL;
  	vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;

Paolo
Vitaly Kuznetsov May 24, 2021, 2:09 p.m. UTC | #4
Paolo Bonzini <pbonzini@redhat.com> writes:

> On 17/05/21 15:50, Vitaly Kuznetsov wrote:
>> Unlike regular set_current_vmptr(), nested_vmx_handle_enlightened_vmptrld()
>> can not be called directly from vmx_set_nested_state() as KVM may not have
>> all the information yet (e.g. HV_X64_MSR_VP_ASSIST_PAGE MSR may not be
>> restored yet). Enlightened VMCS is mapped later while getting nested state
>> pages. In the meantime, vmx->nested.hv_evmcs remains NULL and using it
>> for various checks is incorrect. In particular, if KVM_GET_NESTED_STATE is
>> called right after KVM_SET_NESTED_STATE, KVM_STATE_NESTED_EVMCS flag in the
>> resulting state will be unset (and such state will later fail to load).
>> 
>> Introduce nested_evmcs_is_used() and use 'is_guest_mode(vcpu) &&
>> vmx->nested.current_vmptr == -1ull' check to detect not-yet-mapped eVMCS
>> after restore.
>> 
>> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
>
> Would it make sense to instead use hv_evmcs_ptr, making the unused
> value -1 instead of 0?

You mean we'll be using:

"hv_evmcs_ptr == 0" meaning "no evmcs" (like now)
"hv_evmcs_ptr == -1" meaing "evmcs not yet mapped" (and
nested_evmcs_is_used() will check for both '0' and '-1')
"hv_evmcs_ptr == anything else" - eVMCS mapped.

Right?

>
> I even had this untested patch already lying around as a cleanup I
> never bothered to submit:

...

>
> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index a19cfcb625da..dd3e4ddaaf26 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -187,7 +187,7 @@ static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error)
>   	 * failValid writes the error number to the current VMCS, which
>   	 * can't be done if there isn't a current VMCS.
>   	 */
> -	if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs)
> +	if (vmx->nested.current_vmptr == -1ull && vmx->nested.hv_evmcs_vmptr == -1)

Or, if we decide to integrate this patch, it'll be:

"hv_evmcs_ptr == -1" meaning "no evmcs" (what is '0' now)
"hv_evmcs_ptr == 0" meaing "evmcs not yet mapped"

I'm not against the idea, hope we won't forget that we have two 'special'
values for 'hv_evmcs_vmptr' and not just one.

>   		return nested_vmx_failInvalid(vcpu);
>   
>   	return nested_vmx_failValid(vcpu, vm_instruction_error);
> @@ -221,11 +221,11 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
>   {
>   	struct vcpu_vmx *vmx = to_vmx(vcpu);
>   
> -	if (!vmx->nested.hv_evmcs)
> +	if (vmx->nested.hv_evmcs_vmptr == -1)
>   		return;
>   
>   	kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, NULL, true);
> -	vmx->nested.hv_evmcs_vmptr = 0;
> +	vmx->nested.hv_evmcs_vmptr = -1;
>   	vmx->nested.hv_evmcs = NULL;
>   }
>   
> @@ -1978,10 +1978,8 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
>   	if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa))
>   		return EVMPTRLD_DISABLED;
>   
> -	if (unlikely(!vmx->nested.hv_evmcs ||
> -		     evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
> -		if (!vmx->nested.hv_evmcs)
> -			vmx->nested.current_vmptr = -1ull;
> +	if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
> +		vmx->nested.current_vmptr = -1ull;
>   
>   		nested_release_evmcs(vcpu);
>   
> @@ -2052,7 +2050,7 @@ void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
>   {
>   	struct vcpu_vmx *vmx = to_vmx(vcpu);
>   
> -	if (vmx->nested.hv_evmcs) {
> +	if (vmx->nested.hv_evmcs_vmptr != -1) {
>   		copy_vmcs12_to_enlightened(vmx);
>   		/* All fields are clean */
>   		vmx->nested.hv_evmcs->hv_clean_fields |=
> @@ -2204,7 +2202,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
>   	u32 exec_control;
>   	u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
>   
> -	if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs)
> +	if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs_vmptr != -1)
>   		prepare_vmcs02_early_rare(vmx, vmcs12);
>   
>   	/*
> @@ -2487,9 +2485,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
>   			  enum vm_entry_failure_code *entry_failure_code)
>   {
>   	struct vcpu_vmx *vmx = to_vmx(vcpu);
> -	struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
>   
> -	if (vmx->nested.dirty_vmcs12 || hv_evmcs) {
> +	if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs_vmptr != -1) {
>   		prepare_vmcs02_rare(vmx, vmcs12);
>   		vmx->nested.dirty_vmcs12 = false;
>   	}
> @@ -3075,7 +3072,7 @@ static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
>   	 * L2 was running), map it here to make sure vmcs12 changes are
>   	 * properly reflected.
>   	 */
> -	if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs) {
> +	if (vmx->nested.enlightened_vmcs_enabled && vmx->nested.hv_evmcs_vmptr == -1) {
>   		enum nested_evmptrld_status evmptrld_status =
>   			nested_vmx_handle_enlightened_vmptrld(vcpu, false);
>   
> @@ -3419,7 +3416,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
>   
>   	load_vmcs12_host_state(vcpu, vmcs12);
>   	vmcs12->vm_exit_reason = exit_reason.full;
> -	if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
> +	if (enable_shadow_vmcs || vmx->nested.hv_evmcs_vmptr != -1)
>   		vmx->nested.need_vmcs12_to_shadow_sync = true;
>   	return NVMX_VMENTRY_VMEXIT;
>   }
> @@ -3449,7 +3446,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
>   		return nested_vmx_failInvalid(vcpu);
>   	}
>   
> -	if (CC(!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull))
> +	if (CC(vmx->nested.hv_evmcs_vmptr == -1 && vmx->nested.current_vmptr == -1ull))
>   		return nested_vmx_failInvalid(vcpu);
>   
>   	vmcs12 = get_vmcs12(vcpu);
> @@ -3463,7 +3460,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
>   	if (CC(vmcs12->hdr.shadow_vmcs))
>   		return nested_vmx_failInvalid(vcpu);
>   
> -	if (vmx->nested.hv_evmcs) {
> +	if (vmx->nested.hv_evmcs_vmptr != -1) {
>   		copy_enlightened_to_vmcs12(vmx);
>   		/* Enlightened VMCS doesn't have launch state */
>   		vmcs12->launch_state = !launch;
> @@ -4014,10 +4011,10 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>   {
>   	struct vcpu_vmx *vmx = to_vmx(vcpu);
>   
> -	if (vmx->nested.hv_evmcs)
> +	if (vmx->nested.hv_evmcs_vmptr != -1)
>   		sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
>   
> -	vmx->nested.need_sync_vmcs02_to_vmcs12_rare = !vmx->nested.hv_evmcs;
> +	vmx->nested.need_sync_vmcs02_to_vmcs12_rare = (vmx->nested.hv_evmcs_vmptr == -1);
>   
>   	vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
>   	vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
> @@ -4514,7 +4511,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
>   	}
>   
>   	if ((vm_exit_reason != -1) &&
> -	    (enable_shadow_vmcs || vmx->nested.hv_evmcs))
> +	    (enable_shadow_vmcs || vmx->nested.hv_evmcs_vmptr != -1))
>   		vmx->nested.need_vmcs12_to_shadow_sync = true;
>   
>   	/* in case we halted in L2 */
> @@ -5210,7 +5207,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
>   		return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
>   
>   	/* Forbid normal VMPTRLD if Enlightened version was used */
> -	if (vmx->nested.hv_evmcs)
> +	if (vmx->nested.hv_evmcs_vmptr != -1)
>   		return 1;
>   
>   	if (vmx->nested.current_vmptr != vmptr) {
> @@ -5266,7 +5263,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
>   	if (!nested_vmx_check_permission(vcpu))
>   		return 1;
>   
> -	if (unlikely(to_vmx(vcpu)->nested.hv_evmcs))
> +	if (unlikely(to_vmx(vcpu)->nested.hv_evmcs_vmptr != -1))
>   		return 1;
>   
>   	if (get_vmx_mem_address(vcpu, exit_qual, instr_info,
> @@ -6038,7 +6035,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
>   		if (vmx_has_valid_vmcs12(vcpu)) {
>   			kvm_state.size += sizeof(user_vmx_nested_state->vmcs12);
>   
> -			if (vmx->nested.hv_evmcs)
> +			if (vmx->nested.hv_evmcs_vmptr != -1)
>   				kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
>   
>   			if (is_guest_mode(vcpu) &&
> @@ -6094,7 +6091,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
>   	} else  {
>   		copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
>   		if (!vmx->nested.need_vmcs12_to_shadow_sync) {
> -			if (vmx->nested.hv_evmcs)
> +			if (vmx->nested.hv_evmcs_vmptr != -1)
>   				copy_enlightened_to_vmcs12(vmx);
>   			else if (enable_shadow_vmcs)
>   				copy_shadow_to_vmcs12(vmx);
> diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
> index 184418baeb3c..55925be48973 100644
> --- a/arch/x86/kvm/vmx/nested.h
> +++ b/arch/x86/kvm/vmx/nested.h
> @@ -63,7 +63,7 @@ static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu)
>   	 * have vmcs12 if it is true.
>   	 */
>   	return is_guest_mode(vcpu) || vmx->nested.current_vmptr != -1ull ||
> -		vmx->nested.hv_evmcs;
> +		vmx->nested.hv_evmcs_vmptr != -1;
>   }
>   
>   static inline u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index f2fd447eed45..33208aa4ac87 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -6974,6 +6974,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
>   
>   	vmx->nested.posted_intr_nv = -1;
>   	vmx->nested.current_vmptr = -1ull;
> +	vmx->nested.hv_evmcs_vmptr = -1;
>   
>   	vcpu->arch.microcode_version = 0x100000000ULL;
>   	vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
>
> Paolo
>
Paolo Bonzini May 24, 2021, 2:18 p.m. UTC | #5
On 24/05/21 16:09, Vitaly Kuznetsov wrote:
> You mean we'll be using:
> 
> "hv_evmcs_ptr == 0" meaning "no evmcs" (like now)
> "hv_evmcs_ptr == -1" meaing "evmcs not yet mapped" (and
> nested_evmcs_is_used() will check for both '0' and '-1')
> "hv_evmcs_ptr == anything else" - eVMCS mapped.

I was thinking of:

hv_evmcs_ptr == -1 meaning no evmcs

hv_evmcs == NULL meaning "evmcs not yet mapped" (I think)

hv_evmcs != NULL meaning "evmcs mapped".

As usual with my suggestions, I'm not sure if this makes sense :) but if 
it does, the code should be nicer.

Paolo
Vitaly Kuznetsov May 24, 2021, 2:37 p.m. UTC | #6
Paolo Bonzini <pbonzini@redhat.com> writes:

> On 24/05/21 16:09, Vitaly Kuznetsov wrote:
>> You mean we'll be using:
>> 
>> "hv_evmcs_ptr == 0" meaning "no evmcs" (like now)
>> "hv_evmcs_ptr == -1" meaing "evmcs not yet mapped" (and
>> nested_evmcs_is_used() will check for both '0' and '-1')
>> "hv_evmcs_ptr == anything else" - eVMCS mapped.
>
> I was thinking of:
>
> hv_evmcs_ptr == -1 meaning no evmcs
>
> hv_evmcs == NULL meaning "evmcs not yet mapped" (I think)
>
> hv_evmcs != NULL meaning "evmcs mapped".
>
> As usual with my suggestions, I'm not sure if this makes sense :) but if 
> it does, the code should be nicer.

Ok, let me try this!
Maxim Levitsky May 26, 2021, 2:34 p.m. UTC | #7
On Mon, 2021-05-24 at 14:35 +0200, Vitaly Kuznetsov wrote:
> Maxim Levitsky <mlevitsk@redhat.com> writes:
> 
> > On Mon, 2021-05-17 at 15:50 +0200, Vitaly Kuznetsov wrote:
> > > Unlike regular set_current_vmptr(), nested_vmx_handle_enlightened_vmptrld()
> > > can not be called directly from vmx_set_nested_state() as KVM may not have
> > > all the information yet (e.g. HV_X64_MSR_VP_ASSIST_PAGE MSR may not be
> > > restored yet). Enlightened VMCS is mapped later while getting nested state
> > > pages. In the meantime, vmx->nested.hv_evmcs remains NULL and using it
> > > for various checks is incorrect. In particular, if KVM_GET_NESTED_STATE is
> > > called right after KVM_SET_NESTED_STATE, KVM_STATE_NESTED_EVMCS flag in the
> > > resulting state will be unset (and such state will later fail to load).
> > > 
> > > Introduce nested_evmcs_is_used() and use 'is_guest_mode(vcpu) &&
> > > vmx->nested.current_vmptr == -1ull' check to detect not-yet-mapped eVMCS
> > > after restore.
> > > 
> > > Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
> > > ---
> > >  arch/x86/kvm/vmx/nested.c | 31 ++++++++++++++++++++++++++-----
> > >  1 file changed, 26 insertions(+), 5 deletions(-)
> > > 
> > > diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> > > index 6058a65a6ede..3080e00c8f90 100644
> > > --- a/arch/x86/kvm/vmx/nested.c
> > > +++ b/arch/x86/kvm/vmx/nested.c
> > > @@ -141,6 +141,27 @@ static void init_vmcs_shadow_fields(void)
> > >  	max_shadow_read_write_fields = j;
> > >  }
> > >  
> > > +static inline bool nested_evmcs_is_used(struct vcpu_vmx *vmx)
> > > +{
> > > +	struct kvm_vcpu *vcpu = &vmx->vcpu;
> > > +
> > > +	if (vmx->nested.hv_evmcs)
> > > +		return true;
> > > +
> > > +	/*
> > > +	 * After KVM_SET_NESTED_STATE, enlightened VMCS is mapped during
> > > +	 * KVM_REQ_GET_NESTED_STATE_PAGES handling and until the request is
> > > +	 * processed vmx->nested.hv_evmcs is NULL. It is, however, possible to
> > > +	 * detect such state by checking 'nested.current_vmptr == -1ull' when
> > > +	 * vCPU is in guest mode, it is only possible with eVMCS.
> > > +	 */
> > > +	if (unlikely(vmx->nested.enlightened_vmcs_enabled && is_guest_mode(vcpu) &&
> > > +		     (vmx->nested.current_vmptr == -1ull)))
> > > +		return true;
> > > +
> > > +	return false;
> > > +}
> > 
> > I think that this is a valid way to solve the issue,
> > but it feels like there might be a better way.
> > I don't mind though to accept this patch as is.
> > 
> > So here are my 2 cents about this:
> > 
> > First of all after studying how evmcs works I take my words back
> > about needing to migrate its contents. 
> > 
> > It is indeed enough to migrate its physical address, 
> > or maybe even just a flag that evmcs is loaded
> > (and to my surprise we already do this - KVM_STATE_NESTED_EVMCS)
> > 
> > So how about just having a boolean flag that indicates that evmcs is in use, 
> > but doesn't imply that we know its address or that it is mapped 
> > to host address space, something like 'vmx->nested.enlightened_vmcs_loaded'
> > 
> > On migration that flag saved and restored as the KVM_STATE_NESTED_EVMCS,
> > otherwise it set when we load an evmcs and cleared when it is released.
> > 
> > Then as far as I can see we can use this flag in nested_evmcs_is_used
> > since all its callers don't touch evmcs, thus don't need it to be
> > mapped.
> > 
> > What do you think?



> > 
> 
> First, we need to be compatible with older KVMs which don't have the
> flag and this is problematic: currently, we always expect vmcs12 to
> carry valid contents. This is challenging.

All right, I understand this can be an issue!

If the userspace doesn't set the KVM_STATE_NESTED_EVMCS
but has a valid EVMCS as later indicated enabling it in the HV
assist page, we can just use the logic that this patch uses but use it 
to set vmx->nested.enlightened_vmcs_loaded flag or whatever
we decide to name it.
Later we can even deprecate and disable this with a new KVM cap.


BTW, I like Paolo's idea of putting this flag into the evmcs_gpa,
like that

-1 no evmcs
0 - evmcs enabled but its gpa not known
anything else - valid gpa.


Also as I said, I am not against this patch either, 
I am just thinking maybe we can make it a bit better.


> 
> Second, vCPU can be migrated in three different states:
> 1) While L2 was running ('true' nested state is in VMCS02)
> 2) While L1 was running ('true' nested state is in eVMCS)
> 3) Right after an exit from L2 to L1 was forced
> ('need_vmcs12_to_shadow_sync = true') ('true' nested state is in
> VMCS12).

Yes and this was quite difficult thing to understand
when I was trying to figure out how this code works.

Also you can add another intersting state:

4) Right after emulating vmlauch/vmresume but before
the actual entry to the nested guest (aka nested_run_pending=true)



> 
> The current solution is to always use VMCS12 as a container to transfer
> the state and conceptually, it is at least easier to understand.
> 
> We can, indeed, transfer eVMCS (or VMCS12) in case 2) through guest
> memory and I even tried that but that was making the code more complex
> so eventually I gave up and decided to preserve the 'always use VMCS12
> as a container' status quo.


My only point of concern is that it feels like it is wrong to update eVMCS
when not doing a nested vmexit, because then the eVMCS is owned by the L1 hypervisor.
At least not the fields which aren't supposed to be updated by us.


This is a rough code draft of what I had in mind (not tested).
To me this seems reasonable but I do agree that there is
some complexety tradeoffs involved.

About the compatibitly it can be said that:


Case 1:
Both old and new kernels will send/recive up to date vmcs12,
while evcms is not up to date, and its contents aren't even defined
(since L2 runs).

Case 2:
Old kernel will send vmcb12, with partial changes that L1 already
made to evmcs, and latest state of evmcs with all changes
in the guest memory.

But these changes will be discarded on the receiving side, 
since once L1 asks us to enter L2, we will reload all the state from eVMCS,
(at least the state that is marked as dirty, which means differ
from vmcs12 as it was on the last nested vmexit)

New kernel will always send the vmcb12 as it was on the last vmexit,
a bit older version but even a more consistent one.

But this doesn't matter either as just like in case of the old kernel, 
the vmcs12 will be updated from evmcs as soon as we do another L2 entry.

So while in this case we send 'more stale' vmcb12, it doesn't
really matter as it is stale anyway and will be reloaded from
evmcs.

Case 3:
Old kernel will send up to date vmcb12 (since L1 didn't had a chance
to run anyway after nested vmexit). The evmcs will not be up to date
in the guest memory, but newer kernel can fix this by updating it
as you did in patch 6.

New kernel will send up to date vmcb12 (same reason) and up to date
evmcs, so in fact an unchanged target kernel will be able to migrate
from this state.

So in fact my suggestion would allow to actually migrate to a kernel
without the fix applied.
This is even better than I thought.


This is a rough draft of the idea:


diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 6058a65a6ede..98eb7526cae6 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -167,15 +167,22 @@ static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
 static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
 				u32 vm_instruction_error)
 {
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
 			& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
 			    X86_EFLAGS_SF | X86_EFLAGS_OF))
 			| X86_EFLAGS_ZF);
 	get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
+
 	/*
 	 * We don't need to force a shadow sync because
 	 * VM_INSTRUCTION_ERROR is not shadowed
+	 * We do need to update the evmcs
 	 */
+
+	if (vmx->nested.hv_evmcs)
+		vmx->nested.hv_evmcs->vm_instruction_error = vm_instruction_error;
+
 	return kvm_skip_emulated_instruction(vcpu);
 }
 
@@ -1962,6 +1969,10 @@ static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
 
 	evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
 
+	/* All fields are clean */
+	vmx->nested.hv_evmcs->hv_clean_fields |=
+		HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+
 	return 0;
 }
 
@@ -2055,16 +2066,7 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	if (vmx->nested.hv_evmcs) {
-		copy_vmcs12_to_enlightened(vmx);
-		/* All fields are clean */
-		vmx->nested.hv_evmcs->hv_clean_fields |=
-			HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
-	} else {
-		copy_vmcs12_to_shadow(vmx);
-	}
-
+	copy_vmcs12_to_shadow(vmx);
 	vmx->nested.need_vmcs12_to_shadow_sync = false;
 }
 
@@ -3437,8 +3439,13 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
 
 	load_vmcs12_host_state(vcpu, vmcs12);
 	vmcs12->vm_exit_reason = exit_reason.full;
-	if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
+
+	if (enable_shadow_vmcs)
 		vmx->nested.need_vmcs12_to_shadow_sync = true;
+
+	if (vmx->nested.hv_evmcs)
+		copy_vmcs12_to_enlightened(vmx);
+
 	return NVMX_VMENTRY_VMEXIT;
 }
 
@@ -4531,10 +4538,12 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
 		kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
 	}
 
-	if ((vm_exit_reason != -1) &&
-	    (enable_shadow_vmcs || vmx->nested.hv_evmcs))
+	if ((vm_exit_reason != -1) && enable_shadow_vmcs)
 		vmx->nested.need_vmcs12_to_shadow_sync = true;
 
+	if (vmx->nested.hv_evmcs)
+		copy_vmcs12_to_enlightened(vmx);
+
 	/* in case we halted in L2 */
 	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 
@@ -6111,12 +6120,8 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
 		sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
 	} else  {
 		copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
-		if (!vmx->nested.need_vmcs12_to_shadow_sync) {
-			if (vmx->nested.hv_evmcs)
-				copy_enlightened_to_vmcs12(vmx);
-			else if (enable_shadow_vmcs)
-				copy_shadow_to_vmcs12(vmx);
-		}
+		if (enable_shadow_vmcs && !vmx->nested.need_vmcs12_to_shadow_sync)
+			copy_shadow_to_vmcs12(vmx);
 	}
 
 	BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE);


Best regards,
	Maxim Levitsky

> 
> -- 
> Vitaly
>
Vitaly Kuznetsov May 27, 2021, 7:54 a.m. UTC | #8
Maxim Levitsky <mlevitsk@redhat.com> writes:

> On Mon, 2021-05-24 at 14:35 +0200, Vitaly Kuznetsov wrote:
>> Maxim Levitsky <mlevitsk@redhat.com> writes:
>> 
>> > On Mon, 2021-05-17 at 15:50 +0200, Vitaly Kuznetsov wrote:
>> > > Unlike regular set_current_vmptr(), nested_vmx_handle_enlightened_vmptrld()
>> > > can not be called directly from vmx_set_nested_state() as KVM may not have
>> > > all the information yet (e.g. HV_X64_MSR_VP_ASSIST_PAGE MSR may not be
>> > > restored yet). Enlightened VMCS is mapped later while getting nested state
>> > > pages. In the meantime, vmx->nested.hv_evmcs remains NULL and using it
>> > > for various checks is incorrect. In particular, if KVM_GET_NESTED_STATE is
>> > > called right after KVM_SET_NESTED_STATE, KVM_STATE_NESTED_EVMCS flag in the
>> > > resulting state will be unset (and such state will later fail to load).
>> > > 
>> > > Introduce nested_evmcs_is_used() and use 'is_guest_mode(vcpu) &&
>> > > vmx->nested.current_vmptr == -1ull' check to detect not-yet-mapped eVMCS
>> > > after restore.
>> > > 
>> > > Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
>> > > ---
>> > >  arch/x86/kvm/vmx/nested.c | 31 ++++++++++++++++++++++++++-----
>> > >  1 file changed, 26 insertions(+), 5 deletions(-)
>> > > 
>> > > diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
>> > > index 6058a65a6ede..3080e00c8f90 100644
>> > > --- a/arch/x86/kvm/vmx/nested.c
>> > > +++ b/arch/x86/kvm/vmx/nested.c
>> > > @@ -141,6 +141,27 @@ static void init_vmcs_shadow_fields(void)
>> > >  	max_shadow_read_write_fields = j;
>> > >  }
>> > >  
>> > > +static inline bool nested_evmcs_is_used(struct vcpu_vmx *vmx)
>> > > +{
>> > > +	struct kvm_vcpu *vcpu = &vmx->vcpu;
>> > > +
>> > > +	if (vmx->nested.hv_evmcs)
>> > > +		return true;
>> > > +
>> > > +	/*
>> > > +	 * After KVM_SET_NESTED_STATE, enlightened VMCS is mapped during
>> > > +	 * KVM_REQ_GET_NESTED_STATE_PAGES handling and until the request is
>> > > +	 * processed vmx->nested.hv_evmcs is NULL. It is, however, possible to
>> > > +	 * detect such state by checking 'nested.current_vmptr == -1ull' when
>> > > +	 * vCPU is in guest mode, it is only possible with eVMCS.
>> > > +	 */
>> > > +	if (unlikely(vmx->nested.enlightened_vmcs_enabled && is_guest_mode(vcpu) &&
>> > > +		     (vmx->nested.current_vmptr == -1ull)))
>> > > +		return true;
>> > > +
>> > > +	return false;
>> > > +}
>> > 
>> > I think that this is a valid way to solve the issue,
>> > but it feels like there might be a better way.
>> > I don't mind though to accept this patch as is.
>> > 
>> > So here are my 2 cents about this:
>> > 
>> > First of all after studying how evmcs works I take my words back
>> > about needing to migrate its contents. 
>> > 
>> > It is indeed enough to migrate its physical address, 
>> > or maybe even just a flag that evmcs is loaded
>> > (and to my surprise we already do this - KVM_STATE_NESTED_EVMCS)
>> > 
>> > So how about just having a boolean flag that indicates that evmcs is in use, 
>> > but doesn't imply that we know its address or that it is mapped 
>> > to host address space, something like 'vmx->nested.enlightened_vmcs_loaded'
>> > 
>> > On migration that flag saved and restored as the KVM_STATE_NESTED_EVMCS,
>> > otherwise it set when we load an evmcs and cleared when it is released.
>> > 
>> > Then as far as I can see we can use this flag in nested_evmcs_is_used
>> > since all its callers don't touch evmcs, thus don't need it to be
>> > mapped.
>> > 
>> > What do you think?
>
>
>
>> > 
>> 
>> First, we need to be compatible with older KVMs which don't have the
>> flag and this is problematic: currently, we always expect vmcs12 to
>> carry valid contents. This is challenging.
>
> All right, I understand this can be an issue!
>
> If the userspace doesn't set the KVM_STATE_NESTED_EVMCS
> but has a valid EVMCS as later indicated enabling it in the HV
> assist page, we can just use the logic that this patch uses but use it 
> to set vmx->nested.enlightened_vmcs_loaded flag or whatever
> we decide to name it.
> Later we can even deprecate and disable this with a new KVM cap.
>
>
> BTW, I like Paolo's idea of putting this flag into the evmcs_gpa,
> like that
>
> -1 no evmcs
> 0 - evmcs enabled but its gpa not known
> anything else - valid gpa.
>
>
> Also as I said, I am not against this patch either, 
> I am just thinking maybe we can make it a bit better.
>

v3 implements a similar idea (I kept Paolo's 'Suggested-by' though :-)
we set hv_evmcs_vmptr to EVMPTR_MAP_PENDING after migration. I haven't
tried it yet but I thin we can eventually drop
KVM_REQ_GET_NESTED_STATE_PAGES usage.

For now, this series is a bugfix (multiple bugfixes, actually) so I'd
like to get in and not try to make everything perfect regarding eVMCS
:-) I'll certainly take a look at other possible improvements later.

>
>> 
>> Second, vCPU can be migrated in three different states:
>> 1) While L2 was running ('true' nested state is in VMCS02)
>> 2) While L1 was running ('true' nested state is in eVMCS)
>> 3) Right after an exit from L2 to L1 was forced
>> ('need_vmcs12_to_shadow_sync = true') ('true' nested state is in
>> VMCS12).
>
> Yes and this was quite difficult thing to understand
> when I was trying to figure out how this code works.
>
> Also you can add another intersting state:
>
> 4) Right after emulating vmlauch/vmresume but before
> the actual entry to the nested guest (aka nested_run_pending=true)
>

Honestly, I haven't took a closer look at this state. Do you envision
specific issues? Can we actually try to serve KVM_GET_NESTED_STATE in
between setting 'nested_run_pending=true' and an actual attempt to enter
L2?

>
>
>> 
>> The current solution is to always use VMCS12 as a container to transfer
>> the state and conceptually, it is at least easier to understand.
>> 
>> We can, indeed, transfer eVMCS (or VMCS12) in case 2) through guest
>> memory and I even tried that but that was making the code more complex
>> so eventually I gave up and decided to preserve the 'always use VMCS12
>> as a container' status quo.
>
>
> My only point of concern is that it feels like it is wrong to update eVMCS
> when not doing a nested vmexit, because then the eVMCS is owned by the
> L1 hypervisor.

I see your concern and ideally we wouldn't have to touch it.

> At least not the fields which aren't supposed to be updated by us.
>
>
> This is a rough code draft of what I had in mind (not tested).
> To me this seems reasonable but I do agree that there is
> some complexety tradeoffs involved.
>
> About the compatibitly it can be said that:
>
>
> Case 1:
> Both old and new kernels will send/recive up to date vmcs12,
> while evcms is not up to date, and its contents aren't even defined
> (since L2 runs).
>
> Case 2:
> Old kernel will send vmcb12, with partial changes that L1 already
> made to evmcs, and latest state of evmcs with all changes
> in the guest memory.
>
> But these changes will be discarded on the receiving side, 
> since once L1 asks us to enter L2, we will reload all the state from eVMCS,
> (at least the state that is marked as dirty, which means differ
> from vmcs12 as it was on the last nested vmexit)
>
> New kernel will always send the vmcb12 as it was on the last vmexit,
> a bit older version but even a more consistent one.
>
> But this doesn't matter either as just like in case of the old kernel, 
> the vmcs12 will be updated from evmcs as soon as we do another L2 entry.
>
> So while in this case we send 'more stale' vmcb12, it doesn't
> really matter as it is stale anyway and will be reloaded from
> evmcs.
>
> Case 3:
> Old kernel will send up to date vmcb12 (since L1 didn't had a chance
> to run anyway after nested vmexit). The evmcs will not be up to date
> in the guest memory, but newer kernel can fix this by updating it
> as you did in patch 6.
>
> New kernel will send up to date vmcb12 (same reason) and up to date
> evmcs, so in fact an unchanged target kernel will be able to migrate
> from this state.
>
> So in fact my suggestion would allow to actually migrate to a kernel
> without the fix applied.
> This is even better than I thought.
>
>
> This is a rough draft of the idea:
>
>
> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index 6058a65a6ede..98eb7526cae6 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -167,15 +167,22 @@ static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
>  static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
>  				u32 vm_instruction_error)
>  {
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
>  	vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
>  			& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
>  			    X86_EFLAGS_SF | X86_EFLAGS_OF))
>  			| X86_EFLAGS_ZF);
>  	get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
> +
>  	/*
>  	 * We don't need to force a shadow sync because
>  	 * VM_INSTRUCTION_ERROR is not shadowed
> +	 * We do need to update the evmcs
>  	 */
> +
> +	if (vmx->nested.hv_evmcs)
> +		vmx->nested.hv_evmcs->vm_instruction_error = vm_instruction_error;
> +
>  	return kvm_skip_emulated_instruction(vcpu);
>  }
>  
> @@ -1962,6 +1969,10 @@ static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
>  
>  	evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
>  
> +	/* All fields are clean */
> +	vmx->nested.hv_evmcs->hv_clean_fields |=
> +		HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
> +
>  	return 0;
>  }
>  
> @@ -2055,16 +2066,7 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
>  void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
>  {
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
> -
> -	if (vmx->nested.hv_evmcs) {
> -		copy_vmcs12_to_enlightened(vmx);
> -		/* All fields are clean */
> -		vmx->nested.hv_evmcs->hv_clean_fields |=
> -			HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
> -	} else {
> -		copy_vmcs12_to_shadow(vmx);
> -	}
> -
> +	copy_vmcs12_to_shadow(vmx);
>  	vmx->nested.need_vmcs12_to_shadow_sync = false;
>  }
>  
> @@ -3437,8 +3439,13 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
>  
>  	load_vmcs12_host_state(vcpu, vmcs12);
>  	vmcs12->vm_exit_reason = exit_reason.full;
> -	if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
> +
> +	if (enable_shadow_vmcs)
>  		vmx->nested.need_vmcs12_to_shadow_sync = true;
> +
> +	if (vmx->nested.hv_evmcs)
> +		copy_vmcs12_to_enlightened(vmx);
> +
>  	return NVMX_VMENTRY_VMEXIT;
>  }
>  
> @@ -4531,10 +4538,12 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
>  		kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
>  	}
>  
> -	if ((vm_exit_reason != -1) &&
> -	    (enable_shadow_vmcs || vmx->nested.hv_evmcs))
> +	if ((vm_exit_reason != -1) && enable_shadow_vmcs)
>  		vmx->nested.need_vmcs12_to_shadow_sync = true;
>  
> +	if (vmx->nested.hv_evmcs)
> +		copy_vmcs12_to_enlightened(vmx);
> +
>  	/* in case we halted in L2 */
>  	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
>  
> @@ -6111,12 +6120,8 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
>  		sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
>  	} else  {
>  		copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
> -		if (!vmx->nested.need_vmcs12_to_shadow_sync) {
> -			if (vmx->nested.hv_evmcs)
> -				copy_enlightened_to_vmcs12(vmx);
> -			else if (enable_shadow_vmcs)
> -				copy_shadow_to_vmcs12(vmx);
> -		}
> +		if (enable_shadow_vmcs && !vmx->nested.need_vmcs12_to_shadow_sync)
> +			copy_shadow_to_vmcs12(vmx);
>  	}
>  
>  	BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE);
>
>

I don't see why this can't work, the only concern here is that
conceptually, we'll be making eVMCS something different from shadow
vmcs. In any case, let's give this a try when this series fixing real
bugs lands.
Maxim Levitsky May 27, 2021, 2:10 p.m. UTC | #9
On Thu, 2021-05-27 at 09:54 +0200, Vitaly Kuznetsov wrote:
> Maxim Levitsky <mlevitsk@redhat.com> writes:
> 
> > On Mon, 2021-05-24 at 14:35 +0200, Vitaly Kuznetsov wrote:
> > > Maxim Levitsky <mlevitsk@redhat.com> writes:
> > > 
> > > > On Mon, 2021-05-17 at 15:50 +0200, Vitaly Kuznetsov wrote:
> > > > > Unlike regular set_current_vmptr(), nested_vmx_handle_enlightened_vmptrld()
> > > > > can not be called directly from vmx_set_nested_state() as KVM may not have
> > > > > all the information yet (e.g. HV_X64_MSR_VP_ASSIST_PAGE MSR may not be
> > > > > restored yet). Enlightened VMCS is mapped later while getting nested state
> > > > > pages. In the meantime, vmx->nested.hv_evmcs remains NULL and using it
> > > > > for various checks is incorrect. In particular, if KVM_GET_NESTED_STATE is
> > > > > called right after KVM_SET_NESTED_STATE, KVM_STATE_NESTED_EVMCS flag in the
> > > > > resulting state will be unset (and such state will later fail to load).
> > > > > 
> > > > > Introduce nested_evmcs_is_used() and use 'is_guest_mode(vcpu) &&
> > > > > vmx->nested.current_vmptr == -1ull' check to detect not-yet-mapped eVMCS
> > > > > after restore.
> > > > > 
> > > > > Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
> > > > > ---
> > > > >  arch/x86/kvm/vmx/nested.c | 31 ++++++++++++++++++++++++++-----
> > > > >  1 file changed, 26 insertions(+), 5 deletions(-)
> > > > > 
> > > > > diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> > > > > index 6058a65a6ede..3080e00c8f90 100644
> > > > > --- a/arch/x86/kvm/vmx/nested.c
> > > > > +++ b/arch/x86/kvm/vmx/nested.c
> > > > > @@ -141,6 +141,27 @@ static void init_vmcs_shadow_fields(void)
> > > > >  	max_shadow_read_write_fields = j;
> > > > >  }
> > > > >  
> > > > > +static inline bool nested_evmcs_is_used(struct vcpu_vmx *vmx)
> > > > > +{
> > > > > +	struct kvm_vcpu *vcpu = &vmx->vcpu;
> > > > > +
> > > > > +	if (vmx->nested.hv_evmcs)
> > > > > +		return true;
> > > > > +
> > > > > +	/*
> > > > > +	 * After KVM_SET_NESTED_STATE, enlightened VMCS is mapped during
> > > > > +	 * KVM_REQ_GET_NESTED_STATE_PAGES handling and until the request is
> > > > > +	 * processed vmx->nested.hv_evmcs is NULL. It is, however, possible to
> > > > > +	 * detect such state by checking 'nested.current_vmptr == -1ull' when
> > > > > +	 * vCPU is in guest mode, it is only possible with eVMCS.
> > > > > +	 */
> > > > > +	if (unlikely(vmx->nested.enlightened_vmcs_enabled && is_guest_mode(vcpu) &&
> > > > > +		     (vmx->nested.current_vmptr == -1ull)))
> > > > > +		return true;
> > > > > +
> > > > > +	return false;
> > > > > +}
> > > > 
> > > > I think that this is a valid way to solve the issue,
> > > > but it feels like there might be a better way.
> > > > I don't mind though to accept this patch as is.
> > > > 
> > > > So here are my 2 cents about this:
> > > > 
> > > > First of all after studying how evmcs works I take my words back
> > > > about needing to migrate its contents. 
> > > > 
> > > > It is indeed enough to migrate its physical address, 
> > > > or maybe even just a flag that evmcs is loaded
> > > > (and to my surprise we already do this - KVM_STATE_NESTED_EVMCS)
> > > > 
> > > > So how about just having a boolean flag that indicates that evmcs is in use, 
> > > > but doesn't imply that we know its address or that it is mapped 
> > > > to host address space, something like 'vmx->nested.enlightened_vmcs_loaded'
> > > > 
> > > > On migration that flag saved and restored as the KVM_STATE_NESTED_EVMCS,
> > > > otherwise it set when we load an evmcs and cleared when it is released.
> > > > 
> > > > Then as far as I can see we can use this flag in nested_evmcs_is_used
> > > > since all its callers don't touch evmcs, thus don't need it to be
> > > > mapped.
> > > > 
> > > > What do you think?
> > 
> > 
> > > 
> > > First, we need to be compatible with older KVMs which don't have the
> > > flag and this is problematic: currently, we always expect vmcs12 to
> > > carry valid contents. This is challenging.
> > 
> > All right, I understand this can be an issue!
> > 
> > If the userspace doesn't set the KVM_STATE_NESTED_EVMCS
> > but has a valid EVMCS as later indicated enabling it in the HV
> > assist page, we can just use the logic that this patch uses but use it 
> > to set vmx->nested.enlightened_vmcs_loaded flag or whatever
> > we decide to name it.
> > Later we can even deprecate and disable this with a new KVM cap.
> > 
> > 
> > BTW, I like Paolo's idea of putting this flag into the evmcs_gpa,
> > like that
> > 
> > -1 no evmcs
> > 0 - evmcs enabled but its gpa not known
> > anything else - valid gpa.
> > 
> > 
> > Also as I said, I am not against this patch either, 
> > I am just thinking maybe we can make it a bit better.
> > 
> 
> v3 implements a similar idea (I kept Paolo's 'Suggested-by' though :-)
> we set hv_evmcs_vmptr to EVMPTR_MAP_PENDING after migration. I haven't
> tried it yet but I thin we can eventually drop
> KVM_REQ_GET_NESTED_STATE_PAGES usage.
> 
> For now, this series is a bugfix (multiple bugfixes, actually) so I'd
> like to get in and not try to make everything perfect regarding eVMCS
> :-) I'll certainly take a look at other possible improvements later.
100% agree with you.

> 
> > > Second, vCPU can be migrated in three different states:
> > > 1) While L2 was running ('true' nested state is in VMCS02)
> > > 2) While L1 was running ('true' nested state is in eVMCS)
> > > 3) Right after an exit from L2 to L1 was forced
> > > ('need_vmcs12_to_shadow_sync = true') ('true' nested state is in
> > > VMCS12).
> > 
> > Yes and this was quite difficult thing to understand
> > when I was trying to figure out how this code works.
> > 
> > Also you can add another intersting state:
> > 
> > 4) Right after emulating vmlauch/vmresume but before
> > the actual entry to the nested guest (aka nested_run_pending=true)
> > 
> 
> Honestly, I haven't took a closer look at this state. Do you envision
> specific issues? Can we actually try to serve KVM_GET_NESTED_STATE in
> between setting 'nested_run_pending=true' and an actual attempt to enter
> L2?

Yes, I fixed a bug that was specific to this state on SVM.
 
The problem was like that: L1 would do VMRUN with an injected event,
then we have migration with nested_run_pending=1, 
and then after a migration we would forget to set nested_run_pending=1.
 
This would allow us to inject another event to the VM which meanwhile
came from the local apic, overwriting the event the L1 injected,
something that can't otherwise happen as nested_run_pending=1 doesn't
allow injecting events to the VM.
 
As Paolo explained to me the meaning of nested_run_pending is that
the nested VM entry is basically done in two steps. First we emulate
some stuff, switch vmcs/vmcb and such, and then we do the actual
VM entry to the guest. Only then it is safe to do a nested VM exit.
 
However IMHO it won't hurt to set nested_run_pending=1 even when
it is not needed due to the above, it will eliminate the issues
that we have with VMexits that happen before we processed the
KVM_REQ_GET_NESTED_STATE_PAGES request.
 
Paolo, what do you think?

> 
> > 
> > > The current solution is to always use VMCS12 as a container to transfer
> > > the state and conceptually, it is at least easier to understand.
> > > 
> > > We can, indeed, transfer eVMCS (or VMCS12) in case 2) through guest
> > > memory and I even tried that but that was making the code more complex
> > > so eventually I gave up and decided to preserve the 'always use VMCS12
> > > as a container' status quo.
> > 
> > My only point of concern is that it feels like it is wrong to update eVMCS
> > when not doing a nested vmexit, because then the eVMCS is owned by the
> > L1 hypervisor.
> 
> I see your concern and ideally we wouldn't have to touch it.

Great!

> 
> > At least not the fields which aren't supposed to be updated by us.
> > 
> > 
> > This is a rough code draft of what I had in mind (not tested).
> > To me this seems reasonable but I do agree that there is
> > some complexety tradeoffs involved.
> > 
> > About the compatibitly it can be said that:
> > 
> > 
> > Case 1:
> > Both old and new kernels will send/recive up to date vmcs12,
> > while evcms is not up to date, and its contents aren't even defined
> > (since L2 runs).
> > 
> > Case 2:
> > Old kernel will send vmcb12, with partial changes that L1 already
> > made to evmcs, and latest state of evmcs with all changes
> > in the guest memory.
> > 
> > But these changes will be discarded on the receiving side, 
> > since once L1 asks us to enter L2, we will reload all the state from eVMCS,
> > (at least the state that is marked as dirty, which means differ
> > from vmcs12 as it was on the last nested vmexit)
> > 
> > New kernel will always send the vmcb12 as it was on the last vmexit,
> > a bit older version but even a more consistent one.
> > 
> > But this doesn't matter either as just like in case of the old kernel, 
> > the vmcs12 will be updated from evmcs as soon as we do another L2 entry.
> > 
> > So while in this case we send 'more stale' vmcb12, it doesn't
> > really matter as it is stale anyway and will be reloaded from
> > evmcs.
> > 
> > Case 3:
> > Old kernel will send up to date vmcb12 (since L1 didn't had a chance
> > to run anyway after nested vmexit). The evmcs will not be up to date
> > in the guest memory, but newer kernel can fix this by updating it
> > as you did in patch 6.
> > 
> > New kernel will send up to date vmcb12 (same reason) and up to date
> > evmcs, so in fact an unchanged target kernel will be able to migrate
> > from this state.
> > 
> > So in fact my suggestion would allow to actually migrate to a kernel
> > without the fix applied.
> > This is even better than I thought.
> > 
> > 
> > This is a rough draft of the idea:
> > 
> > 
> > diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> > index 6058a65a6ede..98eb7526cae6 100644
> > --- a/arch/x86/kvm/vmx/nested.c
> > +++ b/arch/x86/kvm/vmx/nested.c
> > @@ -167,15 +167,22 @@ static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
> >  static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
> >  				u32 vm_instruction_error)
> >  {
> > +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> >  	vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
> >  			& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
> >  			    X86_EFLAGS_SF | X86_EFLAGS_OF))
> >  			| X86_EFLAGS_ZF);
> >  	get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
> > +
> >  	/*
> >  	 * We don't need to force a shadow sync because
> >  	 * VM_INSTRUCTION_ERROR is not shadowed
> > +	 * We do need to update the evmcs
> >  	 */
> > +
> > +	if (vmx->nested.hv_evmcs)
> > +		vmx->nested.hv_evmcs->vm_instruction_error = vm_instruction_error;
> > +
> >  	return kvm_skip_emulated_instruction(vcpu);
> >  }
> >  
> > @@ -1962,6 +1969,10 @@ static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
> >  
> >  	evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
> >  
> > +	/* All fields are clean */
> > +	vmx->nested.hv_evmcs->hv_clean_fields |=
> > +		HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
> > +
> >  	return 0;
> >  }
> >  
> > @@ -2055,16 +2066,7 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
> >  void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
> >  {
> >  	struct vcpu_vmx *vmx = to_vmx(vcpu);
> > -
> > -	if (vmx->nested.hv_evmcs) {
> > -		copy_vmcs12_to_enlightened(vmx);
> > -		/* All fields are clean */
> > -		vmx->nested.hv_evmcs->hv_clean_fields |=
> > -			HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
> > -	} else {
> > -		copy_vmcs12_to_shadow(vmx);
> > -	}
> > -
> > +	copy_vmcs12_to_shadow(vmx);
> >  	vmx->nested.need_vmcs12_to_shadow_sync = false;
> >  }
> >  
> > @@ -3437,8 +3439,13 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
> >  
> >  	load_vmcs12_host_state(vcpu, vmcs12);
> >  	vmcs12->vm_exit_reason = exit_reason.full;
> > -	if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
> > +
> > +	if (enable_shadow_vmcs)
> >  		vmx->nested.need_vmcs12_to_shadow_sync = true;
> > +
> > +	if (vmx->nested.hv_evmcs)
> > +		copy_vmcs12_to_enlightened(vmx);
> > +
> >  	return NVMX_VMENTRY_VMEXIT;
> >  }
> >  
> > @@ -4531,10 +4538,12 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
> >  		kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
> >  	}
> >  
> > -	if ((vm_exit_reason != -1) &&
> > -	    (enable_shadow_vmcs || vmx->nested.hv_evmcs))
> > +	if ((vm_exit_reason != -1) && enable_shadow_vmcs)
> >  		vmx->nested.need_vmcs12_to_shadow_sync = true;
> >  
> > +	if (vmx->nested.hv_evmcs)
> > +		copy_vmcs12_to_enlightened(vmx);
> > +
> >  	/* in case we halted in L2 */
> >  	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
> >  
> > @@ -6111,12 +6120,8 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
> >  		sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
> >  	} else  {
> >  		copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
> > -		if (!vmx->nested.need_vmcs12_to_shadow_sync) {
> > -			if (vmx->nested.hv_evmcs)
> > -				copy_enlightened_to_vmcs12(vmx);
> > -			else if (enable_shadow_vmcs)
> > -				copy_shadow_to_vmcs12(vmx);
> > -		}
> > +		if (enable_shadow_vmcs && !vmx->nested.need_vmcs12_to_shadow_sync)
> > +			copy_shadow_to_vmcs12(vmx);
> >  	}
> >  
> >  	BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE);
> > 
> > 
> 
> I don't see why this can't work, the only concern here is that
> conceptually, we'll be making eVMCS something different from shadow
> vmcs. In any case, let's give this a try when this series fixing real
> bugs lands.

100% agree with you on this as well, including your concern about making evmcs
code less consistent with being a form of a 'shadow vmcs'.

It is a tradeoff, and I am not 100% sure which way is better too,
its just that I put a lot of thought to it, and I wanted to share this
approach in case we want to do it this way.


Best regards,
	Maxim Levitsky

>
diff mbox series

Patch

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 6058a65a6ede..3080e00c8f90 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -141,6 +141,27 @@  static void init_vmcs_shadow_fields(void)
 	max_shadow_read_write_fields = j;
 }
 
+static inline bool nested_evmcs_is_used(struct vcpu_vmx *vmx)
+{
+	struct kvm_vcpu *vcpu = &vmx->vcpu;
+
+	if (vmx->nested.hv_evmcs)
+		return true;
+
+	/*
+	 * After KVM_SET_NESTED_STATE, enlightened VMCS is mapped during
+	 * KVM_REQ_GET_NESTED_STATE_PAGES handling and until the request is
+	 * processed vmx->nested.hv_evmcs is NULL. It is, however, possible to
+	 * detect such state by checking 'nested.current_vmptr == -1ull' when
+	 * vCPU is in guest mode, it is only possible with eVMCS.
+	 */
+	if (unlikely(vmx->nested.enlightened_vmcs_enabled && is_guest_mode(vcpu) &&
+		     (vmx->nested.current_vmptr == -1ull)))
+		return true;
+
+	return false;
+}
+
 /*
  * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
  * set the success or error code of an emulated VMX instruction (as specified
@@ -187,7 +208,7 @@  static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error)
 	 * failValid writes the error number to the current VMCS, which
 	 * can't be done if there isn't a current VMCS.
 	 */
-	if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs)
+	if (vmx->nested.current_vmptr == -1ull && !nested_evmcs_is_used(vmx))
 		return nested_vmx_failInvalid(vcpu);
 
 	return nested_vmx_failValid(vcpu, vm_instruction_error);
@@ -2208,7 +2229,7 @@  static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
 	u32 exec_control;
 	u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
 
-	if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs)
+	if (vmx->nested.dirty_vmcs12 || nested_evmcs_is_used(vmx))
 		prepare_vmcs02_early_rare(vmx, vmcs12);
 
 	/*
@@ -3437,7 +3458,7 @@  enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
 
 	load_vmcs12_host_state(vcpu, vmcs12);
 	vmcs12->vm_exit_reason = exit_reason.full;
-	if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
+	if (enable_shadow_vmcs || nested_evmcs_is_used(vmx))
 		vmx->nested.need_vmcs12_to_shadow_sync = true;
 	return NVMX_VMENTRY_VMEXIT;
 }
@@ -4032,7 +4053,7 @@  static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
-	if (vmx->nested.hv_evmcs)
+	if (nested_evmcs_is_used(vmx))
 		sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
 
 	vmx->nested.need_sync_vmcs02_to_vmcs12_rare = !vmx->nested.hv_evmcs;
@@ -6056,7 +6077,7 @@  static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
 		if (vmx_has_valid_vmcs12(vcpu)) {
 			kvm_state.size += sizeof(user_vmx_nested_state->vmcs12);
 
-			if (vmx->nested.hv_evmcs)
+			if (nested_evmcs_is_used(vmx))
 				kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
 
 			if (is_guest_mode(vcpu) &&