diff mbox series

[1/4] KVM: nVMX: Always make an attempt to map eVMCS after migration

Message ID 20210503150854.1144255-2-vkuznets@redhat.com (mailing list archive)
State New, archived
Headers show
Series KVM: nVMX: Fix migration of nested guests when eVMCS is in use | expand

Commit Message

Vitaly Kuznetsov May 3, 2021, 3:08 p.m. UTC
When enlightened VMCS is in use and nested state is migrated with
vmx_get_nested_state()/vmx_set_nested_state() KVM can't map evmcs
page right away: evmcs gpa is not 'struct kvm_vmx_nested_state_hdr'
and we can't read it from VP assist page because userspace may decide
to restore HV_X64_MSR_VP_ASSIST_PAGE after restoring nested state
(and QEMU, for example, does exactly that). To make sure eVMCS is
mapped /vmx_set_nested_state() raises KVM_REQ_GET_NESTED_STATE_PAGES
request.

Commit f2c7ef3ba955 ("KVM: nSVM: cancel KVM_REQ_GET_NESTED_STATE_PAGES
on nested vmexit") added KVM_REQ_GET_NESTED_STATE_PAGES clearing to
nested_vmx_vmexit() to make sure MSR permission bitmap is not switched
when an immediate exit from L2 to L1 happens right after migration (caused
by a pending event, for example). Unfortunately, in the exact same
situation we still need to have eVMCS mapped so
nested_sync_vmcs12_to_shadow() reflects changes in VMCS12 to eVMCS.

As a band-aid, restore nested_get_evmcs_page() when clearing
KVM_REQ_GET_NESTED_STATE_PAGES in nested_vmx_vmexit(). The 'fix' is far
from being ideal as we can't easily propagate possible failures and even if
we could, this is most likely already too late to do so. The whole
'KVM_REQ_GET_NESTED_STATE_PAGES' idea for mapping eVMCS after migration
seems to be fragile as we diverge too much from the 'native' path when
vmptr loading happens on vmx_set_nested_state().

Fixes: f2c7ef3ba955 ("KVM: nSVM: cancel KVM_REQ_GET_NESTED_STATE_PAGES on nested vmexit")
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

Comments

Maxim Levitsky May 5, 2021, 8:22 a.m. UTC | #1
On Mon, 2021-05-03 at 17:08 +0200, Vitaly Kuznetsov wrote:
> When enlightened VMCS is in use and nested state is migrated with
> vmx_get_nested_state()/vmx_set_nested_state() KVM can't map evmcs
> page right away: evmcs gpa is not 'struct kvm_vmx_nested_state_hdr'
> and we can't read it from VP assist page because userspace may decide
> to restore HV_X64_MSR_VP_ASSIST_PAGE after restoring nested state
> (and QEMU, for example, does exactly that). To make sure eVMCS is
> mapped /vmx_set_nested_state() raises KVM_REQ_GET_NESTED_STATE_PAGES
> request.
> 
> Commit f2c7ef3ba955 ("KVM: nSVM: cancel KVM_REQ_GET_NESTED_STATE_PAGES
> on nested vmexit") added KVM_REQ_GET_NESTED_STATE_PAGES clearing to
> nested_vmx_vmexit() to make sure MSR permission bitmap is not switched
> when an immediate exit from L2 to L1 happens right after migration (caused
> by a pending event, for example). Unfortunately, in the exact same
> situation we still need to have eVMCS mapped so
> nested_sync_vmcs12_to_shadow() reflects changes in VMCS12 to eVMCS.
> 
> As a band-aid, restore nested_get_evmcs_page() when clearing
> KVM_REQ_GET_NESTED_STATE_PAGES in nested_vmx_vmexit(). The 'fix' is far
> from being ideal as we can't easily propagate possible failures and even if
> we could, this is most likely already too late to do so. The whole
> 'KVM_REQ_GET_NESTED_STATE_PAGES' idea for mapping eVMCS after migration
> seems to be fragile as we diverge too much from the 'native' path when
> vmptr loading happens on vmx_set_nested_state().
> 
> Fixes: f2c7ef3ba955 ("KVM: nSVM: cancel KVM_REQ_GET_NESTED_STATE_PAGES on nested vmexit")
> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
> ---
>  arch/x86/kvm/vmx/nested.c | 29 +++++++++++++++++++----------
>  1 file changed, 19 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index 1e069aac7410..2febb1dd68e8 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -3098,15 +3098,8 @@ static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
>  			nested_vmx_handle_enlightened_vmptrld(vcpu, false);
>  
>  		if (evmptrld_status == EVMPTRLD_VMFAIL ||
> -		    evmptrld_status == EVMPTRLD_ERROR) {
> -			pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
> -					     __func__);
> -			vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
> -			vcpu->run->internal.suberror =
> -				KVM_INTERNAL_ERROR_EMULATION;
> -			vcpu->run->internal.ndata = 0;
> +		    evmptrld_status == EVMPTRLD_ERROR)
>  			return false;
> -		}
>  	}
>  
>  	return true;
> @@ -3194,8 +3187,16 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
>  
>  static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
>  {
> -	if (!nested_get_evmcs_page(vcpu))
> +	if (!nested_get_evmcs_page(vcpu)) {
> +		pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
> +				     __func__);
> +		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
> +		vcpu->run->internal.suberror =
> +			KVM_INTERNAL_ERROR_EMULATION;
> +		vcpu->run->internal.ndata = 0;
> +
>  		return false;
> +	}

Hi!

Any reason to move the debug prints out of nested_get_evmcs_page?


>  
>  	if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu))
>  		return false;
> @@ -4422,7 +4423,15 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
>  	/* trying to cancel vmlaunch/vmresume is a bug */
>  	WARN_ON_ONCE(vmx->nested.nested_run_pending);
>  
> -	kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
> +	if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
> +		/*
> +		 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map
> +		 * Enlightened VMCS after migration and we still need to
> +		 * do that when something is forcing L2->L1 exit prior to
> +		 * the first L2 run.
> +		 */
> +		(void)nested_get_evmcs_page(vcpu);
> +	}
Yes this is a band-aid, but it has to be done I agree.

>  
>  	/* Service the TLB flush request for L2 before switching to L1. */
>  	if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))




I also tested this and it survives a bit better (used to crash instantly
after a single migration cycle, but the guest still crashes after around ~20 iterations of my 
regular nested migration test).

Blues screen shows that stop code is HYPERVISOR ERROR and nothing else.

I tested both this patch alone and all 4 patches.

Without evmcs, the same VM with same host kernel and qemu survived an overnight
test and passed about 1800 migration iterations.
(my synthetic migration test doesn't yet work on Intel, I need to investigate why)

For reference this is the VM that you gave me to test, kvm/queue kernel,
with merged mainline in it,
and mostly latest qemu (updated about a week ago or so)

qemu: 3791642c8d60029adf9b00bcb4e34d7d8a1aea4d
kernel: 9f242010c3b46e63bc62f08fff42cef992d3801b and
        then merge v5.12 from mainline.

Best regards,
	Maxim Levitsky
Vitaly Kuznetsov May 5, 2021, 8:39 a.m. UTC | #2
Maxim Levitsky <mlevitsk@redhat.com> writes:

> On Mon, 2021-05-03 at 17:08 +0200, Vitaly Kuznetsov wrote:
>> When enlightened VMCS is in use and nested state is migrated with
>> vmx_get_nested_state()/vmx_set_nested_state() KVM can't map evmcs
>> page right away: evmcs gpa is not 'struct kvm_vmx_nested_state_hdr'
>> and we can't read it from VP assist page because userspace may decide
>> to restore HV_X64_MSR_VP_ASSIST_PAGE after restoring nested state
>> (and QEMU, for example, does exactly that). To make sure eVMCS is
>> mapped /vmx_set_nested_state() raises KVM_REQ_GET_NESTED_STATE_PAGES
>> request.
>> 
>> Commit f2c7ef3ba955 ("KVM: nSVM: cancel KVM_REQ_GET_NESTED_STATE_PAGES
>> on nested vmexit") added KVM_REQ_GET_NESTED_STATE_PAGES clearing to
>> nested_vmx_vmexit() to make sure MSR permission bitmap is not switched
>> when an immediate exit from L2 to L1 happens right after migration (caused
>> by a pending event, for example). Unfortunately, in the exact same
>> situation we still need to have eVMCS mapped so
>> nested_sync_vmcs12_to_shadow() reflects changes in VMCS12 to eVMCS.
>> 
>> As a band-aid, restore nested_get_evmcs_page() when clearing
>> KVM_REQ_GET_NESTED_STATE_PAGES in nested_vmx_vmexit(). The 'fix' is far
>> from being ideal as we can't easily propagate possible failures and even if
>> we could, this is most likely already too late to do so. The whole
>> 'KVM_REQ_GET_NESTED_STATE_PAGES' idea for mapping eVMCS after migration
>> seems to be fragile as we diverge too much from the 'native' path when
>> vmptr loading happens on vmx_set_nested_state().
>> 
>> Fixes: f2c7ef3ba955 ("KVM: nSVM: cancel KVM_REQ_GET_NESTED_STATE_PAGES on nested vmexit")
>> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
>> ---
>>  arch/x86/kvm/vmx/nested.c | 29 +++++++++++++++++++----------
>>  1 file changed, 19 insertions(+), 10 deletions(-)
>> 
>> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
>> index 1e069aac7410..2febb1dd68e8 100644
>> --- a/arch/x86/kvm/vmx/nested.c
>> +++ b/arch/x86/kvm/vmx/nested.c
>> @@ -3098,15 +3098,8 @@ static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
>>  			nested_vmx_handle_enlightened_vmptrld(vcpu, false);
>>  
>>  		if (evmptrld_status == EVMPTRLD_VMFAIL ||
>> -		    evmptrld_status == EVMPTRLD_ERROR) {
>> -			pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
>> -					     __func__);
>> -			vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
>> -			vcpu->run->internal.suberror =
>> -				KVM_INTERNAL_ERROR_EMULATION;
>> -			vcpu->run->internal.ndata = 0;
>> +		    evmptrld_status == EVMPTRLD_ERROR)
>>  			return false;
>> -		}
>>  	}
>>  
>>  	return true;
>> @@ -3194,8 +3187,16 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
>>  
>>  static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
>>  {
>> -	if (!nested_get_evmcs_page(vcpu))
>> +	if (!nested_get_evmcs_page(vcpu)) {
>> +		pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
>> +				     __func__);
>> +		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
>> +		vcpu->run->internal.suberror =
>> +			KVM_INTERNAL_ERROR_EMULATION;
>> +		vcpu->run->internal.ndata = 0;
>> +
>>  		return false;
>> +	}
>
> Hi!
>
> Any reason to move the debug prints out of nested_get_evmcs_page?
>

Debug print could've probably stayed or could've been dropped
completely -- I don't really believe it's going to help
anyone. Debugging such issues without instrumentation/tracing seems to
be hard-to-impossible...

>
>>  
>>  	if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu))
>>  		return false;
>> @@ -4422,7 +4423,15 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
>>  	/* trying to cancel vmlaunch/vmresume is a bug */
>>  	WARN_ON_ONCE(vmx->nested.nested_run_pending);
>>  
>> -	kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
>> +	if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
>> +		/*
>> +		 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map
>> +		 * Enlightened VMCS after migration and we still need to
>> +		 * do that when something is forcing L2->L1 exit prior to
>> +		 * the first L2 run.
>> +		 */
>> +		(void)nested_get_evmcs_page(vcpu);
>> +	}
> Yes this is a band-aid, but it has to be done I agree.
>

To restore the status quo, yes.

>>  
>>  	/* Service the TLB flush request for L2 before switching to L1. */
>>  	if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
>
>
>
>
> I also tested this and it survives a bit better (used to crash instantly
> after a single migration cycle, but the guest still crashes after around ~20 iterations of my 
> regular nested migration test).
>
> Blues screen shows that stop code is HYPERVISOR ERROR and nothing else.
>
> I tested both this patch alone and all 4 patches.
>
> Without evmcs, the same VM with same host kernel and qemu survived an overnight
> test and passed about 1800 migration iterations.
> (my synthetic migration test doesn't yet work on Intel, I need to investigate why)
>

It would be great to compare on Intel to be 100% sure the issue is eVMCS
related, Hyper-V may be behaving quite differently on AMD.

> For reference this is the VM that you gave me to test, kvm/queue kernel,
> with merged mainline in it,
> and mostly latest qemu (updated about a week ago or so)
>
> qemu: 3791642c8d60029adf9b00bcb4e34d7d8a1aea4d
> kernel: 9f242010c3b46e63bc62f08fff42cef992d3801b and
>         then merge v5.12 from mainline.

Thanks for testing! I'll try to come up with a selftest for this issue,
maybe it'll help us discovering others)
Maxim Levitsky May 5, 2021, 9:17 a.m. UTC | #3
On Wed, 2021-05-05 at 10:39 +0200, Vitaly Kuznetsov wrote:
> Maxim Levitsky <mlevitsk@redhat.com> writes:
> 
> > On Mon, 2021-05-03 at 17:08 +0200, Vitaly Kuznetsov wrote:
> > > When enlightened VMCS is in use and nested state is migrated with
> > > vmx_get_nested_state()/vmx_set_nested_state() KVM can't map evmcs
> > > page right away: evmcs gpa is not 'struct kvm_vmx_nested_state_hdr'
> > > and we can't read it from VP assist page because userspace may decide
> > > to restore HV_X64_MSR_VP_ASSIST_PAGE after restoring nested state
> > > (and QEMU, for example, does exactly that). To make sure eVMCS is
> > > mapped /vmx_set_nested_state() raises KVM_REQ_GET_NESTED_STATE_PAGES
> > > request.
> > > 
> > > Commit f2c7ef3ba955 ("KVM: nSVM: cancel KVM_REQ_GET_NESTED_STATE_PAGES
> > > on nested vmexit") added KVM_REQ_GET_NESTED_STATE_PAGES clearing to
> > > nested_vmx_vmexit() to make sure MSR permission bitmap is not switched
> > > when an immediate exit from L2 to L1 happens right after migration (caused
> > > by a pending event, for example). Unfortunately, in the exact same
> > > situation we still need to have eVMCS mapped so
> > > nested_sync_vmcs12_to_shadow() reflects changes in VMCS12 to eVMCS.
> > > 
> > > As a band-aid, restore nested_get_evmcs_page() when clearing
> > > KVM_REQ_GET_NESTED_STATE_PAGES in nested_vmx_vmexit(). The 'fix' is far
> > > from being ideal as we can't easily propagate possible failures and even if
> > > we could, this is most likely already too late to do so. The whole
> > > 'KVM_REQ_GET_NESTED_STATE_PAGES' idea for mapping eVMCS after migration
> > > seems to be fragile as we diverge too much from the 'native' path when
> > > vmptr loading happens on vmx_set_nested_state().
> > > 
> > > Fixes: f2c7ef3ba955 ("KVM: nSVM: cancel KVM_REQ_GET_NESTED_STATE_PAGES on nested vmexit")
> > > Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
> > > ---
> > >  arch/x86/kvm/vmx/nested.c | 29 +++++++++++++++++++----------
> > >  1 file changed, 19 insertions(+), 10 deletions(-)
> > > 
> > > diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> > > index 1e069aac7410..2febb1dd68e8 100644
> > > --- a/arch/x86/kvm/vmx/nested.c
> > > +++ b/arch/x86/kvm/vmx/nested.c
> > > @@ -3098,15 +3098,8 @@ static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
> > >  			nested_vmx_handle_enlightened_vmptrld(vcpu, false);
> > >  
> > >  		if (evmptrld_status == EVMPTRLD_VMFAIL ||
> > > -		    evmptrld_status == EVMPTRLD_ERROR) {
> > > -			pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
> > > -					     __func__);
> > > -			vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
> > > -			vcpu->run->internal.suberror =
> > > -				KVM_INTERNAL_ERROR_EMULATION;
> > > -			vcpu->run->internal.ndata = 0;
> > > +		    evmptrld_status == EVMPTRLD_ERROR)
> > >  			return false;
> > > -		}
> > >  	}
> > >  
> > >  	return true;
> > > @@ -3194,8 +3187,16 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
> > >  
> > >  static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
> > >  {
> > > -	if (!nested_get_evmcs_page(vcpu))
> > > +	if (!nested_get_evmcs_page(vcpu)) {
> > > +		pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
> > > +				     __func__);
> > > +		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
> > > +		vcpu->run->internal.suberror =
> > > +			KVM_INTERNAL_ERROR_EMULATION;
> > > +		vcpu->run->internal.ndata = 0;
> > > +
> > >  		return false;
> > > +	}
> > 
> > Hi!
> > 
> > Any reason to move the debug prints out of nested_get_evmcs_page?
> > 
> 
> Debug print could've probably stayed or could've been dropped
> completely -- I don't really believe it's going to help
> anyone. Debugging such issues without instrumentation/tracing seems to
> be hard-to-impossible...
> 
> > >  
> > >  	if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu))
> > >  		return false;
> > > @@ -4422,7 +4423,15 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
> > >  	/* trying to cancel vmlaunch/vmresume is a bug */
> > >  	WARN_ON_ONCE(vmx->nested.nested_run_pending);
> > >  
> > > -	kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
> > > +	if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
> > > +		/*
> > > +		 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map
> > > +		 * Enlightened VMCS after migration and we still need to
> > > +		 * do that when something is forcing L2->L1 exit prior to
> > > +		 * the first L2 run.
> > > +		 */
> > > +		(void)nested_get_evmcs_page(vcpu);
> > > +	}
> > Yes this is a band-aid, but it has to be done I agree.
> > 
> 
> To restore the status quo, yes.
> 
> > >  
> > >  	/* Service the TLB flush request for L2 before switching to L1. */
> > >  	if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
> > 
> > 
> > 
> > I also tested this and it survives a bit better (used to crash instantly
> > after a single migration cycle, but the guest still crashes after around ~20 iterations of my 
> > regular nested migration test).
> > 
> > Blues screen shows that stop code is HYPERVISOR ERROR and nothing else.
> > 
> > I tested both this patch alone and all 4 patches.
> > 
> > Without evmcs, the same VM with same host kernel and qemu survived an overnight
> > test and passed about 1800 migration iterations.
> > (my synthetic migration test doesn't yet work on Intel, I need to investigate why)
> > 
> 
> It would be great to compare on Intel to be 100% sure the issue is eVMCS
> related, Hyper-V may be behaving quite differently on AMD.
Hi!

I tested this on my Intel machine with and without eVMCS, without changing
any other parameters, running the same VM from a snapshot.

As I said without eVMCS the test survived overnight stress of ~1800 migrations.
With eVMCs, it fails pretty much on first try. 
With those patches, it fails after about 20 iterations.

Best regards,
	Maxim Levitsky

> 
> > For reference this is the VM that you gave me to test, kvm/queue kernel,
> > with merged mainline in it,
> > and mostly latest qemu (updated about a week ago or so)
> > 
> > qemu: 3791642c8d60029adf9b00bcb4e34d7d8a1aea4d
> > kernel: 9f242010c3b46e63bc62f08fff42cef992d3801b and
> >         then merge v5.12 from mainline.
> 
> Thanks for testing! I'll try to come up with a selftest for this issue,
> maybe it'll help us discovering others)
>
Vitaly Kuznetsov May 5, 2021, 9:23 a.m. UTC | #4
Maxim Levitsky <mlevitsk@redhat.com> writes:

> On Wed, 2021-05-05 at 10:39 +0200, Vitaly Kuznetsov wrote:
>> Maxim Levitsky <mlevitsk@redhat.com> writes:
>> 
>> > On Mon, 2021-05-03 at 17:08 +0200, Vitaly Kuznetsov wrote:
>> > > When enlightened VMCS is in use and nested state is migrated with
>> > > vmx_get_nested_state()/vmx_set_nested_state() KVM can't map evmcs
>> > > page right away: evmcs gpa is not 'struct kvm_vmx_nested_state_hdr'
>> > > and we can't read it from VP assist page because userspace may decide
>> > > to restore HV_X64_MSR_VP_ASSIST_PAGE after restoring nested state
>> > > (and QEMU, for example, does exactly that). To make sure eVMCS is
>> > > mapped /vmx_set_nested_state() raises KVM_REQ_GET_NESTED_STATE_PAGES
>> > > request.
>> > > 
>> > > Commit f2c7ef3ba955 ("KVM: nSVM: cancel KVM_REQ_GET_NESTED_STATE_PAGES
>> > > on nested vmexit") added KVM_REQ_GET_NESTED_STATE_PAGES clearing to
>> > > nested_vmx_vmexit() to make sure MSR permission bitmap is not switched
>> > > when an immediate exit from L2 to L1 happens right after migration (caused
>> > > by a pending event, for example). Unfortunately, in the exact same
>> > > situation we still need to have eVMCS mapped so
>> > > nested_sync_vmcs12_to_shadow() reflects changes in VMCS12 to eVMCS.
>> > > 
>> > > As a band-aid, restore nested_get_evmcs_page() when clearing
>> > > KVM_REQ_GET_NESTED_STATE_PAGES in nested_vmx_vmexit(). The 'fix' is far
>> > > from being ideal as we can't easily propagate possible failures and even if
>> > > we could, this is most likely already too late to do so. The whole
>> > > 'KVM_REQ_GET_NESTED_STATE_PAGES' idea for mapping eVMCS after migration
>> > > seems to be fragile as we diverge too much from the 'native' path when
>> > > vmptr loading happens on vmx_set_nested_state().
>> > > 
>> > > Fixes: f2c7ef3ba955 ("KVM: nSVM: cancel KVM_REQ_GET_NESTED_STATE_PAGES on nested vmexit")
>> > > Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
>> > > ---
>> > >  arch/x86/kvm/vmx/nested.c | 29 +++++++++++++++++++----------
>> > >  1 file changed, 19 insertions(+), 10 deletions(-)
>> > > 
>> > > diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
>> > > index 1e069aac7410..2febb1dd68e8 100644
>> > > --- a/arch/x86/kvm/vmx/nested.c
>> > > +++ b/arch/x86/kvm/vmx/nested.c
>> > > @@ -3098,15 +3098,8 @@ static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
>> > >  			nested_vmx_handle_enlightened_vmptrld(vcpu, false);
>> > >  
>> > >  		if (evmptrld_status == EVMPTRLD_VMFAIL ||
>> > > -		    evmptrld_status == EVMPTRLD_ERROR) {
>> > > -			pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
>> > > -					     __func__);
>> > > -			vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
>> > > -			vcpu->run->internal.suberror =
>> > > -				KVM_INTERNAL_ERROR_EMULATION;
>> > > -			vcpu->run->internal.ndata = 0;
>> > > +		    evmptrld_status == EVMPTRLD_ERROR)
>> > >  			return false;
>> > > -		}
>> > >  	}
>> > >  
>> > >  	return true;
>> > > @@ -3194,8 +3187,16 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
>> > >  
>> > >  static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
>> > >  {
>> > > -	if (!nested_get_evmcs_page(vcpu))
>> > > +	if (!nested_get_evmcs_page(vcpu)) {
>> > > +		pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
>> > > +				     __func__);
>> > > +		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
>> > > +		vcpu->run->internal.suberror =
>> > > +			KVM_INTERNAL_ERROR_EMULATION;
>> > > +		vcpu->run->internal.ndata = 0;
>> > > +
>> > >  		return false;
>> > > +	}
>> > 
>> > Hi!
>> > 
>> > Any reason to move the debug prints out of nested_get_evmcs_page?
>> > 
>> 
>> Debug print could've probably stayed or could've been dropped
>> completely -- I don't really believe it's going to help
>> anyone. Debugging such issues without instrumentation/tracing seems to
>> be hard-to-impossible...
>> 
>> > >  
>> > >  	if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu))
>> > >  		return false;
>> > > @@ -4422,7 +4423,15 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
>> > >  	/* trying to cancel vmlaunch/vmresume is a bug */
>> > >  	WARN_ON_ONCE(vmx->nested.nested_run_pending);
>> > >  
>> > > -	kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
>> > > +	if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
>> > > +		/*
>> > > +		 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map
>> > > +		 * Enlightened VMCS after migration and we still need to
>> > > +		 * do that when something is forcing L2->L1 exit prior to
>> > > +		 * the first L2 run.
>> > > +		 */
>> > > +		(void)nested_get_evmcs_page(vcpu);
>> > > +	}
>> > Yes this is a band-aid, but it has to be done I agree.
>> > 
>> 
>> To restore the status quo, yes.
>> 
>> > >  
>> > >  	/* Service the TLB flush request for L2 before switching to L1. */
>> > >  	if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
>> > 
>> > 
>> > 
>> > I also tested this and it survives a bit better (used to crash instantly
>> > after a single migration cycle, but the guest still crashes after around ~20 iterations of my 
>> > regular nested migration test).
>> > 
>> > Blues screen shows that stop code is HYPERVISOR ERROR and nothing else.
>> > 
>> > I tested both this patch alone and all 4 patches.
>> > 
>> > Without evmcs, the same VM with same host kernel and qemu survived an overnight
>> > test and passed about 1800 migration iterations.
>> > (my synthetic migration test doesn't yet work on Intel, I need to investigate why)
>> > 
>> 
>> It would be great to compare on Intel to be 100% sure the issue is eVMCS
>> related, Hyper-V may be behaving quite differently on AMD.
> Hi!
>
> I tested this on my Intel machine with and without eVMCS, without changing
> any other parameters, running the same VM from a snapshot.
>
> As I said without eVMCS the test survived overnight stress of ~1800 migrations.
> With eVMCs, it fails pretty much on first try. 
> With those patches, it fails after about 20 iterations.
>

Ah, sorry, misunderstood your 'synthetic migration test doesn't yet work
on Intel' :-)
diff mbox series

Patch

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 1e069aac7410..2febb1dd68e8 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -3098,15 +3098,8 @@  static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
 			nested_vmx_handle_enlightened_vmptrld(vcpu, false);
 
 		if (evmptrld_status == EVMPTRLD_VMFAIL ||
-		    evmptrld_status == EVMPTRLD_ERROR) {
-			pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
-					     __func__);
-			vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-			vcpu->run->internal.suberror =
-				KVM_INTERNAL_ERROR_EMULATION;
-			vcpu->run->internal.ndata = 0;
+		    evmptrld_status == EVMPTRLD_ERROR)
 			return false;
-		}
 	}
 
 	return true;
@@ -3194,8 +3187,16 @@  static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
 
 static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
 {
-	if (!nested_get_evmcs_page(vcpu))
+	if (!nested_get_evmcs_page(vcpu)) {
+		pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
+				     __func__);
+		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		vcpu->run->internal.suberror =
+			KVM_INTERNAL_ERROR_EMULATION;
+		vcpu->run->internal.ndata = 0;
+
 		return false;
+	}
 
 	if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu))
 		return false;
@@ -4422,7 +4423,15 @@  void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
 	/* trying to cancel vmlaunch/vmresume is a bug */
 	WARN_ON_ONCE(vmx->nested.nested_run_pending);
 
-	kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+	if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
+		/*
+		 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map
+		 * Enlightened VMCS after migration and we still need to
+		 * do that when something is forcing L2->L1 exit prior to
+		 * the first L2 run.
+		 */
+		(void)nested_get_evmcs_page(vcpu);
+	}
 
 	/* Service the TLB flush request for L2 before switching to L1. */
 	if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))