diff mbox series

[v4,2/2] KVM: nSVM: implement ondemand allocation of the nested state

Message ID 20200917101048.739691-3-mlevitsk@redhat.com (mailing list archive)
State New, archived
Headers show
Series KVM: nSVM: ondemand nested state allocation | expand

Commit Message

Maxim Levitsky Sept. 17, 2020, 10:10 a.m. UTC
This way we don't waste memory on VMs which don't use
nesting virtualization even if it is available to them.

If allocation of nested state fails (which should happen,
only when host is about to OOM anyway), use new KVM_REQ_OUT_OF_MEMORY
request to shut down the guest

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
---
 arch/x86/kvm/svm/nested.c | 42 ++++++++++++++++++++++++++++++
 arch/x86/kvm/svm/svm.c    | 54 ++++++++++++++++++++++-----------------
 arch/x86/kvm/svm/svm.h    |  7 +++++
 3 files changed, 79 insertions(+), 24 deletions(-)

Comments

Sean Christopherson Sept. 17, 2020, 4:29 p.m. UTC | #1
On Thu, Sep 17, 2020 at 01:10:48PM +0300, Maxim Levitsky wrote:
> This way we don't waste memory on VMs which don't use
> nesting virtualization even if it is available to them.
> 
> If allocation of nested state fails (which should happen,
> only when host is about to OOM anyway), use new KVM_REQ_OUT_OF_MEMORY
> request to shut down the guest
> 
> Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> ---
>  arch/x86/kvm/svm/nested.c | 42 ++++++++++++++++++++++++++++++
>  arch/x86/kvm/svm/svm.c    | 54 ++++++++++++++++++++++-----------------
>  arch/x86/kvm/svm/svm.h    |  7 +++++
>  3 files changed, 79 insertions(+), 24 deletions(-)
> 
> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> index 09417f5197410..fe119da2ef836 100644
> --- a/arch/x86/kvm/svm/nested.c
> +++ b/arch/x86/kvm/svm/nested.c
> @@ -467,6 +467,9 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
>  
>  	vmcb12 = map.hva;
>  
> +	if (WARN_ON(!svm->nested.initialized))
> +		return 1;
> +
>  	if (!nested_vmcb_checks(svm, vmcb12)) {
>  		vmcb12->control.exit_code    = SVM_EXIT_ERR;
>  		vmcb12->control.exit_code_hi = 0;
> @@ -684,6 +687,45 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
>  	return 0;
>  }
>  
> +int svm_allocate_nested(struct vcpu_svm *svm)
> +{
> +	struct page *hsave_page;
> +
> +	if (svm->nested.initialized)
> +		return 0;
> +
> +	hsave_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
> +	if (!hsave_page)
> +		goto error;

goto is unnecessary, just do

		return -ENOMEM;

> +
> +	svm->nested.hsave = page_address(hsave_page);
> +
> +	svm->nested.msrpm = svm_vcpu_init_msrpm();
> +	if (!svm->nested.msrpm)
> +		goto err_free_hsave;
> +
> +	svm->nested.initialized = true;
> +	return 0;
> +err_free_hsave:
> +	__free_page(hsave_page);
> +error:
> +	return 1;

As above, -ENOMEM would be preferable.

> +}
> +
> +void svm_free_nested(struct vcpu_svm *svm)
> +{
> +	if (!svm->nested.initialized)
> +		return;
> +
> +	svm_vcpu_free_msrpm(svm->nested.msrpm);
> +	svm->nested.msrpm = NULL;
> +
> +	__free_page(virt_to_page(svm->nested.hsave));
> +	svm->nested.hsave = NULL;
> +
> +	svm->nested.initialized = false;
> +}
> +
>  /*
>   * Forcibly leave nested mode in order to be able to reset the VCPU later on.
>   */
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 3da5b2f1b4a19..57ea4407dcf09 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -266,6 +266,7 @@ static int get_max_npt_level(void)
>  void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
>  {
>  	struct vcpu_svm *svm = to_svm(vcpu);
> +	u64 old_efer = vcpu->arch.efer;
>  	vcpu->arch.efer = efer;
>  
>  	if (!npt_enabled) {
> @@ -276,9 +277,26 @@ void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
>  			efer &= ~EFER_LME;
>  	}
>  
> -	if (!(efer & EFER_SVME)) {
> -		svm_leave_nested(svm);
> -		svm_set_gif(svm, true);
> +	if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
> +		if (!(efer & EFER_SVME)) {
> +			svm_leave_nested(svm);
> +			svm_set_gif(svm, true);
> +
> +			/*
> +			 * Free the nested state unless we are in SMM, in which
> +			 * case the exit from SVM mode is only for duration of the SMI
> +			 * handler
> +			 */
> +			if (!is_smm(&svm->vcpu))
> +				svm_free_nested(svm);
> +
> +		} else {
> +			if (svm_allocate_nested(svm)) {
> +				vcpu->arch.efer = old_efer;
> +				kvm_make_request(KVM_REQ_OUT_OF_MEMORY, vcpu);

I really dislike KVM_REQ_OUT_OF_MEMORY.  It's redundant with -ENOMEM and
creates a huge discrepancy with respect to existing code, e.g. nVMX returns
-ENOMEM in a similar situation.

The deferred error handling creates other issues, e.g. vcpu->arch.efer is
unwound but the guest's RIP is not.

One thought for handling this without opening a can of worms would be to do:

	r = kvm_x86_ops.set_efer(vcpu, efer);
	if (r) {
		WARN_ON(r > 0);
		return r;
	}

I.e. go with the original approach, but only for returning errors that will
go all the way out to userspace.

> +				return;
> +			}
> +		}
>  	}
>  
>  	svm->vmcb->save.efer = efer | EFER_SVME;
Paolo Bonzini Sept. 19, 2020, 3:09 p.m. UTC | #2
On 17/09/20 18:29, Sean Christopherson wrote:
>> +				vcpu->arch.efer = old_efer;
>> +				kvm_make_request(KVM_REQ_OUT_OF_MEMORY, vcpu);
> I really dislike KVM_REQ_OUT_OF_MEMORY.  It's redundant with -ENOMEM and
> creates a huge discrepancy with respect to existing code, e.g. nVMX returns
> -ENOMEM in a similar situation.

Maxim, your previous version was adding some error handling to
kvm_x86_ops.set_efer.  I don't remember what was the issue; did you have
any problems propagating all the errors up to KVM_SET_SREGS (easy),
kvm_set_msr (harder) etc.?

Paolo
Sean Christopherson Sept. 20, 2020, 4:16 p.m. UTC | #3
On Sat, Sep 19, 2020 at 05:09:09PM +0200, Paolo Bonzini wrote:
> On 17/09/20 18:29, Sean Christopherson wrote:
> >> +				vcpu->arch.efer = old_efer;
> >> +				kvm_make_request(KVM_REQ_OUT_OF_MEMORY, vcpu);
> > I really dislike KVM_REQ_OUT_OF_MEMORY.  It's redundant with -ENOMEM and
> > creates a huge discrepancy with respect to existing code, e.g. nVMX returns
> > -ENOMEM in a similar situation.
> 
> Maxim, your previous version was adding some error handling to
> kvm_x86_ops.set_efer.  I don't remember what was the issue; did you have
> any problems propagating all the errors up to KVM_SET_SREGS (easy),
> kvm_set_msr (harder) etc.?

I objected to letting .set_efer() return a fault.  A relatively minor issue is
the code in vmx_set_efer() that handles lack of EFER because technically KVM
can emulate EFER.SCE+SYSCALL without supporting EFER in hardware.  Returning
success/'0' would avoid that particular issue.  My primary concern is that I'd
prefer not to add another case where KVM can potentially ignore a fault
indicated by a helper, a la vmx_set_cr4().

To that end, I'd be ok with adding error handling to .set_efer() if KVM
enforces, via WARN in one of the .set_efer() call sites, that SVM/VMX can only
return negative error codes, i.e. let SVM handle the -ENOMEM case but disallow
fault injection.  It doesn't actually change anything, but it'd give me a warm
fuzzy feeling.
Paolo Bonzini Sept. 20, 2020, 4:42 p.m. UTC | #4
On 20/09/20 18:16, Sean Christopherson wrote:
>> Maxim, your previous version was adding some error handling to
>> kvm_x86_ops.set_efer.  I don't remember what was the issue; did you have
>> any problems propagating all the errors up to KVM_SET_SREGS (easy),
>> kvm_set_msr (harder) etc.?
> I objected to letting .set_efer() return a fault.

So did I, and that's why we get KVM_REQ_OUT_OF_MEMORY.  But it was more
of an "it's ugly and it ought not to fail" thing than something I could
pinpoint.

It looks like we agree, but still we have to choose the lesser evil?

Paolo

> A relatively minor issue is
> the code in vmx_set_efer() that handles lack of EFER because technically KVM
> can emulate EFER.SCE+SYSCALL without supporting EFER in hardware.  Returning
> success/'0' would avoid that particular issue.  My primary concern is that I'd
> prefer not to add another case where KVM can potentially ignore a fault
> indicated by a helper, a la vmx_set_cr4().
Maxim Levitsky Sept. 21, 2020, 7:53 a.m. UTC | #5
On Sun, 2020-09-20 at 18:42 +0200, Paolo Bonzini wrote:
> On 20/09/20 18:16, Sean Christopherson wrote:
> > > Maxim, your previous version was adding some error handling to
> > > kvm_x86_ops.set_efer.  I don't remember what was the issue; did you have
> > > any problems propagating all the errors up to KVM_SET_SREGS (easy),
> > > kvm_set_msr (harder) etc.?
> > I objected to letting .set_efer() return a fault.
> 
> So did I, and that's why we get KVM_REQ_OUT_OF_MEMORY.  But it was more
> of an "it's ugly and it ought not to fail" thing than something I could
> pinpoint.
> 
> It looks like we agree, but still we have to choose the lesser evil?
> 
> Paolo
> 
> > A relatively minor issue is
> > the code in vmx_set_efer() that handles lack of EFER because technically KVM
> > can emulate EFER.SCE+SYSCALL without supporting EFER in hardware.  Returning
> > success/'0' would avoid that particular issue.  My primary concern is that I'd
> > prefer not to add another case where KVM can potentially ignore a fault
> > indicated by a helper, a la vmx_set_cr4().

The thing is that kvm_emulate_wrmsr injects #GP when kvm_set_msr returns any non zero value,
and returns 1 which means keep on going if I understand correctly (0 is userspace exit,
negative value would be a return to userspace with an error)

So the question is if we have other wrmsr handlers which return negative value, and would
be affected by changing kvm_emulate_wrmsr to pass through the error value.
I am checking the code now.

I do agree now that this is the *correct* solution to this problem.

Best regards,
	Maxim Levitsky
Maxim Levitsky Sept. 21, 2020, 8:57 a.m. UTC | #6
On Mon, 2020-09-21 at 10:53 +0300, Maxim Levitsky wrote:
> On Sun, 2020-09-20 at 18:42 +0200, Paolo Bonzini wrote:
> > On 20/09/20 18:16, Sean Christopherson wrote:
> > > > Maxim, your previous version was adding some error handling to
> > > > kvm_x86_ops.set_efer.  I don't remember what was the issue; did you have
> > > > any problems propagating all the errors up to KVM_SET_SREGS (easy),
> > > > kvm_set_msr (harder) etc.?
> > > I objected to letting .set_efer() return a fault.
> > 
> > So did I, and that's why we get KVM_REQ_OUT_OF_MEMORY.  But it was more
> > of an "it's ugly and it ought not to fail" thing than something I could
> > pinpoint.
> > 
> > It looks like we agree, but still we have to choose the lesser evil?
> > 
> > Paolo
> > 
> > > A relatively minor issue is
> > > the code in vmx_set_efer() that handles lack of EFER because technically KVM
> > > can emulate EFER.SCE+SYSCALL without supporting EFER in hardware.  Returning
> > > success/'0' would avoid that particular issue.  My primary concern is that I'd
> > > prefer not to add another case where KVM can potentially ignore a fault
> > > indicated by a helper, a la vmx_set_cr4().
> 
> The thing is that kvm_emulate_wrmsr injects #GP when kvm_set_msr returns any non zero value,
> and returns 1 which means keep on going if I understand correctly (0 is userspace exit,
> negative value would be a return to userspace with an error)
> 
> So the question is if we have other wrmsr handlers which return negative value, and would
> be affected by changing kvm_emulate_wrmsr to pass through the error value.
> I am checking the code now.
> 
> I do agree now that this is the *correct* solution to this problem.
> 
> Best regards,
> 	Maxim Levitsky


So those are results of my analysis:

WRMSR called functions that return negative value (I could have missed something,
but I double checked the wrmsr code in both SVM and VMX, and in the common x86 code):

vmx_set_vmx_msr - this is only called from userspace (msr_info->host_initiated == true),
so this can be left as is

xen_hvm_config - this code should probably return 1 in some cases, but in one case,
it legit does memory allocation like I do, and failure should probably kill the guest
as well (but I can keep it as is if we are afraid that new behavier will not be
backward compatible)

What do you think about this (only compile tested since I don't have any xen setups):

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 36e963dc1da61..66a57c5b14dfd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2695,24 +2695,19 @@ static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
        u32 page_num = data & ~PAGE_MASK;
        u64 page_addr = data & PAGE_MASK;
        u8 *page;
-       int r;
 
-       r = -E2BIG;
        if (page_num >= blob_size)
-               goto out;
-       r = -ENOMEM;
+               return 1;
+
        page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
-       if (IS_ERR(page)) {
-               r = PTR_ERR(page);
-               goto out;
+       if (IS_ERR(page))
+               return PTR_ERR(page);
+
+       if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) {
+               kfree(page);
+               return 1;
        }
-       if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE))
-               goto out_free;
-       r = 0;
-out_free:
-       kfree(page);
-out:
-       return r;
+       return 0;
 }


The msr write itself can be reached from the guest through two functions,
from kvm_emulate_wrmsr which is called in wrmsr interception from both VMX and SVM,
and from em_wrmsr which is called in unlikely case the emulator needs to emulate a wrmsr.

Both should be changed to inject #GP only on positive return value and pass the error
otherwise.

Sounds reasonable? If you agree I'll post the patches implementing this.

Best regards,
	Maxim Levitsky
Maxim Levitsky Sept. 21, 2020, 1:23 p.m. UTC | #7
On Thu, 2020-09-17 at 09:29 -0700, Sean Christopherson wrote:
> On Thu, Sep 17, 2020 at 01:10:48PM +0300, Maxim Levitsky wrote:
> > This way we don't waste memory on VMs which don't use
> > nesting virtualization even if it is available to them.
> > 
> > If allocation of nested state fails (which should happen,
> > only when host is about to OOM anyway), use new KVM_REQ_OUT_OF_MEMORY
> > request to shut down the guest
> > 
> > Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> > ---
> >  arch/x86/kvm/svm/nested.c | 42 ++++++++++++++++++++++++++++++
> >  arch/x86/kvm/svm/svm.c    | 54 ++++++++++++++++++++++-----------------
> >  arch/x86/kvm/svm/svm.h    |  7 +++++
> >  3 files changed, 79 insertions(+), 24 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> > index 09417f5197410..fe119da2ef836 100644
> > --- a/arch/x86/kvm/svm/nested.c
> > +++ b/arch/x86/kvm/svm/nested.c
> > @@ -467,6 +467,9 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
> >  
> >  	vmcb12 = map.hva;
> >  
> > +	if (WARN_ON(!svm->nested.initialized))
> > +		return 1;
> > +
> >  	if (!nested_vmcb_checks(svm, vmcb12)) {
> >  		vmcb12->control.exit_code    = SVM_EXIT_ERR;
> >  		vmcb12->control.exit_code_hi = 0;
> > @@ -684,6 +687,45 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
> >  	return 0;
> >  }
> >  
> > +int svm_allocate_nested(struct vcpu_svm *svm)
> > +{
> > +	struct page *hsave_page;
> > +
> > +	if (svm->nested.initialized)
> > +		return 0;
> > +
> > +	hsave_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
> > +	if (!hsave_page)
> > +		goto error;
> 
> goto is unnecessary, just do
> 
> 		return -ENOMEM;

To be honest this is a philosophical question,
what way is better, but I don't mind to change this.

> 
> > +
> > +	svm->nested.hsave = page_address(hsave_page);
> > +
> > +	svm->nested.msrpm = svm_vcpu_init_msrpm();
> > +	if (!svm->nested.msrpm)
> > +		goto err_free_hsave;
> > +
> > +	svm->nested.initialized = true;
> > +	return 0;
> > +err_free_hsave:
> > +	__free_page(hsave_page);
> > +error:
> > +	return 1;
> 
> As above, -ENOMEM would be preferable.
After the changes to return negative values from msr writes,
this indeed makes sense and is done now.
> 
> > +}
> > +
> > +void svm_free_nested(struct vcpu_svm *svm)
> > +{
> > +	if (!svm->nested.initialized)
> > +		return;
> > +
> > +	svm_vcpu_free_msrpm(svm->nested.msrpm);
> > +	svm->nested.msrpm = NULL;
> > +
> > +	__free_page(virt_to_page(svm->nested.hsave));
> > +	svm->nested.hsave = NULL;
> > +
> > +	svm->nested.initialized = false;
> > +}
> > +
> >  /*
> >   * Forcibly leave nested mode in order to be able to reset the VCPU later on.
> >   */
> > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> > index 3da5b2f1b4a19..57ea4407dcf09 100644
> > --- a/arch/x86/kvm/svm/svm.c
> > +++ b/arch/x86/kvm/svm/svm.c
> > @@ -266,6 +266,7 @@ static int get_max_npt_level(void)
> >  void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
> >  {
> >  	struct vcpu_svm *svm = to_svm(vcpu);
> > +	u64 old_efer = vcpu->arch.efer;
> >  	vcpu->arch.efer = efer;
> >  
> >  	if (!npt_enabled) {
> > @@ -276,9 +277,26 @@ void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
> >  			efer &= ~EFER_LME;
> >  	}
> >  
> > -	if (!(efer & EFER_SVME)) {
> > -		svm_leave_nested(svm);
> > -		svm_set_gif(svm, true);
> > +	if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
> > +		if (!(efer & EFER_SVME)) {
> > +			svm_leave_nested(svm);
> > +			svm_set_gif(svm, true);
> > +
> > +			/*
> > +			 * Free the nested state unless we are in SMM, in which
> > +			 * case the exit from SVM mode is only for duration of the SMI
> > +			 * handler
> > +			 */
> > +			if (!is_smm(&svm->vcpu))
> > +				svm_free_nested(svm);
> > +
> > +		} else {
> > +			if (svm_allocate_nested(svm)) {
> > +				vcpu->arch.efer = old_efer;
> > +				kvm_make_request(KVM_REQ_OUT_OF_MEMORY, vcpu);
> 
> I really dislike KVM_REQ_OUT_OF_MEMORY.  It's redundant with -ENOMEM and
> creates a huge discrepancy with respect to existing code, e.g. nVMX returns
> -ENOMEM in a similar situation.
> 
> The deferred error handling creates other issues, e.g. vcpu->arch.efer is
> unwound but the guest's RIP is not.
> 
> One thought for handling this without opening a can of worms would be to do:
> 
> 	r = kvm_x86_ops.set_efer(vcpu, efer);
> 	if (r) {
> 		WARN_ON(r > 0);
> 		return r;
> 	}
> 
> I.e. go with the original approach, but only for returning errors that will
> go all the way out to userspace.

Done as explained in the other reply.

> 
> > +				return;
> > +			}
> > +		}
> >  	}
> >  
> >  	svm->vmcb->save.efer = efer | EFER_SVME;


Thanks for the review,
	Best regards,
		Maxim Levitsky
diff mbox series

Patch

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 09417f5197410..fe119da2ef836 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -467,6 +467,9 @@  int nested_svm_vmrun(struct vcpu_svm *svm)
 
 	vmcb12 = map.hva;
 
+	if (WARN_ON(!svm->nested.initialized))
+		return 1;
+
 	if (!nested_vmcb_checks(svm, vmcb12)) {
 		vmcb12->control.exit_code    = SVM_EXIT_ERR;
 		vmcb12->control.exit_code_hi = 0;
@@ -684,6 +687,45 @@  int nested_svm_vmexit(struct vcpu_svm *svm)
 	return 0;
 }
 
+int svm_allocate_nested(struct vcpu_svm *svm)
+{
+	struct page *hsave_page;
+
+	if (svm->nested.initialized)
+		return 0;
+
+	hsave_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+	if (!hsave_page)
+		goto error;
+
+	svm->nested.hsave = page_address(hsave_page);
+
+	svm->nested.msrpm = svm_vcpu_init_msrpm();
+	if (!svm->nested.msrpm)
+		goto err_free_hsave;
+
+	svm->nested.initialized = true;
+	return 0;
+err_free_hsave:
+	__free_page(hsave_page);
+error:
+	return 1;
+}
+
+void svm_free_nested(struct vcpu_svm *svm)
+{
+	if (!svm->nested.initialized)
+		return;
+
+	svm_vcpu_free_msrpm(svm->nested.msrpm);
+	svm->nested.msrpm = NULL;
+
+	__free_page(virt_to_page(svm->nested.hsave));
+	svm->nested.hsave = NULL;
+
+	svm->nested.initialized = false;
+}
+
 /*
  * Forcibly leave nested mode in order to be able to reset the VCPU later on.
  */
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 3da5b2f1b4a19..57ea4407dcf09 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -266,6 +266,7 @@  static int get_max_npt_level(void)
 void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
+	u64 old_efer = vcpu->arch.efer;
 	vcpu->arch.efer = efer;
 
 	if (!npt_enabled) {
@@ -276,9 +277,26 @@  void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 			efer &= ~EFER_LME;
 	}
 
-	if (!(efer & EFER_SVME)) {
-		svm_leave_nested(svm);
-		svm_set_gif(svm, true);
+	if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
+		if (!(efer & EFER_SVME)) {
+			svm_leave_nested(svm);
+			svm_set_gif(svm, true);
+
+			/*
+			 * Free the nested state unless we are in SMM, in which
+			 * case the exit from SVM mode is only for duration of the SMI
+			 * handler
+			 */
+			if (!is_smm(&svm->vcpu))
+				svm_free_nested(svm);
+
+		} else {
+			if (svm_allocate_nested(svm)) {
+				vcpu->arch.efer = old_efer;
+				kvm_make_request(KVM_REQ_OUT_OF_MEMORY, vcpu);
+				return;
+			}
+		}
 	}
 
 	svm->vmcb->save.efer = efer | EFER_SVME;
@@ -609,7 +627,7 @@  static void set_msr_interception(u32 *msrpm, unsigned msr,
 	msrpm[offset] = tmp;
 }
 
-static u32 *svm_vcpu_init_msrpm(void)
+u32 *svm_vcpu_init_msrpm(void)
 {
 	int i;
 	u32 *msrpm;
@@ -629,7 +647,7 @@  static u32 *svm_vcpu_init_msrpm(void)
 	return msrpm;
 }
 
-static void svm_vcpu_free_msrpm(u32 *msrpm)
+void svm_vcpu_free_msrpm(u32 *msrpm)
 {
 	__free_pages(virt_to_page(msrpm), MSRPM_ALLOC_ORDER);
 }
@@ -1203,7 +1221,6 @@  static int svm_create_vcpu(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm;
 	struct page *vmcb_page;
-	struct page *hsave_page;
 	int err;
 
 	BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
@@ -1214,13 +1231,9 @@  static int svm_create_vcpu(struct kvm_vcpu *vcpu)
 	if (!vmcb_page)
 		goto out;
 
-	hsave_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
-	if (!hsave_page)
-		goto error_free_vmcb_page;
-
 	err = avic_init_vcpu(svm);
 	if (err)
-		goto error_free_hsave_page;
+		goto out;
 
 	/* We initialize this flag to true to make sure that the is_running
 	 * bit would be set the first time the vcpu is loaded.
@@ -1228,15 +1241,9 @@  static int svm_create_vcpu(struct kvm_vcpu *vcpu)
 	if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm))
 		svm->avic_is_running = true;
 
-	svm->nested.hsave = page_address(hsave_page);
-
 	svm->msrpm = svm_vcpu_init_msrpm();
 	if (!svm->msrpm)
-		goto error_free_hsave_page;
-
-	svm->nested.msrpm = svm_vcpu_init_msrpm();
-	if (!svm->nested.msrpm)
-		goto error_free_msrpm;
+		goto error_free_vmcb_page;
 
 	svm->vmcb = page_address(vmcb_page);
 	svm->vmcb_pa = __sme_set(page_to_pfn(vmcb_page) << PAGE_SHIFT);
@@ -1248,10 +1255,6 @@  static int svm_create_vcpu(struct kvm_vcpu *vcpu)
 
 	return 0;
 
-error_free_msrpm:
-	svm_vcpu_free_msrpm(svm->msrpm);
-error_free_hsave_page:
-	__free_page(hsave_page);
 error_free_vmcb_page:
 	__free_page(vmcb_page);
 out:
@@ -1277,10 +1280,10 @@  static void svm_free_vcpu(struct kvm_vcpu *vcpu)
 	 */
 	svm_clear_current_vmcb(svm->vmcb);
 
+	svm_free_nested(svm);
+
 	__free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
 	__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
-	__free_page(virt_to_page(svm->nested.hsave));
-	__free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
 }
 
 static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -3963,6 +3966,9 @@  static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
 					 gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL)
 				return 1;
 
+			if (svm_allocate_nested(svm))
+				return 1;
+
 			ret = enter_svm_guest_mode(svm, vmcb12_gpa, map.hva);
 			kvm_vcpu_unmap(&svm->vcpu, &map, true);
 		}
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 45496775f0db2..3bee16e1f2e73 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -96,6 +96,8 @@  struct svm_nested_state {
 
 	/* cache for control fields of the guest */
 	struct vmcb_control_area ctl;
+
+	bool initialized;
 };
 
 struct vcpu_svm {
@@ -338,6 +340,9 @@  static inline bool gif_set(struct vcpu_svm *svm)
 #define MSR_INVALID				0xffffffffU
 
 u32 svm_msrpm_offset(u32 msr);
+u32 *svm_vcpu_init_msrpm(void);
+void svm_vcpu_free_msrpm(u32 *msrpm);
+
 void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer);
 void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
 int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
@@ -379,6 +384,8 @@  static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
 int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
 			 struct vmcb *nested_vmcb);
 void svm_leave_nested(struct vcpu_svm *svm);
+void svm_free_nested(struct vcpu_svm *svm);
+int svm_allocate_nested(struct vcpu_svm *svm);
 int nested_svm_vmrun(struct vcpu_svm *svm);
 void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb);
 int nested_svm_vmexit(struct vcpu_svm *svm);