diff mbox

[v2] kvm: nVMX: Introduce KVM_CAP_STATE

Message ID 1523263049-31993-1-git-send-email-karahmed@amazon.de (mailing list archive)
State New, archived
Headers show

Commit Message

KarimAllah Ahmed April 9, 2018, 8:37 a.m. UTC
From: Jim Mattson <jmattson@google.com>

For nested virtualization L0 KVM is managing a bit of state for L2 guests,
this state can not be captured through the currently available IOCTLs. In
fact the state captured through all of these IOCTLs is usually a mix of L1
and L2 state. It is also dependent on whether the L2 guest was running at
the moment when the process was interrupted to save its state.

With this capability, there are two new vcpu ioctls: KVM_GET_VMX_STATE and
KVM_SET_VMX_STATE. These can be used for saving and restoring a VM that is
in VMX operation.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Jim Mattson <jmattson@google.com>
[karahmed@ - rename structs and functions and make them ready for AMD and
             address previous comments.
           - rebase & a bit of refactoring.
           - Merge 7/8 and 8/8 into one patch.
           - Force a VMExit from L2 after reading the kvm_state to avoid
             mixed state between L1 and L2 on resurrecting the instance. ]
Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de>
---
v1 -> v2:
- rename structs and functions and make them ready for AMD and address
  previous comments.
- rebase & a bit of refactoring.
- Merge 7/8 and 8/8 into one patch.
- Force a VMExit from L2 after reading the kvm_state to avoid mixed state
  between L1 and L2 on resurrecting the instance.
---
 Documentation/virtual/kvm/api.txt |  46 ++++++++++
 arch/x86/include/asm/kvm_host.h   |   7 ++
 arch/x86/include/uapi/asm/kvm.h   |  38 ++++++++
 arch/x86/kvm/vmx.c                | 189 +++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/x86.c                |  21 +++++
 include/uapi/linux/kvm.h          |   5 +
 6 files changed, 302 insertions(+), 4 deletions(-)

Comments

David Hildenbrand April 9, 2018, 11:26 a.m. UTC | #1
On 09.04.2018 10:37, KarimAllah Ahmed wrote:
> From: Jim Mattson <jmattson@google.com>
> 
> For nested virtualization L0 KVM is managing a bit of state for L2 guests,
> this state can not be captured through the currently available IOCTLs. In
> fact the state captured through all of these IOCTLs is usually a mix of L1
> and L2 state. It is also dependent on whether the L2 guest was running at
> the moment when the process was interrupted to save its state.
> 
> With this capability, there are two new vcpu ioctls: KVM_GET_VMX_STATE and
> KVM_SET_VMX_STATE. These can be used for saving and restoring a VM that is
> in VMX operation.
> 

Very nice work!

>  
> +static int get_vmcs_cache(struct kvm_vcpu *vcpu,
> +			  struct kvm_state __user *user_kvm_state)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
> +
> +	/*
> +	 * When running L2, the authoritative vmcs12 state is in the
> +	 * vmcs02. When running L1, the authoritative vmcs12 state is
> +	 * in the shadow vmcs linked to vmcs01, unless
> +	 * sync_shadow_vmcs is set, in which case, the authoritative
> +	 * vmcs12 state is in the vmcs12 already.
> +	 */
> +	if (is_guest_mode(vcpu))
> +		sync_vmcs12(vcpu, vmcs12);
> +	else if (enable_shadow_vmcs && !vmx->nested.sync_shadow_vmcs)
> +		copy_shadow_to_vmcs12(vmx);
> +
> +	if (copy_to_user(user_kvm_state->data, vmcs12, sizeof(*vmcs12)))
> +		return -EFAULT;
> +
> +	/*
> +	 * Force a nested exit that guarantees that any state capture
> +	 * afterwards by any IOCTLs (MSRs, etc) will not capture a mix of L1
> +	 * and L2 state.> +	 *

I totally understand why this is nice, I am worried about the
implications. Let's assume migration fails and we want to continue
running the guest on the source. We would now have a "bad" state.

How is this to be handled (e.g. is a SET_STATE necessary?)? I think this
implication should be documented for KVM_GET_STATE.

> +	 * One example where that would lead to an issue is the TSC DEADLINE
> +	 * MSR vs the guest TSC. If the L2 guest is running, the guest TSC will
> +	 * be the L2 TSC while the TSC deadline MSR will contain the L1 TSC
> +	 * deadline MSR. That would lead to a very large (and wrong) "expire"
> +	 * diff when LAPIC is initialized during instance restore (i.e. the
> +	 * instance will appear to have hanged!).
> +	 */
> +	if (is_guest_mode(vcpu))
> +		nested_vmx_vmexit(vcpu, -1, 0, 0);
> +
> +	return 0;
> +}
> +
> +static int get_vmx_state(struct kvm_vcpu *vcpu,
> +			 struct kvm_state __user *user_kvm_state)
> +{
> +	u32 user_data_size;
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	struct kvm_state kvm_state = {
> +		.flags = 0,
> +		.format = 0,
> +		.size = sizeof(kvm_state),
> +		.vmx.vmxon_pa = -1ull,
> +		.vmx.vmcs_pa = -1ull,
> +	};
> +
> +	if (copy_from_user(&user_data_size, &user_kvm_state->size,
> +			   sizeof(user_data_size)))
> +		return -EFAULT;
> +
> +	if (nested_vmx_allowed(vcpu) && vmx->nested.vmxon) {
> +		kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
> +		kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr;
> +
> +		if (vmx->nested.current_vmptr != -1ull)
> +			kvm_state.size += VMCS12_SIZE;
> +
> +		if (is_guest_mode(vcpu)) {
> +			kvm_state.flags |= KVM_STATE_GUEST_MODE;
> +
> +			if (vmx->nested.nested_run_pending)
> +				kvm_state.flags |= KVM_STATE_RUN_PENDING;
> +		}
> +	}
> +
> +	if (user_data_size < kvm_state.size) {
> +		if (copy_to_user(&user_kvm_state->size, &kvm_state.size,
> +				 sizeof(kvm_state.size)))
> +			return -EFAULT;
> +		return -E2BIG;
> +	}
> +
> +	if (copy_to_user(user_kvm_state, &kvm_state, sizeof(kvm_state)))
> +		return -EFAULT;
> +
> +	if (vmx->nested.current_vmptr == -1ull)
> +		return 0;
> +
> +	return get_vmcs_cache(vcpu, user_kvm_state);
> +}
> +
> +static int set_vmcs_cache(struct kvm_vcpu *vcpu,
> +			  struct kvm_state __user *user_kvm_state,
> +			  struct kvm_state *kvm_state)
> +
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
> +	u32 exit_qual;
> +	int ret;
> +
> +	if ((kvm_state->size < (sizeof(*vmcs12) + sizeof(*kvm_state))) ||
> +	    kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa ||
> +	    !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa))
> +		return -EINVAL;
> +
> +	if (copy_from_user(vmcs12, user_kvm_state->data, sizeof(*vmcs12)))
> +		return -EFAULT;
> +
> +	if (vmcs12->revision_id != VMCS12_REVISION)
> +		return -EINVAL;
> +
> +	set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa);
> +
> +	if (!(kvm_state->flags & KVM_STATE_GUEST_MODE))
> +		return 0;
> +
> +	if (check_vmentry_prereqs(vcpu, vmcs12) ||
> +	    check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
> +		return -EINVAL;
> +
> +	ret = enter_vmx_non_root_mode(vcpu, true);
> +	if (ret)
> +		return ret;
> +
> +	/*
> +	 * This request will result in a call to
> +	 * nested_get_vmcs12_pages before the next VM-entry.
> +	 */
> +	kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);

Can you elaborate (+document) why this is needed instead of trying to
get the page right away?

Thanks!
Jim Mattson April 9, 2018, 6:30 p.m. UTC | #2
On Mon, Apr 9, 2018 at 1:37 AM, KarimAllah Ahmed <karahmed@amazon.de> wrote:
> From: Jim Mattson <jmattson@google.com>
>
> For nested virtualization L0 KVM is managing a bit of state for L2 guests,
> this state can not be captured through the currently available IOCTLs. In
> fact the state captured through all of these IOCTLs is usually a mix of L1
> and L2 state. It is also dependent on whether the L2 guest was running at
> the moment when the process was interrupted to save its state.
>
> With this capability, there are two new vcpu ioctls: KVM_GET_VMX_STATE and
> KVM_SET_VMX_STATE. These can be used for saving and restoring a VM that is
> in VMX operation.
>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Radim Krčmář <rkrcmar@redhat.com>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: H. Peter Anvin <hpa@zytor.com>
> Cc: x86@kernel.org
> Cc: kvm@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> Signed-off-by: Jim Mattson <jmattson@google.com>
> [karahmed@ - rename structs and functions and make them ready for AMD and
>              address previous comments.
>            - rebase & a bit of refactoring.
>            - Merge 7/8 and 8/8 into one patch.
>            - Force a VMExit from L2 after reading the kvm_state to avoid
>              mixed state between L1 and L2 on resurrecting the instance. ]
> Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de>

First, let me say "thank you" for picking this up!

> ---
> v1 -> v2:
> - rename structs and functions and make them ready for AMD and address
>   previous comments.
> - rebase & a bit of refactoring.
> - Merge 7/8 and 8/8 into one patch.
> - Force a VMExit from L2 after reading the kvm_state to avoid mixed state
>   between L1 and L2 on resurrecting the instance.
> ---
>  Documentation/virtual/kvm/api.txt |  46 ++++++++++
>  arch/x86/include/asm/kvm_host.h   |   7 ++
>  arch/x86/include/uapi/asm/kvm.h   |  38 ++++++++
>  arch/x86/kvm/vmx.c                | 189 +++++++++++++++++++++++++++++++++++++-
>  arch/x86/kvm/x86.c                |  21 +++++
>  include/uapi/linux/kvm.h          |   5 +
>  6 files changed, 302 insertions(+), 4 deletions(-)
>
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index d6b3ff5..3ed56df 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -3516,6 +3516,52 @@ Returns: 0 on success; -1 on error
>  This ioctl can be used to unregister the guest memory region registered
>  with KVM_MEMORY_ENCRYPT_REG_REGION ioctl above.
>
> +4.112 KVM_GET_STATE

More specifically, KVM_GET_NESTED_STATE?

> +
> +Capability: KVM_CAP_STATE

KVM_CAP_NESTED_STATE?

> +Architectures: x86
> +Type: vcpu ioctl
> +Parameters: struct kvm_state (in/out)
> +Returns: 0 on success, -1 on error
> +Errors:
> +  E2BIG:     the data size exceeds the value of 'size' specified by
> +             the user (the size required will be written into size).
> +
> +struct kvm_state {
> +       __u16 flags;
> +       __u16 format;
> +       __u32 size;
> +       union {
> +               struct kvm_vmx_state vmx;
> +               struct kvm_svm_state svm;
> +               __u8 pad[120];
> +       };
> +       __u8 data[0];
> +};
> +
> +This ioctl copies the vcpu's kvm_state struct from the kernel to userspace.
> +
> +4.113 KVM_SET_STATE

KVM_SET_NESTED_STATE?

> +
> +Capability: KVM_CAP_STATE

KVM_CAP_NESTED_STATE?

> +Architectures: x86
> +Type: vcpu ioctl
> +Parameters: struct kvm_state (in)
> +Returns: 0 on success, -1 on error
> +
> +struct kvm_state {
> +       __u16 flags;
> +       __u16 format;
> +       __u32 size;
> +       union {
> +               struct kvm_vmx_state vmx;
> +               struct kvm_svm_state svm;
> +               __u8 pad[120];
> +       };
> +       __u8 data[0];
> +};
> +
> +This copies the vcpu's kvm_state struct from userspace to the kernel.
>
>  5. The kvm_run structure
>  ------------------------
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index fad4d46..902db9e 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -73,6 +73,7 @@
>  #define KVM_REQ_HV_RESET               KVM_ARCH_REQ(20)
>  #define KVM_REQ_HV_EXIT                        KVM_ARCH_REQ(21)
>  #define KVM_REQ_HV_STIMER              KVM_ARCH_REQ(22)
> +#define KVM_REQ_GET_VMCS12_PAGES       KVM_ARCH_REQ(23)
>
>  #define CR0_RESERVED_BITS                                               \
>         (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
> @@ -1090,6 +1091,12 @@ struct kvm_x86_ops {
>
>         void (*setup_mce)(struct kvm_vcpu *vcpu);
>
> +       int (*get_state)(struct kvm_vcpu *vcpu,

get_nested_state

> +                        struct kvm_state __user *user_kvm_state);
> +       int (*set_state)(struct kvm_vcpu *vcpu,

set_nested_state

> +                        struct kvm_state __user *user_kvm_state);
> +       void (*get_vmcs12_pages)(struct kvm_vcpu *vcpu);
> +
>         int (*smi_allowed)(struct kvm_vcpu *vcpu);
>         int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
>         int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase);
> diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
> index f3a9604..1d1cd26 100644
> --- a/arch/x86/include/uapi/asm/kvm.h
> +++ b/arch/x86/include/uapi/asm/kvm.h
> @@ -361,4 +361,42 @@ struct kvm_sync_regs {
>  #define KVM_X86_QUIRK_LINT0_REENABLED  (1 << 0)
>  #define KVM_X86_QUIRK_CD_NW_CLEARED    (1 << 1)
>
> +#define KVM_STATE_GUEST_MODE   0x00000001
> +#define KVM_STATE_RUN_PENDING  0x00000002
> +#define KVM_STATE_GIF          0x00000004
> +
> +struct kvm_vmx_state {
> +       __u64 vmxon_pa;
> +       __u64 vmcs_pa;
> +};
> +
> +struct kvm_svm_state {
> +       __u64 hsave_pa;
> +       __u64 vmcb_pa;
> +};
> +
> +/* for KVM_CAP_STATE */
> +struct kvm_state {
> +       /* KVM_STATE_* flags */
> +       __u16 flags;
> +
> +       /* 0 for VMX, 1 for SVM.  */
> +       __u16 format;
> +
> +       /* 128 for SVM, 128 + VMCS size for VMX.  */
> +       __u32 size;
> +
> +       union {
> +               /* VMXON, VMCS */
> +               struct kvm_vmx_state vmx;
> +               /* HSAVE_PA, VMCB */
> +               struct kvm_svm_state svm;
> +
> +               /* Pad the union to 120 bytes.  */
> +               __u8 pad[120];
> +       };
> +
> +       __u8 data[0];
> +};
> +
>  #endif /* _ASM_X86_KVM_H */
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 14655df..4d830f7 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -10056,10 +10056,10 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
>  static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
>                                                  struct vmcs12 *vmcs12);
>
> -static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
> -                                       struct vmcs12 *vmcs12)
> +static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
>  {
>         struct vcpu_vmx *vmx = to_vmx(vcpu);
> +       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
>
>         if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
>                 if (vmcs12->apic_access_addr != vmx->nested.apic_access_mapping.gfn << PAGE_SHIFT) {
> @@ -11101,8 +11101,6 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
>                 return 1;
>         }
>
> -       nested_get_vmcs12_pages(vcpu, vmcs12);
> -
>         msr_entry_idx = nested_vmx_load_msr(vcpu,
>                                             vmcs12->vm_entry_msr_load_addr,
>                                             vmcs12->vm_entry_msr_load_count);
> @@ -11200,6 +11198,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
>         if (ret)
>                 return ret;
>
> +       nested_get_vmcs12_pages(vcpu);
> +
>         /*
>          * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
>          * by event injection, halt vcpu.
> @@ -12259,6 +12259,183 @@ static int enable_smi_window(struct kvm_vcpu *vcpu)
>         return 0;
>  }
>
> +static int get_vmcs_cache(struct kvm_vcpu *vcpu,
> +                         struct kvm_state __user *user_kvm_state)
> +{
> +       struct vcpu_vmx *vmx = to_vmx(vcpu);
> +       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
> +
> +       /*
> +        * When running L2, the authoritative vmcs12 state is in the
> +        * vmcs02. When running L1, the authoritative vmcs12 state is
> +        * in the shadow vmcs linked to vmcs01, unless
> +        * sync_shadow_vmcs is set, in which case, the authoritative
> +        * vmcs12 state is in the vmcs12 already.
> +        */
> +       if (is_guest_mode(vcpu))
> +               sync_vmcs12(vcpu, vmcs12);
> +       else if (enable_shadow_vmcs && !vmx->nested.sync_shadow_vmcs)
> +               copy_shadow_to_vmcs12(vmx);
> +
> +       if (copy_to_user(user_kvm_state->data, vmcs12, sizeof(*vmcs12)))
> +               return -EFAULT;
> +
> +       /*
> +        * Force a nested exit that guarantees that any state capture
> +        * afterwards by any IOCTLs (MSRs, etc) will not capture a mix of L1
> +        * and L2 state.
> +        *
> +        * One example where that would lead to an issue is the TSC DEADLINE
> +        * MSR vs the guest TSC. If the L2 guest is running, the guest TSC will
> +        * be the L2 TSC while the TSC deadline MSR will contain the L1 TSC
> +        * deadline MSR. That would lead to a very large (and wrong) "expire"
> +        * diff when LAPIC is initialized during instance restore (i.e. the
> +        * instance will appear to have hanged!).
> +        */
> +       if (is_guest_mode(vcpu))
> +               nested_vmx_vmexit(vcpu, -1, 0, 0);

Injecting a fake VM-exit on restore is as bad as injecting a fake
VM-exit on save, and I don't think this is a good approach.

> +
> +       return 0;
> +}
> +
> +static int get_vmx_state(struct kvm_vcpu *vcpu,
> +                        struct kvm_state __user *user_kvm_state)
> +{
> +       u32 user_data_size;
> +       struct vcpu_vmx *vmx = to_vmx(vcpu);
> +       struct kvm_state kvm_state = {
> +               .flags = 0,
> +               .format = 0,
> +               .size = sizeof(kvm_state),
> +               .vmx.vmxon_pa = -1ull,
> +               .vmx.vmcs_pa = -1ull,
> +       };
> +
> +       if (copy_from_user(&user_data_size, &user_kvm_state->size,
> +                          sizeof(user_data_size)))
> +               return -EFAULT;
> +
> +       if (nested_vmx_allowed(vcpu) && vmx->nested.vmxon) {
> +               kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
> +               kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr;
> +
> +               if (vmx->nested.current_vmptr != -1ull)
> +                       kvm_state.size += VMCS12_SIZE;
> +
> +               if (is_guest_mode(vcpu)) {
> +                       kvm_state.flags |= KVM_STATE_GUEST_MODE;
> +
> +                       if (vmx->nested.nested_run_pending)
> +                               kvm_state.flags |= KVM_STATE_RUN_PENDING;

IIRC, when I initially posted this set of changes, I neglected to
include the one that set nested_run_pending before prepare_vmcs02(),
and so this bit isn't actually tracked correctly for save/restore at
the moment.

> +               }
> +       }
> +
> +       if (user_data_size < kvm_state.size) {
> +               if (copy_to_user(&user_kvm_state->size, &kvm_state.size,
> +                                sizeof(kvm_state.size)))
> +                       return -EFAULT;
> +               return -E2BIG;
> +       }
> +
> +       if (copy_to_user(user_kvm_state, &kvm_state, sizeof(kvm_state)))
> +               return -EFAULT;
> +
> +       if (vmx->nested.current_vmptr == -1ull)
> +               return 0;
> +
> +       return get_vmcs_cache(vcpu, user_kvm_state);
> +}
> +
> +static int set_vmcs_cache(struct kvm_vcpu *vcpu,
> +                         struct kvm_state __user *user_kvm_state,
> +                         struct kvm_state *kvm_state)
> +
> +{
> +       struct vcpu_vmx *vmx = to_vmx(vcpu);
> +       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
> +       u32 exit_qual;
> +       int ret;
> +
> +       if ((kvm_state->size < (sizeof(*vmcs12) + sizeof(*kvm_state))) ||
> +           kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa ||
> +           !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa))
> +               return -EINVAL;
> +
> +       if (copy_from_user(vmcs12, user_kvm_state->data, sizeof(*vmcs12)))
> +               return -EFAULT;
> +
> +       if (vmcs12->revision_id != VMCS12_REVISION)
> +               return -EINVAL;
> +
> +       set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa);
> +
> +       if (!(kvm_state->flags & KVM_STATE_GUEST_MODE))
> +               return 0;
> +
> +       if (check_vmentry_prereqs(vcpu, vmcs12) ||
> +           check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
> +               return -EINVAL;
> +
> +       ret = enter_vmx_non_root_mode(vcpu, true);
> +       if (ret)
> +               return ret;
> +
> +       /*
> +        * This request will result in a call to
> +        * nested_get_vmcs12_pages before the next VM-entry.
> +        */
> +       kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
> +
> +       vmx->nested.nested_run_pending = 1;
> +
> +       return 0;
> +}
> +
> +static int set_vmx_state(struct kvm_vcpu *vcpu,
> +                        struct kvm_state __user *user_kvm_state)
> +{
> +       struct vcpu_vmx *vmx = to_vmx(vcpu);
> +       struct kvm_state kvm_state;
> +       int ret;
> +
> +       if (copy_from_user(&kvm_state, user_kvm_state, sizeof(kvm_state)))
> +               return -EFAULT;
> +
> +       if (kvm_state.size < sizeof(kvm_state))
> +               return -EINVAL;
> +
> +       if (kvm_state.format != 0)
> +               return -EINVAL;
> +
> +       if (kvm_state.flags &
> +           ~(KVM_STATE_RUN_PENDING | KVM_STATE_GUEST_MODE))
> +               return -EINVAL;
> +
> +       if (!nested_vmx_allowed(vcpu))
> +               return kvm_state.vmx.vmxon_pa == -1ull ? 0 : -EINVAL;
> +
> +       vmx_leave_nested(vcpu);
> +
> +       vmx->nested.nested_run_pending =
> +               !!(kvm_state.flags & KVM_STATE_RUN_PENDING);
> +
> +       if (kvm_state.vmx.vmxon_pa == -1ull)
> +               return 0;
> +
> +       if (!page_address_valid(vcpu, kvm_state.vmx.vmxon_pa))
> +               return -EINVAL;
> +
> +       vmx->nested.vmxon_ptr = kvm_state.vmx.vmxon_pa;
> +       ret = enter_vmx_operation(vcpu);
> +       if (ret)
> +               return ret;
> +
> +       if (kvm_state.vmx.vmcs_pa == -1ull)
> +               return 0;
> +
> +       return set_vmcs_cache(vcpu, user_kvm_state, &kvm_state);
> +}
> +
>  static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
>         .cpu_has_kvm_support = cpu_has_kvm_support,
>         .disabled_by_bios = vmx_disabled_by_bios,
> @@ -12387,6 +12564,10 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
>
>         .setup_mce = vmx_setup_mce,
>
> +       .get_state = get_vmx_state,
> +       .set_state = set_vmx_state,
> +       .get_vmcs12_pages = nested_get_vmcs12_pages,
> +
>         .smi_allowed = vmx_smi_allowed,
>         .pre_enter_smm = vmx_pre_enter_smm,
>         .pre_leave_smm = vmx_pre_leave_smm,
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 963cdb9..1ab7cc5 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2873,6 +2873,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>         case KVM_CAP_X2APIC_API:
>                 r = KVM_X2APIC_API_VALID_FLAGS;
>                 break;
> +       case KVM_CAP_STATE:
> +               r = !!kvm_x86_ops->get_state;
> +               break;
>         default:
>                 r = 0;
>                 break;
> @@ -3892,6 +3895,22 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>                 r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
>                 break;
>         }
> +       case KVM_GET_STATE: {
> +               struct kvm_state __user *user_kvm_state = argp;
> +
> +               r = -EINVAL;
> +               if (kvm_x86_ops->get_state)
> +                       r = kvm_x86_ops->get_state(vcpu, user_kvm_state);
> +               break;
> +       }
> +       case KVM_SET_STATE: {
> +               struct kvm_state __user *user_kvm_state = argp;
> +
> +               r = -EINVAL;
> +               if (kvm_x86_ops->set_state)
> +                       r = kvm_x86_ops->set_state(vcpu, user_kvm_state);
> +               break;
> +       }
>         default:
>                 r = -EINVAL;
>         }
> @@ -7051,6 +7070,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>         bool req_immediate_exit = false;
>
>         if (kvm_request_pending(vcpu)) {
> +               if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu))
> +                       kvm_x86_ops->get_vmcs12_pages(vcpu);
>                 if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
>                         kvm_mmu_unload(vcpu);
>                 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 4e1d7f5..4c170ff 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -949,6 +949,7 @@ struct kvm_ppc_resize_hpt {
>  #define KVM_CAP_PPC_GET_CPU_CHAR 151
>  #define KVM_CAP_S390_BPB 152
>  #define KVM_CAP_GET_MSR_FEATURES 153
> +#define KVM_CAP_STATE 154
>
>  #ifdef KVM_CAP_IRQ_ROUTING
>
> @@ -1380,6 +1381,10 @@ struct kvm_s390_ucas_mapping {
>  /* Memory Encryption Commands */
>  #define KVM_MEMORY_ENCRYPT_OP      _IOWR(KVMIO, 0xba, unsigned long)
>
> +/* Available with KVM_CAP_STATE */
> +#define KVM_GET_STATE         _IOWR(KVMIO, 0xbb, struct kvm_vmx_state)
> +#define KVM_SET_STATE         _IOW(KVMIO,  0xbc, struct kvm_vmx_state)
> +
>  struct kvm_enc_region {
>         __u64 addr;
>         __u64 size;
> --
> 2.7.4
>
Jim Mattson April 9, 2018, 7:24 p.m. UTC | #3
On Mon, Apr 9, 2018 at 1:37 AM, KarimAllah Ahmed <karahmed@amazon.de> wrote:

> +       /*
> +        * Force a nested exit that guarantees that any state capture
> +        * afterwards by any IOCTLs (MSRs, etc) will not capture a mix of L1
> +        * and L2 state.
> +        *
> +        * One example where that would lead to an issue is the TSC DEADLINE
> +        * MSR vs the guest TSC. If the L2 guest is running, the guest TSC will
> +        * be the L2 TSC while the TSC deadline MSR will contain the L1 TSC
> +        * deadline MSR. That would lead to a very large (and wrong) "expire"
> +        * diff when LAPIC is initialized during instance restore (i.e. the
> +        * instance will appear to have hanged!).
> +        */

This sounds like a bug in the virtualization of IA32_TSC_DEADLINE.
Without involving save/restore, what happens if L2 sets
IA32_TSC_DEADLINE (and L1 permits it via the MSR permission bitmap)?
The IA32_TSC_DEADLINE MSR is always specified with respect to L1's
time domain.
KarimAllah Ahmed April 10, 2018, 7:11 a.m. UTC | #4
On Mon, 2018-04-09 at 12:24 -0700, Jim Mattson wrote:
> On Mon, Apr 9, 2018 at 1:37 AM, KarimAllah Ahmed <karahmed@amazon.de> wrote:

> 

> > 

> > +       /*

> > +        * Force a nested exit that guarantees that any state capture

> > +        * afterwards by any IOCTLs (MSRs, etc) will not capture a mix of L1

> > +        * and L2 state.

> > +        *

> > +        * One example where that would lead to an issue is the TSC DEADLINE

> > +        * MSR vs the guest TSC. If the L2 guest is running, the guest TSC will

> > +        * be the L2 TSC while the TSC deadline MSR will contain the L1 TSC

> > +        * deadline MSR. That would lead to a very large (and wrong) "expire"

> > +        * diff when LAPIC is initialized during instance restore (i.e. the

> > +        * instance will appear to have hanged!).

> > +        */

> 

> This sounds like a bug in the virtualization of IA32_TSC_DEADLINE.

> Without involving save/restore, what happens if L2 sets

> IA32_TSC_DEADLINE (and L1 permits it via the MSR permission bitmap)?

> The IA32_TSC_DEADLINE MSR is always specified with respect to L1's

> time domain.


That makes sense! Let me look into that!

Thanks!

> 

Amazon Development Center Germany GmbH
Berlin - Dresden - Aachen
main office: Krausenstr. 38, 10117 Berlin
Geschaeftsfuehrer: Dr. Ralf Herbrich, Christian Schlaeger
Ust-ID: DE289237879
Eingetragen am Amtsgericht Charlottenburg HRB 149173 B
KarimAllah Ahmed April 10, 2018, 7:37 a.m. UTC | #5
On Mon, 2018-04-09 at 13:26 +0200, David Hildenbrand wrote:
> On 09.04.2018 10:37, KarimAllah Ahmed wrote:

> > 

> > From: Jim Mattson <jmattson@google.com>

> > 

> > For nested virtualization L0 KVM is managing a bit of state for L2 guests,

> > this state can not be captured through the currently available IOCTLs. In

> > fact the state captured through all of these IOCTLs is usually a mix of L1

> > and L2 state. It is also dependent on whether the L2 guest was running at

> > the moment when the process was interrupted to save its state.

> > 

> > With this capability, there are two new vcpu ioctls: KVM_GET_VMX_STATE and

> > KVM_SET_VMX_STATE. These can be used for saving and restoring a VM that is

> > in VMX operation.

> > 

> 

> Very nice work!

> 

> > 

> >  

> > +static int get_vmcs_cache(struct kvm_vcpu *vcpu,

> > +			  struct kvm_state __user *user_kvm_state)

> > +{

> > +	struct vcpu_vmx *vmx = to_vmx(vcpu);

> > +	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);

> > +

> > +	/*

> > +	 * When running L2, the authoritative vmcs12 state is in the

> > +	 * vmcs02. When running L1, the authoritative vmcs12 state is

> > +	 * in the shadow vmcs linked to vmcs01, unless

> > +	 * sync_shadow_vmcs is set, in which case, the authoritative

> > +	 * vmcs12 state is in the vmcs12 already.

> > +	 */

> > +	if (is_guest_mode(vcpu))

> > +		sync_vmcs12(vcpu, vmcs12);

> > +	else if (enable_shadow_vmcs && !vmx->nested.sync_shadow_vmcs)

> > +		copy_shadow_to_vmcs12(vmx);

> > +

> > +	if (copy_to_user(user_kvm_state->data, vmcs12, sizeof(*vmcs12)))

> > +		return -EFAULT;

> > +

> > +	/*

> > +	 * Force a nested exit that guarantees that any state capture

> > +	 * afterwards by any IOCTLs (MSRs, etc) will not capture a mix of L1

> > +	 * and L2 state.> +	 *

> 

> I totally understand why this is nice, I am worried about the

> implications. Let's assume migration fails and we want to continue

> running the guest on the source. We would now have a "bad" state.

> 

> How is this to be handled (e.g. is a SET_STATE necessary?)? I think this

> implication should be documented for KVM_GET_STATE.


Yup, I SET_STATE will be needed. That being said, I guess I will do 
what Jim mentioned and just fix the issue outlined here and then I can 
remove this VMExit.

> > 

> > +	 * One example where that would lead to an issue is the TSC DEADLINE

> > +	 * MSR vs the guest TSC. If the L2 guest is running, the guest TSC will

> > +	 * be the L2 TSC while the TSC deadline MSR will contain the L1 TSC

> > +	 * deadline MSR. That would lead to a very large (and wrong) "expire"

> > +	 * diff when LAPIC is initialized during instance restore (i.e. the

> > +	 * instance will appear to have hanged!).

> > +	 */

> > +	if (is_guest_mode(vcpu))

> > +		nested_vmx_vmexit(vcpu, -1, 0, 0);

> > +

> > +	return 0;

> > +}

> > +

> > +static int get_vmx_state(struct kvm_vcpu *vcpu,

> > +			 struct kvm_state __user *user_kvm_state)

> > +{

> > +	u32 user_data_size;

> > +	struct vcpu_vmx *vmx = to_vmx(vcpu);

> > +	struct kvm_state kvm_state = {

> > +		.flags = 0,

> > +		.format = 0,

> > +		.size = sizeof(kvm_state),

> > +		.vmx.vmxon_pa = -1ull,

> > +		.vmx.vmcs_pa = -1ull,

> > +	};

> > +

> > +	if (copy_from_user(&user_data_size, &user_kvm_state->size,

> > +			   sizeof(user_data_size)))

> > +		return -EFAULT;

> > +

> > +	if (nested_vmx_allowed(vcpu) && vmx->nested.vmxon) {

> > +		kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr;

> > +		kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr;

> > +

> > +		if (vmx->nested.current_vmptr != -1ull)

> > +			kvm_state.size += VMCS12_SIZE;

> > +

> > +		if (is_guest_mode(vcpu)) {

> > +			kvm_state.flags |= KVM_STATE_GUEST_MODE;

> > +

> > +			if (vmx->nested.nested_run_pending)

> > +				kvm_state.flags |= KVM_STATE_RUN_PENDING;

> > +		}

> > +	}

> > +

> > +	if (user_data_size < kvm_state.size) {

> > +		if (copy_to_user(&user_kvm_state->size, &kvm_state.size,

> > +				 sizeof(kvm_state.size)))

> > +			return -EFAULT;

> > +		return -E2BIG;

> > +	}

> > +

> > +	if (copy_to_user(user_kvm_state, &kvm_state, sizeof(kvm_state)))

> > +		return -EFAULT;

> > +

> > +	if (vmx->nested.current_vmptr == -1ull)

> > +		return 0;

> > +

> > +	return get_vmcs_cache(vcpu, user_kvm_state);

> > +}

> > +

> > +static int set_vmcs_cache(struct kvm_vcpu *vcpu,

> > +			  struct kvm_state __user *user_kvm_state,

> > +			  struct kvm_state *kvm_state)

> > +

> > +{

> > +	struct vcpu_vmx *vmx = to_vmx(vcpu);

> > +	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);

> > +	u32 exit_qual;

> > +	int ret;

> > +

> > +	if ((kvm_state->size < (sizeof(*vmcs12) + sizeof(*kvm_state))) ||

> > +	    kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa ||

> > +	    !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa))

> > +		return -EINVAL;

> > +

> > +	if (copy_from_user(vmcs12, user_kvm_state->data, sizeof(*vmcs12)))

> > +		return -EFAULT;

> > +

> > +	if (vmcs12->revision_id != VMCS12_REVISION)

> > +		return -EINVAL;

> > +

> > +	set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa);

> > +

> > +	if (!(kvm_state->flags & KVM_STATE_GUEST_MODE))

> > +		return 0;

> > +

> > +	if (check_vmentry_prereqs(vcpu, vmcs12) ||

> > +	    check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))

> > +		return -EINVAL;

> > +

> > +	ret = enter_vmx_non_root_mode(vcpu, true);

> > +	if (ret)

> > +		return ret;

> > +

> > +	/*

> > +	 * This request will result in a call to

> > +	 * nested_get_vmcs12_pages before the next VM-entry.

> > +	 */

> > +	kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);

> 

> Can you elaborate (+document) why this is needed instead of trying to

> get the page right away?


Because at this point, the MMU is not initialized to point at the
right entities yet and "get pages" would need to read data from the 
guest (i.e. we will need to performance gpa to hpa translation).

Ack! will update the comment in v3.

Thank you!

> 

> Thanks!

> 

Amazon Development Center Germany GmbH
Berlin - Dresden - Aachen
main office: Krausenstr. 38, 10117 Berlin
Geschaeftsfuehrer: Dr. Ralf Herbrich, Christian Schlaeger
Ust-ID: DE289237879
Eingetragen am Amtsgericht Charlottenburg HRB 149173 B
diff mbox

Patch

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index d6b3ff5..3ed56df 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3516,6 +3516,52 @@  Returns: 0 on success; -1 on error
 This ioctl can be used to unregister the guest memory region registered
 with KVM_MEMORY_ENCRYPT_REG_REGION ioctl above.
 
+4.112 KVM_GET_STATE
+
+Capability: KVM_CAP_STATE
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_state (in/out)
+Returns: 0 on success, -1 on error
+Errors:
+  E2BIG:     the data size exceeds the value of 'size' specified by
+             the user (the size required will be written into size).
+
+struct kvm_state {
+	__u16 flags;
+	__u16 format;
+	__u32 size;
+	union {
+		struct kvm_vmx_state vmx;
+		struct kvm_svm_state svm;
+		__u8 pad[120];
+	};
+	__u8 data[0];
+};
+
+This ioctl copies the vcpu's kvm_state struct from the kernel to userspace.
+
+4.113 KVM_SET_STATE
+
+Capability: KVM_CAP_STATE
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_state (in)
+Returns: 0 on success, -1 on error
+
+struct kvm_state {
+	__u16 flags;
+	__u16 format;
+	__u32 size;
+	union {
+		struct kvm_vmx_state vmx;
+		struct kvm_svm_state svm;
+		__u8 pad[120];
+	};
+	__u8 data[0];
+};
+
+This copies the vcpu's kvm_state struct from userspace to the kernel.
 
 5. The kvm_run structure
 ------------------------
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fad4d46..902db9e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -73,6 +73,7 @@ 
 #define KVM_REQ_HV_RESET		KVM_ARCH_REQ(20)
 #define KVM_REQ_HV_EXIT			KVM_ARCH_REQ(21)
 #define KVM_REQ_HV_STIMER		KVM_ARCH_REQ(22)
+#define KVM_REQ_GET_VMCS12_PAGES	KVM_ARCH_REQ(23)
 
 #define CR0_RESERVED_BITS                                               \
 	(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@@ -1090,6 +1091,12 @@  struct kvm_x86_ops {
 
 	void (*setup_mce)(struct kvm_vcpu *vcpu);
 
+	int (*get_state)(struct kvm_vcpu *vcpu,
+			 struct kvm_state __user *user_kvm_state);
+	int (*set_state)(struct kvm_vcpu *vcpu,
+			 struct kvm_state __user *user_kvm_state);
+	void (*get_vmcs12_pages)(struct kvm_vcpu *vcpu);
+
 	int (*smi_allowed)(struct kvm_vcpu *vcpu);
 	int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
 	int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase);
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index f3a9604..1d1cd26 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -361,4 +361,42 @@  struct kvm_sync_regs {
 #define KVM_X86_QUIRK_LINT0_REENABLED	(1 << 0)
 #define KVM_X86_QUIRK_CD_NW_CLEARED	(1 << 1)
 
+#define KVM_STATE_GUEST_MODE	0x00000001
+#define KVM_STATE_RUN_PENDING	0x00000002
+#define KVM_STATE_GIF		0x00000004
+
+struct kvm_vmx_state {
+	__u64 vmxon_pa;
+	__u64 vmcs_pa;
+};
+
+struct kvm_svm_state {
+	__u64 hsave_pa;
+	__u64 vmcb_pa;
+};
+
+/* for KVM_CAP_STATE */
+struct kvm_state {
+	/* KVM_STATE_* flags */
+	__u16 flags;
+
+	/* 0 for VMX, 1 for SVM.  */
+	__u16 format;
+
+	/* 128 for SVM, 128 + VMCS size for VMX.  */
+	__u32 size;
+
+	union {
+		/* VMXON, VMCS */
+		struct kvm_vmx_state vmx;
+		/* HSAVE_PA, VMCB */
+		struct kvm_svm_state svm;
+
+		/* Pad the union to 120 bytes.  */
+		__u8 pad[120];
+	};
+
+	__u8 data[0];
+};
+
 #endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 14655df..4d830f7 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -10056,10 +10056,10 @@  static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
 						 struct vmcs12 *vmcs12);
 
-static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
-					struct vmcs12 *vmcs12)
+static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 
 	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
 		if (vmcs12->apic_access_addr != vmx->nested.apic_access_mapping.gfn << PAGE_SHIFT) {
@@ -11101,8 +11101,6 @@  static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
 		return 1;
 	}
 
-	nested_get_vmcs12_pages(vcpu, vmcs12);
-
 	msr_entry_idx = nested_vmx_load_msr(vcpu,
 					    vmcs12->vm_entry_msr_load_addr,
 					    vmcs12->vm_entry_msr_load_count);
@@ -11200,6 +11198,8 @@  static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 	if (ret)
 		return ret;
 
+	nested_get_vmcs12_pages(vcpu);
+
 	/*
 	 * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
 	 * by event injection, halt vcpu.
@@ -12259,6 +12259,183 @@  static int enable_smi_window(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+static int get_vmcs_cache(struct kvm_vcpu *vcpu,
+			  struct kvm_state __user *user_kvm_state)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+	/*
+	 * When running L2, the authoritative vmcs12 state is in the
+	 * vmcs02. When running L1, the authoritative vmcs12 state is
+	 * in the shadow vmcs linked to vmcs01, unless
+	 * sync_shadow_vmcs is set, in which case, the authoritative
+	 * vmcs12 state is in the vmcs12 already.
+	 */
+	if (is_guest_mode(vcpu))
+		sync_vmcs12(vcpu, vmcs12);
+	else if (enable_shadow_vmcs && !vmx->nested.sync_shadow_vmcs)
+		copy_shadow_to_vmcs12(vmx);
+
+	if (copy_to_user(user_kvm_state->data, vmcs12, sizeof(*vmcs12)))
+		return -EFAULT;
+
+	/*
+	 * Force a nested exit that guarantees that any state capture
+	 * afterwards by any IOCTLs (MSRs, etc) will not capture a mix of L1
+	 * and L2 state.
+	 *
+	 * One example where that would lead to an issue is the TSC DEADLINE
+	 * MSR vs the guest TSC. If the L2 guest is running, the guest TSC will
+	 * be the L2 TSC while the TSC deadline MSR will contain the L1 TSC
+	 * deadline MSR. That would lead to a very large (and wrong) "expire"
+	 * diff when LAPIC is initialized during instance restore (i.e. the
+	 * instance will appear to have hanged!).
+	 */
+	if (is_guest_mode(vcpu))
+		nested_vmx_vmexit(vcpu, -1, 0, 0);
+
+	return 0;
+}
+
+static int get_vmx_state(struct kvm_vcpu *vcpu,
+			 struct kvm_state __user *user_kvm_state)
+{
+	u32 user_data_size;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct kvm_state kvm_state = {
+		.flags = 0,
+		.format = 0,
+		.size = sizeof(kvm_state),
+		.vmx.vmxon_pa = -1ull,
+		.vmx.vmcs_pa = -1ull,
+	};
+
+	if (copy_from_user(&user_data_size, &user_kvm_state->size,
+			   sizeof(user_data_size)))
+		return -EFAULT;
+
+	if (nested_vmx_allowed(vcpu) && vmx->nested.vmxon) {
+		kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
+		kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr;
+
+		if (vmx->nested.current_vmptr != -1ull)
+			kvm_state.size += VMCS12_SIZE;
+
+		if (is_guest_mode(vcpu)) {
+			kvm_state.flags |= KVM_STATE_GUEST_MODE;
+
+			if (vmx->nested.nested_run_pending)
+				kvm_state.flags |= KVM_STATE_RUN_PENDING;
+		}
+	}
+
+	if (user_data_size < kvm_state.size) {
+		if (copy_to_user(&user_kvm_state->size, &kvm_state.size,
+				 sizeof(kvm_state.size)))
+			return -EFAULT;
+		return -E2BIG;
+	}
+
+	if (copy_to_user(user_kvm_state, &kvm_state, sizeof(kvm_state)))
+		return -EFAULT;
+
+	if (vmx->nested.current_vmptr == -1ull)
+		return 0;
+
+	return get_vmcs_cache(vcpu, user_kvm_state);
+}
+
+static int set_vmcs_cache(struct kvm_vcpu *vcpu,
+			  struct kvm_state __user *user_kvm_state,
+			  struct kvm_state *kvm_state)
+
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+	u32 exit_qual;
+	int ret;
+
+	if ((kvm_state->size < (sizeof(*vmcs12) + sizeof(*kvm_state))) ||
+	    kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa ||
+	    !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa))
+		return -EINVAL;
+
+	if (copy_from_user(vmcs12, user_kvm_state->data, sizeof(*vmcs12)))
+		return -EFAULT;
+
+	if (vmcs12->revision_id != VMCS12_REVISION)
+		return -EINVAL;
+
+	set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa);
+
+	if (!(kvm_state->flags & KVM_STATE_GUEST_MODE))
+		return 0;
+
+	if (check_vmentry_prereqs(vcpu, vmcs12) ||
+	    check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
+		return -EINVAL;
+
+	ret = enter_vmx_non_root_mode(vcpu, true);
+	if (ret)
+		return ret;
+
+	/*
+	 * This request will result in a call to
+	 * nested_get_vmcs12_pages before the next VM-entry.
+	 */
+	kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
+
+	vmx->nested.nested_run_pending = 1;
+
+	return 0;
+}
+
+static int set_vmx_state(struct kvm_vcpu *vcpu,
+			 struct kvm_state __user *user_kvm_state)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct kvm_state kvm_state;
+	int ret;
+
+	if (copy_from_user(&kvm_state, user_kvm_state, sizeof(kvm_state)))
+		return -EFAULT;
+
+	if (kvm_state.size < sizeof(kvm_state))
+		return -EINVAL;
+
+	if (kvm_state.format != 0)
+		return -EINVAL;
+
+	if (kvm_state.flags &
+	    ~(KVM_STATE_RUN_PENDING | KVM_STATE_GUEST_MODE))
+		return -EINVAL;
+
+	if (!nested_vmx_allowed(vcpu))
+		return kvm_state.vmx.vmxon_pa == -1ull ? 0 : -EINVAL;
+
+	vmx_leave_nested(vcpu);
+
+	vmx->nested.nested_run_pending =
+		!!(kvm_state.flags & KVM_STATE_RUN_PENDING);
+
+	if (kvm_state.vmx.vmxon_pa == -1ull)
+		return 0;
+
+	if (!page_address_valid(vcpu, kvm_state.vmx.vmxon_pa))
+		return -EINVAL;
+
+	vmx->nested.vmxon_ptr = kvm_state.vmx.vmxon_pa;
+	ret = enter_vmx_operation(vcpu);
+	if (ret)
+		return ret;
+
+	if (kvm_state.vmx.vmcs_pa == -1ull)
+		return 0;
+
+	return set_vmcs_cache(vcpu, user_kvm_state, &kvm_state);
+}
+
 static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
 	.disabled_by_bios = vmx_disabled_by_bios,
@@ -12387,6 +12564,10 @@  static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 
 	.setup_mce = vmx_setup_mce,
 
+	.get_state = get_vmx_state,
+	.set_state = set_vmx_state,
+	.get_vmcs12_pages = nested_get_vmcs12_pages,
+
 	.smi_allowed = vmx_smi_allowed,
 	.pre_enter_smm = vmx_pre_enter_smm,
 	.pre_leave_smm = vmx_pre_leave_smm,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 963cdb9..1ab7cc5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2873,6 +2873,9 @@  int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_X2APIC_API:
 		r = KVM_X2APIC_API_VALID_FLAGS;
 		break;
+	case KVM_CAP_STATE:
+		r = !!kvm_x86_ops->get_state;
+		break;
 	default:
 		r = 0;
 		break;
@@ -3892,6 +3895,22 @@  long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
 		break;
 	}
+	case KVM_GET_STATE: {
+		struct kvm_state __user *user_kvm_state = argp;
+
+		r = -EINVAL;
+		if (kvm_x86_ops->get_state)
+			r = kvm_x86_ops->get_state(vcpu, user_kvm_state);
+		break;
+	}
+	case KVM_SET_STATE: {
+		struct kvm_state __user *user_kvm_state = argp;
+
+		r = -EINVAL;
+		if (kvm_x86_ops->set_state)
+			r = kvm_x86_ops->set_state(vcpu, user_kvm_state);
+		break;
+	}
 	default:
 		r = -EINVAL;
 	}
@@ -7051,6 +7070,8 @@  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	bool req_immediate_exit = false;
 
 	if (kvm_request_pending(vcpu)) {
+		if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu))
+			kvm_x86_ops->get_vmcs12_pages(vcpu);
 		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
 			kvm_mmu_unload(vcpu);
 		if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 4e1d7f5..4c170ff 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -949,6 +949,7 @@  struct kvm_ppc_resize_hpt {
 #define KVM_CAP_PPC_GET_CPU_CHAR 151
 #define KVM_CAP_S390_BPB 152
 #define KVM_CAP_GET_MSR_FEATURES 153
+#define KVM_CAP_STATE 154
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1380,6 +1381,10 @@  struct kvm_s390_ucas_mapping {
 /* Memory Encryption Commands */
 #define KVM_MEMORY_ENCRYPT_OP      _IOWR(KVMIO, 0xba, unsigned long)
 
+/* Available with KVM_CAP_STATE */
+#define KVM_GET_STATE         _IOWR(KVMIO, 0xbb, struct kvm_vmx_state)
+#define KVM_SET_STATE         _IOW(KVMIO,  0xbc, struct kvm_vmx_state)
+
 struct kvm_enc_region {
 	__u64 addr;
 	__u64 size;