diff mbox

KVM: nVMX: support restore of VMX capability MSRs

Message ID 1472840903-7168-1-git-send-email-dmatlack@google.com (mailing list archive)
State New, archived
Headers show

Commit Message

David Matlack Sept. 2, 2016, 6:28 p.m. UTC
The VMX capability MSRs advertise the set of features the KVM virtual
CPU can support. This set of features varies across different host CPUs
and KVM versions. This patch aims to addresses both sources of
differences, allowing VMs to be migrated across CPUs and KVM versions
without guest-visible changes to these MSRs. Note that cross-KVM-
version migration is only supported from this point forward.

In order to support cross CPU migration, the VMX capability MSRs that
depend on host CPU features (all of the VM-Execution Control capability
MSRs, IA32_VMX_MISC, and IA32_VMX_EPT_VPID_CAP) must be audited on
restore to check that the set of features advertised on restore are
a subset of what KVM and the CPU support.

MSRs that do not depend on host features (MSR_IA32_VMX_BASIC and the
cr0/cr4 fixed0/1 MSRs), and instead are entirely dependent on KVM
software support, can only be restored if they match the default value
in KVM. Any future changes to these MSRs will need to be backward
compatible with the existing value of these MSRs and will need to
update the checks done in vmx_set_vmx_msr().

The VMX capability MSRs are read-only, so they do not need to be on
the default MSR save/restore lists. The userspace hypervisor can set
the values of these MSRs or read them from KVM at VCPU creation time,
and restore the same value after every save/restore.

Signed-off-by: David Matlack <dmatlack@google.com>
---
 arch/x86/include/asm/vmx.h |  21 +++++
 arch/x86/kvm/vmx.c         | 231 ++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 230 insertions(+), 22 deletions(-)

Comments

David Matlack Sept. 7, 2016, 10:27 p.m. UTC | #1
On Fri, Sep 2, 2016 at 11:28 AM, David Matlack <dmatlack@google.com> wrote:
> The VMX capability MSRs advertise the set of features the KVM virtual
> CPU can support. This set of features varies across different host CPUs
> and KVM versions. This patch aims to addresses both sources of
> differences, allowing VMs to be migrated across CPUs and KVM versions
> without guest-visible changes to these MSRs. Note that cross-KVM-
> version migration is only supported from this point forward.
>
> In order to support cross CPU migration, the VMX capability MSRs that
> depend on host CPU features (all of the VM-Execution Control capability
> MSRs, IA32_VMX_MISC, and IA32_VMX_EPT_VPID_CAP) must be audited on
> restore to check that the set of features advertised on restore are
> a subset of what KVM and the CPU support.
>
> MSRs that do not depend on host features (MSR_IA32_VMX_BASIC and the
> cr0/cr4 fixed0/1 MSRs), and instead are entirely dependent on KVM
> software support, can only be restored if they match the default value
> in KVM. Any future changes to these MSRs will need to be backward
> compatible with the existing value of these MSRs and will need to
> update the checks done in vmx_set_vmx_msr().
>
> The VMX capability MSRs are read-only, so they do not need to be on
> the default MSR save/restore lists. The userspace hypervisor can set
> the values of these MSRs or read them from KVM at VCPU creation time,
> and restore the same value after every save/restore.
>
> Signed-off-by: David Matlack <dmatlack@google.com>
> ---
>  arch/x86/include/asm/vmx.h |  21 +++++
>  arch/x86/kvm/vmx.c         | 231 ++++++++++++++++++++++++++++++++++++++++-----
>  2 files changed, 230 insertions(+), 22 deletions(-)
>
> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> index a002b07..e57f389 100644
> --- a/arch/x86/include/asm/vmx.h
> +++ b/arch/x86/include/asm/vmx.h
> @@ -25,6 +25,7 @@
>  #define VMX_H
>
>
> +#include <linux/bitops.h>
>  #include <linux/types.h>
>  #include <uapi/asm/vmx.h>
>
> @@ -110,6 +111,26 @@
>  #define VMX_MISC_SAVE_EFER_LMA                 0x00000020
>  #define VMX_MISC_ACTIVITY_HLT                  0x00000040
>
> +static inline int vmx_misc_preemption_timer_rate(u64 vmx_misc)
> +{
> +       return vmx_misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
> +}
> +
> +static inline int vmx_misc_cr3_count(u64 vmx_misc)
> +{
> +       return (vmx_misc & GENMASK_ULL(24, 16)) >> 16;
> +}
> +
> +static inline int vmx_misc_max_msr(u64 vmx_misc)
> +{
> +       return (vmx_misc & GENMASK_ULL(27, 25)) >> 25;
> +}
> +
> +static inline int vmx_misc_mseg_revid(u64 vmx_misc)
> +{
> +       return (vmx_misc & GENMASK_ULL(63, 32)) >> 32;
> +}
> +
>  /* VMCS Encodings */
>  enum vmcs_field {
>         VIRTUAL_PROCESSOR_ID            = 0x00000000,
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 5cede40..1e88cc8 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -464,6 +464,12 @@ struct nested_vmx {
>         u32 nested_vmx_misc_high;
>         u32 nested_vmx_ept_caps;
>         u32 nested_vmx_vpid_caps;
> +       u64 nested_vmx_basic;
> +       u64 nested_vmx_cr0_fixed0;
> +       u64 nested_vmx_cr0_fixed1;
> +       u64 nested_vmx_cr4_fixed0;
> +       u64 nested_vmx_cr4_fixed1;
> +       u64 nested_vmx_vmcs_enum;
>  };
>
>  #define POSTED_INTR_ON  0
> @@ -2846,6 +2852,33 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
>                 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
>                 VMX_MISC_ACTIVITY_HLT;
>         vmx->nested.nested_vmx_misc_high = 0;
> +
> +       /*
> +        * This MSR reports some information about VMX support. We
> +        * should return information about the VMX we emulate for the
> +        * guest, and the VMCS structure we give it - not about the
> +        * VMX support of the underlying hardware.
> +        */
> +       vmx->nested.nested_vmx_basic =
> +               VMCS12_REVISION |
> +               VMX_BASIC_TRUE_CTLS |
> +               ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
> +               (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);

I'll need to send out a v2 to account for "KVM: nVMX: expose INS/OUTS
information support"
(https://git.kernel.org/cgit/virt/kvm/kvm.git/commit/?h=next&id=9ac7e3e815060efdc86b6d12448200e3c3597e01).
It will be a relatively small modification: restore of
MSR_IA32_VMX_BASIC will need to support the 0-setting of
VMX_BASIC_INOUT.

Also, as an FYI to reviewers, I will be on vacation the next two
weeks, so expect high latency from me until late September. Thanks!

> +
> +       /*
> +        * These MSRs specify bits which the guest must keep fixed (on or off)
> +        * while L1 is in VMXON mode (in L1's root mode, or running an L2).
> +        * We picked the standard core2 setting.
> +        */
> +#define VMXON_CR0_ALWAYSON     (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
> +#define VMXON_CR4_ALWAYSON     X86_CR4_VMXE
> +       vmx->nested.nested_vmx_cr0_fixed0 = VMXON_CR0_ALWAYSON;
> +       vmx->nested.nested_vmx_cr0_fixed1 = -1ULL;
> +       vmx->nested.nested_vmx_cr4_fixed0 = VMXON_CR4_ALWAYSON;
> +       vmx->nested.nested_vmx_cr4_fixed1 = -1ULL;
> +
> +       /* highest index: VMX_PREEMPTION_TIMER_VALUE */
> +       vmx->nested.nested_vmx_vmcs_enum = 0x2e;
>  }
>
>  static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
> @@ -2861,6 +2894,171 @@ static inline u64 vmx_control_msr(u32 low, u32 high)
>         return low | ((u64)high << 32);
>  }
>
> +static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
> +{
> +       superset &= mask;
> +       subset &= mask;
> +
> +       return (superset | subset) == superset;
> +}
> +
> +static int
> +vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
> +{
> +       u64 supported;
> +       u32 *lowp, *highp;
> +
> +       switch (msr_index) {
> +       case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
> +       case MSR_IA32_VMX_PINBASED_CTLS:
> +               lowp = &vmx->nested.nested_vmx_pinbased_ctls_low;
> +               highp = &vmx->nested.nested_vmx_pinbased_ctls_high;
> +               break;
> +       case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
> +               lowp = &vmx->nested.nested_vmx_true_procbased_ctls_low;
> +               highp = &vmx->nested.nested_vmx_procbased_ctls_high;
> +               break;
> +       case MSR_IA32_VMX_PROCBASED_CTLS:
> +               lowp = &vmx->nested.nested_vmx_procbased_ctls_low;
> +               highp = &vmx->nested.nested_vmx_procbased_ctls_high;
> +               break;
> +       case MSR_IA32_VMX_TRUE_EXIT_CTLS:
> +               lowp = &vmx->nested.nested_vmx_true_exit_ctls_low;
> +               highp = &vmx->nested.nested_vmx_exit_ctls_high;
> +               break;
> +       case MSR_IA32_VMX_EXIT_CTLS:
> +               lowp = &vmx->nested.nested_vmx_exit_ctls_low,
> +               highp = &vmx->nested.nested_vmx_exit_ctls_high;
> +               break;
> +       case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
> +               lowp = &vmx->nested.nested_vmx_true_entry_ctls_low;
> +               highp = &vmx->nested.nested_vmx_entry_ctls_high;
> +               break;
> +       case MSR_IA32_VMX_ENTRY_CTLS:
> +               lowp = &vmx->nested.nested_vmx_entry_ctls_low;
> +               highp = &vmx->nested.nested_vmx_entry_ctls_high;
> +               break;
> +       case MSR_IA32_VMX_PROCBASED_CTLS2:
> +               lowp = &vmx->nested.nested_vmx_secondary_ctls_low;
> +               highp = &vmx->nested.nested_vmx_secondary_ctls_high;
> +               break;
> +       default:
> +               BUG();
> +       }
> +
> +       supported = vmx_control_msr(*lowp, *highp);
> +
> +       /* Check must-be-1 bits are still 1. */
> +       if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
> +               return -EINVAL;
> +
> +       /* Check must-be-0 bits are still 0. */
> +       if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
> +               return -EINVAL;
> +
> +       *lowp = data;
> +       *highp = data >> 32;
> +       return 0;
> +}
> +
> +static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
> +{
> +       const u64 feature_and_reserved_bits =
> +               /* feature */
> +               BIT(5) | GENMASK(8, 6) | BIT(15) | BIT(28) | BIT(29) |
> +               /* reserved */
> +               GENMASK(14, 9) | GENMASK(31, 30);
> +       u64 vmx_misc;
> +
> +       vmx_misc = vmx_control_msr(vmx->nested.nested_vmx_misc_low,
> +                                  vmx->nested.nested_vmx_misc_high);
> +
> +       if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
> +               return -EINVAL;
> +
> +
> +       if ((vmx->nested.nested_vmx_pinbased_ctls_high &
> +            PIN_BASED_VMX_PREEMPTION_TIMER) &&
> +           vmx_misc_preemption_timer_rate(data) !=
> +           vmx_misc_preemption_timer_rate(vmx_misc))
> +               return -EINVAL;
> +
> +       if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
> +               return -EINVAL;
> +
> +       if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
> +               return -EINVAL;
> +
> +       if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
> +               return -EINVAL;
> +
> +       vmx->nested.nested_vmx_misc_low = data;
> +       vmx->nested.nested_vmx_misc_high = data >> 32;
> +       return 0;
> +}
> +
> +static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
> +{
> +       u64 vmx_ept_vpid_cap;
> +
> +       vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.nested_vmx_ept_caps,
> +                                          vmx->nested.nested_vmx_vpid_caps);
> +
> +       /* Every bit is either reserved or a feature bit. */
> +       if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
> +               return -EINVAL;
> +
> +       vmx->nested.nested_vmx_ept_caps = data;
> +       vmx->nested.nested_vmx_vpid_caps = data >> 32;
> +       return 0;
> +}
> +
> +/*
> + * Called when userspace is restoring VMX MSRs.
> + *
> + * Returns 0 on success, non-0 otherwise.
> + */
> +static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
> +{
> +       struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> +       switch (msr_index) {
> +       case MSR_IA32_VMX_BASIC:
> +               return vmx->nested.nested_vmx_basic != data;
> +       case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
> +       case MSR_IA32_VMX_PINBASED_CTLS:
> +       case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
> +       case MSR_IA32_VMX_PROCBASED_CTLS:
> +       case MSR_IA32_VMX_TRUE_EXIT_CTLS:
> +       case MSR_IA32_VMX_EXIT_CTLS:
> +       case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
> +       case MSR_IA32_VMX_ENTRY_CTLS:
> +       case MSR_IA32_VMX_PROCBASED_CTLS2:
> +               return vmx_restore_control_msr(vmx, msr_index, data);
> +       case MSR_IA32_VMX_MISC:
> +               return vmx_restore_vmx_misc(vmx, data);
> +       case MSR_IA32_VMX_CR0_FIXED0:
> +               return vmx->nested.nested_vmx_cr0_fixed0 != data;
> +       case MSR_IA32_VMX_CR0_FIXED1:
> +               return vmx->nested.nested_vmx_cr0_fixed1 != data;
> +       case MSR_IA32_VMX_CR4_FIXED0:
> +               return vmx->nested.nested_vmx_cr4_fixed0 != data;
> +       case MSR_IA32_VMX_CR4_FIXED1:
> +               return vmx->nested.nested_vmx_cr4_fixed1 != data;
> +       case MSR_IA32_VMX_EPT_VPID_CAP:
> +               return vmx_restore_vmx_ept_vpid_cap(vmx, data);
> +       case MSR_IA32_VMX_VMCS_ENUM:
> +               return vmx->nested.nested_vmx_vmcs_enum != data;
> +       default:
> +               /*
> +                * The rest of the VMX capability MSRs do not support restore.
> +                */
> +               return -EINVAL;
> +       }
> +
> +       return 0;
> +}
> +
>  /* Returns 0 on success, non-0 otherwise. */
>  static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
>  {
> @@ -2868,15 +3066,7 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
>
>         switch (msr_index) {
>         case MSR_IA32_VMX_BASIC:
> -               /*
> -                * This MSR reports some information about VMX support. We
> -                * should return information about the VMX we emulate for the
> -                * guest, and the VMCS structure we give it - not about the
> -                * VMX support of the underlying hardware.
> -                */
> -               *pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS |
> -                          ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
> -                          (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
> +               *pdata = vmx->nested.nested_vmx_basic;
>                 break;
>         case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
>         case MSR_IA32_VMX_PINBASED_CTLS:
> @@ -2919,27 +3109,20 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
>                         vmx->nested.nested_vmx_misc_low,
>                         vmx->nested.nested_vmx_misc_high);
>                 break;
> -       /*
> -        * These MSRs specify bits which the guest must keep fixed (on or off)
> -        * while L1 is in VMXON mode (in L1's root mode, or running an L2).
> -        * We picked the standard core2 setting.
> -        */
> -#define VMXON_CR0_ALWAYSON     (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
> -#define VMXON_CR4_ALWAYSON     X86_CR4_VMXE
>         case MSR_IA32_VMX_CR0_FIXED0:
> -               *pdata = VMXON_CR0_ALWAYSON;
> +               *pdata = vmx->nested.nested_vmx_cr0_fixed0;
>                 break;
>         case MSR_IA32_VMX_CR0_FIXED1:
> -               *pdata = -1ULL;
> +               *pdata = vmx->nested.nested_vmx_cr0_fixed1;
>                 break;
>         case MSR_IA32_VMX_CR4_FIXED0:
> -               *pdata = VMXON_CR4_ALWAYSON;
> +               *pdata = vmx->nested.nested_vmx_cr4_fixed0;
>                 break;
>         case MSR_IA32_VMX_CR4_FIXED1:
> -               *pdata = -1ULL;
> +               *pdata = vmx->nested.nested_vmx_cr4_fixed1;
>                 break;
>         case MSR_IA32_VMX_VMCS_ENUM:
> -               *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */
> +               *pdata = vmx->nested.nested_vmx_vmcs_enum;
>                 break;
>         case MSR_IA32_VMX_PROCBASED_CTLS2:
>                 *pdata = vmx_control_msr(
> @@ -3122,7 +3305,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>                         vmx_leave_nested(vcpu);
>                 break;
>         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
> -               return 1; /* they are read-only */
> +               if (!msr_info->host_initiated)
> +                       return 1; /* they are read-only */
> +               if (!nested_vmx_allowed(vcpu))
> +                       return 1;
> +               return vmx_set_vmx_msr(vcpu, msr_index, data);
>         case MSR_IA32_XSS:
>                 if (!vmx_xsaves_supported())
>                         return 1;
> --
> 2.8.0.rc3.226.g39d4020
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Nov. 4, 2016, 2:50 p.m. UTC | #2
On 02/09/2016 20:28, David Matlack wrote:
> +	case MSR_IA32_VMX_CR4_FIXED0:
> +		return vmx->nested.nested_vmx_cr4_fixed0 != data;
> +	case MSR_IA32_VMX_CR4_FIXED1:
> +		return vmx->nested.nested_vmx_cr4_fixed1 != data;

Processors will have different fixed0/fixed1 as they grow support for
new CR4 bits.  The same is true for CR0, because IIRC the capability
MSRs changed when Intel introduced unrestricted guest support.

Userspace should set fixed0/fixed1 based on other execution controls
(e.g. unrestricted guest) or CPUID (e.g. PKU, FSGSBASE, etc.), similar
to how QEMU is doing it for CPUID leaf 0Dh (XSAVE).

So I think we should accept the case where L1's fixed0/fixed1 are more
constrained than L0's.  Then handle_set_cr{0,4} can check that L1's
fixed0/fixed1 are respected.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Matlack Nov. 4, 2016, 5:06 p.m. UTC | #3
On Fri, Nov 4, 2016 at 7:50 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>
> On 02/09/2016 20:28, David Matlack wrote:
>> +     case MSR_IA32_VMX_CR4_FIXED0:
>> +             return vmx->nested.nested_vmx_cr4_fixed0 != data;
>> +     case MSR_IA32_VMX_CR4_FIXED1:
>> +             return vmx->nested.nested_vmx_cr4_fixed1 != data;
>
> Processors will have different fixed0/fixed1 as they grow support for
> new CR4 bits.  The same is true for CR0, because IIRC the capability
> MSRs changed when Intel introduced unrestricted guest support.
>
> Userspace should set fixed0/fixed1 based on other execution controls
> (e.g. unrestricted guest) or CPUID (e.g. PKU, FSGSBASE, etc.), similar
> to how QEMU is doing it for CPUID leaf 0Dh (XSAVE).
>
> So I think we should accept the case where L1's fixed0/fixed1 are more
> constrained than L0's.  Then handle_set_cr{0,4} can check that L1's
> fixed0/fixed1 are respected.

Agreed, I'll include that in v2.

>
> Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index a002b07..e57f389 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -25,6 +25,7 @@ 
 #define VMX_H
 
 
+#include <linux/bitops.h>
 #include <linux/types.h>
 #include <uapi/asm/vmx.h>
 
@@ -110,6 +111,26 @@ 
 #define VMX_MISC_SAVE_EFER_LMA			0x00000020
 #define VMX_MISC_ACTIVITY_HLT			0x00000040
 
+static inline int vmx_misc_preemption_timer_rate(u64 vmx_misc)
+{
+	return vmx_misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
+}
+
+static inline int vmx_misc_cr3_count(u64 vmx_misc)
+{
+	return (vmx_misc & GENMASK_ULL(24, 16)) >> 16;
+}
+
+static inline int vmx_misc_max_msr(u64 vmx_misc)
+{
+	return (vmx_misc & GENMASK_ULL(27, 25)) >> 25;
+}
+
+static inline int vmx_misc_mseg_revid(u64 vmx_misc)
+{
+	return (vmx_misc & GENMASK_ULL(63, 32)) >> 32;
+}
+
 /* VMCS Encodings */
 enum vmcs_field {
 	VIRTUAL_PROCESSOR_ID            = 0x00000000,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5cede40..1e88cc8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -464,6 +464,12 @@  struct nested_vmx {
 	u32 nested_vmx_misc_high;
 	u32 nested_vmx_ept_caps;
 	u32 nested_vmx_vpid_caps;
+	u64 nested_vmx_basic;
+	u64 nested_vmx_cr0_fixed0;
+	u64 nested_vmx_cr0_fixed1;
+	u64 nested_vmx_cr4_fixed0;
+	u64 nested_vmx_cr4_fixed1;
+	u64 nested_vmx_vmcs_enum;
 };
 
 #define POSTED_INTR_ON  0
@@ -2846,6 +2852,33 @@  static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 		VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
 		VMX_MISC_ACTIVITY_HLT;
 	vmx->nested.nested_vmx_misc_high = 0;
+
+	/*
+	 * This MSR reports some information about VMX support. We
+	 * should return information about the VMX we emulate for the
+	 * guest, and the VMCS structure we give it - not about the
+	 * VMX support of the underlying hardware.
+	 */
+	vmx->nested.nested_vmx_basic =
+		VMCS12_REVISION |
+		VMX_BASIC_TRUE_CTLS |
+		((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
+		(VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
+
+	/*
+	 * These MSRs specify bits which the guest must keep fixed (on or off)
+	 * while L1 is in VMXON mode (in L1's root mode, or running an L2).
+	 * We picked the standard core2 setting.
+	 */
+#define VMXON_CR0_ALWAYSON     (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
+#define VMXON_CR4_ALWAYSON     X86_CR4_VMXE
+	vmx->nested.nested_vmx_cr0_fixed0 = VMXON_CR0_ALWAYSON;
+	vmx->nested.nested_vmx_cr0_fixed1 = -1ULL;
+	vmx->nested.nested_vmx_cr4_fixed0 = VMXON_CR4_ALWAYSON;
+	vmx->nested.nested_vmx_cr4_fixed1 = -1ULL;
+
+	/* highest index: VMX_PREEMPTION_TIMER_VALUE */
+	vmx->nested.nested_vmx_vmcs_enum = 0x2e;
 }
 
 static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
@@ -2861,6 +2894,171 @@  static inline u64 vmx_control_msr(u32 low, u32 high)
 	return low | ((u64)high << 32);
 }
 
+static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
+{
+	superset &= mask;
+	subset &= mask;
+
+	return (superset | subset) == superset;
+}
+
+static int
+vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
+{
+	u64 supported;
+	u32 *lowp, *highp;
+
+	switch (msr_index) {
+	case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
+	case MSR_IA32_VMX_PINBASED_CTLS:
+		lowp = &vmx->nested.nested_vmx_pinbased_ctls_low;
+		highp = &vmx->nested.nested_vmx_pinbased_ctls_high;
+		break;
+	case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
+		lowp = &vmx->nested.nested_vmx_true_procbased_ctls_low;
+		highp = &vmx->nested.nested_vmx_procbased_ctls_high;
+		break;
+	case MSR_IA32_VMX_PROCBASED_CTLS:
+		lowp = &vmx->nested.nested_vmx_procbased_ctls_low;
+		highp = &vmx->nested.nested_vmx_procbased_ctls_high;
+		break;
+	case MSR_IA32_VMX_TRUE_EXIT_CTLS:
+		lowp = &vmx->nested.nested_vmx_true_exit_ctls_low;
+		highp = &vmx->nested.nested_vmx_exit_ctls_high;
+		break;
+	case MSR_IA32_VMX_EXIT_CTLS:
+		lowp = &vmx->nested.nested_vmx_exit_ctls_low,
+		highp = &vmx->nested.nested_vmx_exit_ctls_high;
+		break;
+	case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
+		lowp = &vmx->nested.nested_vmx_true_entry_ctls_low;
+		highp = &vmx->nested.nested_vmx_entry_ctls_high;
+		break;
+	case MSR_IA32_VMX_ENTRY_CTLS:
+		lowp = &vmx->nested.nested_vmx_entry_ctls_low;
+		highp = &vmx->nested.nested_vmx_entry_ctls_high;
+		break;
+	case MSR_IA32_VMX_PROCBASED_CTLS2:
+		lowp = &vmx->nested.nested_vmx_secondary_ctls_low;
+		highp = &vmx->nested.nested_vmx_secondary_ctls_high;
+		break;
+	default:
+		BUG();
+	}
+
+	supported = vmx_control_msr(*lowp, *highp);
+
+	/* Check must-be-1 bits are still 1. */
+	if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
+		return -EINVAL;
+
+	/* Check must-be-0 bits are still 0. */
+	if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
+		return -EINVAL;
+
+	*lowp = data;
+	*highp = data >> 32;
+	return 0;
+}
+
+static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
+{
+	const u64 feature_and_reserved_bits =
+		/* feature */
+		BIT(5) | GENMASK(8, 6) | BIT(15) | BIT(28) | BIT(29) |
+		/* reserved */
+		GENMASK(14, 9) | GENMASK(31, 30);
+	u64 vmx_misc;
+
+	vmx_misc = vmx_control_msr(vmx->nested.nested_vmx_misc_low,
+				   vmx->nested.nested_vmx_misc_high);
+
+	if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
+		return -EINVAL;
+
+
+	if ((vmx->nested.nested_vmx_pinbased_ctls_high &
+	     PIN_BASED_VMX_PREEMPTION_TIMER) &&
+	    vmx_misc_preemption_timer_rate(data) !=
+	    vmx_misc_preemption_timer_rate(vmx_misc))
+		return -EINVAL;
+
+	if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
+		return -EINVAL;
+
+	if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
+		return -EINVAL;
+
+	if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
+		return -EINVAL;
+
+	vmx->nested.nested_vmx_misc_low = data;
+	vmx->nested.nested_vmx_misc_high = data >> 32;
+	return 0;
+}
+
+static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
+{
+	u64 vmx_ept_vpid_cap;
+
+	vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.nested_vmx_ept_caps,
+					   vmx->nested.nested_vmx_vpid_caps);
+
+	/* Every bit is either reserved or a feature bit. */
+	if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
+		return -EINVAL;
+
+	vmx->nested.nested_vmx_ept_caps = data;
+	vmx->nested.nested_vmx_vpid_caps = data >> 32;
+	return 0;
+}
+
+/*
+ * Called when userspace is restoring VMX MSRs.
+ *
+ * Returns 0 on success, non-0 otherwise.
+ */
+static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	switch (msr_index) {
+	case MSR_IA32_VMX_BASIC:
+		return vmx->nested.nested_vmx_basic != data;
+	case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
+	case MSR_IA32_VMX_PINBASED_CTLS:
+	case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
+	case MSR_IA32_VMX_PROCBASED_CTLS:
+	case MSR_IA32_VMX_TRUE_EXIT_CTLS:
+	case MSR_IA32_VMX_EXIT_CTLS:
+	case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
+	case MSR_IA32_VMX_ENTRY_CTLS:
+	case MSR_IA32_VMX_PROCBASED_CTLS2:
+		return vmx_restore_control_msr(vmx, msr_index, data);
+	case MSR_IA32_VMX_MISC:
+		return vmx_restore_vmx_misc(vmx, data);
+	case MSR_IA32_VMX_CR0_FIXED0:
+		return vmx->nested.nested_vmx_cr0_fixed0 != data;
+	case MSR_IA32_VMX_CR0_FIXED1:
+		return vmx->nested.nested_vmx_cr0_fixed1 != data;
+	case MSR_IA32_VMX_CR4_FIXED0:
+		return vmx->nested.nested_vmx_cr4_fixed0 != data;
+	case MSR_IA32_VMX_CR4_FIXED1:
+		return vmx->nested.nested_vmx_cr4_fixed1 != data;
+	case MSR_IA32_VMX_EPT_VPID_CAP:
+		return vmx_restore_vmx_ept_vpid_cap(vmx, data);
+	case MSR_IA32_VMX_VMCS_ENUM:
+		return vmx->nested.nested_vmx_vmcs_enum != data;
+	default:
+		/*
+		 * The rest of the VMX capability MSRs do not support restore.
+		 */
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 /* Returns 0 on success, non-0 otherwise. */
 static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 {
@@ -2868,15 +3066,7 @@  static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 
 	switch (msr_index) {
 	case MSR_IA32_VMX_BASIC:
-		/*
-		 * This MSR reports some information about VMX support. We
-		 * should return information about the VMX we emulate for the
-		 * guest, and the VMCS structure we give it - not about the
-		 * VMX support of the underlying hardware.
-		 */
-		*pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS |
-			   ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
-			   (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
+		*pdata = vmx->nested.nested_vmx_basic;
 		break;
 	case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
 	case MSR_IA32_VMX_PINBASED_CTLS:
@@ -2919,27 +3109,20 @@  static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 			vmx->nested.nested_vmx_misc_low,
 			vmx->nested.nested_vmx_misc_high);
 		break;
-	/*
-	 * These MSRs specify bits which the guest must keep fixed (on or off)
-	 * while L1 is in VMXON mode (in L1's root mode, or running an L2).
-	 * We picked the standard core2 setting.
-	 */
-#define VMXON_CR0_ALWAYSON	(X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
-#define VMXON_CR4_ALWAYSON	X86_CR4_VMXE
 	case MSR_IA32_VMX_CR0_FIXED0:
-		*pdata = VMXON_CR0_ALWAYSON;
+		*pdata = vmx->nested.nested_vmx_cr0_fixed0;
 		break;
 	case MSR_IA32_VMX_CR0_FIXED1:
-		*pdata = -1ULL;
+		*pdata = vmx->nested.nested_vmx_cr0_fixed1;
 		break;
 	case MSR_IA32_VMX_CR4_FIXED0:
-		*pdata = VMXON_CR4_ALWAYSON;
+		*pdata = vmx->nested.nested_vmx_cr4_fixed0;
 		break;
 	case MSR_IA32_VMX_CR4_FIXED1:
-		*pdata = -1ULL;
+		*pdata = vmx->nested.nested_vmx_cr4_fixed1;
 		break;
 	case MSR_IA32_VMX_VMCS_ENUM:
-		*pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */
+		*pdata = vmx->nested.nested_vmx_vmcs_enum;
 		break;
 	case MSR_IA32_VMX_PROCBASED_CTLS2:
 		*pdata = vmx_control_msr(
@@ -3122,7 +3305,11 @@  static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			vmx_leave_nested(vcpu);
 		break;
 	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
-		return 1; /* they are read-only */
+		if (!msr_info->host_initiated)
+			return 1; /* they are read-only */
+		if (!nested_vmx_allowed(vcpu))
+			return 1;
+		return vmx_set_vmx_msr(vcpu, msr_index, data);
 	case MSR_IA32_XSS:
 		if (!vmx_xsaves_supported())
 			return 1;