diff mbox series

[v6,05/13] KVM: nVMX: implement enlightened VMPTRLD and VMCLEAR

Message ID 20181016165011.6607-6-vkuznets@redhat.com (mailing list archive)
State New, archived
Headers show
Series KVM: nVMX: Enlightened VMCS for Hyper-V on KVM | expand

Commit Message

Vitaly Kuznetsov Oct. 16, 2018, 4:50 p.m. UTC
Per Hyper-V TLFS 5.0b:

"The L1 hypervisor may choose to use enlightened VMCSs by writing 1 to
the corresponding field in the VP assist page (see section 7.8.7).
Another field in the VP assist page controls the currently active
enlightened VMCS. Each enlightened VMCS is exactly one page (4 KB) in
size and must be initially zeroed. No VMPTRLD instruction must be
executed to make an enlightened VMCS active or current.

After the L1 hypervisor performs a VM entry with an enlightened VMCS,
the VMCS is considered active on the processor. An enlightened VMCS
can only be active on a single processor at the same time. The L1
hypervisor can execute a VMCLEAR instruction to transition an
enlightened VMCS from the active to the non-active state. Any VMREAD
or VMWRITE instructions while an enlightened VMCS is active is
unsupported and can result in unexpected behavior."

Keep Enlightened VMCS structure for the current L2 guest permanently mapped
from struct nested_vmx instead of mapping it every time.

Suggested-by: Ladi Prosek <lprosek@redhat.com>
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
---
 arch/x86/kvm/vmx.c | 115 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 108 insertions(+), 7 deletions(-)

Comments

Jim Mattson Dec. 12, 2018, 11:19 p.m. UTC | #1
On Tue, Oct 16, 2018 at 9:50 AM Vitaly Kuznetsov <vkuznets@redhat.com> wrote:
>
> Per Hyper-V TLFS 5.0b:
>
> "The L1 hypervisor may choose to use enlightened VMCSs by writing 1 to
> the corresponding field in the VP assist page (see section 7.8.7).
> Another field in the VP assist page controls the currently active
> enlightened VMCS. Each enlightened VMCS is exactly one page (4 KB) in
> size and must be initially zeroed. No VMPTRLD instruction must be
> executed to make an enlightened VMCS active or current.
>
> After the L1 hypervisor performs a VM entry with an enlightened VMCS,
> the VMCS is considered active on the processor. An enlightened VMCS
> can only be active on a single processor at the same time. The L1
> hypervisor can execute a VMCLEAR instruction to transition an
> enlightened VMCS from the active to the non-active state. Any VMREAD
> or VMWRITE instructions while an enlightened VMCS is active is
> unsupported and can result in unexpected behavior."
>
> Keep Enlightened VMCS structure for the current L2 guest permanently mapped
> from struct nested_vmx instead of mapping it every time.
>
> Suggested-by: Ladi Prosek <lprosek@redhat.com>
> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
> ---
>  arch/x86/kvm/vmx.c | 115 ++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 108 insertions(+), 7 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index aebd008ccccb..cfb44acd4291 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -20,6 +20,7 @@
>  #include "mmu.h"
>  #include "cpuid.h"
>  #include "lapic.h"
> +#include "hyperv.h"
>
>  #include <linux/kvm_host.h>
>  #include <linux/module.h>
> @@ -889,6 +890,8 @@ struct nested_vmx {
>                 bool guest_mode;
>         } smm;
>
> +       gpa_t hv_evmcs_vmptr;
> +       struct page *hv_evmcs_page;
>         struct hv_enlightened_vmcs *hv_evmcs;
>  };
>
> @@ -8111,11 +8114,13 @@ static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
>  static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
>                                 u32 vm_instruction_error)
>  {
> +       struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
>         /*
>          * failValid writes the error number to the current VMCS, which
>          * can't be done if there isn't a current VMCS.
>          */
> -       if (to_vmx(vcpu)->nested.current_vmptr == -1ull)
> +       if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs)
>                 return nested_vmx_failInvalid(vcpu);
>
>         vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
> @@ -8441,6 +8446,20 @@ static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
>         vmcs_write64(VMCS_LINK_POINTER, -1ull);
>  }
>
> +static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
> +{
> +       struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> +       if (!vmx->nested.hv_evmcs)
> +               return;
> +
> +       kunmap(vmx->nested.hv_evmcs_page);
> +       kvm_release_page_dirty(vmx->nested.hv_evmcs_page);
> +       vmx->nested.hv_evmcs_vmptr = -1ull;
> +       vmx->nested.hv_evmcs_page = NULL;
> +       vmx->nested.hv_evmcs = NULL;
> +}
> +
>  static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
>  {
>         struct vcpu_vmx *vmx = to_vmx(vcpu);
> @@ -8509,6 +8528,8 @@ static void free_nested(struct kvm_vcpu *vcpu)
>
>         kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
>
> +       nested_release_evmcs(vcpu);
> +
>         free_loaded_vmcs(&vmx->nested.vmcs02);
>  }
>
> @@ -8542,12 +8563,18 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
>                 return nested_vmx_failValid(vcpu,
>                         VMXERR_VMCLEAR_VMXON_POINTER);
>
> -       if (vmptr == vmx->nested.current_vmptr)
> -               nested_release_vmcs12(vcpu);
> +       if (vmx->nested.hv_evmcs_page) {
> +               if (vmptr == vmx->nested.hv_evmcs_vmptr)
> +                       nested_release_evmcs(vcpu);
> +       } else {
> +               if (vmptr == vmx->nested.current_vmptr)
> +                       nested_release_vmcs12(vcpu);
>
> -       kvm_vcpu_write_guest(vcpu,
> -                       vmptr + offsetof(struct vmcs12, launch_state),
> -                       &zero, sizeof(zero));
> +               kvm_vcpu_write_guest(vcpu,
> +                                    vmptr + offsetof(struct vmcs12,
> +                                                     launch_state),
> +                                    &zero, sizeof(zero));
> +       }
>
>         return nested_vmx_succeed(vcpu);
>  }
> @@ -8637,6 +8664,8 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
>         struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
>         struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
>
> +       vmcs12->hdr.revision_id = evmcs->revision_id;
> +
>         /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
>         vmcs12->tpr_threshold = evmcs->tpr_threshold;
>         vmcs12->guest_rip = evmcs->guest_rip;
> @@ -9268,6 +9297,10 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
>                 return nested_vmx_failValid(vcpu,
>                         VMXERR_VMPTRLD_VMXON_POINTER);
>
> +       /* Forbid normal VMPTRLD if Enlightened version was used */
> +       if (vmx->nested.hv_evmcs)
> +               return 1;
> +
>         if (vmx->nested.current_vmptr != vmptr) {
>                 struct vmcs12 *new_vmcs12;
>                 struct page *page;
> @@ -9301,6 +9334,68 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
>         return nested_vmx_succeed(vcpu);
>  }
>
> +/*
> + * This is an equivalent of the nested hypervisor executing the vmptrld
> + * instruction.
> + */
> +static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu)
> +{
> +       struct vcpu_vmx *vmx = to_vmx(vcpu);
> +       struct hv_vp_assist_page assist_page;
> +
> +       if (likely(!vmx->nested.enlightened_vmcs_enabled))
> +               return 1;
> +
> +       if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page)))
> +               return 1;
> +
> +       if (unlikely(!assist_page.enlighten_vmentry))
> +               return 1;
> +
> +       if (unlikely(assist_page.current_nested_vmcs !=
> +                    vmx->nested.hv_evmcs_vmptr)) {
> +
> +               if (!vmx->nested.hv_evmcs)
> +                       vmx->nested.current_vmptr = -1ull;
> +
> +               nested_release_evmcs(vcpu);
> +
> +               vmx->nested.hv_evmcs_page = kvm_vcpu_gpa_to_page(
> +                       vcpu, assist_page.current_nested_vmcs);
> +
> +               if (unlikely(is_error_page(vmx->nested.hv_evmcs_page)))
> +                       return 0;
> +
> +               vmx->nested.hv_evmcs = kmap(vmx->nested.hv_evmcs_page);

Are you sure that directly mapping guest memory isn't going to lead to
time-of-check vs. time-of-use bugs? This is a very hard programming
model to get right.
Vitaly Kuznetsov Dec. 13, 2018, 10:26 a.m. UTC | #2
Jim Mattson <jmattson@google.com> writes:

> On Tue, Oct 16, 2018 at 9:50 AM Vitaly Kuznetsov <vkuznets@redhat.com> wrote:
>>
>> +
>> +               vmx->nested.hv_evmcs = kmap(vmx->nested.hv_evmcs_page);
>
> Are you sure that directly mapping guest memory isn't going to lead to
> time-of-check vs. time-of-use bugs? This is a very hard programming
> model to get right.

The basic assumption here is that Enlightened VMCS (just like normal or
shadow VMCSes) is being access by one L1 vCPU only. When we access it
from KVM the vCPU is not running. Yes, L1 guest can screw itself up by
breaking this assumption but honestly I don't see how this is different
from normal VMCS: we can always break things by writing to the page from
a different vCPU.

Enlightened VMCS is (mostly) not used directly: we copy it to vmcs12 and
then back before entry. The only field we always access directly is
hv_clean_fields. We can, of course, copy it to vmcs12 too but I failed
to find a reason to do so: L1 guest is in control of the field, it can
always write junk there and L2 guest will likely get broken.

I remember having map/copy/unmap sequences for eVMCS on entry/exit in
some early version of this series but it was just slowing things down so
I switched to having it permanently mapped. In case you see (potential)
grave bugs with this we can of course re-consider.

Thanks,
diff mbox series

Patch

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index aebd008ccccb..cfb44acd4291 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -20,6 +20,7 @@ 
 #include "mmu.h"
 #include "cpuid.h"
 #include "lapic.h"
+#include "hyperv.h"
 
 #include <linux/kvm_host.h>
 #include <linux/module.h>
@@ -889,6 +890,8 @@  struct nested_vmx {
 		bool guest_mode;
 	} smm;
 
+	gpa_t hv_evmcs_vmptr;
+	struct page *hv_evmcs_page;
 	struct hv_enlightened_vmcs *hv_evmcs;
 };
 
@@ -8111,11 +8114,13 @@  static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
 static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
 				u32 vm_instruction_error)
 {
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
 	/*
 	 * failValid writes the error number to the current VMCS, which
 	 * can't be done if there isn't a current VMCS.
 	 */
-	if (to_vmx(vcpu)->nested.current_vmptr == -1ull)
+	if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs)
 		return nested_vmx_failInvalid(vcpu);
 
 	vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
@@ -8441,6 +8446,20 @@  static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
 	vmcs_write64(VMCS_LINK_POINTER, -1ull);
 }
 
+static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!vmx->nested.hv_evmcs)
+		return;
+
+	kunmap(vmx->nested.hv_evmcs_page);
+	kvm_release_page_dirty(vmx->nested.hv_evmcs_page);
+	vmx->nested.hv_evmcs_vmptr = -1ull;
+	vmx->nested.hv_evmcs_page = NULL;
+	vmx->nested.hv_evmcs = NULL;
+}
+
 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -8509,6 +8528,8 @@  static void free_nested(struct kvm_vcpu *vcpu)
 
 	kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
 
+	nested_release_evmcs(vcpu);
+
 	free_loaded_vmcs(&vmx->nested.vmcs02);
 }
 
@@ -8542,12 +8563,18 @@  static int handle_vmclear(struct kvm_vcpu *vcpu)
 		return nested_vmx_failValid(vcpu,
 			VMXERR_VMCLEAR_VMXON_POINTER);
 
-	if (vmptr == vmx->nested.current_vmptr)
-		nested_release_vmcs12(vcpu);
+	if (vmx->nested.hv_evmcs_page) {
+		if (vmptr == vmx->nested.hv_evmcs_vmptr)
+			nested_release_evmcs(vcpu);
+	} else {
+		if (vmptr == vmx->nested.current_vmptr)
+			nested_release_vmcs12(vcpu);
 
-	kvm_vcpu_write_guest(vcpu,
-			vmptr + offsetof(struct vmcs12, launch_state),
-			&zero, sizeof(zero));
+		kvm_vcpu_write_guest(vcpu,
+				     vmptr + offsetof(struct vmcs12,
+						      launch_state),
+				     &zero, sizeof(zero));
+	}
 
 	return nested_vmx_succeed(vcpu);
 }
@@ -8637,6 +8664,8 @@  static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
 	struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
 	struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
 
+	vmcs12->hdr.revision_id = evmcs->revision_id;
+
 	/* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
 	vmcs12->tpr_threshold = evmcs->tpr_threshold;
 	vmcs12->guest_rip = evmcs->guest_rip;
@@ -9268,6 +9297,10 @@  static int handle_vmptrld(struct kvm_vcpu *vcpu)
 		return nested_vmx_failValid(vcpu,
 			VMXERR_VMPTRLD_VMXON_POINTER);
 
+	/* Forbid normal VMPTRLD if Enlightened version was used */
+	if (vmx->nested.hv_evmcs)
+		return 1;
+
 	if (vmx->nested.current_vmptr != vmptr) {
 		struct vmcs12 *new_vmcs12;
 		struct page *page;
@@ -9301,6 +9334,68 @@  static int handle_vmptrld(struct kvm_vcpu *vcpu)
 	return nested_vmx_succeed(vcpu);
 }
 
+/*
+ * This is an equivalent of the nested hypervisor executing the vmptrld
+ * instruction.
+ */
+static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct hv_vp_assist_page assist_page;
+
+	if (likely(!vmx->nested.enlightened_vmcs_enabled))
+		return 1;
+
+	if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page)))
+		return 1;
+
+	if (unlikely(!assist_page.enlighten_vmentry))
+		return 1;
+
+	if (unlikely(assist_page.current_nested_vmcs !=
+		     vmx->nested.hv_evmcs_vmptr)) {
+
+		if (!vmx->nested.hv_evmcs)
+			vmx->nested.current_vmptr = -1ull;
+
+		nested_release_evmcs(vcpu);
+
+		vmx->nested.hv_evmcs_page = kvm_vcpu_gpa_to_page(
+			vcpu, assist_page.current_nested_vmcs);
+
+		if (unlikely(is_error_page(vmx->nested.hv_evmcs_page)))
+			return 0;
+
+		vmx->nested.hv_evmcs = kmap(vmx->nested.hv_evmcs_page);
+
+		if (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION) {
+			nested_release_evmcs(vcpu);
+			return 0;
+		}
+
+		vmx->nested.dirty_vmcs12 = true;
+		/*
+		 * As we keep L2 state for one guest only 'hv_clean_fields' mask
+		 * can't be used when we switch between them. Reset it here for
+		 * simplicity.
+		 */
+		vmx->nested.hv_evmcs->hv_clean_fields &=
+			~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+		vmx->nested.hv_evmcs_vmptr = assist_page.current_nested_vmcs;
+
+		/*
+		 * Unlike normal vmcs12, enlightened vmcs12 is not fully
+		 * reloaded from guest's memory (read only fields, fields not
+		 * present in struct hv_enlightened_vmcs, ...). Make sure there
+		 * are no leftovers.
+		 */
+		memset(vmx->nested.cached_vmcs12, 0,
+		       sizeof(*vmx->nested.cached_vmcs12));
+
+	}
+	return 1;
+}
+
 /* Emulate the VMPTRST instruction */
 static int handle_vmptrst(struct kvm_vcpu *vcpu)
 {
@@ -9313,6 +9408,9 @@  static int handle_vmptrst(struct kvm_vcpu *vcpu)
 	if (!nested_vmx_check_permission(vcpu))
 		return 1;
 
+	if (unlikely(to_vmx(vcpu)->nested.hv_evmcs))
+		return 1;
+
 	if (get_vmx_mem_address(vcpu, exit_qual, instr_info, true, &gva))
 		return 1;
 	/* *_system ok, nested_vmx_check_permission has verified cpl=0 */
@@ -13314,7 +13412,10 @@  static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 	if (!nested_vmx_check_permission(vcpu))
 		return 1;
 
-	if (vmx->nested.current_vmptr == -1ull)
+	if (!nested_vmx_handle_enlightened_vmptrld(vcpu))
+		return 1;
+
+	if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull)
 		return nested_vmx_failInvalid(vcpu);
 
 	vmcs12 = get_vmcs12(vcpu);