diff mbox

[RFC,6/6] x86/kvm: use enlightened VMCS when running on Hyper-V

Message ID 20180115173105.31845-7-vkuznets@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Vitaly Kuznetsov Jan. 15, 2018, 5:31 p.m. UTC
Early prototype.

When running nested KVM on Hyper-V it's possible to use so called
'Enlightened VMCS' and do normal memory reads/writes instead of
doing VMWRITE/VMREAD instructions. Tests show that this speeds up
tight CPUID loop almost 3 times:

Before:
./cpuid_tight
20459

After:
./cpuid_tight
7698

checkpatch.pl errors/warnings and 32bit brokenness are known things.

Main RFC questions I have are:
- Do we want to have this per L2 VM or per L1 host?
- How can we achieve zero overhead for non-Hyper-V deployments? Use static
  keys? But this will only work if we decide to do eVMCS per host.
- Can we do better than a big switch in evmcs_read()/evmcs_write()? And
  probably don't use 'case' defines which checkpatch.pl hates.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
---
 arch/x86/kvm/vmx.c | 595 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 593 insertions(+), 2 deletions(-)
diff mbox

Patch

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index efff9d035543..dfdfd15c3d60 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -51,6 +51,7 @@ 
 #include <asm/irq_remapping.h>
 #include <asm/mmu_context.h>
 #include <asm/intel_pt.h>
+#include <asm/mshyperv.h>
 
 #include "trace.h"
 #include "pmu.h"
@@ -198,6 +199,9 @@  extern const ulong vmx_return;
 
 #define NR_AUTOLOAD_MSRS 8
 
+static bool __read_mostly enlightened_vmcs = true;
+module_param(enlightened_vmcs, bool, 0444);
+
 struct vmcs {
 	u32 revision_id;
 	u32 abort;
@@ -1498,11 +1502,22 @@  static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
 	loaded_vmcs->launched = 0;
 }
 
+static inline void vmcs_load_enlightened(u64 phys_addr)
+{
+	int cpu = smp_processor_id();
+
+	hv_vp_assist_page[cpu]->current_nested_vmcs = phys_addr;
+	hv_vp_assist_page[cpu]->enlighten_vmentry = 1;
+}
+
 static void vmcs_load(struct vmcs *vmcs)
 {
 	u64 phys_addr = __pa(vmcs);
 	u8 error;
 
+	if (enlightened_vmcs)
+		return vmcs_load_enlightened(phys_addr);
+
 	asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
 			: "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
 			: "cc", "memory");
@@ -1620,6 +1635,514 @@  static inline void ept_sync_context(u64 eptp)
 		ept_sync_global();
 }
 
+/*
+ *  Enlightened VMCSv1 doesn't support these:
+ *	POSTED_INTR_NV                  = 0x00000002,
+ *	GUEST_INTR_STATUS               = 0x00000810,
+ *	GUEST_PML_INDEX			= 0x00000812,
+ *	IO_BITMAP_A_HIGH                = 0x00002001,
+ *	IO_BITMAP_B_HIGH                = 0x00002003,
+ *	MSR_BITMAP_HIGH                 = 0x00002005,
+ *	VM_EXIT_MSR_STORE_ADDR_HIGH     = 0x00002007,
+ *	VM_EXIT_MSR_LOAD_ADDR_HIGH      = 0x00002009,
+ *	VM_ENTRY_MSR_LOAD_ADDR_HIGH     = 0x0000200b,
+ *	PML_ADDRESS			= 0x0000200e,
+ *	PML_ADDRESS_HIGH		= 0x0000200f,
+ *	TSC_OFFSET_HIGH                 = 0x00002011,
+ *	VIRTUAL_APIC_PAGE_ADDR_HIGH     = 0x00002013,
+ *	APIC_ACCESS_ADDR		= 0x00002014,
+ *	APIC_ACCESS_ADDR_HIGH		= 0x00002015,
+ *	POSTED_INTR_DESC_ADDR           = 0x00002016,
+ *	POSTED_INTR_DESC_ADDR_HIGH      = 0x00002017,
+ *	VM_FUNCTION_CONTROL             = 0x00002018,
+ *	VM_FUNCTION_CONTROL_HIGH        = 0x00002019,
+ *	EPT_POINTER_HIGH                = 0x0000201b,
+ *	EOI_EXIT_BITMAP0                = 0x0000201c,
+ *	EOI_EXIT_BITMAP0_HIGH           = 0x0000201d,
+ *	EOI_EXIT_BITMAP1                = 0x0000201e,
+ *	EOI_EXIT_BITMAP1_HIGH           = 0x0000201f,
+ *	EOI_EXIT_BITMAP2                = 0x00002020,
+ *	EOI_EXIT_BITMAP2_HIGH           = 0x00002021,
+ *	EOI_EXIT_BITMAP3                = 0x00002022,
+ *	EOI_EXIT_BITMAP3_HIGH           = 0x00002023,
+ *	EPTP_LIST_ADDRESS               = 0x00002024,
+ *	EPTP_LIST_ADDRESS_HIGH          = 0x00002025,
+ *	VMREAD_BITMAP                   = 0x00002026,
+ *	VMWRITE_BITMAP                  = 0x00002028,
+ *	XSS_EXIT_BITMAP_HIGH            = 0x0000202D,
+ *	TSC_MULTIPLIER                  = 0x00002032,
+ *	TSC_MULTIPLIER_HIGH             = 0x00002033,
+ *	GUEST_PHYSICAL_ADDRESS_HIGH     = 0x00002401,
+ *	VMCS_LINK_POINTER_HIGH          = 0x00002801,
+ *	GUEST_IA32_DEBUGCTL_HIGH        = 0x00002803,
+ *	GUEST_IA32_PAT_HIGH		= 0x00002805,
+ *	GUEST_IA32_EFER_HIGH		= 0x00002807,
+ *	GUEST_IA32_PERF_GLOBAL_CTRL	= 0x00002808,
+ *	GUEST_IA32_PERF_GLOBAL_CTRL_HIGH= 0x00002809,
+ *	GUEST_PDPTR0_HIGH               = 0x0000280b,
+ *	GUEST_PDPTR1_HIGH               = 0x0000280d,
+ *	GUEST_PDPTR2_HIGH               = 0x0000280f,
+ *	GUEST_PDPTR3_HIGH               = 0x00002811,
+ *	GUEST_BNDCFGS_HIGH              = 0x00002813,
+ *	GUEST_IA32_RTIT_CTL		= 0x00002814,
+ *	GUEST_IA32_RTIT_CTL_HIGH	= 0x00002815,
+ *	HOST_IA32_PAT_HIGH		= 0x00002c01,
+ *	HOST_IA32_EFER_HIGH		= 0x00002c03,
+ *	HOST_IA32_PERF_GLOBAL_CTRL	= 0x00002c04,
+ *	HOST_IA32_PERF_GLOBAL_CTRL_HIGH	= 0x00002c05,
+ *	VM_EXIT_MSR_STORE_COUNT         = 0x0000400e,
+ *	VM_EXIT_MSR_LOAD_COUNT          = 0x00004010,
+ *	VM_ENTRY_MSR_LOAD_COUNT         = 0x00004014,
+ *	PLE_GAP                         = 0x00004020,
+ *	PLE_WINDOW                      = 0x00004022,
+ *	VMX_PREEMPTION_TIMER_VALUE      = 0x0000482E,
+ */
+
+#define evmcs_write_field(field, efield, mask)		\
+	case field:					\
+	evmcs->efield = value;				\
+	evmcs->hv_clean_fields &= ~mask;		\
+	break;
+
+#define evmcs_read_field(field, efield)			\
+	case field:					\
+	return evmcs->efield;				\
+
+static void evmcs_write(unsigned long field, u64 value)
+{
+	int cpu = smp_processor_id();
+	struct hv_enlightened_vmcs *evmcs =
+		__va(hv_vp_assist_page[cpu]->current_nested_vmcs);
+
+	switch (field) {
+		/* 64 bit fields */
+		evmcs_write_field(GUEST_RIP, guest_rip,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
+		evmcs_write_field(GUEST_RSP, guest_rsp,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC);
+		evmcs_write_field(GUEST_RFLAGS, guest_rflags,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC);
+		evmcs_write_field(HOST_IA32_PAT, host_ia32_pat,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1);
+		evmcs_write_field(HOST_IA32_EFER, host_ia32_efer,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1);
+		evmcs_write_field(HOST_CR0, host_cr0,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1);
+		evmcs_write_field(HOST_CR3, host_cr3,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1);
+		evmcs_write_field(HOST_CR4, host_cr4,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1);
+		evmcs_write_field(HOST_IA32_SYSENTER_ESP,
+				  host_ia32_sysenter_esp,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1);
+		evmcs_write_field(HOST_IA32_SYSENTER_EIP,
+				  host_ia32_sysenter_eip,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1);
+		evmcs_write_field(HOST_RIP, host_rip,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1);
+		evmcs_write_field(IO_BITMAP_A, io_bitmap_a,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP);
+		evmcs_write_field(IO_BITMAP_B, io_bitmap_b,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP);
+		evmcs_write_field(MSR_BITMAP, msr_bitmap,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP);
+		evmcs_write_field(GUEST_ES_BASE, guest_es_base,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+		evmcs_write_field(GUEST_CS_BASE, guest_cs_base,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+		evmcs_write_field(GUEST_SS_BASE, guest_ss_base,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+		evmcs_write_field(GUEST_DS_BASE, guest_ds_base,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+		evmcs_write_field(GUEST_FS_BASE, guest_fs_base,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+		evmcs_write_field(GUEST_GS_BASE, guest_gs_base,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+		evmcs_write_field(GUEST_LDTR_BASE, guest_ldtr_base,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+		evmcs_write_field(GUEST_TR_BASE, guest_tr_base,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+		evmcs_write_field(GUEST_GDTR_BASE, guest_gdtr_base,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+		evmcs_write_field(GUEST_IDTR_BASE, guest_idtr_base,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+		evmcs_write_field(TSC_OFFSET, tsc_offset,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2);
+		evmcs_write_field(VIRTUAL_APIC_PAGE_ADDR,
+				  virtual_apic_page_addr,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2);
+		evmcs_write_field(VMCS_LINK_POINTER, vmcs_link_pointer,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
+		evmcs_write_field(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
+		evmcs_write_field(GUEST_IA32_PAT, guest_ia32_pat,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
+		evmcs_write_field(GUEST_IA32_EFER, guest_ia32_efer,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
+		evmcs_write_field(GUEST_PDPTR0, guest_pdptr0,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
+		evmcs_write_field(GUEST_PDPTR1, guest_pdptr1,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
+		evmcs_write_field(GUEST_PDPTR2, guest_pdptr2,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
+		evmcs_write_field(GUEST_PDPTR3, guest_pdptr3,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
+		evmcs_write_field(GUEST_PENDING_DBG_EXCEPTIONS,
+				  guest_pending_dbg_exceptions,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
+		evmcs_write_field(GUEST_SYSENTER_ESP, guest_sysenter_esp,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
+		evmcs_write_field(GUEST_SYSENTER_EIP, guest_sysenter_eip,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
+		evmcs_write_field(CR0_GUEST_HOST_MASK, cr0_guest_host_mask,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR);
+		evmcs_write_field(CR4_GUEST_HOST_MASK, cr4_guest_host_mask,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR);
+		evmcs_write_field(CR0_READ_SHADOW, cr0_read_shadow,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR);
+		evmcs_write_field(CR4_READ_SHADOW, cr4_read_shadow,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR);
+		evmcs_write_field(GUEST_CR0, guest_cr0,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR);
+		evmcs_write_field(GUEST_CR3, guest_cr3,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR);
+		evmcs_write_field(GUEST_CR4, guest_cr4,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR);
+		evmcs_write_field(GUEST_DR7, guest_dr7,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR);
+		evmcs_write_field(HOST_FS_BASE, host_fs_base,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER);
+		evmcs_write_field(HOST_GS_BASE, host_gs_base,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER);
+		evmcs_write_field(HOST_TR_BASE, host_tr_base,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER);
+		evmcs_write_field(HOST_GDTR_BASE, host_gdtr_base,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER);
+		evmcs_write_field(HOST_IDTR_BASE, host_idtr_base,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER);
+		evmcs_write_field(HOST_RSP, host_rsp,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER);
+		evmcs_write_field(EPT_POINTER, ept_pointer,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT);
+		evmcs_write_field(GUEST_BNDCFGS, guest_bndcfgs,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
+		evmcs_write_field(XSS_EXIT_BITMAP, xss_exit_bitmap,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2);
+		/* no mask defined in the spec */
+		evmcs_write_field(VM_EXIT_MSR_STORE_ADDR,
+				  vm_exit_msr_store_addr, 0xffff);
+		evmcs_write_field(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr,
+				  0xffff);
+		evmcs_write_field(VM_ENTRY_MSR_LOAD_ADDR,
+				  vm_entry_msr_load_addr, 0xffff);
+		evmcs_write_field(CR3_TARGET_VALUE0, cr3_target_value0, 0xffff);
+		evmcs_write_field(CR3_TARGET_VALUE1, cr3_target_value1, 0xffff);
+		evmcs_write_field(CR3_TARGET_VALUE2, cr3_target_value2, 0xffff);
+		evmcs_write_field(CR3_TARGET_VALUE3, cr3_target_value3, 0xffff);
+
+		/* 32 bit fields */
+		evmcs_write_field(TPR_THRESHOLD, tpr_threshold,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
+		evmcs_write_field(GUEST_INTERRUPTIBILITY_INFO,
+				  guest_interruptibility_info,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC);
+		evmcs_write_field(CPU_BASED_VM_EXEC_CONTROL,
+				  cpu_based_vm_exec_control,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC);
+		evmcs_write_field(EXCEPTION_BITMAP, exception_bitmap,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN);
+		evmcs_write_field(VM_ENTRY_CONTROLS, vm_entry_controls,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY);
+		evmcs_write_field(VM_ENTRY_INTR_INFO_FIELD,
+				  vm_entry_intr_info_field,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT);
+		evmcs_write_field(VM_ENTRY_EXCEPTION_ERROR_CODE,
+				  vm_entry_exception_error_code,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT);
+		evmcs_write_field(VM_ENTRY_INSTRUCTION_LEN,
+				  vm_entry_instruction_len,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT);
+		evmcs_write_field(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1);
+		evmcs_write_field(PIN_BASED_VM_EXEC_CONTROL,
+				  pin_based_vm_exec_control,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1);
+		evmcs_write_field(VM_EXIT_CONTROLS, vm_exit_controls,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1);
+		evmcs_write_field(SECONDARY_VM_EXEC_CONTROL,
+				  secondary_vm_exec_control,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1);
+		evmcs_write_field(GUEST_ES_LIMIT, guest_es_limit,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+		evmcs_write_field(GUEST_CS_LIMIT, guest_cs_limit,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+		evmcs_write_field(GUEST_SS_LIMIT, guest_ss_limit,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+		evmcs_write_field(GUEST_DS_LIMIT, guest_ds_limit,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+		evmcs_write_field(GUEST_FS_LIMIT, guest_fs_limit,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+		evmcs_write_field(GUEST_GS_LIMIT, guest_gs_limit,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+		evmcs_write_field(GUEST_LDTR_LIMIT, guest_ldtr_limit,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+		evmcs_write_field(GUEST_TR_LIMIT, guest_tr_limit,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+		evmcs_write_field(GUEST_GDTR_LIMIT, guest_gdtr_limit,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+		evmcs_write_field(GUEST_IDTR_LIMIT, guest_idtr_limit,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+		evmcs_write_field(GUEST_ES_AR_BYTES, guest_es_ar_bytes,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+		evmcs_write_field(GUEST_CS_AR_BYTES, guest_cs_ar_bytes,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+		evmcs_write_field(GUEST_SS_AR_BYTES, guest_ss_ar_bytes,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+		evmcs_write_field(GUEST_DS_AR_BYTES, guest_ds_ar_bytes,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+		evmcs_write_field(GUEST_FS_AR_BYTES, guest_fs_ar_bytes,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+		evmcs_write_field(GUEST_GS_AR_BYTES, guest_gs_ar_bytes,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+		evmcs_write_field(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+		evmcs_write_field(GUEST_TR_AR_BYTES, guest_tr_ar_bytes,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+		evmcs_write_field(GUEST_ACTIVITY_STATE, guest_activity_state,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
+		evmcs_write_field(GUEST_SYSENTER_CS, guest_sysenter_cs,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
+		/* no mask defined in the spec */
+		evmcs_write_field(PAGE_FAULT_ERROR_CODE_MASK,
+				  page_fault_error_code_mask, 0xffff);
+		evmcs_write_field(PAGE_FAULT_ERROR_CODE_MATCH,
+				  page_fault_error_code_match, 0xffff);
+		evmcs_write_field(CR3_TARGET_COUNT, cr3_target_count,
+				  0xffff);
+		evmcs_write_field(VM_EXIT_MSR_STORE_COUNT,
+				  vm_exit_msr_store_count, 0xffff);
+		evmcs_write_field(VM_EXIT_MSR_LOAD_COUNT,
+				  vm_exit_msr_load_count, 0xffff);
+		evmcs_write_field(VM_ENTRY_MSR_LOAD_COUNT,
+				  vm_entry_msr_load_count, 0xffff);
+
+		/* 16 bit fields */
+		evmcs_write_field(HOST_ES_SELECTOR, host_es_selector,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1);
+		evmcs_write_field(HOST_CS_SELECTOR, host_cs_selector,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1);
+		evmcs_write_field(HOST_SS_SELECTOR, host_ss_selector,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1);
+		evmcs_write_field(HOST_DS_SELECTOR, host_ds_selector,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1);
+		evmcs_write_field(HOST_FS_SELECTOR, host_fs_selector,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1);
+		evmcs_write_field(HOST_GS_SELECTOR, host_gs_selector,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1);
+		evmcs_write_field(HOST_TR_SELECTOR, host_tr_selector,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1);
+		evmcs_write_field(GUEST_ES_SELECTOR, guest_es_selector,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+		evmcs_write_field(GUEST_CS_SELECTOR, guest_cs_selector,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+		evmcs_write_field(GUEST_SS_SELECTOR, guest_ss_selector,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+		evmcs_write_field(GUEST_DS_SELECTOR, guest_ds_selector,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+		evmcs_write_field(GUEST_FS_SELECTOR, guest_fs_selector,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+		evmcs_write_field(GUEST_GS_SELECTOR, guest_gs_selector,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+		evmcs_write_field(GUEST_LDTR_SELECTOR, guest_ldtr_selector,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+		evmcs_write_field(GUEST_TR_SELECTOR, guest_tr_selector,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+		evmcs_write_field(VIRTUAL_PROCESSOR_ID, virtual_processor_id,
+				  HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT);
+	default:
+		pr_err("VMX: no EVMCS support write:0x%lx\n", field);
+	}
+}
+
+static u64 evmcs_read(unsigned long field)
+{
+	int cpu = smp_processor_id();
+	struct hv_enlightened_vmcs *evmcs =
+		__va(hv_vp_assist_page[cpu]->current_nested_vmcs);
+
+	switch (field) {
+		/* 64 bit fields */
+		evmcs_read_field(GUEST_RIP, guest_rip);
+		evmcs_read_field(GUEST_RSP, guest_rsp);
+		evmcs_read_field(GUEST_RFLAGS, guest_rflags);
+		evmcs_read_field(HOST_IA32_PAT, host_ia32_pat);
+		evmcs_read_field(HOST_IA32_EFER, host_ia32_efer);
+		evmcs_read_field(HOST_CR0, host_cr0);
+		evmcs_read_field(HOST_CR3, host_cr3);
+		evmcs_read_field(HOST_CR4, host_cr4);
+		evmcs_read_field(HOST_IA32_SYSENTER_ESP,
+				  host_ia32_sysenter_esp);
+		evmcs_read_field(HOST_IA32_SYSENTER_EIP,
+				  host_ia32_sysenter_eip);
+		evmcs_read_field(HOST_RIP, host_rip);
+		evmcs_read_field(IO_BITMAP_A, io_bitmap_a);
+		evmcs_read_field(IO_BITMAP_B, io_bitmap_b);
+		evmcs_read_field(MSR_BITMAP, msr_bitmap);
+		evmcs_read_field(GUEST_ES_BASE, guest_es_base);
+		evmcs_read_field(GUEST_CS_BASE, guest_cs_base);
+		evmcs_read_field(GUEST_SS_BASE, guest_ss_base);
+		evmcs_read_field(GUEST_DS_BASE, guest_ds_base);
+		evmcs_read_field(GUEST_FS_BASE, guest_fs_base);
+		evmcs_read_field(GUEST_GS_BASE, guest_gs_base);
+		evmcs_read_field(GUEST_LDTR_BASE, guest_ldtr_base);
+		evmcs_read_field(GUEST_TR_BASE, guest_tr_base);
+		evmcs_read_field(GUEST_GDTR_BASE, guest_gdtr_base);
+		evmcs_read_field(GUEST_IDTR_BASE, guest_idtr_base);
+		evmcs_read_field(TSC_OFFSET, tsc_offset);
+		evmcs_read_field(VIRTUAL_APIC_PAGE_ADDR,
+				 virtual_apic_page_addr);
+		evmcs_read_field(VMCS_LINK_POINTER, vmcs_link_pointer);
+		evmcs_read_field(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl);
+		evmcs_read_field(GUEST_IA32_PAT, guest_ia32_pat);
+		evmcs_read_field(GUEST_IA32_EFER, guest_ia32_efer);
+		evmcs_read_field(GUEST_PDPTR0, guest_pdptr0);
+		evmcs_read_field(GUEST_PDPTR1, guest_pdptr1);
+		evmcs_read_field(GUEST_PDPTR2, guest_pdptr2);
+		evmcs_read_field(GUEST_PDPTR3, guest_pdptr3);
+		evmcs_read_field(GUEST_PENDING_DBG_EXCEPTIONS,
+				  guest_pending_dbg_exceptions);
+		evmcs_read_field(GUEST_SYSENTER_ESP, guest_sysenter_esp);
+		evmcs_read_field(GUEST_SYSENTER_EIP, guest_sysenter_eip);
+		evmcs_read_field(CR0_GUEST_HOST_MASK, cr0_guest_host_mask);
+		evmcs_read_field(CR4_GUEST_HOST_MASK, cr4_guest_host_mask);
+		evmcs_read_field(CR0_READ_SHADOW, cr0_read_shadow);
+		evmcs_read_field(CR4_READ_SHADOW, cr4_read_shadow);
+		evmcs_read_field(GUEST_CR0, guest_cr0);
+		evmcs_read_field(GUEST_CR3, guest_cr3);
+		evmcs_read_field(GUEST_CR4, guest_cr4);
+		evmcs_read_field(GUEST_DR7, guest_dr7);
+		evmcs_read_field(HOST_FS_BASE, host_fs_base);
+		evmcs_read_field(HOST_GS_BASE, host_gs_base);
+		evmcs_read_field(HOST_TR_BASE, host_tr_base);
+		evmcs_read_field(HOST_GDTR_BASE, host_gdtr_base);
+		evmcs_read_field(HOST_IDTR_BASE, host_idtr_base);
+		evmcs_read_field(HOST_RSP, host_rsp);
+		evmcs_read_field(EPT_POINTER, ept_pointer);
+		evmcs_read_field(GUEST_BNDCFGS, guest_bndcfgs);
+		evmcs_read_field(XSS_EXIT_BITMAP, xss_exit_bitmap);
+		evmcs_read_field(GUEST_PHYSICAL_ADDRESS,
+				 guest_physical_address);
+		evmcs_read_field(EXIT_QUALIFICATION, exit_qualification);
+		/*
+		 * Not implemented in KVM:
+		 * evmcs_read_field(0x00006402, exit_io_instruction_ecx);
+		 * evmcs_read_field(0x00006404, exit_io_instruction_esi);
+		 * evmcs_read_field(0x00006406, exit_io_instruction_esi);
+		 * evmcs_read_field(0x00006408, exit_io_instruction_eip);
+		 */
+		evmcs_read_field(GUEST_LINEAR_ADDRESS, guest_linear_address);
+
+		/* no mask defined in the spec */
+		evmcs_read_field(VM_EXIT_MSR_STORE_ADDR,
+				 vm_exit_msr_store_addr);
+		evmcs_read_field(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr);
+		evmcs_read_field(VM_ENTRY_MSR_LOAD_ADDR,
+				 vm_entry_msr_load_addr);
+		evmcs_read_field(CR3_TARGET_VALUE0, cr3_target_value0);
+		evmcs_read_field(CR3_TARGET_VALUE1, cr3_target_value1);
+		evmcs_read_field(CR3_TARGET_VALUE2, cr3_target_value2);
+		evmcs_read_field(CR3_TARGET_VALUE3, cr3_target_value3);
+
+		/* 32 bit fields */
+		evmcs_read_field(TPR_THRESHOLD, tpr_threshold);
+		evmcs_read_field(GUEST_INTERRUPTIBILITY_INFO,
+				  guest_interruptibility_info);
+		evmcs_read_field(CPU_BASED_VM_EXEC_CONTROL,
+				  cpu_based_vm_exec_control);
+		evmcs_read_field(EXCEPTION_BITMAP, exception_bitmap);
+		evmcs_read_field(VM_ENTRY_CONTROLS, vm_entry_controls);
+		evmcs_read_field(VM_ENTRY_INTR_INFO_FIELD,
+				  vm_entry_intr_info_field);
+		evmcs_read_field(VM_ENTRY_EXCEPTION_ERROR_CODE,
+				  vm_entry_exception_error_code);
+		evmcs_read_field(VM_ENTRY_INSTRUCTION_LEN,
+				  vm_entry_instruction_len);
+		evmcs_read_field(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs);
+		evmcs_read_field(PIN_BASED_VM_EXEC_CONTROL,
+				  pin_based_vm_exec_control);
+		evmcs_read_field(VM_EXIT_CONTROLS, vm_exit_controls);
+		evmcs_read_field(SECONDARY_VM_EXEC_CONTROL,
+				  secondary_vm_exec_control);
+		evmcs_read_field(GUEST_ES_LIMIT, guest_es_limit);
+		evmcs_read_field(GUEST_CS_LIMIT, guest_cs_limit);
+		evmcs_read_field(GUEST_SS_LIMIT, guest_ss_limit);
+		evmcs_read_field(GUEST_DS_LIMIT, guest_ds_limit);
+		evmcs_read_field(GUEST_FS_LIMIT, guest_fs_limit);
+		evmcs_read_field(GUEST_GS_LIMIT, guest_gs_limit);
+		evmcs_read_field(GUEST_LDTR_LIMIT, guest_ldtr_limit);
+		evmcs_read_field(GUEST_TR_LIMIT, guest_tr_limit);
+		evmcs_read_field(GUEST_GDTR_LIMIT, guest_gdtr_limit);
+		evmcs_read_field(GUEST_IDTR_LIMIT, guest_idtr_limit);
+		evmcs_read_field(GUEST_ES_AR_BYTES, guest_es_ar_bytes);
+		evmcs_read_field(GUEST_CS_AR_BYTES, guest_cs_ar_bytes);
+		evmcs_read_field(GUEST_SS_AR_BYTES, guest_ss_ar_bytes);
+		evmcs_read_field(GUEST_DS_AR_BYTES, guest_ds_ar_bytes);
+		evmcs_read_field(GUEST_FS_AR_BYTES, guest_fs_ar_bytes);
+		evmcs_read_field(GUEST_GS_AR_BYTES, guest_gs_ar_bytes);
+		evmcs_read_field(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes);
+		evmcs_read_field(GUEST_TR_AR_BYTES, guest_tr_ar_bytes);
+		evmcs_read_field(GUEST_ACTIVITY_STATE, guest_activity_state);
+		evmcs_read_field(GUEST_SYSENTER_CS, guest_sysenter_cs);
+		evmcs_read_field(VM_INSTRUCTION_ERROR, vm_instruction_error);
+		evmcs_read_field(VM_EXIT_REASON, vm_exit_reason);
+		evmcs_read_field(VM_EXIT_INTR_INFO, vm_exit_intr_info);
+		evmcs_read_field(VM_EXIT_INTR_ERROR_CODE,
+				 vm_exit_intr_error_code);
+		evmcs_read_field(IDT_VECTORING_INFO_FIELD,
+				 idt_vectoring_info_field);
+		evmcs_read_field(IDT_VECTORING_ERROR_CODE,
+				 idt_vectoring_error_code);
+		evmcs_read_field(VM_EXIT_INSTRUCTION_LEN,
+				 vm_exit_instruction_len);
+		evmcs_read_field(VMX_INSTRUCTION_INFO, vmx_instruction_info);
+		/* no mask defined in the spec */
+		evmcs_read_field(PAGE_FAULT_ERROR_CODE_MASK,
+				 page_fault_error_code_mask);
+		evmcs_read_field(PAGE_FAULT_ERROR_CODE_MATCH,
+				 page_fault_error_code_match);
+		evmcs_read_field(CR3_TARGET_COUNT, cr3_target_count);
+		evmcs_read_field(VM_EXIT_MSR_STORE_COUNT,
+				 vm_exit_msr_store_count);
+		evmcs_read_field(VM_EXIT_MSR_LOAD_COUNT,
+				 vm_exit_msr_load_count);
+		evmcs_read_field(VM_ENTRY_MSR_LOAD_COUNT,
+				 vm_entry_msr_load_count);
+
+		/* 16 bit fields */
+		evmcs_read_field(HOST_ES_SELECTOR, host_es_selector);
+		evmcs_read_field(HOST_CS_SELECTOR, host_cs_selector);
+		evmcs_read_field(HOST_SS_SELECTOR, host_ss_selector);
+		evmcs_read_field(HOST_DS_SELECTOR, host_ds_selector);
+		evmcs_read_field(HOST_FS_SELECTOR, host_fs_selector);
+		evmcs_read_field(HOST_GS_SELECTOR, host_gs_selector);
+		evmcs_read_field(HOST_TR_SELECTOR, host_tr_selector);
+		evmcs_read_field(GUEST_ES_SELECTOR, guest_es_selector);
+		evmcs_read_field(GUEST_CS_SELECTOR, guest_cs_selector);
+		evmcs_read_field(GUEST_SS_SELECTOR, guest_ss_selector);
+		evmcs_read_field(GUEST_DS_SELECTOR, guest_ds_selector);
+		evmcs_read_field(GUEST_FS_SELECTOR, guest_fs_selector);
+		evmcs_read_field(GUEST_GS_SELECTOR, guest_gs_selector);
+		evmcs_read_field(GUEST_LDTR_SELECTOR, guest_ldtr_selector);
+		evmcs_read_field(GUEST_TR_SELECTOR, guest_tr_selector);
+		evmcs_read_field(VIRTUAL_PROCESSOR_ID, virtual_processor_id);
+
+	default:
+		pr_err("VMX: no EVMCS support read:0x%lx\n", field);
+	}
+
+	return 0;
+}
+
 static __always_inline void vmcs_check16(unsigned long field)
 {
         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
@@ -1676,18 +2199,24 @@  static __always_inline unsigned long __vmcs_readl(unsigned long field)
 static __always_inline u16 vmcs_read16(unsigned long field)
 {
 	vmcs_check16(field);
+	if (enlightened_vmcs)
+		return evmcs_read(field);
 	return __vmcs_readl(field);
 }
 
 static __always_inline u32 vmcs_read32(unsigned long field)
 {
 	vmcs_check32(field);
+	if (enlightened_vmcs)
+		return evmcs_read(field);
 	return __vmcs_readl(field);
 }
 
 static __always_inline u64 vmcs_read64(unsigned long field)
 {
 	vmcs_check64(field);
+	if (enlightened_vmcs)
+		return evmcs_read(field);
 #ifdef CONFIG_X86_64
 	return __vmcs_readl(field);
 #else
@@ -1698,6 +2227,8 @@  static __always_inline u64 vmcs_read64(unsigned long field)
 static __always_inline unsigned long vmcs_readl(unsigned long field)
 {
 	vmcs_checkl(field);
+	if (enlightened_vmcs)
+		return evmcs_read(field);
 	return __vmcs_readl(field);
 }
 
@@ -1721,18 +2252,27 @@  static __always_inline void __vmcs_writel(unsigned long field, unsigned long val
 static __always_inline void vmcs_write16(unsigned long field, u16 value)
 {
 	vmcs_check16(field);
+	if (enlightened_vmcs)
+		return evmcs_write(field, value);
+
 	__vmcs_writel(field, value);
 }
 
 static __always_inline void vmcs_write32(unsigned long field, u32 value)
 {
 	vmcs_check32(field);
+	if (enlightened_vmcs)
+		return evmcs_write(field, value);
+
 	__vmcs_writel(field, value);
 }
 
 static __always_inline void vmcs_write64(unsigned long field, u64 value)
 {
 	vmcs_check64(field);
+	if (enlightened_vmcs)
+		return evmcs_write(field, value);
+
 	__vmcs_writel(field, value);
 #ifndef CONFIG_X86_64
 	asm volatile ("");
@@ -1743,6 +2283,9 @@  static __always_inline void vmcs_write64(unsigned long field, u64 value)
 static __always_inline void vmcs_writel(unsigned long field, unsigned long value)
 {
 	vmcs_checkl(field);
+	if (enlightened_vmcs)
+		return evmcs_write(field, value);
+
 	__vmcs_writel(field, value);
 }
 
@@ -1750,6 +2293,9 @@  static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask)
 {
         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
 			 "vmcs_clear_bits does not support 64-bit fields");
+	if (enlightened_vmcs)
+		return evmcs_write(field, evmcs_read(field) & ~mask);
+
 	__vmcs_writel(field, __vmcs_readl(field) & ~mask);
 }
 
@@ -1757,6 +2303,9 @@  static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
 {
         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
 			 "vmcs_set_bits does not support 64-bit fields");
+	if (enlightened_vmcs)
+		return evmcs_write(field, evmcs_read(field) | mask);
+
 	__vmcs_writel(field, __vmcs_readl(field) | mask);
 }
 
@@ -3891,7 +4440,11 @@  static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	vmcs_conf->size = vmx_msr_high & 0x1fff;
 	vmcs_conf->order = get_order(vmcs_conf->size);
 	vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
-	vmcs_conf->revision_id = vmx_msr_low;
+
+	if (enlightened_vmcs)
+		vmcs_conf->revision_id = ms_hyperv.nested_features & 0xff;
+	else
+		vmcs_conf->revision_id = vmx_msr_low;
 
 	vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
 	vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
@@ -9520,6 +10073,7 @@  static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	unsigned long cr3, cr4;
+	struct hv_enlightened_vmcs *evmcs = NULL;
 
 	/* Record the guest's net vcpu time for enforced NMI injections. */
 	if (unlikely(!enable_vnmi &&
@@ -9581,6 +10135,17 @@  static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	vmx_arm_hv_timer(vcpu);
 
 	vmx->__launched = vmx->loaded_vmcs->launched;
+
+	if (enlightened_vmcs) {
+		int cpu = smp_processor_id();
+
+		evmcs = __va(hv_vp_assist_page[cpu]->current_nested_vmcs);
+
+		/* Crude hack: put RSP-8 to enlightened VMCS host_rsp field */
+		asm volatile ("mov %%rsp, (%%rax); sub $32, (%%rax)" : :
+			      "a"(&evmcs->host_rsp));
+		vmx->host_rsp = evmcs->host_rsp;
+	}
 	asm(
 		/* Store host registers */
 		"push %%" _ASM_DX "; push %%" _ASM_BP ";"
@@ -9686,6 +10251,10 @@  static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 #endif
 	      );
 
+	/* All fields are CLEAN */
+	if (evmcs)
+		evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+
 	if (have_spec_ctrl) {
 		rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
 		if (vmx->spec_ctrl)
@@ -12463,7 +13032,29 @@  static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 
 static int __init vmx_init(void)
 {
-	int r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
+	int r;
+
+#ifdef CONFIG_HYPERVISOR_GUEST
+	if (enlightened_vmcs &&
+	    ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED) {
+		int cpu;
+
+		/* check that we have assist pages on all CPUs */
+		for_each_online_cpu(cpu) {
+			if (!hv_vp_assist_page[cpu]) {
+				enlightened_vmcs = false;
+				break;
+			}
+		}
+
+		if (enlightened_vmcs)
+			pr_info("VMX: using Hyper-V Enlightened VMCS\n");
+	} else {
+		enlightened_vmcs = false;
+	}
+#endif
+
+	r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
                      __alignof__(struct vcpu_vmx), THIS_MODULE);
 	if (r)
 		return r;