diff mbox

[v6,10/15] nEPT: Nested INVEPT

Message ID 1375366117-9014-11-git-send-email-gleb@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Gleb Natapov Aug. 1, 2013, 2:08 p.m. UTC
From: Nadav Har'El <nyh@il.ibm.com>

If we let L1 use EPT, we should probably also support the INVEPT instruction.

In our current nested EPT implementation, when L1 changes its EPT table
for L2 (i.e., EPT12), L0 modifies the shadow EPT table (EPT02), and in
the course of this modification already calls INVEPT. But if last level
of shadow page is unsync not all L1's changes to EPT12 are intercepted,
which means roots need to be synced when L1 calls INVEPT. Global INVEPT
should not be different since roots are synced by kvm_mmu_load() each
time EPTP02 changes.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Jun Nakajima <jun.nakajima@intel.com>
Signed-off-by: Xinhao Xu <xinhao.xu@intel.com>
Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/include/asm/vmx.h      |    2 ++
 arch/x86/include/uapi/asm/vmx.h |    1 +
 arch/x86/kvm/mmu.c              |    2 ++
 arch/x86/kvm/vmx.c              |   67 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 72 insertions(+)

Comments

Xiao Guangrong Aug. 2, 2013, 8:06 a.m. UTC | #1
On 08/01/2013 10:08 PM, Gleb Natapov wrote:

> +/* Emulate the INVEPT instruction */
> +static int handle_invept(struct kvm_vcpu *vcpu)
> +{
> +	u32 vmx_instruction_info;
> +	bool ok;
> +	unsigned long type;
> +	gva_t gva;
> +	struct x86_exception e;
> +	struct {
> +		u64 eptp, gpa;
> +	} operand;
> +
> +	if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||
> +	    !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
> +		kvm_queue_exception(vcpu, UD_VECTOR);
> +		return 1;
> +	}
> +
> +	if (!nested_vmx_check_permission(vcpu))
> +		return 1;
> +
> +	if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) {
> +		kvm_queue_exception(vcpu, UD_VECTOR);
> +		return 1;
> +	}
> +
> +	/* According to the Intel VMX instruction reference, the memory
> +	 * operand is read even if it isn't needed (e.g., for type==global)
> +	 */
> +	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
> +	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
> +			vmx_instruction_info, &gva))
> +		return 1;
> +	if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
> +				sizeof(operand), &e)) {
> +		kvm_inject_page_fault(vcpu, &e);
> +		return 1;
> +	}
> +
> +	type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
> +
> +	switch (type) {
> +	case VMX_EPT_EXTENT_GLOBAL:
> +	case VMX_EPT_EXTENT_CONTEXT:
> +		ok = !!(nested_vmx_ept_caps &
> +				(1UL << (type + VMX_EPT_EXTENT_SHIFT)));
> +		break;
> +	default:
> +		ok = false;
> +	}
> +
> +	if (ok) {
> +		kvm_mmu_sync_roots(vcpu);
> +		kvm_mmu_flush_tlb(vcpu);
> +		nested_vmx_succeed(vcpu);
> +	}
> +	else
> +		nested_vmx_failValid(vcpu,
> +				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
> +
> +	skip_emulated_instruction(vcpu);
> +	return 1;

Can this code be changed to:

	switch (type) {
	case VMX_EPT_EXTENT_GLOBAL:
	case VMX_EPT_EXTENT_CONTEXT:
		if (nested_vmx_ept_caps &
				(1UL << (type + VMX_EPT_EXTENT_SHIFT) {
			kvm_mmu_sync_roots(vcpu);
			kvm_mmu_flush_tlb(vcpu);
			nested_vmx_succeed(vcpu);
			break;
		}
	default:
		nested_vmx_failValid(vcpu,
				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
	}
?
That's clearer i think.

Also, we can sync the shadow page table only if the current eptp is the required
eptp, that means:

if (type == GLOBAL || operand.eptp == nested_ept_get_cr3(vcpu)) {
	kvm_mmu_sync_roots(vcpu);
	......
}

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Aug. 2, 2013, 10 a.m. UTC | #2
On Fri, Aug 02, 2013 at 04:06:00PM +0800, Xiao Guangrong wrote:
> On 08/01/2013 10:08 PM, Gleb Natapov wrote:
> 
> > +/* Emulate the INVEPT instruction */
> > +static int handle_invept(struct kvm_vcpu *vcpu)
> > +{
> > +	u32 vmx_instruction_info;
> > +	bool ok;
> > +	unsigned long type;
> > +	gva_t gva;
> > +	struct x86_exception e;
> > +	struct {
> > +		u64 eptp, gpa;
> > +	} operand;
> > +
> > +	if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||
> > +	    !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
> > +		kvm_queue_exception(vcpu, UD_VECTOR);
> > +		return 1;
> > +	}
> > +
> > +	if (!nested_vmx_check_permission(vcpu))
> > +		return 1;
> > +
> > +	if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) {
> > +		kvm_queue_exception(vcpu, UD_VECTOR);
> > +		return 1;
> > +	}
> > +
> > +	/* According to the Intel VMX instruction reference, the memory
> > +	 * operand is read even if it isn't needed (e.g., for type==global)
> > +	 */
> > +	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
> > +	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
> > +			vmx_instruction_info, &gva))
> > +		return 1;
> > +	if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
> > +				sizeof(operand), &e)) {
> > +		kvm_inject_page_fault(vcpu, &e);
> > +		return 1;
> > +	}
> > +
> > +	type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
> > +
> > +	switch (type) {
> > +	case VMX_EPT_EXTENT_GLOBAL:
> > +	case VMX_EPT_EXTENT_CONTEXT:
> > +		ok = !!(nested_vmx_ept_caps &
> > +				(1UL << (type + VMX_EPT_EXTENT_SHIFT)));
> > +		break;
> > +	default:
> > +		ok = false;
> > +	}
> > +
> > +	if (ok) {
> > +		kvm_mmu_sync_roots(vcpu);
> > +		kvm_mmu_flush_tlb(vcpu);
> > +		nested_vmx_succeed(vcpu);
> > +	}
> > +	else
> > +		nested_vmx_failValid(vcpu,
> > +				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
> > +
> > +	skip_emulated_instruction(vcpu);
> > +	return 1;
> 
> Can this code be changed to:
> 
> 	switch (type) {
> 	case VMX_EPT_EXTENT_GLOBAL:
> 	case VMX_EPT_EXTENT_CONTEXT:
> 		if (nested_vmx_ept_caps &
> 				(1UL << (type + VMX_EPT_EXTENT_SHIFT) {
> 			kvm_mmu_sync_roots(vcpu);
> 			kvm_mmu_flush_tlb(vcpu);
> 			nested_vmx_succeed(vcpu);
> 			break;
> 		}
> 	default:
> 		nested_vmx_failValid(vcpu,
> 				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
> 	}
> ?
> That's clearer i think.
> 
> Also, we can sync the shadow page table only if the current eptp is the required
> eptp, that means:
> 
> if (type == GLOBAL || operand.eptp == nested_ept_get_cr3(vcpu)) {
> 	kvm_mmu_sync_roots(vcpu);
> 	......
> }
Good point. Will do.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index f3e01a2..966502d 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -387,6 +387,7 @@  enum vmcs_field {
 #define VMX_EPT_EXTENT_INDIVIDUAL_ADDR		0
 #define VMX_EPT_EXTENT_CONTEXT			1
 #define VMX_EPT_EXTENT_GLOBAL			2
+#define VMX_EPT_EXTENT_SHIFT			24
 
 #define VMX_EPT_EXECUTE_ONLY_BIT		(1ull)
 #define VMX_EPT_PAGE_WALK_4_BIT			(1ull << 6)
@@ -394,6 +395,7 @@  enum vmcs_field {
 #define VMX_EPTP_WB_BIT				(1ull << 14)
 #define VMX_EPT_2MB_PAGE_BIT			(1ull << 16)
 #define VMX_EPT_1GB_PAGE_BIT			(1ull << 17)
+#define VMX_EPT_INVEPT_BIT			(1ull << 20)
 #define VMX_EPT_AD_BIT				    (1ull << 21)
 #define VMX_EPT_EXTENT_CONTEXT_BIT		(1ull << 25)
 #define VMX_EPT_EXTENT_GLOBAL_BIT		(1ull << 26)
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index d651082..7a34e8f 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -65,6 +65,7 @@ 
 #define EXIT_REASON_EOI_INDUCED         45
 #define EXIT_REASON_EPT_VIOLATION       48
 #define EXIT_REASON_EPT_MISCONFIG       49
+#define EXIT_REASON_INVEPT              50
 #define EXIT_REASON_PREEMPTION_TIMER    52
 #define EXIT_REASON_WBINVD              54
 #define EXIT_REASON_XSETBV              55
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a512ecf..a1f8f7b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3182,6 +3182,7 @@  void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
 	mmu_sync_roots(vcpu);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
 
 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
 				  u32 access, struct x86_exception *exception)
@@ -3451,6 +3452,7 @@  void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
 	++vcpu->stat.tlb_flush;
 	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb);
 
 static void paging_new_cr3(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4e98764..a4d9385 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2156,6 +2156,7 @@  static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
 static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
 static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
 static u32 nested_vmx_misc_low, nested_vmx_misc_high;
+static u32 nested_vmx_ept_caps;
 static __init void nested_vmx_setup_ctls_msrs(void)
 {
 	/*
@@ -6270,6 +6271,70 @@  static int handle_vmptrst(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+/* Emulate the INVEPT instruction */
+static int handle_invept(struct kvm_vcpu *vcpu)
+{
+	u32 vmx_instruction_info;
+	bool ok;
+	unsigned long type;
+	gva_t gva;
+	struct x86_exception e;
+	struct {
+		u64 eptp, gpa;
+	} operand;
+
+	if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||
+	    !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 1;
+	}
+
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 1;
+	}
+
+	/* According to the Intel VMX instruction reference, the memory
+	 * operand is read even if it isn't needed (e.g., for type==global)
+	 */
+	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+			vmx_instruction_info, &gva))
+		return 1;
+	if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
+				sizeof(operand), &e)) {
+		kvm_inject_page_fault(vcpu, &e);
+		return 1;
+	}
+
+	type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
+
+	switch (type) {
+	case VMX_EPT_EXTENT_GLOBAL:
+	case VMX_EPT_EXTENT_CONTEXT:
+		ok = !!(nested_vmx_ept_caps &
+				(1UL << (type + VMX_EPT_EXTENT_SHIFT)));
+		break;
+	default:
+		ok = false;
+	}
+
+	if (ok) {
+		kvm_mmu_sync_roots(vcpu);
+		kvm_mmu_flush_tlb(vcpu);
+		nested_vmx_succeed(vcpu);
+	}
+	else
+		nested_vmx_failValid(vcpu,
+				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -6314,6 +6379,7 @@  static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
 	[EXIT_REASON_MWAIT_INSTRUCTION]	      = handle_invalid_op,
 	[EXIT_REASON_MONITOR_INSTRUCTION]     = handle_invalid_op,
+	[EXIT_REASON_INVEPT]                  = handle_invept,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -6540,6 +6606,7 @@  static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 	case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
 	case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
 	case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
+	case EXIT_REASON_INVEPT:
 		/*
 		 * VMX instructions trap unconditionally. This allows L1 to
 		 * emulate them for its L2 guest, i.e., allows 3-level nesting!