diff mbox

[4/4] x86: kvm: mmu: use ept a/d in vmcs02 iff used in vmcs12

Message ID 5bdf824538481ee2308354165e8ebe7b59b7b8a1.1498868316.git.pfeiner@google.com (mailing list archive)
State New, archived
Headers show

Commit Message

Peter Feiner July 1, 2017, 12:26 a.m. UTC
EPT A/D was enabled in the vmcs02 EPTP regardless of the vmcs12's EPTP
value. The problem is that enabling A/D changes the behavior of L2's
x86 page table walks as seen by L1. With A/D enabled, x86 page table
walks are always treated as EPT writes.

The EPT A/D emulation patch (XXX) tried to work around this problem
by clearing the write bit in the exit qualification for EPT violations
triggered by page walks. However, that fixup introduced the opposite
bug: page-table walks that actually set x86 A/D bits were *missing*
the write bit in the exit qualification.

This patch fixes the problem by disabling EPT A/D in the shadow MMU
when EPT A/D is disabled in vmcs12's EPTP.

Signed-off-by: Peter Feiner <pfeiner@google.com>
---
 arch/x86/kvm/mmu.c |  1 +
 arch/x86/kvm/vmx.c | 40 ++++++++++++++++++----------------------
 2 files changed, 19 insertions(+), 22 deletions(-)

Comments

Paolo Bonzini July 1, 2017, 5:29 a.m. UTC | #1
> @@ -8358,7 +8349,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
>  	 * mode as if vcpus is in root mode, the PML buffer must has been
>  	 * flushed already.
>  	 */
> -	if (enable_pml)
> +	if (enable_pml && !is_guest_mode(vcpu))
>  		vmx_flush_pml_buffer(vcpu);
>  
>  	/* If guest state is invalid, start emulating */

I don't understand this.  You need to flush the PML buffer if
L2 is running with EPT A/D bits enabled, don't you? Apart from
this it seems sane, I only have to look at patch 3 more carefully.

Thanks,

Paolo
Peter Feiner July 1, 2017, 7:29 a.m. UTC | #2
On Fri, Jun 30, 2017 at 10:29 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>> @@ -8358,7 +8349,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
>>        * mode as if vcpus is in root mode, the PML buffer must has been
>>        * flushed already.
>>        */
>> -     if (enable_pml)
>> +     if (enable_pml && !is_guest_mode(vcpu))
>>               vmx_flush_pml_buffer(vcpu);
>>
>>       /* If guest state is invalid, start emulating */
>
> I don't understand this.  You need to flush the PML buffer if
> L2 is running with EPT A/D bits enabled, don't you? Apart from
> this it seems sane, I only have to look at patch 3 more carefully.

You're right: this is busted. I wrote these patches before you
implemented EPT A/D nesting (i.e., PML was moot for guest mode).

I think the patch hunk can go away entirely actually. As long as PML
is enabled, it's ok to flush the buffer. The interesting case is when
the vcpu is in guest mode with EPT A/D disabled. In this case, L0's
PML isn't filled while L2 runs because EPT A/D is disabled in the
vmcs02 (thanks to this patch), so there's nothing in the buffer!

It's troubling is that there's no test case covering L0's use of PML +
nesting. Stress testing live migrations of L1 hypervisors (and
implicitly their L2 guests) is one way of doing it, but it's pretty
clumsy. A tightly coupled L0 userspace, L1 and L2 guests would be the
way to go because you could just coordinate ioctls with guest memory
accesses.
Paolo Bonzini July 1, 2017, 7:59 a.m. UTC | #3
On 01/07/2017 09:29, Peter Feiner wrote:
> You're right: this is busted. I wrote these patches before you
> implemented EPT A/D nesting (i.e., PML was moot for guest mode).
> 
> I think the patch hunk can go away entirely actually. As long as PML
> is enabled, it's ok to flush the buffer. The interesting case is when
> the vcpu is in guest mode with EPT A/D disabled. In this case, L0's
> PML isn't filled while L2 runs because EPT A/D is disabled in the
> vmcs02 (thanks to this patch), so there's nothing in the buffer!

That was my thought too.

> It's troubling is that there's no test case covering L0's use of PML +
> nesting. Stress testing live migrations of L1 hypervisors (and
> implicitly their L2 guests) is one way of doing it, but it's pretty
> clumsy. A tightly coupled L0 userspace, L1 and L2 guests would be the
> way to go because you could just coordinate ioctls with guest memory
> accesses.

Indeed.  We need to do api/-style testing of nested virt.

Paolo
diff mbox

Patch

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e814fb6248ac..d483f81699fe 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4415,6 +4415,7 @@  void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 	context->root_level = context->shadow_root_level;
 	context->root_hpa = INVALID_PAGE;
 	context->direct_map = false;
+	context->base_role.ad_disabled = !accessed_dirty;
 
 	update_permission_bitmask(vcpu, context, true);
 	update_pkru_bitmask(vcpu, context, true);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e59b01a1d431..58137690d57d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -910,8 +910,9 @@  static void nested_release_page_clean(struct page *page)
 	kvm_release_page_clean(page);
 }
 
+static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu);
 static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
-static u64 construct_eptp(unsigned long root_hpa);
+static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
 static bool vmx_xsaves_supported(void);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
@@ -4013,7 +4014,7 @@  static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid)
 	if (enable_ept) {
 		if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 			return;
-		ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
+		ept_sync_context(construct_eptp(vcpu, vcpu->arch.mmu.root_hpa));
 	} else {
 		vpid_sync_context(vpid);
 	}
@@ -4188,14 +4189,15 @@  static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	vmx->emulation_required = emulation_required(vcpu);
 }
 
-static u64 construct_eptp(unsigned long root_hpa)
+static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
 {
 	u64 eptp;
 
 	/* TODO write the value reading from MSR */
 	eptp = VMX_EPT_DEFAULT_MT |
 		VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
-	if (enable_ept_ad_bits)
+	if (enable_ept_ad_bits &&
+	    (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
 		eptp |= VMX_EPT_AD_ENABLE_BIT;
 	eptp |= (root_hpa & PAGE_MASK);
 
@@ -4209,7 +4211,7 @@  static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 
 	guest_cr3 = cr3;
 	if (enable_ept) {
-		eptp = construct_eptp(cr3);
+		eptp = construct_eptp(vcpu, cr3);
 		vmcs_write64(EPT_POINTER, eptp);
 		if (is_paging(vcpu) || is_guest_mode(vcpu))
 			guest_cr3 = kvm_read_cr3(vcpu);
@@ -6214,17 +6216,6 @@  static int handle_ept_violation(struct kvm_vcpu *vcpu)
 
 	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 
-	if (is_guest_mode(vcpu)
-	    && !(exit_qualification & EPT_VIOLATION_GVA_TRANSLATED)) {
-		/*
-		 * Fix up exit_qualification according to whether guest
-		 * page table accesses are reads or writes.
-		 */
-		u64 eptp = nested_ept_get_cr3(vcpu);
-		if (!(eptp & VMX_EPT_AD_ENABLE_BIT))
-			exit_qualification &= ~EPT_VIOLATION_ACC_WRITE;
-	}
-
 	/*
 	 * EPT violation happened while executing iret from NMI,
 	 * "blocked by NMI" bit has to be set before next VM entry.
@@ -6447,7 +6438,7 @@  void vmx_enable_tdp(void)
 		enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
 		0ull, VMX_EPT_EXECUTABLE_MASK,
 		cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
-		enable_ept_ad_bits ? 0ull : VMX_EPT_RWX_MASK);
+		VMX_EPT_RWX_MASK);
 
 	ept_set_mmio_spte_mask();
 	kvm_enable_tdp();
@@ -8358,7 +8349,7 @@  static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 	 * mode as if vcpus is in root mode, the PML buffer must has been
 	 * flushed already.
 	 */
-	if (enable_pml)
+	if (enable_pml && !is_guest_mode(vcpu))
 		vmx_flush_pml_buffer(vcpu);
 
 	/* If guest state is invalid, start emulating */
@@ -9379,6 +9370,11 @@  static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
 	vmcs12->guest_physical_address = fault->address;
 }
 
+static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu)
+{
+	return nested_ept_get_cr3(vcpu) & VMX_EPT_AD_ENABLE_BIT;
+}
+
 /* Callbacks for nested_ept_init_mmu_context: */
 
 static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
@@ -9389,18 +9385,18 @@  static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
 
 static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
 {
-	u64 eptp;
+	bool wants_ad;
 
 	WARN_ON(mmu_is_nested(vcpu));
-	eptp = nested_ept_get_cr3(vcpu);
-	if ((eptp & VMX_EPT_AD_ENABLE_BIT) && !enable_ept_ad_bits)
+	wants_ad = nested_ept_ad_enabled(vcpu);
+	if (wants_ad && !enable_ept_ad_bits)
 		return 1;
 
 	kvm_mmu_unload(vcpu);
 	kvm_init_shadow_ept_mmu(vcpu,
 			to_vmx(vcpu)->nested.nested_vmx_ept_caps &
 			VMX_EPT_EXECUTE_ONLY_BIT,
-			eptp & VMX_EPT_AD_ENABLE_BIT);
+			wants_ad);
 	vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
 	vcpu->arch.mmu.get_cr3           = nested_ept_get_cr3;
 	vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;