KVM/nVMX: Stop mapping the "APIC-access address" page into the kernel
diff mbox series

Message ID 1543845551-4403-1-git-send-email-karahmed@amazon.de
State New
Headers show
Series
  • KVM/nVMX: Stop mapping the "APIC-access address" page into the kernel
Related show

Commit Message

KarimAllah Ahmed Dec. 3, 2018, 1:59 p.m. UTC
The "APIC-access address" is simply a token that the hypervisor puts into
the PFN of a 4K EPTE (or PTE if using shadow paging) that triggers APIC
virtualization whenever a page walk terminates with that PFN. This address
has to be a legal address (i.e.  within the physical address supported by
the CPU), but it need not have WB memory behind it. In fact, it need not
have anything at all behind it. When bit 31 ("activate secondary controls")
of the primary processor-based VM-execution controls is set and bit 0
("virtualize APIC accesses") of the secondary processor-based VM-execution
controls is set, the PFN recorded in the VMCS "APIC-access address" field
will never be touched. (Instead, the access triggers APIC virtualization,
which may access the PFN recorded in the "Virtual-APIC address" field of
the VMCS.)

So stop mapping the "APIC-access address" page into the kernel and even
drop the requirements to have a valid page backing it. Instead, just use
some token that:

1) Not one of the valid guest pages.
2) Within the physical address supported by the CPU.

Suggested-by: Jim Mattson <jmattson@google.com>
Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de>
---

Thanks Jim for the commit message :)
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/mmu.c              | 10 ++++++
 arch/x86/kvm/vmx.c              | 71 ++++++++++++++++++-----------------------
 3 files changed, 42 insertions(+), 40 deletions(-)

Comments

KarimAllah Ahmed Dec. 3, 2018, 4:59 p.m. UTC | #1
On Mon, 2018-12-03 at 14:59 +0100, KarimAllah Ahmed wrote:
> The "APIC-access address" is simply a token that the hypervisor puts into
> the PFN of a 4K EPTE (or PTE if using shadow paging) that triggers APIC
> virtualization whenever a page walk terminates with that PFN. This address
> has to be a legal address (i.e.  within the physical address supported by
> the CPU), but it need not have WB memory behind it. In fact, it need not
> have anything at all behind it. When bit 31 ("activate secondary controls")
> of the primary processor-based VM-execution controls is set and bit 0
> ("virtualize APIC accesses") of the secondary processor-based VM-execution
> controls is set, the PFN recorded in the VMCS "APIC-access address" field
> will never be touched. (Instead, the access triggers APIC virtualization,
> which may access the PFN recorded in the "Virtual-APIC address" field of
> the VMCS.)
> 
> So stop mapping the "APIC-access address" page into the kernel and even
> drop the requirements to have a valid page backing it. Instead, just use
> some token that:
> 
> 1) Not one of the valid guest pages.
> 2) Within the physical address supported by the CPU.
> 
> Suggested-by: Jim Mattson <jmattson@google.com>
> Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de>
> ---
> 
> Thanks Jim for the commit message :)
> ---
>  arch/x86/include/asm/kvm_host.h |  1 +
>  arch/x86/kvm/mmu.c              | 10 ++++++
>  arch/x86/kvm/vmx.c              | 71 ++++++++++++++++++-----------------------
>  3 files changed, 42 insertions(+), 40 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index fbda5a9..7e50196 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1077,6 +1077,7 @@ struct kvm_x86_ops {
>  	void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
>  	void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
>  	void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
> +	bool (*nested_apic_access_addr)(struct kvm_vcpu *vcpu, gpa_t gpa, hpa_t *hpa);
>  	void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
>  	int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
>  	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 7c03c0f..ae46a8d 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -3962,9 +3962,19 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
>  static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
>  			 gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable)
>  {
> +	hpa_t hpa;
>  	struct kvm_memory_slot *slot;
>  	bool async;
>  
> +	if (is_guest_mode(vcpu) &&
> +	    kvm_x86_ops->nested_apic_access_addr &&
> +	    kvm_x86_ops->nested_apic_access_addr(vcpu, gfn_to_gpa(gfn), &hpa)) {
> +		*pfn = hpa >> PAGE_SHIFT;
> +		if (writable)
> +			*writable = true;
> +		return false;
> +	}

Now thinking further about this, I actually still need to validate that the L12 
EPT for this gfn actually contains the apic_access address. To ensure that I 
only fixup the fault when the L1 hypervisor sets up both VMCS L12 APIC_ACCESS 
and L12 EPT to contain the same address.

Will fix and send v2.

> +
>  	/*
>  	 * Don't expose private memslots to L2.
>  	 */
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 83a614f..340cf56 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -864,7 +864,6 @@ struct nested_vmx {
>  	 * Guest pages referred to in the vmcs02 with host-physical
>  	 * pointers, so we must keep them pinned while L2 runs.
>  	 */
> -	struct page *apic_access_page;
>  	struct kvm_host_map virtual_apic_map;
>  	struct kvm_host_map pi_desc_map;
>  	struct kvm_host_map msr_bitmap_map;
> @@ -8512,10 +8511,6 @@ static void free_nested(struct kvm_vcpu *vcpu)
>  	kfree(vmx->nested.cached_vmcs12);
>  	kfree(vmx->nested.cached_shadow_vmcs12);
>  	/* Unpin physical memory we referred to in the vmcs02 */
> -	if (vmx->nested.apic_access_page) {
> -		kvm_release_page_dirty(vmx->nested.apic_access_page);
> -		vmx->nested.apic_access_page = NULL;
> -	}
>  	kvm_vcpu_unmap(&vmx->nested.virtual_apic_map);
>  	kvm_vcpu_unmap(&vmx->nested.pi_desc_map);
>  	vmx->nested.pi_desc = NULL;
> @@ -11901,41 +11896,27 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
>  static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
>  						 struct vmcs12 *vmcs12);
>  
> +static hpa_t vmx_apic_access_addr(void)
> +{
> +	/*
> +	 * The physical address choosen here has to:
> +	 * 1) Never be an address that could be assigned to a guest.
> +	 * 2) Within the maximum physical limits of the CPU.
> +	 *
> +	 * So our choice below is completely random, but at least it follows
> +	 * these two rules.
> +	 */
> +	return __pa_symbol(_text) & PAGE_MASK;
> +}
> +
>  static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
>  {
>  	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
>  	struct kvm_host_map *map;
> -	struct page *page;
> -	u64 hpa;
>  
> -	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
> -		/*
> -		 * Translate L1 physical address to host physical
> -		 * address for vmcs02. Keep the page pinned, so this
> -		 * physical address remains valid. We keep a reference
> -		 * to it so we can release it later.
> -		 */
> -		if (vmx->nested.apic_access_page) { /* shouldn't happen */
> -			kvm_release_page_dirty(vmx->nested.apic_access_page);
> -			vmx->nested.apic_access_page = NULL;
> -		}
> -		page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
> -		/*
> -		 * If translation failed, no matter: This feature asks
> -		 * to exit when accessing the given address, and if it
> -		 * can never be accessed, this feature won't do
> -		 * anything anyway.
> -		 */
> -		if (!is_error_page(page)) {
> -			vmx->nested.apic_access_page = page;
> -			hpa = page_to_phys(vmx->nested.apic_access_page);
> -			vmcs_write64(APIC_ACCESS_ADDR, hpa);
> -		} else {
> -			vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
> -					SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
> -		}
> -	}
> +	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
> +		vmcs_write64(APIC_ACCESS_ADDR, vmx_apic_access_addr());
>  
>  	if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
>  		map = &vmx->nested.virtual_apic_map;
> @@ -14196,12 +14177,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
>  	/* This is needed for same reason as it was needed in prepare_vmcs02 */
>  	vmx->host_rsp = 0;
>  
> -	/* Unpin physical memory we referred to in vmcs02 */
> -	if (vmx->nested.apic_access_page) {
> -		kvm_release_page_dirty(vmx->nested.apic_access_page);
> -		vmx->nested.apic_access_page = NULL;
> -	}
> -
>  	kvm_vcpu_unmap(&vmx->nested.virtual_apic_map);
>  	kvm_vcpu_unmap(&vmx->nested.pi_desc_map);
>  	vmx->nested.pi_desc = NULL;
> @@ -14949,6 +14924,21 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
>  	return 0;
>  }
>  
> +static bool vmx_nested_apic_access_addr(struct kvm_vcpu *vcpu,
> +					gpa_t gpa, hpa_t *hpa)
> +{
> +	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
> +
> +	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
> +	    page_address_valid(vcpu, vmcs12->apic_access_addr) &&
> +	    (gpa_to_gfn(gpa) == gpa_to_gfn(vmcs12->apic_access_addr))) {
> +		*hpa = vmx_apic_access_addr();
> +		return true;
> +	}
> +
> +	return false;
> +}
> +
>  static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
>  	.cpu_has_kvm_support = cpu_has_kvm_support,
>  	.disabled_by_bios = vmx_disabled_by_bios,
> @@ -15022,6 +15012,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
>  	.update_cr8_intercept = update_cr8_intercept,
>  	.set_virtual_apic_mode = vmx_set_virtual_apic_mode,
>  	.set_apic_access_page_addr = vmx_set_apic_access_page_addr,
> +	.nested_apic_access_addr = vmx_nested_apic_access_addr,
>  	.get_enable_apicv = vmx_get_enable_apicv,
>  	.refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
>  	.load_eoi_exitmap = vmx_load_eoi_exitmap,



Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrer: Christian Schlaeger, Ralf Herbrich
Ust-ID: DE 289 237 879
Eingetragen am Amtsgericht Charlottenburg HRB 149173 B

Patch
diff mbox series

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fbda5a9..7e50196 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1077,6 +1077,7 @@  struct kvm_x86_ops {
 	void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
 	void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
 	void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
+	bool (*nested_apic_access_addr)(struct kvm_vcpu *vcpu, gpa_t gpa, hpa_t *hpa);
 	void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
 	int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7c03c0f..ae46a8d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3962,9 +3962,19 @@  bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 			 gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable)
 {
+	hpa_t hpa;
 	struct kvm_memory_slot *slot;
 	bool async;
 
+	if (is_guest_mode(vcpu) &&
+	    kvm_x86_ops->nested_apic_access_addr &&
+	    kvm_x86_ops->nested_apic_access_addr(vcpu, gfn_to_gpa(gfn), &hpa)) {
+		*pfn = hpa >> PAGE_SHIFT;
+		if (writable)
+			*writable = true;
+		return false;
+	}
+
 	/*
 	 * Don't expose private memslots to L2.
 	 */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 83a614f..340cf56 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -864,7 +864,6 @@  struct nested_vmx {
 	 * Guest pages referred to in the vmcs02 with host-physical
 	 * pointers, so we must keep them pinned while L2 runs.
 	 */
-	struct page *apic_access_page;
 	struct kvm_host_map virtual_apic_map;
 	struct kvm_host_map pi_desc_map;
 	struct kvm_host_map msr_bitmap_map;
@@ -8512,10 +8511,6 @@  static void free_nested(struct kvm_vcpu *vcpu)
 	kfree(vmx->nested.cached_vmcs12);
 	kfree(vmx->nested.cached_shadow_vmcs12);
 	/* Unpin physical memory we referred to in the vmcs02 */
-	if (vmx->nested.apic_access_page) {
-		kvm_release_page_dirty(vmx->nested.apic_access_page);
-		vmx->nested.apic_access_page = NULL;
-	}
 	kvm_vcpu_unmap(&vmx->nested.virtual_apic_map);
 	kvm_vcpu_unmap(&vmx->nested.pi_desc_map);
 	vmx->nested.pi_desc = NULL;
@@ -11901,41 +11896,27 @@  static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
 						 struct vmcs12 *vmcs12);
 
+static hpa_t vmx_apic_access_addr(void)
+{
+	/*
+	 * The physical address choosen here has to:
+	 * 1) Never be an address that could be assigned to a guest.
+	 * 2) Within the maximum physical limits of the CPU.
+	 *
+	 * So our choice below is completely random, but at least it follows
+	 * these two rules.
+	 */
+	return __pa_symbol(_text) & PAGE_MASK;
+}
+
 static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
 {
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct kvm_host_map *map;
-	struct page *page;
-	u64 hpa;
 
-	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
-		/*
-		 * Translate L1 physical address to host physical
-		 * address for vmcs02. Keep the page pinned, so this
-		 * physical address remains valid. We keep a reference
-		 * to it so we can release it later.
-		 */
-		if (vmx->nested.apic_access_page) { /* shouldn't happen */
-			kvm_release_page_dirty(vmx->nested.apic_access_page);
-			vmx->nested.apic_access_page = NULL;
-		}
-		page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
-		/*
-		 * If translation failed, no matter: This feature asks
-		 * to exit when accessing the given address, and if it
-		 * can never be accessed, this feature won't do
-		 * anything anyway.
-		 */
-		if (!is_error_page(page)) {
-			vmx->nested.apic_access_page = page;
-			hpa = page_to_phys(vmx->nested.apic_access_page);
-			vmcs_write64(APIC_ACCESS_ADDR, hpa);
-		} else {
-			vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
-					SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
-		}
-	}
+	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+		vmcs_write64(APIC_ACCESS_ADDR, vmx_apic_access_addr());
 
 	if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
 		map = &vmx->nested.virtual_apic_map;
@@ -14196,12 +14177,6 @@  static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 	/* This is needed for same reason as it was needed in prepare_vmcs02 */
 	vmx->host_rsp = 0;
 
-	/* Unpin physical memory we referred to in vmcs02 */
-	if (vmx->nested.apic_access_page) {
-		kvm_release_page_dirty(vmx->nested.apic_access_page);
-		vmx->nested.apic_access_page = NULL;
-	}
-
 	kvm_vcpu_unmap(&vmx->nested.virtual_apic_map);
 	kvm_vcpu_unmap(&vmx->nested.pi_desc_map);
 	vmx->nested.pi_desc = NULL;
@@ -14949,6 +14924,21 @@  static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+static bool vmx_nested_apic_access_addr(struct kvm_vcpu *vcpu,
+					gpa_t gpa, hpa_t *hpa)
+{
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
+	    page_address_valid(vcpu, vmcs12->apic_access_addr) &&
+	    (gpa_to_gfn(gpa) == gpa_to_gfn(vmcs12->apic_access_addr))) {
+		*hpa = vmx_apic_access_addr();
+		return true;
+	}
+
+	return false;
+}
+
 static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
 	.disabled_by_bios = vmx_disabled_by_bios,
@@ -15022,6 +15012,7 @@  static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.update_cr8_intercept = update_cr8_intercept,
 	.set_virtual_apic_mode = vmx_set_virtual_apic_mode,
 	.set_apic_access_page_addr = vmx_set_apic_access_page_addr,
+	.nested_apic_access_addr = vmx_nested_apic_access_addr,
 	.get_enable_apicv = vmx_get_enable_apicv,
 	.refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
 	.load_eoi_exitmap = vmx_load_eoi_exitmap,