diff mbox

[v1,1/3] kvm: svm: Add support for additional SVM NPF error codes

Message ID 147916173923.16347.14406683681257344312.stgit@brijesh-build-machine (mailing list archive)
State New, archived
Headers show

Commit Message

Brijesh Singh Nov. 14, 2016, 10:15 p.m. UTC
From: Tom Lendacky <thomas.lendacky@amd.com>

AMD hardware adds two additional bits to aid in nested page fault handling.

Bit 32 - NPF occurred while translating the guest's final physical address
Bit 33 - NPF occurred while translating the guest page tables

The guest page tables fault indicator can be used as an aid for nested
virtualization. Using V0 for the host, V1 for the first level guest and
V2 for the second level guest, when both V1 and V2 are using nested paging
there are currently a number of unnecessary instruction emulations. When
V2 is launched shadow paging is used in V1 for the nested tables of V2. As
a result, KVM marks these pages as RO in the host nested page tables. When
V2 exits and we resume V1, these pages are still marked RO.

Every nested walk for a guest page table is treated as a user-level write
access and this causes a lot of NPFs because the V1 page tables are marked
RO in the V0 nested tables. While executing V1, when these NPFs occur KVM
sees a write to a read-only page, emulates the V1 instruction and unprotects
the page (marking it RW). This patch looks for cases where we get a NPF due
to a guest page table walk where the page was marked RO. It immediately
unprotects the page and resumes the guest, leading to far fewer instruction
emulations when nested virtualization is used.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
---
 arch/x86/include/asm/kvm_host.h |   11 ++++++++++-
 arch/x86/kvm/mmu.c              |   20 ++++++++++++++++++--
 arch/x86/kvm/svm.c              |    2 +-
 3 files changed, 29 insertions(+), 4 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Paolo Bonzini Nov. 21, 2016, 3:12 p.m. UTC | #1
On 14/11/2016 23:15, Brijesh Singh wrote:
> From: Tom Lendacky <thomas.lendacky@amd.com>
> 
> AMD hardware adds two additional bits to aid in nested page fault handling.
> 
> Bit 32 - NPF occurred while translating the guest's final physical address
> Bit 33 - NPF occurred while translating the guest page tables

I have two questions out of curiosity, and to better understand the
differences between Intel and AMD:

1) are the two bits mutually exclusive, and is one bit always set?

2) what bit is set if the processor is reading the PDPTEs of a 32-bit
PAE guest?

Thanks,

Paolo

> The guest page tables fault indicator can be used as an aid for nested
> virtualization. Using V0 for the host, V1 for the first level guest and
> V2 for the second level guest, when both V1 and V2 are using nested paging
> there are currently a number of unnecessary instruction emulations. When
> V2 is launched shadow paging is used in V1 for the nested tables of V2. As
> a result, KVM marks these pages as RO in the host nested page tables. When
> V2 exits and we resume V1, these pages are still marked RO.
> 
> Every nested walk for a guest page table is treated as a user-level write
> access and this causes a lot of NPFs because the V1 page tables are marked
> RO in the V0 nested tables. While executing V1, when these NPFs occur KVM
> sees a write to a read-only page, emulates the V1 instruction and unprotects
> the page (marking it RW). This patch looks for cases where we get a NPF due
> to a guest page table walk where the page was marked RO. It immediately
> unprotects the page and resumes the guest, leading to far fewer instruction
> emulations when nested virtualization is used.
> 
> Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
> Reviewed-by: Borislav Petkov <bp@suse.de>
> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
> ---
>  arch/x86/include/asm/kvm_host.h |   11 ++++++++++-
>  arch/x86/kvm/mmu.c              |   20 ++++++++++++++++++--
>  arch/x86/kvm/svm.c              |    2 +-
>  3 files changed, 29 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index bdde807..da07e17 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -191,6 +191,8 @@ enum {
>  #define PFERR_RSVD_BIT 3
>  #define PFERR_FETCH_BIT 4
>  #define PFERR_PK_BIT 5
> +#define PFERR_GUEST_FINAL_BIT 32
> +#define PFERR_GUEST_PAGE_BIT 33
>  
>  #define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT)
>  #define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT)
> @@ -198,6 +200,13 @@ enum {
>  #define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
>  #define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
>  #define PFERR_PK_MASK (1U << PFERR_PK_BIT)
> +#define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT)
> +#define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT)
> +
> +#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK |	\
> +				 PFERR_USER_MASK |		\
> +				 PFERR_WRITE_MASK |		\
> +				 PFERR_PRESENT_MASK)
>  
>  /* apic attention bits */
>  #define KVM_APIC_CHECK_VAPIC	0
> @@ -1203,7 +1212,7 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu);
>  
>  int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
>  
> -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
> +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code,
>  		       void *insn, int insn_len);
>  void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
>  void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu);
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index d9c7e98..f633d29 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -4508,7 +4508,7 @@ static void make_mmu_pages_available(struct kvm_vcpu *vcpu)
>  	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
>  }
>  
> -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
> +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
>  		       void *insn, int insn_len)
>  {
>  	int r, emulation_type = EMULTYPE_RETRY;
> @@ -4527,12 +4527,28 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
>  			return r;
>  	}
>  
> -	r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
> +	r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code),
> +				      false);
>  	if (r < 0)
>  		return r;
>  	if (!r)
>  		return 1;
>  
> +	/*
> +	 * Before emulating the instruction, check if the error code
> +	 * was due to a RO violation while translating the guest page.
> +	 * This can occur when using nested virtualization with nested
> +	 * paging in both guests. If true, we simply unprotect the page
> +	 * and resume the guest.
> +	 *
> +	 * Note: AMD only (since it supports the PFERR_GUEST_PAGE_MASK used
> +	 *       in PFERR_NEXT_GUEST_PAGE)
> +	 */
> +	if (error_code == PFERR_NESTED_GUEST_PAGE) {
> +		kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2));
> +		return 1;
> +	}
> +
>  	if (mmio_info_in_cache(vcpu, cr2, direct))
>  		emulation_type = 0;
>  emulate:
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index 8ca1eca..4e462bb 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -2074,7 +2074,7 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
>  static int pf_interception(struct vcpu_svm *svm)
>  {
>  	u64 fault_address = svm->vmcb->control.exit_info_2;
> -	u32 error_code;
> +	u64 error_code;
>  	int r = 1;
>  
>  	switch (svm->apf_reason) {
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Lendacky Nov. 22, 2016, 10:15 p.m. UTC | #2
On 11/21/2016 9:12 AM, Paolo Bonzini wrote:
> 
> 
> On 14/11/2016 23:15, Brijesh Singh wrote:
>> From: Tom Lendacky <thomas.lendacky@amd.com>
>>
>> AMD hardware adds two additional bits to aid in nested page fault handling.
>>
>> Bit 32 - NPF occurred while translating the guest's final physical address
>> Bit 33 - NPF occurred while translating the guest page tables
> 
> I have two questions out of curiosity, and to better understand the
> differences between Intel and AMD:

I talked with some folks about these questions and here's what we
determined:

> 
> 1) are the two bits mutually exclusive, and is one bit always set?

The two bits are mutually exclusive - either the processor encounters
the fault while translating the final gPA or while translating a guest
page table, there's no way for it to be both.

> 
> 2) what bit is set if the processor is reading the PDPTEs of a 32-bit
> PAE guest?

I believe that bit 33 will be set.  The PDPE's are considered guest
tables and are read during a guest table walk (see APM vol2 section
15.25.10).  Note that this is slightly different than the bare-metal
behavior of legacy PAE mode as APM describes. I'll try to test this
and verify it.

Thanks,
Tom

> 
> Thanks,
> 
> Paolo
> 
>> The guest page tables fault indicator can be used as an aid for nested
>> virtualization. Using V0 for the host, V1 for the first level guest and
>> V2 for the second level guest, when both V1 and V2 are using nested paging
>> there are currently a number of unnecessary instruction emulations. When
>> V2 is launched shadow paging is used in V1 for the nested tables of V2. As
>> a result, KVM marks these pages as RO in the host nested page tables. When
>> V2 exits and we resume V1, these pages are still marked RO.
>>
>> Every nested walk for a guest page table is treated as a user-level write
>> access and this causes a lot of NPFs because the V1 page tables are marked
>> RO in the V0 nested tables. While executing V1, when these NPFs occur KVM
>> sees a write to a read-only page, emulates the V1 instruction and unprotects
>> the page (marking it RW). This patch looks for cases where we get a NPF due
>> to a guest page table walk where the page was marked RO. It immediately
>> unprotects the page and resumes the guest, leading to far fewer instruction
>> emulations when nested virtualization is used.
>>
>> Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
>> Reviewed-by: Borislav Petkov <bp@suse.de>
>> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
>> ---
>>  arch/x86/include/asm/kvm_host.h |   11 ++++++++++-
>>  arch/x86/kvm/mmu.c              |   20 ++++++++++++++++++--
>>  arch/x86/kvm/svm.c              |    2 +-
>>  3 files changed, 29 insertions(+), 4 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index bdde807..da07e17 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -191,6 +191,8 @@ enum {
>>  #define PFERR_RSVD_BIT 3
>>  #define PFERR_FETCH_BIT 4
>>  #define PFERR_PK_BIT 5
>> +#define PFERR_GUEST_FINAL_BIT 32
>> +#define PFERR_GUEST_PAGE_BIT 33
>>  
>>  #define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT)
>>  #define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT)
>> @@ -198,6 +200,13 @@ enum {
>>  #define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
>>  #define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
>>  #define PFERR_PK_MASK (1U << PFERR_PK_BIT)
>> +#define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT)
>> +#define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT)
>> +
>> +#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK |	\
>> +				 PFERR_USER_MASK |		\
>> +				 PFERR_WRITE_MASK |		\
>> +				 PFERR_PRESENT_MASK)
>>  
>>  /* apic attention bits */
>>  #define KVM_APIC_CHECK_VAPIC	0
>> @@ -1203,7 +1212,7 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu);
>>  
>>  int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
>>  
>> -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
>> +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code,
>>  		       void *insn, int insn_len);
>>  void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
>>  void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu);
>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>> index d9c7e98..f633d29 100644
>> --- a/arch/x86/kvm/mmu.c
>> +++ b/arch/x86/kvm/mmu.c
>> @@ -4508,7 +4508,7 @@ static void make_mmu_pages_available(struct kvm_vcpu *vcpu)
>>  	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
>>  }
>>  
>> -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
>> +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
>>  		       void *insn, int insn_len)
>>  {
>>  	int r, emulation_type = EMULTYPE_RETRY;
>> @@ -4527,12 +4527,28 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
>>  			return r;
>>  	}
>>  
>> -	r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
>> +	r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code),
>> +				      false);
>>  	if (r < 0)
>>  		return r;
>>  	if (!r)
>>  		return 1;
>>  
>> +	/*
>> +	 * Before emulating the instruction, check if the error code
>> +	 * was due to a RO violation while translating the guest page.
>> +	 * This can occur when using nested virtualization with nested
>> +	 * paging in both guests. If true, we simply unprotect the page
>> +	 * and resume the guest.
>> +	 *
>> +	 * Note: AMD only (since it supports the PFERR_GUEST_PAGE_MASK used
>> +	 *       in PFERR_NEXT_GUEST_PAGE)
>> +	 */
>> +	if (error_code == PFERR_NESTED_GUEST_PAGE) {
>> +		kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2));
>> +		return 1;
>> +	}
>> +
>>  	if (mmio_info_in_cache(vcpu, cr2, direct))
>>  		emulation_type = 0;
>>  emulate:
>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
>> index 8ca1eca..4e462bb 100644
>> --- a/arch/x86/kvm/svm.c
>> +++ b/arch/x86/kvm/svm.c
>> @@ -2074,7 +2074,7 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
>>  static int pf_interception(struct vcpu_svm *svm)
>>  {
>>  	u64 fault_address = svm->vmcb->control.exit_info_2;
>> -	u32 error_code;
>> +	u64 error_code;
>>  	int r = 1;
>>  
>>  	switch (svm->apf_reason) {
>>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Nov. 22, 2016, 10:29 p.m. UTC | #3
On 22/11/2016 23:15, Tom Lendacky wrote:
> > 2) what bit is set if the processor is reading the PDPTEs of a 32-bit
> > PAE guest?
>
> I believe that bit 33 will be set.  The PDPE's are considered guest
> tables and are read during a guest table walk (see APM vol2 section
> 15.25.10).  Note that this is slightly different than the bare-metal
> behavior of legacy PAE mode as APM describes. I'll try to test this
> and verify it.

No big deal, indeed it's a bit different from Intel which caches the
four PDPEs, but it's enough to know that bit 33 will be set.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index bdde807..da07e17 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -191,6 +191,8 @@  enum {
 #define PFERR_RSVD_BIT 3
 #define PFERR_FETCH_BIT 4
 #define PFERR_PK_BIT 5
+#define PFERR_GUEST_FINAL_BIT 32
+#define PFERR_GUEST_PAGE_BIT 33
 
 #define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT)
 #define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT)
@@ -198,6 +200,13 @@  enum {
 #define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
 #define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
 #define PFERR_PK_MASK (1U << PFERR_PK_BIT)
+#define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT)
+#define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT)
+
+#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK |	\
+				 PFERR_USER_MASK |		\
+				 PFERR_WRITE_MASK |		\
+				 PFERR_PRESENT_MASK)
 
 /* apic attention bits */
 #define KVM_APIC_CHECK_VAPIC	0
@@ -1203,7 +1212,7 @@  void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu);
 
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
 
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code,
 		       void *insn, int insn_len);
 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
 void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d9c7e98..f633d29 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4508,7 +4508,7 @@  static void make_mmu_pages_available(struct kvm_vcpu *vcpu)
 	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 }
 
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
 		       void *insn, int insn_len)
 {
 	int r, emulation_type = EMULTYPE_RETRY;
@@ -4527,12 +4527,28 @@  int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
 			return r;
 	}
 
-	r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
+	r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code),
+				      false);
 	if (r < 0)
 		return r;
 	if (!r)
 		return 1;
 
+	/*
+	 * Before emulating the instruction, check if the error code
+	 * was due to a RO violation while translating the guest page.
+	 * This can occur when using nested virtualization with nested
+	 * paging in both guests. If true, we simply unprotect the page
+	 * and resume the guest.
+	 *
+	 * Note: AMD only (since it supports the PFERR_GUEST_PAGE_MASK used
+	 *       in PFERR_NEXT_GUEST_PAGE)
+	 */
+	if (error_code == PFERR_NESTED_GUEST_PAGE) {
+		kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2));
+		return 1;
+	}
+
 	if (mmio_info_in_cache(vcpu, cr2, direct))
 		emulation_type = 0;
 emulate:
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8ca1eca..4e462bb 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2074,7 +2074,7 @@  static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
 static int pf_interception(struct vcpu_svm *svm)
 {
 	u64 fault_address = svm->vmcb->control.exit_info_2;
-	u32 error_code;
+	u64 error_code;
 	int r = 1;
 
 	switch (svm->apf_reason) {