Message ID | 147916173923.16347.14406683681257344312.stgit@brijesh-build-machine (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 14/11/2016 23:15, Brijesh Singh wrote: > From: Tom Lendacky <thomas.lendacky@amd.com> > > AMD hardware adds two additional bits to aid in nested page fault handling. > > Bit 32 - NPF occurred while translating the guest's final physical address > Bit 33 - NPF occurred while translating the guest page tables I have two questions out of curiosity, and to better understand the differences between Intel and AMD: 1) are the two bits mutually exclusive, and is one bit always set? 2) what bit is set if the processor is reading the PDPTEs of a 32-bit PAE guest? Thanks, Paolo > The guest page tables fault indicator can be used as an aid for nested > virtualization. Using V0 for the host, V1 for the first level guest and > V2 for the second level guest, when both V1 and V2 are using nested paging > there are currently a number of unnecessary instruction emulations. When > V2 is launched shadow paging is used in V1 for the nested tables of V2. As > a result, KVM marks these pages as RO in the host nested page tables. When > V2 exits and we resume V1, these pages are still marked RO. > > Every nested walk for a guest page table is treated as a user-level write > access and this causes a lot of NPFs because the V1 page tables are marked > RO in the V0 nested tables. While executing V1, when these NPFs occur KVM > sees a write to a read-only page, emulates the V1 instruction and unprotects > the page (marking it RW). This patch looks for cases where we get a NPF due > to a guest page table walk where the page was marked RO. It immediately > unprotects the page and resumes the guest, leading to far fewer instruction > emulations when nested virtualization is used. > > Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> > Reviewed-by: Borislav Petkov <bp@suse.de> > Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> > --- > arch/x86/include/asm/kvm_host.h | 11 ++++++++++- > arch/x86/kvm/mmu.c | 20 ++++++++++++++++++-- > arch/x86/kvm/svm.c | 2 +- > 3 files changed, 29 insertions(+), 4 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index bdde807..da07e17 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -191,6 +191,8 @@ enum { > #define PFERR_RSVD_BIT 3 > #define PFERR_FETCH_BIT 4 > #define PFERR_PK_BIT 5 > +#define PFERR_GUEST_FINAL_BIT 32 > +#define PFERR_GUEST_PAGE_BIT 33 > > #define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT) > #define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT) > @@ -198,6 +200,13 @@ enum { > #define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) > #define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) > #define PFERR_PK_MASK (1U << PFERR_PK_BIT) > +#define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT) > +#define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT) > + > +#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ > + PFERR_USER_MASK | \ > + PFERR_WRITE_MASK | \ > + PFERR_PRESENT_MASK) > > /* apic attention bits */ > #define KVM_APIC_CHECK_VAPIC 0 > @@ -1203,7 +1212,7 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu); > > int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); > > -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, > +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code, > void *insn, int insn_len); > void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); > void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu); > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c > index d9c7e98..f633d29 100644 > --- a/arch/x86/kvm/mmu.c > +++ b/arch/x86/kvm/mmu.c > @@ -4508,7 +4508,7 @@ static void make_mmu_pages_available(struct kvm_vcpu *vcpu) > kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); > } > > -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, > +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, > void *insn, int insn_len) > { > int r, emulation_type = EMULTYPE_RETRY; > @@ -4527,12 +4527,28 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, > return r; > } > > - r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); > + r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code), > + false); > if (r < 0) > return r; > if (!r) > return 1; > > + /* > + * Before emulating the instruction, check if the error code > + * was due to a RO violation while translating the guest page. > + * This can occur when using nested virtualization with nested > + * paging in both guests. If true, we simply unprotect the page > + * and resume the guest. > + * > + * Note: AMD only (since it supports the PFERR_GUEST_PAGE_MASK used > + * in PFERR_NEXT_GUEST_PAGE) > + */ > + if (error_code == PFERR_NESTED_GUEST_PAGE) { > + kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2)); > + return 1; > + } > + > if (mmio_info_in_cache(vcpu, cr2, direct)) > emulation_type = 0; > emulate: > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c > index 8ca1eca..4e462bb 100644 > --- a/arch/x86/kvm/svm.c > +++ b/arch/x86/kvm/svm.c > @@ -2074,7 +2074,7 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) > static int pf_interception(struct vcpu_svm *svm) > { > u64 fault_address = svm->vmcb->control.exit_info_2; > - u32 error_code; > + u64 error_code; > int r = 1; > > switch (svm->apf_reason) { > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 11/21/2016 9:12 AM, Paolo Bonzini wrote: > > > On 14/11/2016 23:15, Brijesh Singh wrote: >> From: Tom Lendacky <thomas.lendacky@amd.com> >> >> AMD hardware adds two additional bits to aid in nested page fault handling. >> >> Bit 32 - NPF occurred while translating the guest's final physical address >> Bit 33 - NPF occurred while translating the guest page tables > > I have two questions out of curiosity, and to better understand the > differences between Intel and AMD: I talked with some folks about these questions and here's what we determined: > > 1) are the two bits mutually exclusive, and is one bit always set? The two bits are mutually exclusive - either the processor encounters the fault while translating the final gPA or while translating a guest page table, there's no way for it to be both. > > 2) what bit is set if the processor is reading the PDPTEs of a 32-bit > PAE guest? I believe that bit 33 will be set. The PDPE's are considered guest tables and are read during a guest table walk (see APM vol2 section 15.25.10). Note that this is slightly different than the bare-metal behavior of legacy PAE mode as APM describes. I'll try to test this and verify it. Thanks, Tom > > Thanks, > > Paolo > >> The guest page tables fault indicator can be used as an aid for nested >> virtualization. Using V0 for the host, V1 for the first level guest and >> V2 for the second level guest, when both V1 and V2 are using nested paging >> there are currently a number of unnecessary instruction emulations. When >> V2 is launched shadow paging is used in V1 for the nested tables of V2. As >> a result, KVM marks these pages as RO in the host nested page tables. When >> V2 exits and we resume V1, these pages are still marked RO. >> >> Every nested walk for a guest page table is treated as a user-level write >> access and this causes a lot of NPFs because the V1 page tables are marked >> RO in the V0 nested tables. While executing V1, when these NPFs occur KVM >> sees a write to a read-only page, emulates the V1 instruction and unprotects >> the page (marking it RW). This patch looks for cases where we get a NPF due >> to a guest page table walk where the page was marked RO. It immediately >> unprotects the page and resumes the guest, leading to far fewer instruction >> emulations when nested virtualization is used. >> >> Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> >> Reviewed-by: Borislav Petkov <bp@suse.de> >> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> >> --- >> arch/x86/include/asm/kvm_host.h | 11 ++++++++++- >> arch/x86/kvm/mmu.c | 20 ++++++++++++++++++-- >> arch/x86/kvm/svm.c | 2 +- >> 3 files changed, 29 insertions(+), 4 deletions(-) >> >> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h >> index bdde807..da07e17 100644 >> --- a/arch/x86/include/asm/kvm_host.h >> +++ b/arch/x86/include/asm/kvm_host.h >> @@ -191,6 +191,8 @@ enum { >> #define PFERR_RSVD_BIT 3 >> #define PFERR_FETCH_BIT 4 >> #define PFERR_PK_BIT 5 >> +#define PFERR_GUEST_FINAL_BIT 32 >> +#define PFERR_GUEST_PAGE_BIT 33 >> >> #define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT) >> #define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT) >> @@ -198,6 +200,13 @@ enum { >> #define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) >> #define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) >> #define PFERR_PK_MASK (1U << PFERR_PK_BIT) >> +#define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT) >> +#define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT) >> + >> +#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ >> + PFERR_USER_MASK | \ >> + PFERR_WRITE_MASK | \ >> + PFERR_PRESENT_MASK) >> >> /* apic attention bits */ >> #define KVM_APIC_CHECK_VAPIC 0 >> @@ -1203,7 +1212,7 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu); >> >> int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); >> >> -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, >> +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code, >> void *insn, int insn_len); >> void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); >> void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu); >> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c >> index d9c7e98..f633d29 100644 >> --- a/arch/x86/kvm/mmu.c >> +++ b/arch/x86/kvm/mmu.c >> @@ -4508,7 +4508,7 @@ static void make_mmu_pages_available(struct kvm_vcpu *vcpu) >> kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); >> } >> >> -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, >> +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, >> void *insn, int insn_len) >> { >> int r, emulation_type = EMULTYPE_RETRY; >> @@ -4527,12 +4527,28 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, >> return r; >> } >> >> - r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); >> + r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code), >> + false); >> if (r < 0) >> return r; >> if (!r) >> return 1; >> >> + /* >> + * Before emulating the instruction, check if the error code >> + * was due to a RO violation while translating the guest page. >> + * This can occur when using nested virtualization with nested >> + * paging in both guests. If true, we simply unprotect the page >> + * and resume the guest. >> + * >> + * Note: AMD only (since it supports the PFERR_GUEST_PAGE_MASK used >> + * in PFERR_NEXT_GUEST_PAGE) >> + */ >> + if (error_code == PFERR_NESTED_GUEST_PAGE) { >> + kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2)); >> + return 1; >> + } >> + >> if (mmio_info_in_cache(vcpu, cr2, direct)) >> emulation_type = 0; >> emulate: >> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c >> index 8ca1eca..4e462bb 100644 >> --- a/arch/x86/kvm/svm.c >> +++ b/arch/x86/kvm/svm.c >> @@ -2074,7 +2074,7 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) >> static int pf_interception(struct vcpu_svm *svm) >> { >> u64 fault_address = svm->vmcb->control.exit_info_2; >> - u32 error_code; >> + u64 error_code; >> int r = 1; >> >> switch (svm->apf_reason) { >> -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 22/11/2016 23:15, Tom Lendacky wrote: > > 2) what bit is set if the processor is reading the PDPTEs of a 32-bit > > PAE guest? > > I believe that bit 33 will be set. The PDPE's are considered guest > tables and are read during a guest table walk (see APM vol2 section > 15.25.10). Note that this is slightly different than the bare-metal > behavior of legacy PAE mode as APM describes. I'll try to test this > and verify it. No big deal, indeed it's a bit different from Intel which caches the four PDPEs, but it's enough to know that bit 33 will be set. Paolo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index bdde807..da07e17 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -191,6 +191,8 @@ enum { #define PFERR_RSVD_BIT 3 #define PFERR_FETCH_BIT 4 #define PFERR_PK_BIT 5 +#define PFERR_GUEST_FINAL_BIT 32 +#define PFERR_GUEST_PAGE_BIT 33 #define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT) #define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT) @@ -198,6 +200,13 @@ enum { #define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) #define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) #define PFERR_PK_MASK (1U << PFERR_PK_BIT) +#define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT) +#define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT) + +#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ + PFERR_USER_MASK | \ + PFERR_WRITE_MASK | \ + PFERR_PRESENT_MASK) /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 @@ -1203,7 +1212,7 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code, void *insn, int insn_len); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d9c7e98..f633d29 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4508,7 +4508,7 @@ static void make_mmu_pages_available(struct kvm_vcpu *vcpu) kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); } -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, void *insn, int insn_len) { int r, emulation_type = EMULTYPE_RETRY; @@ -4527,12 +4527,28 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, return r; } - r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); + r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code), + false); if (r < 0) return r; if (!r) return 1; + /* + * Before emulating the instruction, check if the error code + * was due to a RO violation while translating the guest page. + * This can occur when using nested virtualization with nested + * paging in both guests. If true, we simply unprotect the page + * and resume the guest. + * + * Note: AMD only (since it supports the PFERR_GUEST_PAGE_MASK used + * in PFERR_NEXT_GUEST_PAGE) + */ + if (error_code == PFERR_NESTED_GUEST_PAGE) { + kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2)); + return 1; + } + if (mmio_info_in_cache(vcpu, cr2, direct)) emulation_type = 0; emulate: diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8ca1eca..4e462bb 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2074,7 +2074,7 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) static int pf_interception(struct vcpu_svm *svm) { u64 fault_address = svm->vmcb->control.exit_info_2; - u32 error_code; + u64 error_code; int r = 1; switch (svm->apf_reason) {