Message ID | 20210808192658.2923641-2-wei.huang2@amd.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | SVM 5-level page table support | expand |
On Sun, Aug 08, 2021 at 02:26:56PM -0500, Wei Huang wrote: > AMD future CPUs will require a 5-level NPT if host CR4.LA57 is set. Sorry, but why? NPT is not indexed by HVA. > To prevent kvm_mmu_get_tdp_level() from incorrectly changing NPT level > on behalf of CPUs, add a new parameter in kvm_configure_mmu() to force > a fixed TDP level. > > Signed-off-by: Wei Huang <wei.huang2@amd.com> > --- > arch/x86/include/asm/kvm_host.h | 5 ++--- > arch/x86/kvm/mmu/mmu.c | 10 ++++++++-- > arch/x86/kvm/svm/svm.c | 4 +++- > arch/x86/kvm/vmx/vmx.c | 3 ++- > 4 files changed, 15 insertions(+), 7 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index 974cbfb1eefe..6d16f75cc8da 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -723,7 +723,6 @@ struct kvm_vcpu_arch { > > u64 reserved_gpa_bits; > int maxphyaddr; > - int max_tdp_level; > > /* emulate context */ > > @@ -1747,8 +1746,8 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, > void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); > void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd); > > -void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, > - int tdp_huge_page_level); > +void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, > + int tdp_max_root_level, int tdp_huge_page_level); > > static inline u16 kvm_read_ldt(void) > { > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > index 66f7f5bc3482..c11ee4531f6d 100644 > --- a/arch/x86/kvm/mmu/mmu.c > +++ b/arch/x86/kvm/mmu/mmu.c > @@ -97,6 +97,7 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644); > bool tdp_enabled = false; > > static int max_huge_page_level __read_mostly; > +static int tdp_root_level __read_mostly; I think this is a broken design - meaning KVM can only use 5-level or 4-level NPT for all VMs. B.R. Yu > static int max_tdp_level __read_mostly; > > enum { > @@ -4562,6 +4563,10 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu, > > static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) > { > + /* tdp_root_level is architecture forced level, use it if nonzero */ > + if (tdp_root_level) > + return tdp_root_level; > + > /* Use 5-level TDP if and only if it's useful/necessary. */ > if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48) > return 4; > @@ -5253,10 +5258,11 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) > */ > } > > -void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, > - int tdp_huge_page_level) > +void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, > + int tdp_max_root_level, int tdp_huge_page_level) > { > tdp_enabled = enable_tdp; > + tdp_root_level = tdp_forced_root_level; > max_tdp_level = tdp_max_root_level; > > /* > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c > index e8ccab50ebf6..f361d466e18e 100644 > --- a/arch/x86/kvm/svm/svm.c > +++ b/arch/x86/kvm/svm/svm.c > @@ -1015,7 +1015,9 @@ static __init int svm_hardware_setup(void) > if (!boot_cpu_has(X86_FEATURE_NPT)) > npt_enabled = false; > > - kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G); > + /* Force VM NPT level equal to the host's max NPT level */ > + kvm_configure_mmu(npt_enabled, get_max_npt_level(), > + get_max_npt_level(), PG_LEVEL_1G); > pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); > > /* Note, SEV setup consumes npt_enabled. */ > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c > index 927a552393b9..034e1397c7d5 100644 > --- a/arch/x86/kvm/vmx/vmx.c > +++ b/arch/x86/kvm/vmx/vmx.c > @@ -7803,7 +7803,8 @@ static __init int hardware_setup(void) > ept_lpage_level = PG_LEVEL_2M; > else > ept_lpage_level = PG_LEVEL_4K; > - kvm_configure_mmu(enable_ept, vmx_get_max_tdp_level(), ept_lpage_level); > + kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(), > + ept_lpage_level); > > /* > * Only enable PML when hardware supports PML feature, and both EPT > -- > 2.31.1 >
On 8/8/21 10:58 PM, Yu Zhang wrote: > On Sun, Aug 08, 2021 at 02:26:56PM -0500, Wei Huang wrote: >> AMD future CPUs will require a 5-level NPT if host CR4.LA57 is set. > > Sorry, but why? NPT is not indexed by HVA. NPT is not indexed by HVA - it is always indexed by GPA. What I meant is NPT page table level has to be the same as the host OS page table: if 5-level page table is enabled in host OS (CR4.LA57=1), guest NPT has to 5-level too. > >> To prevent kvm_mmu_get_tdp_level() from incorrectly changing NPT level >> on behalf of CPUs, add a new parameter in kvm_configure_mmu() to force >> a fixed TDP level. >> >> Signed-off-by: Wei Huang <wei.huang2@amd.com> >> --- >> arch/x86/include/asm/kvm_host.h | 5 ++--- >> arch/x86/kvm/mmu/mmu.c | 10 ++++++++-- >> arch/x86/kvm/svm/svm.c | 4 +++- >> arch/x86/kvm/vmx/vmx.c | 3 ++- >> 4 files changed, 15 insertions(+), 7 deletions(-) >> >> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h >> index 974cbfb1eefe..6d16f75cc8da 100644 >> --- a/arch/x86/include/asm/kvm_host.h >> +++ b/arch/x86/include/asm/kvm_host.h >> @@ -723,7 +723,6 @@ struct kvm_vcpu_arch { >> >> u64 reserved_gpa_bits; >> int maxphyaddr; >> - int max_tdp_level; >> >> /* emulate context */ >> >> @@ -1747,8 +1746,8 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, >> void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); >> void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd); >> >> -void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, >> - int tdp_huge_page_level); >> +void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, >> + int tdp_max_root_level, int tdp_huge_page_level); >> >> static inline u16 kvm_read_ldt(void) >> { >> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c >> index 66f7f5bc3482..c11ee4531f6d 100644 >> --- a/arch/x86/kvm/mmu/mmu.c >> +++ b/arch/x86/kvm/mmu/mmu.c >> @@ -97,6 +97,7 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644); >> bool tdp_enabled = false; >> >> static int max_huge_page_level __read_mostly; >> +static int tdp_root_level __read_mostly; > > I think this is a broken design - meaning KVM can only use 5-level or > 4-level NPT for all VMs. Broken normally means non-functional or buggy, which doesn't apply here. A good TLB design should be able to offset the potential overhead of 5-level page table for most cases. > > B.R. > Yu > >> static int max_tdp_level __read_mostly; >> >> enum { >> @@ -4562,6 +4563,10 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu, >> >> static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) >> { >> + /* tdp_root_level is architecture forced level, use it if nonzero */ >> + if (tdp_root_level) >> + return tdp_root_level; >> + >> /* Use 5-level TDP if and only if it's useful/necessary. */ >> if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48) >> return 4; >> @@ -5253,10 +5258,11 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) >> */ >> } >> >> -void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, >> - int tdp_huge_page_level) >> +void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, >> + int tdp_max_root_level, int tdp_huge_page_level) >> { >> tdp_enabled = enable_tdp; >> + tdp_root_level = tdp_forced_root_level; >> max_tdp_level = tdp_max_root_level; >> >> /* >> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c >> index e8ccab50ebf6..f361d466e18e 100644 >> --- a/arch/x86/kvm/svm/svm.c >> +++ b/arch/x86/kvm/svm/svm.c >> @@ -1015,7 +1015,9 @@ static __init int svm_hardware_setup(void) >> if (!boot_cpu_has(X86_FEATURE_NPT)) >> npt_enabled = false; >> >> - kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G); >> + /* Force VM NPT level equal to the host's max NPT level */ >> + kvm_configure_mmu(npt_enabled, get_max_npt_level(), >> + get_max_npt_level(), PG_LEVEL_1G); >> pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); >> >> /* Note, SEV setup consumes npt_enabled. */ >> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c >> index 927a552393b9..034e1397c7d5 100644 >> --- a/arch/x86/kvm/vmx/vmx.c >> +++ b/arch/x86/kvm/vmx/vmx.c >> @@ -7803,7 +7803,8 @@ static __init int hardware_setup(void) >> ept_lpage_level = PG_LEVEL_2M; >> else >> ept_lpage_level = PG_LEVEL_4K; >> - kvm_configure_mmu(enable_ept, vmx_get_max_tdp_level(), ept_lpage_level); >> + kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(), >> + ept_lpage_level); >> >> /* >> * Only enable PML when hardware supports PML feature, and both EPT >> -- >> 2.31.1 >>
On Sun, Aug 08, 2021 at 11:11:40PM -0500, Wei Huang wrote: > > > On 8/8/21 10:58 PM, Yu Zhang wrote: > > On Sun, Aug 08, 2021 at 02:26:56PM -0500, Wei Huang wrote: > > > AMD future CPUs will require a 5-level NPT if host CR4.LA57 is set. > > > > Sorry, but why? NPT is not indexed by HVA. > > NPT is not indexed by HVA - it is always indexed by GPA. What I meant is NPT > page table level has to be the same as the host OS page table: if 5-level > page table is enabled in host OS (CR4.LA57=1), guest NPT has to 5-level too. I know what you meant. But may I ask why? B.R. Yu > > > > > > To prevent kvm_mmu_get_tdp_level() from incorrectly changing NPT level > > > on behalf of CPUs, add a new parameter in kvm_configure_mmu() to force > > > a fixed TDP level. > > > > > > Signed-off-by: Wei Huang <wei.huang2@amd.com> > > > --- > > > arch/x86/include/asm/kvm_host.h | 5 ++--- > > > arch/x86/kvm/mmu/mmu.c | 10 ++++++++-- > > > arch/x86/kvm/svm/svm.c | 4 +++- > > > arch/x86/kvm/vmx/vmx.c | 3 ++- > > > 4 files changed, 15 insertions(+), 7 deletions(-) > > > > > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > > > index 974cbfb1eefe..6d16f75cc8da 100644 > > > --- a/arch/x86/include/asm/kvm_host.h > > > +++ b/arch/x86/include/asm/kvm_host.h > > > @@ -723,7 +723,6 @@ struct kvm_vcpu_arch { > > > u64 reserved_gpa_bits; > > > int maxphyaddr; > > > - int max_tdp_level; > > > /* emulate context */ > > > @@ -1747,8 +1746,8 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, > > > void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); > > > void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd); > > > -void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, > > > - int tdp_huge_page_level); > > > +void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, > > > + int tdp_max_root_level, int tdp_huge_page_level); > > > static inline u16 kvm_read_ldt(void) > > > { > > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > > > index 66f7f5bc3482..c11ee4531f6d 100644 > > > --- a/arch/x86/kvm/mmu/mmu.c > > > +++ b/arch/x86/kvm/mmu/mmu.c > > > @@ -97,6 +97,7 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644); > > > bool tdp_enabled = false; > > > static int max_huge_page_level __read_mostly; > > > +static int tdp_root_level __read_mostly; > > > > I think this is a broken design - meaning KVM can only use 5-level or > > 4-level NPT for all VMs. > > Broken normally means non-functional or buggy, which doesn't apply here. A > good TLB design should be able to offset the potential overhead of 5-level > page table for most cases. > > > > > B.R. > > Yu > > > > > static int max_tdp_level __read_mostly; > > > enum { > > > @@ -4562,6 +4563,10 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu, > > > static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) > > > { > > > + /* tdp_root_level is architecture forced level, use it if nonzero */ > > > + if (tdp_root_level) > > > + return tdp_root_level; > > > + > > > /* Use 5-level TDP if and only if it's useful/necessary. */ > > > if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48) > > > return 4; > > > @@ -5253,10 +5258,11 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) > > > */ > > > } > > > -void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, > > > - int tdp_huge_page_level) > > > +void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, > > > + int tdp_max_root_level, int tdp_huge_page_level) > > > { > > > tdp_enabled = enable_tdp; > > > + tdp_root_level = tdp_forced_root_level; > > > max_tdp_level = tdp_max_root_level; > > > /* > > > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c > > > index e8ccab50ebf6..f361d466e18e 100644 > > > --- a/arch/x86/kvm/svm/svm.c > > > +++ b/arch/x86/kvm/svm/svm.c > > > @@ -1015,7 +1015,9 @@ static __init int svm_hardware_setup(void) > > > if (!boot_cpu_has(X86_FEATURE_NPT)) > > > npt_enabled = false; > > > - kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G); > > > + /* Force VM NPT level equal to the host's max NPT level */ > > > + kvm_configure_mmu(npt_enabled, get_max_npt_level(), > > > + get_max_npt_level(), PG_LEVEL_1G); > > > pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); > > > /* Note, SEV setup consumes npt_enabled. */ > > > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c > > > index 927a552393b9..034e1397c7d5 100644 > > > --- a/arch/x86/kvm/vmx/vmx.c > > > +++ b/arch/x86/kvm/vmx/vmx.c > > > @@ -7803,7 +7803,8 @@ static __init int hardware_setup(void) > > > ept_lpage_level = PG_LEVEL_2M; > > > else > > > ept_lpage_level = PG_LEVEL_4K; > > > - kvm_configure_mmu(enable_ept, vmx_get_max_tdp_level(), ept_lpage_level); > > > + kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(), > > > + ept_lpage_level); > > > /* > > > * Only enable PML when hardware supports PML feature, and both EPT > > > -- > > > 2.31.1 > > >
On 8/8/21 11:27 PM, Yu Zhang wrote: > On Sun, Aug 08, 2021 at 11:11:40PM -0500, Wei Huang wrote: >> >> >> On 8/8/21 10:58 PM, Yu Zhang wrote: >>> On Sun, Aug 08, 2021 at 02:26:56PM -0500, Wei Huang wrote: >>>> AMD future CPUs will require a 5-level NPT if host CR4.LA57 is set. >>> >>> Sorry, but why? NPT is not indexed by HVA. >> >> NPT is not indexed by HVA - it is always indexed by GPA. What I meant is NPT >> page table level has to be the same as the host OS page table: if 5-level >> page table is enabled in host OS (CR4.LA57=1), guest NPT has to 5-level too. > > I know what you meant. But may I ask why? I don't have a good answer for it. From what I know, VMCB doesn't have a field to indicate guest page table level. As a result, hardware relies on host CR4 to infer NPT level. > > B.R. > Yu > >>
On Sun, Aug 08, 2021 at 11:33:44PM -0500, Wei Huang wrote: > > > On 8/8/21 11:27 PM, Yu Zhang wrote: > > On Sun, Aug 08, 2021 at 11:11:40PM -0500, Wei Huang wrote: > > > > > > > > > On 8/8/21 10:58 PM, Yu Zhang wrote: > > > > On Sun, Aug 08, 2021 at 02:26:56PM -0500, Wei Huang wrote: > > > > > AMD future CPUs will require a 5-level NPT if host CR4.LA57 is set. > > > > > > > > Sorry, but why? NPT is not indexed by HVA. > > > > > > NPT is not indexed by HVA - it is always indexed by GPA. What I meant is NPT > > > page table level has to be the same as the host OS page table: if 5-level > > > page table is enabled in host OS (CR4.LA57=1), guest NPT has to 5-level too. > > > > I know what you meant. But may I ask why? > > I don't have a good answer for it. From what I know, VMCB doesn't have a > field to indicate guest page table level. As a result, hardware relies on > host CR4 to infer NPT level. I guess you mean not even in the N_CR3 field of VMCB? Then it's not a broken design - it's a limitation of SVM. :) B.R. Yu
On Mon, Aug 09, 2021, Yu Zhang wrote: > On Sun, Aug 08, 2021 at 11:33:44PM -0500, Wei Huang wrote: > > > > On 8/8/21 11:27 PM, Yu Zhang wrote: > > > On Sun, Aug 08, 2021 at 11:11:40PM -0500, Wei Huang wrote: > > > > > > > > > > > > On 8/8/21 10:58 PM, Yu Zhang wrote: > > > > > On Sun, Aug 08, 2021 at 02:26:56PM -0500, Wei Huang wrote: > > > > > > AMD future CPUs will require a 5-level NPT if host CR4.LA57 is set. > > > > > > > > > > Sorry, but why? NPT is not indexed by HVA. > > > > > > > > NPT is not indexed by HVA - it is always indexed by GPA. What I meant is NPT > > > > page table level has to be the same as the host OS page table: if 5-level > > > > page table is enabled in host OS (CR4.LA57=1), guest NPT has to 5-level too. > > > > > > I know what you meant. But may I ask why? > > > > I don't have a good answer for it. From what I know, VMCB doesn't have a > > field to indicate guest page table level. As a result, hardware relies on > > host CR4 to infer NPT level. > > I guess you mean not even in the N_CR3 field of VMCB? Correct, nCR3 is a basically a pure representation of a regular CR3. > Then it's not a broken design - it's a limitation of SVM. :) That's just a polite way of saying it's a broken design ;-) Joking aside, NPT opted for a semblance of backwards compatibility at the cost of having to carry all the baggage that comes with a legacy design. Keeping the core functionality from IA32 paging presumably miminizes design and hardware costs, and required minimal enabling in hypervisors. The downside is that it's less flexible than EPT and has a few warts, e.g. shadowing NPT is gross because the host can't easily mirror L1's desired paging mode.
On Mon, Aug 9, 2021 at 8:30 AM Sean Christopherson <seanjc@google.com> wrote: > > On Mon, Aug 09, 2021, Yu Zhang wrote: > > On Sun, Aug 08, 2021 at 11:33:44PM -0500, Wei Huang wrote: > > > > > > On 8/8/21 11:27 PM, Yu Zhang wrote: > > > > On Sun, Aug 08, 2021 at 11:11:40PM -0500, Wei Huang wrote: > > > > > > > > > > > > > > > On 8/8/21 10:58 PM, Yu Zhang wrote: > > > > > > On Sun, Aug 08, 2021 at 02:26:56PM -0500, Wei Huang wrote: > > > > > > > AMD future CPUs will require a 5-level NPT if host CR4.LA57 is set. > > > > > > > > > > > > Sorry, but why? NPT is not indexed by HVA. > > > > > > > > > > NPT is not indexed by HVA - it is always indexed by GPA. What I meant is NPT > > > > > page table level has to be the same as the host OS page table: if 5-level > > > > > page table is enabled in host OS (CR4.LA57=1), guest NPT has to 5-level too. > > > > > > > > I know what you meant. But may I ask why? > > > > > > I don't have a good answer for it. From what I know, VMCB doesn't have a > > > field to indicate guest page table level. As a result, hardware relies on > > > host CR4 to infer NPT level. > > > > I guess you mean not even in the N_CR3 field of VMCB? > > Correct, nCR3 is a basically a pure representation of a regular CR3. > > > Then it's not a broken design - it's a limitation of SVM. :) > > That's just a polite way of saying it's a broken design ;-) Doesn't this break legacy type 2 hypervisors that don't know anything about 5-level NPT and don't have any control over whether or not the host uses 5-level paging? > Joking aside, NPT opted for a semblance of backwards compatibility at the cost of > having to carry all the baggage that comes with a legacy design. Keeping the core > functionality from IA32 paging presumably miminizes design and hardware costs, and > required minimal enabling in hypervisors. The downside is that it's less flexible > than EPT and has a few warts, e.g. shadowing NPT is gross because the host can't > easily mirror L1's desired paging mode.
On Mon, Aug 09, 2021 at 03:30:08PM +0000, Sean Christopherson wrote: > On Mon, Aug 09, 2021, Yu Zhang wrote: > > On Sun, Aug 08, 2021 at 11:33:44PM -0500, Wei Huang wrote: > > > > > > On 8/8/21 11:27 PM, Yu Zhang wrote: > > > > On Sun, Aug 08, 2021 at 11:11:40PM -0500, Wei Huang wrote: > > > > > > > > > > > > > > > On 8/8/21 10:58 PM, Yu Zhang wrote: > > > > > > On Sun, Aug 08, 2021 at 02:26:56PM -0500, Wei Huang wrote: > > > > > > > AMD future CPUs will require a 5-level NPT if host CR4.LA57 is set. > > > > > > > > > > > > Sorry, but why? NPT is not indexed by HVA. > > > > > > > > > > NPT is not indexed by HVA - it is always indexed by GPA. What I meant is NPT > > > > > page table level has to be the same as the host OS page table: if 5-level > > > > > page table is enabled in host OS (CR4.LA57=1), guest NPT has to 5-level too. > > > > > > > > I know what you meant. But may I ask why? > > > > > > I don't have a good answer for it. From what I know, VMCB doesn't have a > > > field to indicate guest page table level. As a result, hardware relies on > > > host CR4 to infer NPT level. > > > > I guess you mean not even in the N_CR3 field of VMCB? > > Correct, nCR3 is a basically a pure representation of a regular CR3. > > > Then it's not a broken design - it's a limitation of SVM. :) > > That's just a polite way of saying it's a broken design ;-) > > Joking aside, NPT opted for a semblance of backwards compatibility at the cost of > having to carry all the baggage that comes with a legacy design. Keeping the core > functionality from IA32 paging presumably miminizes design and hardware costs, and > required minimal enabling in hypervisors. The downside is that it's less flexible > than EPT and has a few warts, e.g. shadowing NPT is gross because the host can't > easily mirror L1's desired paging mode. Thanks for your explaination, Sean. Everything has a cost, and now it's time to pay the price. :) As to the max level, it is calculated in kvm_init(). Though I do not see any chance that host paging mode will be changed after kvm_init(), or any case that Linux uses different paging levels in different pCPUs, I am wondering, should we do something, e.g., to document this as an open? About "host can't easily mirror L1's desired paging mode", could you please elaborate? Thanks! B.R. Yu
On 09/08/21 23:49, Jim Mattson wrote: > Doesn't this break legacy type 2 hypervisors that don't know anything > about 5-level NPT and don't have any control over whether or not the > host uses 5-level paging? Yes, where "legacy" probably means "all released versions of all of them", including KVM. Host support for LA57 was merged in 4.13, while KVM started supporting 5-level page tables in EPT in 4.14 and even then just returned PT64_ROOT_LEVEL (i.e. 4) for the maximum NPT level. So all Linux versions up to 5.13, which has "KVM: x86: Prevent KVM SVM from loading on kernels with 5-level paging", will break horribly. Better backport that patch to stable... Paolo
On 10/08/21 09:40, Yu Zhang wrote: > About "host can't easily mirror L1's desired paging mode", could you please elaborate? > Thanks! Shadow pgae tables in KVM will always have 3 levels on 32-bit machines and 4/5 levels on 64-bit machines. L1 instead might have any number of levels from 2 to 5 (though of course not more than the host has). Therefore, when shadowing 32-bit NPT page tables, KVM has to add extra fixed levels on top of those that it's shadowing. See mmu_alloc_direct_roots for the code. Paolo
On Tue, Aug 10, 2021 at 11:25:27AM +0200, Paolo Bonzini wrote: > On 10/08/21 09:40, Yu Zhang wrote: > > About "host can't easily mirror L1's desired paging mode", could you please elaborate? > > Thanks! > > Shadow pgae tables in KVM will always have 3 levels on 32-bit machines and > 4/5 levels on 64-bit machines. L1 instead might have any number of levels > from 2 to 5 (though of course not more than the host has). Thanks Paolo. I guess it's because, unlike EPT which are with either 4 or 5 levels, NPT's level can range from 2 to 5, depending on the host paging mode... > > Therefore, when shadowing 32-bit NPT page tables, KVM has to add extra fixed > levels on top of those that it's shadowing. See mmu_alloc_direct_roots for > the code. So when shadowing NPTs(can be 2/3 levels, depending on the paging mode in L1), and if L0 Linux is running in 4/5 level mode, extra levels of paging structures is needed in the shadow NPT. But shadow EPT does not have such annoyance. Is above understanding correct? B.R. Yu
On 10/08/21 13:00, Yu Zhang wrote: > I guess it's because, unlike EPT which are with either 4 or 5 levels, NPT's > level can range from 2 to 5, depending on the host paging mode... Yes, on Linux that will be one of 3/4/5 based on host paging mode, and it will apply to all N_CR3... > But shadow EPT does not have such annoyance. Is above understanding correct? ... Right, because shadow EPT cannot have less than 4 levels, and it can always use 4 levels if that's what L1 uses. Paolo
On Tue, Aug 10, 2021 at 02:47:27PM +0200, Paolo Bonzini wrote: > On 10/08/21 13:00, Yu Zhang wrote: > > I guess it's because, unlike EPT which are with either 4 or 5 levels, NPT's > > level can range from 2 to 5, depending on the host paging mode... > > Yes, on Linux that will be one of 3/4/5 based on host paging mode, and it > will apply to all N_CR3... > > > But shadow EPT does not have such annoyance. Is above understanding correct? > > ... Right, because shadow EPT cannot have less than 4 levels, and it can > always use 4 levels if that's what L1 uses. Interesting. :) Thanks a lot for the explanation! B.R. Yu
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 974cbfb1eefe..6d16f75cc8da 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -723,7 +723,6 @@ struct kvm_vcpu_arch { u64 reserved_gpa_bits; int maxphyaddr; - int max_tdp_level; /* emulate context */ @@ -1747,8 +1746,8 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd); -void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, - int tdp_huge_page_level); +void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, + int tdp_max_root_level, int tdp_huge_page_level); static inline u16 kvm_read_ldt(void) { diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 66f7f5bc3482..c11ee4531f6d 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -97,6 +97,7 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644); bool tdp_enabled = false; static int max_huge_page_level __read_mostly; +static int tdp_root_level __read_mostly; static int max_tdp_level __read_mostly; enum { @@ -4562,6 +4563,10 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu, static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) { + /* tdp_root_level is architecture forced level, use it if nonzero */ + if (tdp_root_level) + return tdp_root_level; + /* Use 5-level TDP if and only if it's useful/necessary. */ if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48) return 4; @@ -5253,10 +5258,11 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) */ } -void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, - int tdp_huge_page_level) +void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, + int tdp_max_root_level, int tdp_huge_page_level) { tdp_enabled = enable_tdp; + tdp_root_level = tdp_forced_root_level; max_tdp_level = tdp_max_root_level; /* diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index e8ccab50ebf6..f361d466e18e 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1015,7 +1015,9 @@ static __init int svm_hardware_setup(void) if (!boot_cpu_has(X86_FEATURE_NPT)) npt_enabled = false; - kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G); + /* Force VM NPT level equal to the host's max NPT level */ + kvm_configure_mmu(npt_enabled, get_max_npt_level(), + get_max_npt_level(), PG_LEVEL_1G); pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); /* Note, SEV setup consumes npt_enabled. */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 927a552393b9..034e1397c7d5 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7803,7 +7803,8 @@ static __init int hardware_setup(void) ept_lpage_level = PG_LEVEL_2M; else ept_lpage_level = PG_LEVEL_4K; - kvm_configure_mmu(enable_ept, vmx_get_max_tdp_level(), ept_lpage_level); + kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(), + ept_lpage_level); /* * Only enable PML when hardware supports PML feature, and both EPT
AMD future CPUs will require a 5-level NPT if host CR4.LA57 is set. To prevent kvm_mmu_get_tdp_level() from incorrectly changing NPT level on behalf of CPUs, add a new parameter in kvm_configure_mmu() to force a fixed TDP level. Signed-off-by: Wei Huang <wei.huang2@amd.com> --- arch/x86/include/asm/kvm_host.h | 5 ++--- arch/x86/kvm/mmu/mmu.c | 10 ++++++++-- arch/x86/kvm/svm/svm.c | 4 +++- arch/x86/kvm/vmx/vmx.c | 3 ++- 4 files changed, 15 insertions(+), 7 deletions(-)