Message ID | 20200201185218.24473-59-sean.j.christopherson@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | KVM: x86: Introduce KVM cpu caps | expand |
Sean Christopherson <sean.j.christopherson@intel.com> writes: > Configure the max page level during hardware setup to avoid a retpoline > in the page fault handler. Drop ->get_lpage_level() as the page fault > handler was the last user. > > No functional change intended. > > Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com> > --- > arch/x86/include/asm/kvm_host.h | 3 +-- > arch/x86/kvm/mmu/mmu.c | 13 +++++++++++-- > arch/x86/kvm/svm.c | 9 +-------- > arch/x86/kvm/vmx/vmx.c | 24 +++++++++++------------- > 4 files changed, 24 insertions(+), 25 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index 1a13a53bbaeb..4165d3ef11e4 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -1142,7 +1142,6 @@ struct kvm_x86_ops { > int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr); > int (*get_tdp_level)(struct kvm_vcpu *vcpu); > u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); > - int (*get_lpage_level)(void); > > void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); > > @@ -1494,7 +1493,7 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); > void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); > void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush); > > -void kvm_configure_mmu(bool enable_tdp); > +void kvm_configure_mmu(bool enable_tdp, int tdp_page_level); > > static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, > struct x86_exception *exception) > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > index 08c80c7c88d4..1aedb71e7a20 100644 > --- a/arch/x86/kvm/mmu/mmu.c > +++ b/arch/x86/kvm/mmu/mmu.c > @@ -86,6 +86,8 @@ __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint"); > */ > bool tdp_enabled = false; > > +static int max_page_level __read_mostly; > + > enum { > AUDIT_PRE_PAGE_FAULT, > AUDIT_POST_PAGE_FAULT, > @@ -3280,7 +3282,7 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, > if (!slot) > return PT_PAGE_TABLE_LEVEL; > > - max_level = min(max_level, kvm_x86_ops->get_lpage_level()); > + max_level = min(max_level, max_page_level); > for ( ; max_level > PT_PAGE_TABLE_LEVEL; max_level--) { > linfo = lpage_info_slot(gfn, slot, max_level); > if (!linfo->disallow_lpage) > @@ -5541,9 +5543,16 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) > } > EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva); > > -void kvm_configure_mmu(bool enable_tdp) > +void kvm_configure_mmu(bool enable_tdp, int tdp_page_level) > { > tdp_enabled = enable_tdp; > + > + if (tdp_enabled) > + max_page_level = tdp_page_level; > + else if (boot_cpu_has(X86_FEATURE_GBPAGES)) > + max_page_level = PT_PDPE_LEVEL; > + else > + max_page_level = PT_DIRECTORY_LEVEL; > } > EXPORT_SYMBOL_GPL(kvm_configure_mmu); > > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c > index 19dc74ae1efb..76c24b3491f6 100644 > --- a/arch/x86/kvm/svm.c > +++ b/arch/x86/kvm/svm.c > @@ -1443,7 +1443,7 @@ static __init int svm_hardware_setup(void) > if (npt_enabled && !npt) > npt_enabled = false; > > - kvm_configure_mmu(npt_enabled); > + kvm_configure_mmu(npt_enabled, PT_PDPE_LEVEL); > pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); > > if (nrips) { > @@ -6064,11 +6064,6 @@ static void svm_set_supported_cpuid(struct kvm_cpuid_entry2 *entry) > } > } > > -static int svm_get_lpage_level(void) > -{ > - return PT_PDPE_LEVEL; > -} I've probably missed something but before the change, get_lpage_level() on AMD was always returning PT_PDPE_LEVEL, but after the change and when NPT is disabled, we set max_page_level to either PT_PDPE_LEVEL (when boot_cpu_has(X86_FEATURE_GBPAGES)) or PT_DIRECTORY_LEVEL (otherwise). This sounds like a change) unless we think that boot_cpu_has(X86_FEATURE_GBPAGES) is always true on AMD. > - > static bool svm_has_wbinvd_exit(void) > { > return true; > @@ -7424,8 +7419,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { > > .get_exit_info = svm_get_exit_info, > > - .get_lpage_level = svm_get_lpage_level, > - > .cpuid_update = svm_cpuid_update, > > .set_supported_cpuid = svm_set_supported_cpuid, > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c > index 59206c22b5e1..3ad24ca692a6 100644 > --- a/arch/x86/kvm/vmx/vmx.c > +++ b/arch/x86/kvm/vmx/vmx.c > @@ -6889,15 +6889,6 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) > return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat; > } > > -static int vmx_get_lpage_level(void) > -{ > - if (enable_ept && !cpu_has_vmx_ept_1g_page()) > - return PT_DIRECTORY_LEVEL; > - else > - /* For shadow and EPT supported 1GB page */ > - return PT_PDPE_LEVEL; > -} > - > static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx) > { > /* > @@ -7584,7 +7575,7 @@ static __init int hardware_setup(void) > { > unsigned long host_bndcfgs; > struct desc_ptr dt; > - int r, i; > + int r, i, ept_lpage_level; > > rdmsrl_safe(MSR_EFER, &host_efer); > > @@ -7677,7 +7668,16 @@ static __init int hardware_setup(void) > > if (enable_ept) > vmx_enable_tdp(); > - kvm_configure_mmu(enable_ept); > + > + if (!enable_ept) > + ept_lpage_level = 0; > + else if (cpu_has_vmx_ept_1g_page()) > + ept_lpage_level = PT_PDPE_LEVEL; > + else if (cpu_has_vmx_ept_2m_page()) > + ept_lpage_level = PT_DIRECTORY_LEVEL; > + else > + ept_lpage_level = PT_PAGE_TABLE_LEVEL; > + kvm_configure_mmu(enable_ept, ept_lpage_level); > > /* > * Only enable PML when hardware supports PML feature, and both EPT > @@ -7855,8 +7855,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { > > .get_exit_info = vmx_get_exit_info, > > - .get_lpage_level = vmx_get_lpage_level, > - > .cpuid_update = vmx_cpuid_update, > .set_supported_cpuid = vmx_set_supported_cpuid,
On Tue, Feb 25, 2020 at 03:43:36PM +0100, Vitaly Kuznetsov wrote: > Sean Christopherson <sean.j.christopherson@intel.com> writes: > > > Configure the max page level during hardware setup to avoid a retpoline > > in the page fault handler. Drop ->get_lpage_level() as the page fault > > handler was the last user. > > @@ -6064,11 +6064,6 @@ static void svm_set_supported_cpuid(struct kvm_cpuid_entry2 *entry) > > } > > } > > > > -static int svm_get_lpage_level(void) > > -{ > > - return PT_PDPE_LEVEL; > > -} > > I've probably missed something but before the change, get_lpage_level() > on AMD was always returning PT_PDPE_LEVEL, but after the change and when > NPT is disabled, we set max_page_level to either PT_PDPE_LEVEL (when > boot_cpu_has(X86_FEATURE_GBPAGES)) or PT_DIRECTORY_LEVEL > (otherwise). This sounds like a change) unless we think that > boot_cpu_has(X86_FEATURE_GBPAGES) is always true on AMD. It looks like a functional change, but isn't. kvm_mmu_hugepage_adjust() caps the page size used by KVM's MMU at the minimum of ->get_lpage_level() and the host's mapping level. Barring an egregious bug in the kernel MMU, the host page tables will max out at PT_DIRECTORY_LEVEL (2mb) unless boot_cpu_has(X86_FEATURE_GBPAGES) is true. In other words, this is effectively a "documentation" change. I'll figure out a way to explain this in the changelog... max_level = min(max_level, kvm_x86_ops->get_lpage_level()); for ( ; max_level > PT_PAGE_TABLE_LEVEL; max_level--) { linfo = lpage_info_slot(gfn, slot, max_level); if (!linfo->disallow_lpage) break; } if (max_level == PT_PAGE_TABLE_LEVEL) return PT_PAGE_TABLE_LEVEL; level = host_pfn_mapping_level(vcpu, gfn, pfn, slot); if (level == PT_PAGE_TABLE_LEVEL) return level; level = min(level, max_level); <---------
Sean Christopherson <sean.j.christopherson@intel.com> writes: > On Tue, Feb 25, 2020 at 03:43:36PM +0100, Vitaly Kuznetsov wrote: >> Sean Christopherson <sean.j.christopherson@intel.com> writes: >> >> > Configure the max page level during hardware setup to avoid a retpoline >> > in the page fault handler. Drop ->get_lpage_level() as the page fault >> > handler was the last user. >> > @@ -6064,11 +6064,6 @@ static void svm_set_supported_cpuid(struct kvm_cpuid_entry2 *entry) >> > } >> > } >> > >> > -static int svm_get_lpage_level(void) >> > -{ >> > - return PT_PDPE_LEVEL; >> > -} >> >> I've probably missed something but before the change, get_lpage_level() >> on AMD was always returning PT_PDPE_LEVEL, but after the change and when >> NPT is disabled, we set max_page_level to either PT_PDPE_LEVEL (when >> boot_cpu_has(X86_FEATURE_GBPAGES)) or PT_DIRECTORY_LEVEL >> (otherwise). This sounds like a change) unless we think that >> boot_cpu_has(X86_FEATURE_GBPAGES) is always true on AMD. > > It looks like a functional change, but isn't. kvm_mmu_hugepage_adjust() > caps the page size used by KVM's MMU at the minimum of ->get_lpage_level() > and the host's mapping level. Barring an egregious bug in the kernel MMU, > the host page tables will max out at PT_DIRECTORY_LEVEL (2mb) unless > boot_cpu_has(X86_FEATURE_GBPAGES) is true. > > In other words, this is effectively a "documentation" change. I'll figure > out a way to explain this in the changelog... > > max_level = min(max_level, kvm_x86_ops->get_lpage_level()); > for ( ; max_level > PT_PAGE_TABLE_LEVEL; max_level--) { > linfo = lpage_info_slot(gfn, slot, max_level); > if (!linfo->disallow_lpage) > break; > } > > if (max_level == PT_PAGE_TABLE_LEVEL) > return PT_PAGE_TABLE_LEVEL; > > level = host_pfn_mapping_level(vcpu, gfn, pfn, slot); > if (level == PT_PAGE_TABLE_LEVEL) > return level; > > level = min(level, max_level); <--------- Ok, I see (I believe): Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com> It would've helped me a bit if kvm_configure_mmu() was written the following way: void kvm_configure_mmu(bool enable_tdp, int tdp_page_level) { tdp_enabled = enable_tdp; if (boot_cpu_has(X86_FEATURE_GBPAGES)) max_page_level = PT_PDPE_LEVEL; else max_page_level = PT_DIRECTORY_LEVEL; if (tdp_enabled) max_page_level = min(tdp_page_level, max_page_level); } (we can't have cpu_has_vmx_ept_1g_page() and not boot_cpu_has(X86_FEATURE_GBPAGES), right?) But this is certainly just a personal preference, feel free to ignore)
On Wed, Feb 26, 2020 at 03:55:55PM +0100, Vitaly Kuznetsov wrote: > Sean Christopherson <sean.j.christopherson@intel.com> writes: > > > On Tue, Feb 25, 2020 at 03:43:36PM +0100, Vitaly Kuznetsov wrote: > >> Sean Christopherson <sean.j.christopherson@intel.com> writes: > >> > >> > Configure the max page level during hardware setup to avoid a retpoline > >> > in the page fault handler. Drop ->get_lpage_level() as the page fault > >> > handler was the last user. > >> > @@ -6064,11 +6064,6 @@ static void svm_set_supported_cpuid(struct kvm_cpuid_entry2 *entry) > >> > } > >> > } > >> > > >> > -static int svm_get_lpage_level(void) > >> > -{ > >> > - return PT_PDPE_LEVEL; > >> > -} > >> > >> I've probably missed something but before the change, get_lpage_level() > >> on AMD was always returning PT_PDPE_LEVEL, but after the change and when > >> NPT is disabled, we set max_page_level to either PT_PDPE_LEVEL (when > >> boot_cpu_has(X86_FEATURE_GBPAGES)) or PT_DIRECTORY_LEVEL > >> (otherwise). This sounds like a change) unless we think that > >> boot_cpu_has(X86_FEATURE_GBPAGES) is always true on AMD. > > > > It looks like a functional change, but isn't. kvm_mmu_hugepage_adjust() > > caps the page size used by KVM's MMU at the minimum of ->get_lpage_level() > > and the host's mapping level. Barring an egregious bug in the kernel MMU, > > the host page tables will max out at PT_DIRECTORY_LEVEL (2mb) unless > > boot_cpu_has(X86_FEATURE_GBPAGES) is true. > > > > In other words, this is effectively a "documentation" change. I'll figure > > out a way to explain this in the changelog... > > > > max_level = min(max_level, kvm_x86_ops->get_lpage_level()); > > for ( ; max_level > PT_PAGE_TABLE_LEVEL; max_level--) { > > linfo = lpage_info_slot(gfn, slot, max_level); > > if (!linfo->disallow_lpage) > > break; > > } > > > > if (max_level == PT_PAGE_TABLE_LEVEL) > > return PT_PAGE_TABLE_LEVEL; > > > > level = host_pfn_mapping_level(vcpu, gfn, pfn, slot); > > if (level == PT_PAGE_TABLE_LEVEL) > > return level; > > > > level = min(level, max_level); <--------- > > Ok, I see (I believe): > > Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com> > > It would've helped me a bit if kvm_configure_mmu() was written the > following way: > > void kvm_configure_mmu(bool enable_tdp, int tdp_page_level) > { > tdp_enabled = enable_tdp; > > if (boot_cpu_has(X86_FEATURE_GBPAGES)) > max_page_level = PT_PDPE_LEVEL; > else > max_page_level = PT_DIRECTORY_LEVEL; > > if (tdp_enabled) > max_page_level = min(tdp_page_level, max_page_level); > } > > (we can't have cpu_has_vmx_ept_1g_page() and not > boot_cpu_has(X86_FEATURE_GBPAGES), right?) Wrong, because VMX. It could even occur on a real system if the user disables the feature via kernel param, e.g. "clearcpuid=58". In the end it won't actually change anything because KVM caps its page size at the kernel page size (as above). Well, unless someone is running a custom kernel that does funky things. > But this is certainly just a personal preference, feel free to ignore) I'm on the fence. Part of me likes having max_page_level reflect what KVM is capable of, irrespective of the kernel.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1a13a53bbaeb..4165d3ef11e4 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1142,7 +1142,6 @@ struct kvm_x86_ops { int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr); int (*get_tdp_level)(struct kvm_vcpu *vcpu); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); - int (*get_lpage_level)(void); void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); @@ -1494,7 +1493,7 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush); -void kvm_configure_mmu(bool enable_tdp); +void kvm_configure_mmu(bool enable_tdp, int tdp_page_level); static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, struct x86_exception *exception) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 08c80c7c88d4..1aedb71e7a20 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -86,6 +86,8 @@ __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint"); */ bool tdp_enabled = false; +static int max_page_level __read_mostly; + enum { AUDIT_PRE_PAGE_FAULT, AUDIT_POST_PAGE_FAULT, @@ -3280,7 +3282,7 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, if (!slot) return PT_PAGE_TABLE_LEVEL; - max_level = min(max_level, kvm_x86_ops->get_lpage_level()); + max_level = min(max_level, max_page_level); for ( ; max_level > PT_PAGE_TABLE_LEVEL; max_level--) { linfo = lpage_info_slot(gfn, slot, max_level); if (!linfo->disallow_lpage) @@ -5541,9 +5543,16 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) } EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva); -void kvm_configure_mmu(bool enable_tdp) +void kvm_configure_mmu(bool enable_tdp, int tdp_page_level) { tdp_enabled = enable_tdp; + + if (tdp_enabled) + max_page_level = tdp_page_level; + else if (boot_cpu_has(X86_FEATURE_GBPAGES)) + max_page_level = PT_PDPE_LEVEL; + else + max_page_level = PT_DIRECTORY_LEVEL; } EXPORT_SYMBOL_GPL(kvm_configure_mmu); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 19dc74ae1efb..76c24b3491f6 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1443,7 +1443,7 @@ static __init int svm_hardware_setup(void) if (npt_enabled && !npt) npt_enabled = false; - kvm_configure_mmu(npt_enabled); + kvm_configure_mmu(npt_enabled, PT_PDPE_LEVEL); pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); if (nrips) { @@ -6064,11 +6064,6 @@ static void svm_set_supported_cpuid(struct kvm_cpuid_entry2 *entry) } } -static int svm_get_lpage_level(void) -{ - return PT_PDPE_LEVEL; -} - static bool svm_has_wbinvd_exit(void) { return true; @@ -7424,8 +7419,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .get_exit_info = svm_get_exit_info, - .get_lpage_level = svm_get_lpage_level, - .cpuid_update = svm_cpuid_update, .set_supported_cpuid = svm_set_supported_cpuid, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 59206c22b5e1..3ad24ca692a6 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6889,15 +6889,6 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat; } -static int vmx_get_lpage_level(void) -{ - if (enable_ept && !cpu_has_vmx_ept_1g_page()) - return PT_DIRECTORY_LEVEL; - else - /* For shadow and EPT supported 1GB page */ - return PT_PDPE_LEVEL; -} - static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx) { /* @@ -7584,7 +7575,7 @@ static __init int hardware_setup(void) { unsigned long host_bndcfgs; struct desc_ptr dt; - int r, i; + int r, i, ept_lpage_level; rdmsrl_safe(MSR_EFER, &host_efer); @@ -7677,7 +7668,16 @@ static __init int hardware_setup(void) if (enable_ept) vmx_enable_tdp(); - kvm_configure_mmu(enable_ept); + + if (!enable_ept) + ept_lpage_level = 0; + else if (cpu_has_vmx_ept_1g_page()) + ept_lpage_level = PT_PDPE_LEVEL; + else if (cpu_has_vmx_ept_2m_page()) + ept_lpage_level = PT_DIRECTORY_LEVEL; + else + ept_lpage_level = PT_PAGE_TABLE_LEVEL; + kvm_configure_mmu(enable_ept, ept_lpage_level); /* * Only enable PML when hardware supports PML feature, and both EPT @@ -7855,8 +7855,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .get_exit_info = vmx_get_exit_info, - .get_lpage_level = vmx_get_lpage_level, - .cpuid_update = vmx_cpuid_update, .set_supported_cpuid = vmx_set_supported_cpuid,
Configure the max page level during hardware setup to avoid a retpoline in the page fault handler. Drop ->get_lpage_level() as the page fault handler was the last user. No functional change intended. Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com> --- arch/x86/include/asm/kvm_host.h | 3 +-- arch/x86/kvm/mmu/mmu.c | 13 +++++++++++-- arch/x86/kvm/svm.c | 9 +-------- arch/x86/kvm/vmx/vmx.c | 24 +++++++++++------------- 4 files changed, 24 insertions(+), 25 deletions(-)