diff mbox series

[v2,1/3] KVM: x86: Allow CPU to force vendor-specific TDP level

Message ID 20210808192658.2923641-2-wei.huang2@amd.com (mailing list archive)
State New, archived
Headers show
Series SVM 5-level page table support | expand

Commit Message

Wei Huang Aug. 8, 2021, 7:26 p.m. UTC
AMD future CPUs will require a 5-level NPT if host CR4.LA57 is set.
To prevent kvm_mmu_get_tdp_level() from incorrectly changing NPT level
on behalf of CPUs, add a new parameter in kvm_configure_mmu() to force
a fixed TDP level.

Signed-off-by: Wei Huang <wei.huang2@amd.com>
---
 arch/x86/include/asm/kvm_host.h |  5 ++---
 arch/x86/kvm/mmu/mmu.c          | 10 ++++++++--
 arch/x86/kvm/svm/svm.c          |  4 +++-
 arch/x86/kvm/vmx/vmx.c          |  3 ++-
 4 files changed, 15 insertions(+), 7 deletions(-)

Comments

Yu Zhang Aug. 9, 2021, 3:58 a.m. UTC | #1
On Sun, Aug 08, 2021 at 02:26:56PM -0500, Wei Huang wrote:
> AMD future CPUs will require a 5-level NPT if host CR4.LA57 is set.

Sorry, but why? NPT is not indexed by HVA. 

> To prevent kvm_mmu_get_tdp_level() from incorrectly changing NPT level
> on behalf of CPUs, add a new parameter in kvm_configure_mmu() to force
> a fixed TDP level.
> 
> Signed-off-by: Wei Huang <wei.huang2@amd.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  5 ++---
>  arch/x86/kvm/mmu/mmu.c          | 10 ++++++++--
>  arch/x86/kvm/svm/svm.c          |  4 +++-
>  arch/x86/kvm/vmx/vmx.c          |  3 ++-
>  4 files changed, 15 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 974cbfb1eefe..6d16f75cc8da 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -723,7 +723,6 @@ struct kvm_vcpu_arch {
>  
>  	u64 reserved_gpa_bits;
>  	int maxphyaddr;
> -	int max_tdp_level;
>  
>  	/* emulate context */
>  
> @@ -1747,8 +1746,8 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
>  void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
>  void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd);
>  
> -void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
> -		       int tdp_huge_page_level);
> +void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
> +		       int tdp_max_root_level, int tdp_huge_page_level);
>  
>  static inline u16 kvm_read_ldt(void)
>  {
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 66f7f5bc3482..c11ee4531f6d 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -97,6 +97,7 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
>  bool tdp_enabled = false;
>  
>  static int max_huge_page_level __read_mostly;
> +static int tdp_root_level __read_mostly;

I think this is a broken design - meaning KVM can only use 5-level or
4-level NPT for all VMs.

B.R.
Yu

>  static int max_tdp_level __read_mostly;
>  
>  enum {
> @@ -4562,6 +4563,10 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
>  
>  static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
>  {
> +	/* tdp_root_level is architecture forced level, use it if nonzero */
> +	if (tdp_root_level)
> +		return tdp_root_level;
> +
>  	/* Use 5-level TDP if and only if it's useful/necessary. */
>  	if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
>  		return 4;
> @@ -5253,10 +5258,11 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
>  	 */
>  }
>  
> -void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
> -		       int tdp_huge_page_level)
> +void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
> +		       int tdp_max_root_level, int tdp_huge_page_level)
>  {
>  	tdp_enabled = enable_tdp;
> +	tdp_root_level = tdp_forced_root_level;
>  	max_tdp_level = tdp_max_root_level;
>  
>  	/*
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index e8ccab50ebf6..f361d466e18e 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -1015,7 +1015,9 @@ static __init int svm_hardware_setup(void)
>  	if (!boot_cpu_has(X86_FEATURE_NPT))
>  		npt_enabled = false;
>  
> -	kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G);
> +	/* Force VM NPT level equal to the host's max NPT level */
> +	kvm_configure_mmu(npt_enabled, get_max_npt_level(),
> +			  get_max_npt_level(), PG_LEVEL_1G);
>  	pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
>  
>  	/* Note, SEV setup consumes npt_enabled. */
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 927a552393b9..034e1397c7d5 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -7803,7 +7803,8 @@ static __init int hardware_setup(void)
>  		ept_lpage_level = PG_LEVEL_2M;
>  	else
>  		ept_lpage_level = PG_LEVEL_4K;
> -	kvm_configure_mmu(enable_ept, vmx_get_max_tdp_level(), ept_lpage_level);
> +	kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(),
> +			  ept_lpage_level);
>  
>  	/*
>  	 * Only enable PML when hardware supports PML feature, and both EPT
> -- 
> 2.31.1
>
Wei Huang Aug. 9, 2021, 4:11 a.m. UTC | #2
On 8/8/21 10:58 PM, Yu Zhang wrote:
> On Sun, Aug 08, 2021 at 02:26:56PM -0500, Wei Huang wrote:
>> AMD future CPUs will require a 5-level NPT if host CR4.LA57 is set.
> 
> Sorry, but why? NPT is not indexed by HVA.

NPT is not indexed by HVA - it is always indexed by GPA. What I meant is 
NPT page table level has to be the same as the host OS page table: if 
5-level page table is enabled in host OS (CR4.LA57=1), guest NPT has to 
5-level too.

> 
>> To prevent kvm_mmu_get_tdp_level() from incorrectly changing NPT level
>> on behalf of CPUs, add a new parameter in kvm_configure_mmu() to force
>> a fixed TDP level.
>>
>> Signed-off-by: Wei Huang <wei.huang2@amd.com>
>> ---
>>   arch/x86/include/asm/kvm_host.h |  5 ++---
>>   arch/x86/kvm/mmu/mmu.c          | 10 ++++++++--
>>   arch/x86/kvm/svm/svm.c          |  4 +++-
>>   arch/x86/kvm/vmx/vmx.c          |  3 ++-
>>   4 files changed, 15 insertions(+), 7 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index 974cbfb1eefe..6d16f75cc8da 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -723,7 +723,6 @@ struct kvm_vcpu_arch {
>>   
>>   	u64 reserved_gpa_bits;
>>   	int maxphyaddr;
>> -	int max_tdp_level;
>>   
>>   	/* emulate context */
>>   
>> @@ -1747,8 +1746,8 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
>>   void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
>>   void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd);
>>   
>> -void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
>> -		       int tdp_huge_page_level);
>> +void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
>> +		       int tdp_max_root_level, int tdp_huge_page_level);
>>   
>>   static inline u16 kvm_read_ldt(void)
>>   {
>> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
>> index 66f7f5bc3482..c11ee4531f6d 100644
>> --- a/arch/x86/kvm/mmu/mmu.c
>> +++ b/arch/x86/kvm/mmu/mmu.c
>> @@ -97,6 +97,7 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
>>   bool tdp_enabled = false;
>>   
>>   static int max_huge_page_level __read_mostly;
>> +static int tdp_root_level __read_mostly;
> 
> I think this is a broken design - meaning KVM can only use 5-level or
> 4-level NPT for all VMs.

Broken normally means non-functional or buggy, which doesn't apply here. 
A good TLB design should be able to offset the potential overhead of 
5-level page table for most cases.

> 
> B.R.
> Yu
> 
>>   static int max_tdp_level __read_mostly;
>>   
>>   enum {
>> @@ -4562,6 +4563,10 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
>>   
>>   static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
>>   {
>> +	/* tdp_root_level is architecture forced level, use it if nonzero */
>> +	if (tdp_root_level)
>> +		return tdp_root_level;
>> +
>>   	/* Use 5-level TDP if and only if it's useful/necessary. */
>>   	if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
>>   		return 4;
>> @@ -5253,10 +5258,11 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
>>   	 */
>>   }
>>   
>> -void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
>> -		       int tdp_huge_page_level)
>> +void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
>> +		       int tdp_max_root_level, int tdp_huge_page_level)
>>   {
>>   	tdp_enabled = enable_tdp;
>> +	tdp_root_level = tdp_forced_root_level;
>>   	max_tdp_level = tdp_max_root_level;
>>   
>>   	/*
>> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
>> index e8ccab50ebf6..f361d466e18e 100644
>> --- a/arch/x86/kvm/svm/svm.c
>> +++ b/arch/x86/kvm/svm/svm.c
>> @@ -1015,7 +1015,9 @@ static __init int svm_hardware_setup(void)
>>   	if (!boot_cpu_has(X86_FEATURE_NPT))
>>   		npt_enabled = false;
>>   
>> -	kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G);
>> +	/* Force VM NPT level equal to the host's max NPT level */
>> +	kvm_configure_mmu(npt_enabled, get_max_npt_level(),
>> +			  get_max_npt_level(), PG_LEVEL_1G);
>>   	pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
>>   
>>   	/* Note, SEV setup consumes npt_enabled. */
>> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
>> index 927a552393b9..034e1397c7d5 100644
>> --- a/arch/x86/kvm/vmx/vmx.c
>> +++ b/arch/x86/kvm/vmx/vmx.c
>> @@ -7803,7 +7803,8 @@ static __init int hardware_setup(void)
>>   		ept_lpage_level = PG_LEVEL_2M;
>>   	else
>>   		ept_lpage_level = PG_LEVEL_4K;
>> -	kvm_configure_mmu(enable_ept, vmx_get_max_tdp_level(), ept_lpage_level);
>> +	kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(),
>> +			  ept_lpage_level);
>>   
>>   	/*
>>   	 * Only enable PML when hardware supports PML feature, and both EPT
>> -- 
>> 2.31.1
>>
Yu Zhang Aug. 9, 2021, 4:27 a.m. UTC | #3
On Sun, Aug 08, 2021 at 11:11:40PM -0500, Wei Huang wrote:
> 
> 
> On 8/8/21 10:58 PM, Yu Zhang wrote:
> > On Sun, Aug 08, 2021 at 02:26:56PM -0500, Wei Huang wrote:
> > > AMD future CPUs will require a 5-level NPT if host CR4.LA57 is set.
> > 
> > Sorry, but why? NPT is not indexed by HVA.
> 
> NPT is not indexed by HVA - it is always indexed by GPA. What I meant is NPT
> page table level has to be the same as the host OS page table: if 5-level
> page table is enabled in host OS (CR4.LA57=1), guest NPT has to 5-level too.

I know what you meant. But may I ask why? 

B.R.
Yu

> 
> > 
> > > To prevent kvm_mmu_get_tdp_level() from incorrectly changing NPT level
> > > on behalf of CPUs, add a new parameter in kvm_configure_mmu() to force
> > > a fixed TDP level.
> > > 
> > > Signed-off-by: Wei Huang <wei.huang2@amd.com>
> > > ---
> > >   arch/x86/include/asm/kvm_host.h |  5 ++---
> > >   arch/x86/kvm/mmu/mmu.c          | 10 ++++++++--
> > >   arch/x86/kvm/svm/svm.c          |  4 +++-
> > >   arch/x86/kvm/vmx/vmx.c          |  3 ++-
> > >   4 files changed, 15 insertions(+), 7 deletions(-)
> > > 
> > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > > index 974cbfb1eefe..6d16f75cc8da 100644
> > > --- a/arch/x86/include/asm/kvm_host.h
> > > +++ b/arch/x86/include/asm/kvm_host.h
> > > @@ -723,7 +723,6 @@ struct kvm_vcpu_arch {
> > >   	u64 reserved_gpa_bits;
> > >   	int maxphyaddr;
> > > -	int max_tdp_level;
> > >   	/* emulate context */
> > > @@ -1747,8 +1746,8 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
> > >   void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
> > >   void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd);
> > > -void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
> > > -		       int tdp_huge_page_level);
> > > +void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
> > > +		       int tdp_max_root_level, int tdp_huge_page_level);
> > >   static inline u16 kvm_read_ldt(void)
> > >   {
> > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > > index 66f7f5bc3482..c11ee4531f6d 100644
> > > --- a/arch/x86/kvm/mmu/mmu.c
> > > +++ b/arch/x86/kvm/mmu/mmu.c
> > > @@ -97,6 +97,7 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
> > >   bool tdp_enabled = false;
> > >   static int max_huge_page_level __read_mostly;
> > > +static int tdp_root_level __read_mostly;
> > 
> > I think this is a broken design - meaning KVM can only use 5-level or
> > 4-level NPT for all VMs.
> 
> Broken normally means non-functional or buggy, which doesn't apply here. A
> good TLB design should be able to offset the potential overhead of 5-level
> page table for most cases.
> 
> > 
> > B.R.
> > Yu
> > 
> > >   static int max_tdp_level __read_mostly;
> > >   enum {
> > > @@ -4562,6 +4563,10 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
> > >   static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
> > >   {
> > > +	/* tdp_root_level is architecture forced level, use it if nonzero */
> > > +	if (tdp_root_level)
> > > +		return tdp_root_level;
> > > +
> > >   	/* Use 5-level TDP if and only if it's useful/necessary. */
> > >   	if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
> > >   		return 4;
> > > @@ -5253,10 +5258,11 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
> > >   	 */
> > >   }
> > > -void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
> > > -		       int tdp_huge_page_level)
> > > +void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
> > > +		       int tdp_max_root_level, int tdp_huge_page_level)
> > >   {
> > >   	tdp_enabled = enable_tdp;
> > > +	tdp_root_level = tdp_forced_root_level;
> > >   	max_tdp_level = tdp_max_root_level;
> > >   	/*
> > > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> > > index e8ccab50ebf6..f361d466e18e 100644
> > > --- a/arch/x86/kvm/svm/svm.c
> > > +++ b/arch/x86/kvm/svm/svm.c
> > > @@ -1015,7 +1015,9 @@ static __init int svm_hardware_setup(void)
> > >   	if (!boot_cpu_has(X86_FEATURE_NPT))
> > >   		npt_enabled = false;
> > > -	kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G);
> > > +	/* Force VM NPT level equal to the host's max NPT level */
> > > +	kvm_configure_mmu(npt_enabled, get_max_npt_level(),
> > > +			  get_max_npt_level(), PG_LEVEL_1G);
> > >   	pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
> > >   	/* Note, SEV setup consumes npt_enabled. */
> > > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > > index 927a552393b9..034e1397c7d5 100644
> > > --- a/arch/x86/kvm/vmx/vmx.c
> > > +++ b/arch/x86/kvm/vmx/vmx.c
> > > @@ -7803,7 +7803,8 @@ static __init int hardware_setup(void)
> > >   		ept_lpage_level = PG_LEVEL_2M;
> > >   	else
> > >   		ept_lpage_level = PG_LEVEL_4K;
> > > -	kvm_configure_mmu(enable_ept, vmx_get_max_tdp_level(), ept_lpage_level);
> > > +	kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(),
> > > +			  ept_lpage_level);
> > >   	/*
> > >   	 * Only enable PML when hardware supports PML feature, and both EPT
> > > -- 
> > > 2.31.1
> > >
Wei Huang Aug. 9, 2021, 4:33 a.m. UTC | #4
On 8/8/21 11:27 PM, Yu Zhang wrote:
> On Sun, Aug 08, 2021 at 11:11:40PM -0500, Wei Huang wrote:
>>
>>
>> On 8/8/21 10:58 PM, Yu Zhang wrote:
>>> On Sun, Aug 08, 2021 at 02:26:56PM -0500, Wei Huang wrote:
>>>> AMD future CPUs will require a 5-level NPT if host CR4.LA57 is set.
>>>
>>> Sorry, but why? NPT is not indexed by HVA.
>>
>> NPT is not indexed by HVA - it is always indexed by GPA. What I meant is NPT
>> page table level has to be the same as the host OS page table: if 5-level
>> page table is enabled in host OS (CR4.LA57=1), guest NPT has to 5-level too.
> 
> I know what you meant. But may I ask why?

I don't have a good answer for it. From what I know, VMCB doesn't have a 
field to indicate guest page table level. As a result, hardware relies 
on host CR4 to infer NPT level.

> 
> B.R.
> Yu
> 
>>
Yu Zhang Aug. 9, 2021, 6:42 a.m. UTC | #5
On Sun, Aug 08, 2021 at 11:33:44PM -0500, Wei Huang wrote:
> 
> 
> On 8/8/21 11:27 PM, Yu Zhang wrote:
> > On Sun, Aug 08, 2021 at 11:11:40PM -0500, Wei Huang wrote:
> > > 
> > > 
> > > On 8/8/21 10:58 PM, Yu Zhang wrote:
> > > > On Sun, Aug 08, 2021 at 02:26:56PM -0500, Wei Huang wrote:
> > > > > AMD future CPUs will require a 5-level NPT if host CR4.LA57 is set.
> > > > 
> > > > Sorry, but why? NPT is not indexed by HVA.
> > > 
> > > NPT is not indexed by HVA - it is always indexed by GPA. What I meant is NPT
> > > page table level has to be the same as the host OS page table: if 5-level
> > > page table is enabled in host OS (CR4.LA57=1), guest NPT has to 5-level too.
> > 
> > I know what you meant. But may I ask why?
> 
> I don't have a good answer for it. From what I know, VMCB doesn't have a
> field to indicate guest page table level. As a result, hardware relies on
> host CR4 to infer NPT level.

I guess you mean not even in the N_CR3 field of VMCB? 

Then it's not a broken design - it's a limitation of SVM. :)

B.R.
Yu
Sean Christopherson Aug. 9, 2021, 3:30 p.m. UTC | #6
On Mon, Aug 09, 2021, Yu Zhang wrote:
> On Sun, Aug 08, 2021 at 11:33:44PM -0500, Wei Huang wrote:
> > 
> > On 8/8/21 11:27 PM, Yu Zhang wrote:
> > > On Sun, Aug 08, 2021 at 11:11:40PM -0500, Wei Huang wrote:
> > > > 
> > > > 
> > > > On 8/8/21 10:58 PM, Yu Zhang wrote:
> > > > > On Sun, Aug 08, 2021 at 02:26:56PM -0500, Wei Huang wrote:
> > > > > > AMD future CPUs will require a 5-level NPT if host CR4.LA57 is set.
> > > > > 
> > > > > Sorry, but why? NPT is not indexed by HVA.
> > > > 
> > > > NPT is not indexed by HVA - it is always indexed by GPA. What I meant is NPT
> > > > page table level has to be the same as the host OS page table: if 5-level
> > > > page table is enabled in host OS (CR4.LA57=1), guest NPT has to 5-level too.
> > > 
> > > I know what you meant. But may I ask why?
> > 
> > I don't have a good answer for it. From what I know, VMCB doesn't have a
> > field to indicate guest page table level. As a result, hardware relies on
> > host CR4 to infer NPT level.
> 
> I guess you mean not even in the N_CR3 field of VMCB? 

Correct, nCR3 is a basically a pure representation of a regular CR3.

> Then it's not a broken design - it's a limitation of SVM. :)

That's just a polite way of saying it's a broken design ;-)

Joking aside, NPT opted for a semblance of backwards compatibility at the cost of
having to carry all the baggage that comes with a legacy design.  Keeping the core
functionality from IA32 paging presumably miminizes design and hardware costs, and
required minimal enabling in hypervisors.  The downside is that it's less flexible
than EPT and has a few warts, e.g. shadowing NPT is gross because the host can't
easily mirror L1's desired paging mode.
Jim Mattson Aug. 9, 2021, 9:49 p.m. UTC | #7
On Mon, Aug 9, 2021 at 8:30 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Mon, Aug 09, 2021, Yu Zhang wrote:
> > On Sun, Aug 08, 2021 at 11:33:44PM -0500, Wei Huang wrote:
> > >
> > > On 8/8/21 11:27 PM, Yu Zhang wrote:
> > > > On Sun, Aug 08, 2021 at 11:11:40PM -0500, Wei Huang wrote:
> > > > >
> > > > >
> > > > > On 8/8/21 10:58 PM, Yu Zhang wrote:
> > > > > > On Sun, Aug 08, 2021 at 02:26:56PM -0500, Wei Huang wrote:
> > > > > > > AMD future CPUs will require a 5-level NPT if host CR4.LA57 is set.
> > > > > >
> > > > > > Sorry, but why? NPT is not indexed by HVA.
> > > > >
> > > > > NPT is not indexed by HVA - it is always indexed by GPA. What I meant is NPT
> > > > > page table level has to be the same as the host OS page table: if 5-level
> > > > > page table is enabled in host OS (CR4.LA57=1), guest NPT has to 5-level too.
> > > >
> > > > I know what you meant. But may I ask why?
> > >
> > > I don't have a good answer for it. From what I know, VMCB doesn't have a
> > > field to indicate guest page table level. As a result, hardware relies on
> > > host CR4 to infer NPT level.
> >
> > I guess you mean not even in the N_CR3 field of VMCB?
>
> Correct, nCR3 is a basically a pure representation of a regular CR3.
>
> > Then it's not a broken design - it's a limitation of SVM. :)
>
> That's just a polite way of saying it's a broken design ;-)

Doesn't this break legacy type 2 hypervisors that don't know anything
about 5-level NPT and don't have any control over whether or not the
host uses 5-level paging?

> Joking aside, NPT opted for a semblance of backwards compatibility at the cost of
> having to carry all the baggage that comes with a legacy design.  Keeping the core
> functionality from IA32 paging presumably miminizes design and hardware costs, and
> required minimal enabling in hypervisors.  The downside is that it's less flexible
> than EPT and has a few warts, e.g. shadowing NPT is gross because the host can't
> easily mirror L1's desired paging mode.
Yu Zhang Aug. 10, 2021, 7:40 a.m. UTC | #8
On Mon, Aug 09, 2021 at 03:30:08PM +0000, Sean Christopherson wrote:
> On Mon, Aug 09, 2021, Yu Zhang wrote:
> > On Sun, Aug 08, 2021 at 11:33:44PM -0500, Wei Huang wrote:
> > > 
> > > On 8/8/21 11:27 PM, Yu Zhang wrote:
> > > > On Sun, Aug 08, 2021 at 11:11:40PM -0500, Wei Huang wrote:
> > > > > 
> > > > > 
> > > > > On 8/8/21 10:58 PM, Yu Zhang wrote:
> > > > > > On Sun, Aug 08, 2021 at 02:26:56PM -0500, Wei Huang wrote:
> > > > > > > AMD future CPUs will require a 5-level NPT if host CR4.LA57 is set.
> > > > > > 
> > > > > > Sorry, but why? NPT is not indexed by HVA.
> > > > > 
> > > > > NPT is not indexed by HVA - it is always indexed by GPA. What I meant is NPT
> > > > > page table level has to be the same as the host OS page table: if 5-level
> > > > > page table is enabled in host OS (CR4.LA57=1), guest NPT has to 5-level too.
> > > > 
> > > > I know what you meant. But may I ask why?
> > > 
> > > I don't have a good answer for it. From what I know, VMCB doesn't have a
> > > field to indicate guest page table level. As a result, hardware relies on
> > > host CR4 to infer NPT level.
> > 
> > I guess you mean not even in the N_CR3 field of VMCB? 
> 
> Correct, nCR3 is a basically a pure representation of a regular CR3.
> 
> > Then it's not a broken design - it's a limitation of SVM. :)
> 
> That's just a polite way of saying it's a broken design ;-)
> 
> Joking aside, NPT opted for a semblance of backwards compatibility at the cost of
> having to carry all the baggage that comes with a legacy design.  Keeping the core
> functionality from IA32 paging presumably miminizes design and hardware costs, and
> required minimal enabling in hypervisors.  The downside is that it's less flexible
> than EPT and has a few warts, e.g. shadowing NPT is gross because the host can't
> easily mirror L1's desired paging mode.

Thanks for your explaination, Sean. Everything has a cost, and now it's time to pay
the price. :)

As to the max level, it is calculated in kvm_init(). Though I do not see any chance
that host paging mode will be changed after kvm_init(), or any case that Linux uses
different paging levels in different pCPUs, I am wondering, should we do something,
e.g., to document this as an open?

About "host can't easily mirror L1's desired paging mode", could you please elaborate?
Thanks!

B.R.
Yu
Paolo Bonzini Aug. 10, 2021, 9:23 a.m. UTC | #9
On 09/08/21 23:49, Jim Mattson wrote:
> Doesn't this break legacy type 2 hypervisors that don't know anything
> about 5-level NPT and don't have any control over whether or not the
> host uses 5-level paging?

Yes, where "legacy" probably means "all released versions of all of 
them", including KVM.  Host support for LA57 was merged in 4.13, while 
KVM started supporting 5-level page tables in EPT in 4.14 and even then 
just returned PT64_ROOT_LEVEL (i.e. 4) for the maximum NPT level.

So all Linux versions up to 5.13, which has "KVM: x86: Prevent KVM SVM 
from loading on kernels with 5-level paging", will break horribly. 
Better backport that patch to stable...

Paolo
Paolo Bonzini Aug. 10, 2021, 9:25 a.m. UTC | #10
On 10/08/21 09:40, Yu Zhang wrote:
> About "host can't easily mirror L1's desired paging mode", could you please elaborate?
> Thanks!

Shadow pgae tables in KVM will always have 3 levels on 32-bit machines 
and 4/5 levels on 64-bit machines.  L1 instead might have any number of 
levels from 2 to 5 (though of course not more than the host has).

Therefore, when shadowing 32-bit NPT page tables, KVM has to add extra 
fixed levels on top of those that it's shadowing.  See 
mmu_alloc_direct_roots for the code.

Paolo
Yu Zhang Aug. 10, 2021, 11 a.m. UTC | #11
On Tue, Aug 10, 2021 at 11:25:27AM +0200, Paolo Bonzini wrote:
> On 10/08/21 09:40, Yu Zhang wrote:
> > About "host can't easily mirror L1's desired paging mode", could you please elaborate?
> > Thanks!
> 
> Shadow pgae tables in KVM will always have 3 levels on 32-bit machines and
> 4/5 levels on 64-bit machines.  L1 instead might have any number of levels
> from 2 to 5 (though of course not more than the host has).

Thanks Paolo.

I guess it's because, unlike EPT which are with either 4 or 5 levels, NPT's
level can range from 2 to 5, depending on the host paging mode...

> 
> Therefore, when shadowing 32-bit NPT page tables, KVM has to add extra fixed
> levels on top of those that it's shadowing.  See mmu_alloc_direct_roots for
> the code.

So when shadowing NPTs(can be 2/3 levels, depending on the paging mode in L1),
and if L0 Linux is running in 4/5 level mode, extra levels of paging structures
is needed in the shadow NPT.

But shadow EPT does not have such annoyance. Is above understanding correct?

B.R.
Yu
Paolo Bonzini Aug. 10, 2021, 12:47 p.m. UTC | #12
On 10/08/21 13:00, Yu Zhang wrote:
> I guess it's because, unlike EPT which are with either 4 or 5 levels, NPT's
> level can range from 2 to 5, depending on the host paging mode...

Yes, on Linux that will be one of 3/4/5 based on host paging mode, and 
it will apply to all N_CR3...

> But shadow EPT does not have such annoyance. Is above understanding correct?

... Right, because shadow EPT cannot have less than 4 levels, and it can 
always use 4 levels if that's what L1 uses.

Paolo
Yu Zhang Aug. 10, 2021, 2:37 p.m. UTC | #13
On Tue, Aug 10, 2021 at 02:47:27PM +0200, Paolo Bonzini wrote:
> On 10/08/21 13:00, Yu Zhang wrote:
> > I guess it's because, unlike EPT which are with either 4 or 5 levels, NPT's
> > level can range from 2 to 5, depending on the host paging mode...
> 
> Yes, on Linux that will be one of 3/4/5 based on host paging mode, and it
> will apply to all N_CR3...
> 
> > But shadow EPT does not have such annoyance. Is above understanding correct?
> 
> ... Right, because shadow EPT cannot have less than 4 levels, and it can
> always use 4 levels if that's what L1 uses.

Interesting. :) Thanks a lot for the explanation!

B.R.
Yu
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 974cbfb1eefe..6d16f75cc8da 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -723,7 +723,6 @@  struct kvm_vcpu_arch {
 
 	u64 reserved_gpa_bits;
 	int maxphyaddr;
-	int max_tdp_level;
 
 	/* emulate context */
 
@@ -1747,8 +1746,8 @@  void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
 void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd);
 
-void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
-		       int tdp_huge_page_level);
+void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
+		       int tdp_max_root_level, int tdp_huge_page_level);
 
 static inline u16 kvm_read_ldt(void)
 {
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 66f7f5bc3482..c11ee4531f6d 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -97,6 +97,7 @@  module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
 bool tdp_enabled = false;
 
 static int max_huge_page_level __read_mostly;
+static int tdp_root_level __read_mostly;
 static int max_tdp_level __read_mostly;
 
 enum {
@@ -4562,6 +4563,10 @@  static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
 
 static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
 {
+	/* tdp_root_level is architecture forced level, use it if nonzero */
+	if (tdp_root_level)
+		return tdp_root_level;
+
 	/* Use 5-level TDP if and only if it's useful/necessary. */
 	if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
 		return 4;
@@ -5253,10 +5258,11 @@  void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
 	 */
 }
 
-void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
-		       int tdp_huge_page_level)
+void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
+		       int tdp_max_root_level, int tdp_huge_page_level)
 {
 	tdp_enabled = enable_tdp;
+	tdp_root_level = tdp_forced_root_level;
 	max_tdp_level = tdp_max_root_level;
 
 	/*
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index e8ccab50ebf6..f361d466e18e 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1015,7 +1015,9 @@  static __init int svm_hardware_setup(void)
 	if (!boot_cpu_has(X86_FEATURE_NPT))
 		npt_enabled = false;
 
-	kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G);
+	/* Force VM NPT level equal to the host's max NPT level */
+	kvm_configure_mmu(npt_enabled, get_max_npt_level(),
+			  get_max_npt_level(), PG_LEVEL_1G);
 	pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
 
 	/* Note, SEV setup consumes npt_enabled. */
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 927a552393b9..034e1397c7d5 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7803,7 +7803,8 @@  static __init int hardware_setup(void)
 		ept_lpage_level = PG_LEVEL_2M;
 	else
 		ept_lpage_level = PG_LEVEL_4K;
-	kvm_configure_mmu(enable_ept, vmx_get_max_tdp_level(), ept_lpage_level);
+	kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(),
+			  ept_lpage_level);
 
 	/*
 	 * Only enable PML when hardware supports PML feature, and both EPT