diff mbox series

[07/23] KVM: MMU: remove kvm_mmu_calc_root_page_role

Message ID 20220204115718.14934-8-pbonzini@redhat.com (mailing list archive)
State New, archived
Headers show
Series KVM: MMU: MMU role refactoring | expand

Commit Message

Paolo Bonzini Feb. 4, 2022, 11:57 a.m. UTC
Since the guest PGD is now loaded after the MMU has been set up
completely, the desired role for a cache hit is simply the current
mmu_role.  There is no need to compute it again, so __kvm_mmu_new_pgd
can be folded in kvm_mmu_new_pgd.

As an aside, the !tdp_enabled case in the function was dead code,
and that also gets mopped up as a side effect.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c | 29 ++++-------------------------
 1 file changed, 4 insertions(+), 25 deletions(-)

Comments

David Matlack Feb. 4, 2022, 7:32 p.m. UTC | #1
On Fri, Feb 04, 2022 at 06:57:02AM -0500, Paolo Bonzini wrote:
> Since the guest PGD is now loaded after the MMU has been set up
> completely, the desired role for a cache hit is simply the current
> mmu_role.  There is no need to compute it again, so __kvm_mmu_new_pgd
> can be folded in kvm_mmu_new_pgd.
> 
> As an aside, the !tdp_enabled case in the function was dead code,
> and that also gets mopped up as a side effect.

Couldn't the !tdp_enabled case be called via kvm_set_cr3() ->
kvm_mmu_new_pgd()?

> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  arch/x86/kvm/mmu/mmu.c | 29 ++++-------------------------
>  1 file changed, 4 insertions(+), 25 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index b8ab16323629..42475e4c2a48 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -190,8 +190,6 @@ struct kmem_cache *mmu_page_header_cache;
>  static struct percpu_counter kvm_total_used_mmu_pages;
>  
>  static void mmu_spte_set(u64 *sptep, u64 spte);
> -static union kvm_mmu_page_role
> -kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
>  
>  struct kvm_mmu_role_regs {
>  	const unsigned long cr0;
> @@ -4159,9 +4157,9 @@ static bool fast_pgd_switch(struct kvm_vcpu *vcpu, gpa_t new_pgd,
>  	return false;
>  }
>  
> -static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd,
> -			      union kvm_mmu_page_role new_role)
> +void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
>  {
> +	union kvm_mmu_page_role new_role = vcpu->arch.mmu->mmu_role.base;

nit: Blank line after variable declarations.

>  	if (!fast_pgd_switch(vcpu, new_pgd, new_role)) {
>  		kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, KVM_MMU_ROOT_CURRENT);
>  		return;
> @@ -4196,11 +4194,6 @@ static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd,
>  		__clear_sp_write_flooding_count(
>  				to_shadow_page(vcpu->arch.mmu->root_hpa));
>  }
> -
> -void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
> -{
> -	__kvm_mmu_new_pgd(vcpu, new_pgd, kvm_mmu_calc_root_page_role(vcpu));
> -}
>  EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
>  
>  static unsigned long get_cr3(struct kvm_vcpu *vcpu)
> @@ -4871,7 +4864,7 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
>  
>  	shadow_mmu_init_context(vcpu, context, &regs, new_role);
>  	reset_shadow_zero_bits_mask(vcpu, context, is_efer_nx(context));
> -	__kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base);
> +	kvm_mmu_new_pgd(vcpu, nested_cr3);
>  }
>  EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
>  
> @@ -4923,7 +4916,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
>  		reset_ept_shadow_zero_bits_mask(context, execonly);
>  	}
>  
> -	__kvm_mmu_new_pgd(vcpu, new_eptp, new_role.base);
> +	kvm_mmu_new_pgd(vcpu, new_eptp);
>  }
>  EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
>  
> @@ -5009,20 +5002,6 @@ void kvm_init_mmu(struct kvm_vcpu *vcpu)
>  }
>  EXPORT_SYMBOL_GPL(kvm_init_mmu);
>  
> -static union kvm_mmu_page_role
> -kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
> -{
> -	struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
> -	union kvm_mmu_role role;
> -
> -	if (tdp_enabled)
> -		role = kvm_calc_tdp_mmu_root_page_role(vcpu, &regs, true);
> -	else
> -		role = kvm_calc_shadow_mmu_root_page_role(vcpu, &regs, true);
> -
> -	return role.base;
> -}
> -
>  void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
>  {
>  	/*
> -- 
> 2.31.1
> 
>
Paolo Bonzini Feb. 5, 2022, 2:46 p.m. UTC | #2
On 2/4/22 20:32, David Matlack wrote:
> On Fri, Feb 04, 2022 at 06:57:02AM -0500, Paolo Bonzini wrote:
>> Since the guest PGD is now loaded after the MMU has been set up
>> completely, the desired role for a cache hit is simply the current
>> mmu_role.  There is no need to compute it again, so __kvm_mmu_new_pgd
>> can be folded in kvm_mmu_new_pgd.
>>
>> As an aside, the !tdp_enabled case in the function was dead code,
>> and that also gets mopped up as a side effect.
> Couldn't the !tdp_enabled case be called via kvm_set_cr3() ->
> kvm_mmu_new_pgd()?
> 

Right, it's not dead code.  Rather, there never was a need for 
kvm_mmu_calc_root_page_role(vcpu) as opposed to just using what is 
already in vcpu->arch.mmu.

Paolo
Sean Christopherson Feb. 10, 2022, 12:47 a.m. UTC | #3
On Fri, Feb 04, 2022, Paolo Bonzini wrote:
> Since the guest PGD is now loaded after the MMU has been set up
> completely, the desired role for a cache hit is simply the current
> mmu_role.  There is no need to compute it again, so __kvm_mmu_new_pgd
> can be folded in kvm_mmu_new_pgd.
> 
> As an aside, the !tdp_enabled case in the function was dead code,
> and that also gets mopped up as a side effect.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---

...

> @@ -4871,7 +4864,7 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
>  
>  	shadow_mmu_init_context(vcpu, context, &regs, new_role);
>  	reset_shadow_zero_bits_mask(vcpu, context, is_efer_nx(context));
> -	__kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base);
> +	kvm_mmu_new_pgd(vcpu, nested_cr3);

I'm not a fan of this change.  Conceptually it makes perfect sense, but I really,
really don't like the hidden dependency between shadow_mmu_init_context() and
kvm_mmu_new_pgd().

It's less bad once patch 01 is jettisoned, but I still don't love.  Yes, it's
silly to call __kvm_mmu_new_pgd() before the role is set, but it is robust.  And
either way we end up with an incoherent state, e.g. with this change, the role of
the shadow page pointed at by mmu->root_hpa doesn't match the role of the mmu itself.

Furthermore, the next patch to take advantage of this change is subtly broken/wrong.
It drops kvm_mmu_calc_root_page_role() from kvm_mmu_new_pgd() under the guise that
kvm_mmu_new_pgd() is called with an up-to-date mmu_role, but that is incorrect for
both nested VM-Enter with TDP disabled in the host and nested VM-Exit (regardless of
TDP enabling).

Both nested_vmx_load_cr3() and nested_svm_load_cr3() load a new CR3 before updating
the mmu.  Why, I have no idea.  Probably a historical wart

The nested mess is likely easily solved, I don't see any obvious issue with swapping
the order.  But I still don't love the subtlety.  I do like shaving cycles, just
not the subtlety...

If we do rework things to have kvm_mmu_new_pgd() pull the role from the mmu, then
we should first add a long overdue audit/warn that KVM never runs with a mmu_role
that isn't consistent with respect to its root SP's role.  E.g. finally get rid of
mmu_audit.c, add a proper CONFIG_KVM_AUDIT_MMU (which I'll happily turn on for all
my kernels), and start adding WARNs and other assertions.
Paolo Bonzini Feb. 10, 2022, 9:52 a.m. UTC | #4
On 2/10/22 01:47, Sean Christopherson wrote:
> The nested mess is likely easily solved, I don't see any obvious issue with swapping
> the order.  But I still don't love the subtlety.  I do like shaving cycles, just
> not the subtlety...

Not so easily, but it's doable and it's essentially what I did in the 
other series (the one that reworks the root cache).

Quick spoiler: there's a complicated dependency between the _old_ values 
in kvm_mmu and the root cache, so that the root cache code currently 
needs both the old MMU state (especially shadow_root_level/root_level) 
and the new role.

kvm_mmu_reset_context does the expensive kvm_mmu_unload to cop out of 
having to know in advance the new role; the crux of the other series is 
to remove that need, so that kvm_mmu_reset_context does not have to cop 
out anymore.

> If we do rework things to have kvm_mmu_new_pgd() pull the role from the mmu, then
> we should first add a long overdue audit/warn that KVM never runs with a mmu_role
> that isn't consistent with respect to its root SP's role.

There's a much cheaper check that can be done to enforce the invariant 
that kvm_mmu_new_pgd must follow kvm_init_mmu: kvm_init_mmu sets a 
not_ready flag, kvm_mmu_new_pgd clears it, and kvm_mmu_reload screams if 
it sees not_ready == 1.

Paolo
Sean Christopherson Feb. 10, 2022, 5:29 p.m. UTC | #5
On Thu, Feb 10, 2022, Paolo Bonzini wrote:
> On 2/10/22 01:47, Sean Christopherson wrote:
> > The nested mess is likely easily solved, I don't see any obvious issue with swapping
> > the order.  But I still don't love the subtlety.  I do like shaving cycles, just
> > not the subtlety...
> 
> Not so easily, but it's doable and it's essentially what I did in the other
> series (the one that reworks the root cache).

Sounds like I should reveiw that series first?
Paolo Bonzini Feb. 10, 2022, 5:43 p.m. UTC | #6
On 2/10/22 18:29, Sean Christopherson wrote:
> On Thu, Feb 10, 2022, Paolo Bonzini wrote:
>> On 2/10/22 01:47, Sean Christopherson wrote:
>>> The nested mess is likely easily solved, I don't see any obvious issue with swapping
>>> the order.  But I still don't love the subtlety.  I do like shaving cycles, just
>>> not the subtlety...
>>
>> Not so easily, but it's doable and it's essentially what I did in the other
>> series (the one that reworks the root cache).
> 
> Sounds like I should reveiw that series first?

Yeah, this one is still a nice step in the direction of guest pt walk 
from shadow pt build(*), but with no immediate use for TDP MMU root reuse.

The original idea was to use the MMU role to decide whether to do 
kvm_mmu_unload(), but that would have still been a bandaid---inefficient 
and a bad idea overall.  Patches 6+7 of this series (once fixed, because 
they were buggy as hell) turned out to be enough to tame the PGD cache 
and remove kvm_mmu_unload() altogether from kvm_mmu_reset_context().

Paolo

(*) Your idea of detecting stale roots is quite easily implemented on 
top of these, for example, because a root is stale if and only if the 
root_role changes.
diff mbox series

Patch

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index b8ab16323629..42475e4c2a48 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -190,8 +190,6 @@  struct kmem_cache *mmu_page_header_cache;
 static struct percpu_counter kvm_total_used_mmu_pages;
 
 static void mmu_spte_set(u64 *sptep, u64 spte);
-static union kvm_mmu_page_role
-kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
 
 struct kvm_mmu_role_regs {
 	const unsigned long cr0;
@@ -4159,9 +4157,9 @@  static bool fast_pgd_switch(struct kvm_vcpu *vcpu, gpa_t new_pgd,
 	return false;
 }
 
-static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd,
-			      union kvm_mmu_page_role new_role)
+void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
 {
+	union kvm_mmu_page_role new_role = vcpu->arch.mmu->mmu_role.base;
 	if (!fast_pgd_switch(vcpu, new_pgd, new_role)) {
 		kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, KVM_MMU_ROOT_CURRENT);
 		return;
@@ -4196,11 +4194,6 @@  static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd,
 		__clear_sp_write_flooding_count(
 				to_shadow_page(vcpu->arch.mmu->root_hpa));
 }
-
-void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
-{
-	__kvm_mmu_new_pgd(vcpu, new_pgd, kvm_mmu_calc_root_page_role(vcpu));
-}
 EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
 
 static unsigned long get_cr3(struct kvm_vcpu *vcpu)
@@ -4871,7 +4864,7 @@  void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
 
 	shadow_mmu_init_context(vcpu, context, &regs, new_role);
 	reset_shadow_zero_bits_mask(vcpu, context, is_efer_nx(context));
-	__kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base);
+	kvm_mmu_new_pgd(vcpu, nested_cr3);
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
 
@@ -4923,7 +4916,7 @@  void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 		reset_ept_shadow_zero_bits_mask(context, execonly);
 	}
 
-	__kvm_mmu_new_pgd(vcpu, new_eptp, new_role.base);
+	kvm_mmu_new_pgd(vcpu, new_eptp);
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
 
@@ -5009,20 +5002,6 @@  void kvm_init_mmu(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_init_mmu);
 
-static union kvm_mmu_page_role
-kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
-{
-	struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
-	union kvm_mmu_role role;
-
-	if (tdp_enabled)
-		role = kvm_calc_tdp_mmu_root_page_role(vcpu, &regs, true);
-	else
-		role = kvm_calc_shadow_mmu_root_page_role(vcpu, &regs, true);
-
-	return role.base;
-}
-
 void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
 {
 	/*