diff mbox series

[v2,1/7] KVM: x86/mmu: Track if shadow MMU active

Message ID 20210429211833.3361994-2-bgardon@google.com (mailing list archive)
State New, archived
Headers show
Series Lazily allocate memslot rmaps | expand

Commit Message

Ben Gardon April 29, 2021, 9:18 p.m. UTC
Add a field to each VM to track if the shadow / legacy MMU is actually
in use. If the shadow MMU is not in use, then that knowledge opens the
door to other optimizations which will be added in future patches.

Signed-off-by: Ben Gardon <bgardon@google.com>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/mmu/mmu.c          | 10 +++++++++-
 arch/x86/kvm/mmu/mmu_internal.h |  2 ++
 arch/x86/kvm/mmu/tdp_mmu.c      |  6 ++++--
 arch/x86/kvm/mmu/tdp_mmu.h      |  4 ++--
 5 files changed, 19 insertions(+), 5 deletions(-)

Comments

Paolo Bonzini May 3, 2021, 1:42 p.m. UTC | #1
On 29/04/21 23:18, Ben Gardon wrote:
> +void activate_shadow_mmu(struct kvm *kvm)
> +{
> +	kvm->arch.shadow_mmu_active = true;
> +}
> +

I think there's no lock protecting both the write and the read side.
Therefore this should be an smp_store_release, and all checks in
patch 2 should be an smp_load_acquire.

Also, the assignments to slot->arch.rmap in patch 4 (alloc_memslot_rmap)
should be an rcu_assign_pointer, while __gfn_to_rmap must be changed like so:

+	struct kvm_rmap_head *head;
...
-	return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
+	head = srcu_dereference(slot->arch.rmap[level - PG_LEVEL_4K], &kvm->srcu,
+				 lockdep_is_held(&kvm->slots_arch_lock));
+       return &head[idx];

Paolo
Ben Gardon May 4, 2021, 5:26 p.m. UTC | #2
On Mon, May 3, 2021 at 6:42 AM Paolo Bonzini <pbonzini@redhat.com> wrote:
>
> On 29/04/21 23:18, Ben Gardon wrote:
> > +void activate_shadow_mmu(struct kvm *kvm)
> > +{
> > +     kvm->arch.shadow_mmu_active = true;
> > +}
> > +
>
> I think there's no lock protecting both the write and the read side.
> Therefore this should be an smp_store_release, and all checks in
> patch 2 should be an smp_load_acquire.

That makes sense.

>
> Also, the assignments to slot->arch.rmap in patch 4 (alloc_memslot_rmap)
> should be an rcu_assign_pointer, while __gfn_to_rmap must be changed like so:
>
> +       struct kvm_rmap_head *head;
> ...
> -       return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
> +       head = srcu_dereference(slot->arch.rmap[level - PG_LEVEL_4K], &kvm->srcu,
> +                                lockdep_is_held(&kvm->slots_arch_lock));
> +       return &head[idx];

I'm not sure I fully understand why this becomes necessary after patch
4. Isn't it already needed since the memslots are protected by RCU? Or
is there already a higher level rcu dereference?

__kvm_memslots already does an srcu dereference, so is there a path
where we aren't getting the slots from that function where this is
needed?

I wouldn't say that the rmaps are protected by RCU in any way that
separate from the memslots.

>
> Paolo
>
Sean Christopherson May 4, 2021, 7:55 p.m. UTC | #3
On Thu, Apr 29, 2021, Ben Gardon wrote:
> Add a field to each VM to track if the shadow / legacy MMU is actually
> in use. If the shadow MMU is not in use, then that knowledge opens the
> door to other optimizations which will be added in future patches.
> 
> Signed-off-by: Ben Gardon <bgardon@google.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  2 ++
>  arch/x86/kvm/mmu/mmu.c          | 10 +++++++++-
>  arch/x86/kvm/mmu/mmu_internal.h |  2 ++
>  arch/x86/kvm/mmu/tdp_mmu.c      |  6 ++++--
>  arch/x86/kvm/mmu/tdp_mmu.h      |  4 ++--
>  5 files changed, 19 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index ad22d4839bcc..3900dcf2439e 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1122,6 +1122,8 @@ struct kvm_arch {
>  	 */
>  	spinlock_t tdp_mmu_pages_lock;
>  #endif /* CONFIG_X86_64 */
> +
> +	bool shadow_mmu_active;

I'm not a fan of the name, "shadow mmu" in KVM almost always means shadow paging
of some form, whereas this covers both shadow paging and legacy TDP support.

But, I think we we can avoid bikeshedding by simply eliminating this flag.  More
in later patches.

>  };
>  
>  struct kvm_vm_stat {
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 930ac8a7e7c9..3975272321d0 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -3110,6 +3110,11 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>  	return ret;
>  }
>  
> +void activate_shadow_mmu(struct kvm *kvm)
> +{
> +	kvm->arch.shadow_mmu_active = true;
> +}
> +
>  static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
>  			       struct list_head *invalid_list)
>  {
> @@ -3280,6 +3285,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
>  		}
>  	}
>  
> +	activate_shadow_mmu(vcpu->kvm);
> +
>  	write_lock(&vcpu->kvm->mmu_lock);
>  	r = make_mmu_pages_available(vcpu);
>  	if (r < 0)
> @@ -5467,7 +5474,8 @@ void kvm_mmu_init_vm(struct kvm *kvm)
>  {
>  	struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
>  
> -	kvm_mmu_init_tdp_mmu(kvm);
> +	if (!kvm_mmu_init_tdp_mmu(kvm))
> +		activate_shadow_mmu(kvm);

Doesn't come into play yet, but I would strongly prefer to open code setting the
necessary flag instead of relying on the helper to never fail.
Paolo Bonzini May 4, 2021, 8:18 p.m. UTC | #4
On 04/05/21 19:26, Ben Gardon wrote:
> On Mon, May 3, 2021 at 6:42 AM Paolo Bonzini <pbonzini@redhat.com> wrote:
>>
>> On 29/04/21 23:18, Ben Gardon wrote:
>>> +void activate_shadow_mmu(struct kvm *kvm)
>>> +{
>>> +     kvm->arch.shadow_mmu_active = true;
>>> +}
>>> +
>>
>> I think there's no lock protecting both the write and the read side.
>> Therefore this should be an smp_store_release, and all checks in
>> patch 2 should be an smp_load_acquire.
> 
> That makes sense.
> 
>>
>> Also, the assignments to slot->arch.rmap in patch 4 (alloc_memslot_rmap)
>> should be an rcu_assign_pointer, while __gfn_to_rmap must be changed like so:
>>
>> +       struct kvm_rmap_head *head;
>> ...
>> -       return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
>> +       head = srcu_dereference(slot->arch.rmap[level - PG_LEVEL_4K], &kvm->srcu,
>> +                                lockdep_is_held(&kvm->slots_arch_lock));
>> +       return &head[idx];
> 
> I'm not sure I fully understand why this becomes necessary after patch
> 4. Isn't it already needed since the memslots are protected by RCU? Or
> is there already a higher level rcu dereference?
> 
> __kvm_memslots already does an srcu dereference, so is there a path
> where we aren't getting the slots from that function where this is
> needed?

There are two point of views:

1) the easier one is just CONFIG_PROVE_RCU debugging: the rmaps need to 
be accessed under RCU because the memslots can disappear as soon as 
kvm->srcu is unlocked.

2) the harder one (though at this point I'm better at figuring out these 
ordering bugs than "traditional" mutex races) is what the happens before 
relation[1] looks like.  Consider what happens if the rmaps are 
allocated by *another thread* after the slots have been fetched.

thread 1		thread 2		thread 3
allocate memslots
rcu_assign_pointer
			slots = srcu_dereference
						allocate rmap
						rcu_assign_pointer
			head = slot->arch.rmap[]

Here, thread 3 is allocating the rmaps in the SRCU-protected 
kvm_memslots; those rmaps that didn't exist at the time thread 1 did the 
rcu_assign_pointer (which synchronizes with thread 2's srcu_dereference 
that retrieves slots), hence they were not covered by the release 
semantics of that rcu_assign_pointer and the "consume" semantics of the 
corresponding srcu_dereference.  Therefore, thread 2 needs another 
srcu_dereference when retrieving them.

Paolo

[1] https://lwn.net/Articles/844224/

> I wouldn't say that the rmaps are protected by RCU in any way that
> separate from the memslots.
Paolo Bonzini May 4, 2021, 8:26 p.m. UTC | #5
On 04/05/21 21:55, Sean Christopherson wrote:
> But, I think we we can avoid bikeshedding by simply eliminating this flag.  More
> in later patches.

Are you thinking of checking slot->arch.rmap[0] directly?  That should 
work indeed.

>> -	kvm_mmu_init_tdp_mmu(kvm);
>> +	if (!kvm_mmu_init_tdp_mmu(kvm))
>> +		activate_shadow_mmu(kvm);
> Doesn't come into play yet, but I would strongly prefer to open code setting the
> necessary flag instead of relying on the helper to never fail.
> 

You mean

kvm->arch.shadow_mmu_active = !kvm_mmu_init_tdp_mmu(kvm);

(which would assign to alloc_memslot_rmaps instead if shadow_mmu_active 
is removed)?  That makes sense.

Paolo
Sean Christopherson May 4, 2021, 8:36 p.m. UTC | #6
On Tue, May 04, 2021, Paolo Bonzini wrote:
> On 04/05/21 21:55, Sean Christopherson wrote:
> > But, I think we we can avoid bikeshedding by simply eliminating this flag.  More
> > in later patches.
> 
> Are you thinking of checking slot->arch.rmap[0] directly?  That should work
> indeed.
> 
> > > -	kvm_mmu_init_tdp_mmu(kvm);
> > > +	if (!kvm_mmu_init_tdp_mmu(kvm))
> > > +		activate_shadow_mmu(kvm);
> > Doesn't come into play yet, but I would strongly prefer to open code setting the
> > necessary flag instead of relying on the helper to never fail.
> > 
> 
> You mean
> 
> kvm->arch.shadow_mmu_active = !kvm_mmu_init_tdp_mmu(kvm);
> 
> (which would assign to alloc_memslot_rmaps instead if shadow_mmu_active is
> removed)?  That makes sense.

Ya, that or:

	if (kvm_mmu_init_tdp_mmu(kvm))
		kvm->arch.memslots_have_rmaps = true;

I don't have a preference between the two variants.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ad22d4839bcc..3900dcf2439e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1122,6 +1122,8 @@  struct kvm_arch {
 	 */
 	spinlock_t tdp_mmu_pages_lock;
 #endif /* CONFIG_X86_64 */
+
+	bool shadow_mmu_active;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 930ac8a7e7c9..3975272321d0 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3110,6 +3110,11 @@  static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 	return ret;
 }
 
+void activate_shadow_mmu(struct kvm *kvm)
+{
+	kvm->arch.shadow_mmu_active = true;
+}
+
 static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
 			       struct list_head *invalid_list)
 {
@@ -3280,6 +3285,8 @@  static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 		}
 	}
 
+	activate_shadow_mmu(vcpu->kvm);
+
 	write_lock(&vcpu->kvm->mmu_lock);
 	r = make_mmu_pages_available(vcpu);
 	if (r < 0)
@@ -5467,7 +5474,8 @@  void kvm_mmu_init_vm(struct kvm *kvm)
 {
 	struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
 
-	kvm_mmu_init_tdp_mmu(kvm);
+	if (!kvm_mmu_init_tdp_mmu(kvm))
+		activate_shadow_mmu(kvm);
 
 	node->track_write = kvm_mmu_pte_write;
 	node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index f2546d6d390c..297a911c018c 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -165,4 +165,6 @@  void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
 void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
 void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
 
+void activate_shadow_mmu(struct kvm *kvm);
+
 #endif /* __KVM_X86_MMU_INTERNAL_H */
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 83cbdbe5de5a..5342aca2c8e0 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -14,10 +14,10 @@  static bool __read_mostly tdp_mmu_enabled = false;
 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
 
 /* Initializes the TDP MMU for the VM, if enabled. */
-void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
+bool kvm_mmu_init_tdp_mmu(struct kvm *kvm)
 {
 	if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
-		return;
+		return false;
 
 	/* This should not be changed for the lifetime of the VM. */
 	kvm->arch.tdp_mmu_enabled = true;
@@ -25,6 +25,8 @@  void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
 	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
+
+	return true;
 }
 
 static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 5fdf63090451..b046ab5137a1 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -80,12 +80,12 @@  int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
 			 int *root_level);
 
 #ifdef CONFIG_X86_64
-void kvm_mmu_init_tdp_mmu(struct kvm *kvm);
+bool kvm_mmu_init_tdp_mmu(struct kvm *kvm);
 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
 static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; }
 static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; }
 #else
-static inline void kvm_mmu_init_tdp_mmu(struct kvm *kvm) {}
+static inline bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return false; }
 static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {}
 static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; }
 static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; }