diff mbox series

[v4,7/7] KVM: x86/mmu: Lazily allocate memslot rmaps

Message ID 20210511171610.170160-8-bgardon@google.com (mailing list archive)
State New, archived
Headers show
Series Lazily allocate memslot rmaps | expand

Commit Message

Ben Gardon May 11, 2021, 5:16 p.m. UTC
If the TDP MMU is in use, wait to allocate the rmaps until the shadow
MMU is actually used. (i.e. a nested VM is launched.) This saves memory
equal to 0.2% of guest memory in cases where the TDP MMU is used and
there are no nested guests involved.

Signed-off-by: Ben Gardon <bgardon@google.com>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/mmu/mmu.c          | 53 +++++++++++++++++++++++----------
 arch/x86/kvm/mmu/tdp_mmu.c      |  6 ++--
 arch/x86/kvm/mmu/tdp_mmu.h      |  4 +--
 arch/x86/kvm/x86.c              | 45 +++++++++++++++++++++++++++-
 5 files changed, 89 insertions(+), 21 deletions(-)

Comments

Sean Christopherson May 11, 2021, 6:57 p.m. UTC | #1
On Tue, May 11, 2021, Ben Gardon wrote:
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> @@ -10935,6 +10937,46 @@ static int memslot_rmap_alloc(struct kvm_memory_slot *slot,
>  	return 0;
>  }
>  
> +int alloc_all_memslots_rmaps(struct kvm *kvm)
> +{
> +	struct kvm_memslots *slots;
> +	struct kvm_memory_slot *slot;
> +	int r = 0;

No need to initialize r.  And then it makes sense to put i and r on the same
line.

> +	int i;
> +
> +	/*
> +	 * Check memslots_have_rmaps early before acquiring the
> +	 * slots_arch_lock below.
> +	 */
> +	if (smp_load_acquire(&kvm->arch.memslots_have_rmaps))
> +		return 0;
> +
> +	mutex_lock(&kvm->slots_arch_lock);
> +
> +	/*
> +	 * Read memslots_have_rmaps again, under the slots arch lock,
> +	 * before allocating the rmaps
> +	 */
> +	if (smp_load_acquire(&kvm->arch.memslots_have_rmaps))
> +		return 0;

This fails to drop slots_arch_lock.

> +
> +	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
> +		slots = __kvm_memslots(kvm, i);
> +		kvm_for_each_memslot(slot, slots) {
> +			r = memslot_rmap_alloc(slot, slot->npages);
> +			if (r) {
> +				mutex_unlock(&kvm->slots_arch_lock);
> +				return r;
> +			}
> +		}
> +	}
> +
> +	/* Write rmap pointers before memslots_have_rmaps */
> +	smp_store_release(&kvm->arch.memslots_have_rmaps, true);
> +	mutex_unlock(&kvm->slots_arch_lock);
> +	return 0;
> +}
> +
>  static int kvm_alloc_memslot_metadata(struct kvm *kvm,
>  				      struct kvm_memory_slot *slot,
>  				      unsigned long npages)
> @@ -10949,7 +10991,8 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm,
>  	 */
>  	memset(&slot->arch, 0, sizeof(slot->arch));
>  
> -	if (kvm->arch.memslots_have_rmaps) {
> +	/* Read memslots_have_rmaps before allocating the rmaps */
> +	if (smp_load_acquire(&kvm->arch.memslots_have_rmaps)) {
>  		r = memslot_rmap_alloc(slot, npages);
>  		if (r)
>  			return r;
> -- 
> 2.31.1.607.g51e8a6a459-goog
>
Sean Christopherson May 11, 2021, 8:03 p.m. UTC | #2
On Tue, May 11, 2021, Ben Gardon wrote:
> If the TDP MMU is in use, wait to allocate the rmaps until the shadow
> MMU is actually used. (i.e. a nested VM is launched.) This saves memory
> equal to 0.2% of guest memory in cases where the TDP MMU is used and
> there are no nested guests involved.
> 
> Signed-off-by: Ben Gardon <bgardon@google.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  2 ++
>  arch/x86/kvm/mmu/mmu.c          | 53 +++++++++++++++++++++++----------
>  arch/x86/kvm/mmu/tdp_mmu.c      |  6 ++--
>  arch/x86/kvm/mmu/tdp_mmu.h      |  4 +--
>  arch/x86/kvm/x86.c              | 45 +++++++++++++++++++++++++++-
>  5 files changed, 89 insertions(+), 21 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index fc75ed49bfee..7b65f82ade1c 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1868,4 +1868,6 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)
>  
>  int kvm_cpu_dirty_log_size(void);
>  
> +int alloc_all_memslots_rmaps(struct kvm *kvm);
> +
>  #endif /* _ASM_X86_KVM_HOST_H */
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index b0bdb924d519..183afccd2944 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -1190,7 +1190,8 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
>  		kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
>  				slot->base_gfn + gfn_offset, mask, true);
>  
> -	if (!kvm->arch.memslots_have_rmaps)
> +	/* Read memslots_have_rmaps before the rmaps themselves */

IIRC, you open coded reading memslots_have_rmaps because of a circular
dependency, but you can solve that simply by defining the helper in mmu.h
instead of kvm_host.h.

And I think you could even make it static in mmu.c and omit the smp_load_acuquire
from the three users in x86.c, though that's probably not worth it.

Either way, reading the same comment over and over and over, just to make
checkpatch happy, gets more than a bit tedious.

That would also allow you to elaborate on why the smp_load_acquire() is
necessary, and preferably what it pairs with.

> +	if (!smp_load_acquire(&kvm->arch.memslots_have_rmaps))
>  		return;
>  
>  	while (mask) {
Sean Christopherson May 11, 2021, 8:04 p.m. UTC | #3
On Tue, May 11, 2021, Sean Christopherson wrote:
> On Tue, May 11, 2021, Ben Gardon wrote:
> > If the TDP MMU is in use, wait to allocate the rmaps until the shadow
> > MMU is actually used. (i.e. a nested VM is launched.) This saves memory
> > equal to 0.2% of guest memory in cases where the TDP MMU is used and
> > there are no nested guests involved.
> > 
> > Signed-off-by: Ben Gardon <bgardon@google.com>
> > ---
> >  arch/x86/include/asm/kvm_host.h |  2 ++
> >  arch/x86/kvm/mmu/mmu.c          | 53 +++++++++++++++++++++++----------
> >  arch/x86/kvm/mmu/tdp_mmu.c      |  6 ++--
> >  arch/x86/kvm/mmu/tdp_mmu.h      |  4 +--
> >  arch/x86/kvm/x86.c              | 45 +++++++++++++++++++++++++++-
> >  5 files changed, 89 insertions(+), 21 deletions(-)
> > 
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index fc75ed49bfee..7b65f82ade1c 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -1868,4 +1868,6 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)
> >  
> >  int kvm_cpu_dirty_log_size(void);
> >  
> > +int alloc_all_memslots_rmaps(struct kvm *kvm);
> > +
> >  #endif /* _ASM_X86_KVM_HOST_H */
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index b0bdb924d519..183afccd2944 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -1190,7 +1190,8 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
> >  		kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
> >  				slot->base_gfn + gfn_offset, mask, true);
> >  
> > -	if (!kvm->arch.memslots_have_rmaps)
> > +	/* Read memslots_have_rmaps before the rmaps themselves */
> 
> IIRC, you open coded reading memslots_have_rmaps because of a circular
> dependency, but you can solve that simply by defining the helper in mmu.h
> instead of kvm_host.h.
> 
> And I think you could even make it static in mmu.c and omit the smp_load_acuquire
> from the three users in x86.c, though that's probably not worth it.
> 
> Either way, reading the same comment over and over and over, just to make
> checkpatch happy, gets more than a bit tedious.
> 
> That would also allow you to elaborate on why the smp_load_acquire() is
> necessary, and preferably what it pairs with.

Belated thought: you could also introduce the helper in patch 06 in order to
miminize thrash in this patch.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fc75ed49bfee..7b65f82ade1c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1868,4 +1868,6 @@  static inline int kvm_cpu_get_apicid(int mps_cpu)
 
 int kvm_cpu_dirty_log_size(void);
 
+int alloc_all_memslots_rmaps(struct kvm *kvm);
+
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index b0bdb924d519..183afccd2944 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1190,7 +1190,8 @@  static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
 		kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
 				slot->base_gfn + gfn_offset, mask, true);
 
-	if (!kvm->arch.memslots_have_rmaps)
+	/* Read memslots_have_rmaps before the rmaps themselves */
+	if (!smp_load_acquire(&kvm->arch.memslots_have_rmaps))
 		return;
 
 	while (mask) {
@@ -1223,7 +1224,8 @@  static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
 		kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
 				slot->base_gfn + gfn_offset, mask, false);
 
-	if (!kvm->arch.memslots_have_rmaps)
+	/* Read memslots_have_rmaps before the rmaps themselves */
+	if (!smp_load_acquire(&kvm->arch.memslots_have_rmaps))
 		return;
 
 	while (mask) {
@@ -1268,7 +1270,8 @@  bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
 	int i;
 	bool write_protected = false;
 
-	if (kvm->arch.memslots_have_rmaps) {
+	/* Read memslots_have_rmaps before the rmaps themselves */
+	if (smp_load_acquire(&kvm->arch.memslots_have_rmaps)) {
 		for (i = PG_LEVEL_4K; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
 			rmap_head = __gfn_to_rmap(gfn, i, slot);
 			write_protected |= __rmap_write_protect(kvm, rmap_head,
@@ -1446,7 +1449,8 @@  bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 {
 	bool flush = false;
 
-	if (kvm->arch.memslots_have_rmaps)
+	/* Read memslots_have_rmaps before the rmaps themselves */
+	if (smp_load_acquire(&kvm->arch.memslots_have_rmaps))
 		flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp);
 
 	if (is_tdp_mmu_enabled(kvm))
@@ -1459,7 +1463,8 @@  bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
 	bool flush = false;
 
-	if (kvm->arch.memslots_have_rmaps)
+	/* Read memslots_have_rmaps before the rmaps themselves */
+	if (smp_load_acquire(&kvm->arch.memslots_have_rmaps))
 		flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp);
 
 	if (is_tdp_mmu_enabled(kvm))
@@ -1515,7 +1520,8 @@  bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
 	bool young = false;
 
-	if (kvm->arch.memslots_have_rmaps)
+	/* Read memslots_have_rmaps before the rmaps themselves */
+	if (smp_load_acquire(&kvm->arch.memslots_have_rmaps))
 		young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp);
 
 	if (is_tdp_mmu_enabled(kvm))
@@ -1528,7 +1534,8 @@  bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
 	bool young = false;
 
-	if (kvm->arch.memslots_have_rmaps)
+	/* Read memslots_have_rmaps before the rmaps themselves */
+	if (smp_load_acquire(&kvm->arch.memslots_have_rmaps))
 		young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp);
 
 	if (is_tdp_mmu_enabled(kvm))
@@ -3295,6 +3302,10 @@  static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 		}
 	}
 
+	r = alloc_all_memslots_rmaps(vcpu->kvm);
+	if (r)
+		return r;
+
 	write_lock(&vcpu->kvm->mmu_lock);
 	r = make_mmu_pages_available(vcpu);
 	if (r < 0)
@@ -5455,7 +5466,8 @@  static void kvm_mmu_zap_all_fast(struct kvm *kvm)
 	 */
 	kvm_reload_remote_mmus(kvm);
 
-	if (kvm->arch.memslots_have_rmaps)
+	/* Read memslots_have_rmaps before the rmaps themselves */
+	if (smp_load_acquire(&kvm->arch.memslots_have_rmaps))
 		kvm_zap_obsolete_pages(kvm);
 
 	write_unlock(&kvm->mmu_lock);
@@ -5483,9 +5495,13 @@  void kvm_mmu_init_vm(struct kvm *kvm)
 {
 	struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
 
-	kvm_mmu_init_tdp_mmu(kvm);
-
-	kvm->arch.memslots_have_rmaps = true;
+	if (!kvm_mmu_init_tdp_mmu(kvm))
+		/*
+		 * No smp_load/store wrappers needed here as we are in
+		 * VM init and there cannot be any memslots / other threads
+		 * accessing this struct kvm yet.
+		 */
+		kvm->arch.memslots_have_rmaps = true;
 
 	node->track_write = kvm_mmu_pte_write;
 	node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
@@ -5508,7 +5524,8 @@  void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
 	int i;
 	bool flush = false;
 
-	if (kvm->arch.memslots_have_rmaps) {
+	/* Read memslots_have_rmaps before the rmaps themselves */
+	if (smp_load_acquire(&kvm->arch.memslots_have_rmaps)) {
 		write_lock(&kvm->mmu_lock);
 		for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
 			slots = __kvm_memslots(kvm, i);
@@ -5559,7 +5576,8 @@  void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
 {
 	bool flush = false;
 
-	if (kvm->arch.memslots_have_rmaps) {
+	/* Read memslots_have_rmaps before the rmaps themselves */
+	if (smp_load_acquire(&kvm->arch.memslots_have_rmaps)) {
 		write_lock(&kvm->mmu_lock);
 		flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect,
 					  start_level, KVM_MAX_HUGEPAGE_LEVEL,
@@ -5635,7 +5653,8 @@  void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
 	struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot;
 	bool flush;
 
-	if (kvm->arch.memslots_have_rmaps) {
+	/* Read memslots_have_rmaps before the rmaps themselves */
+	if (smp_load_acquire(&kvm->arch.memslots_have_rmaps)) {
 		write_lock(&kvm->mmu_lock);
 		flush = slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true);
 		if (flush)
@@ -5672,7 +5691,8 @@  void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
 {
 	bool flush = false;
 
-	if (kvm->arch.memslots_have_rmaps) {
+	/* Read memslots_have_rmaps before the rmaps themselves */
+	if (smp_load_acquire(&kvm->arch.memslots_have_rmaps)) {
 		write_lock(&kvm->mmu_lock);
 		flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty,
 					 false);
@@ -5705,7 +5725,8 @@  void kvm_mmu_zap_all(struct kvm *kvm)
 	if (is_tdp_mmu_enabled(kvm))
 		kvm_tdp_mmu_zap_all(kvm);
 
-	if (!kvm->arch.memslots_have_rmaps) {
+	/* Read memslots_have_rmaps before the rmaps themselves */
+	if (!smp_load_acquire(&kvm->arch.memslots_have_rmaps)) {
 		write_unlock(&kvm->mmu_lock);
 		return;
 	}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 95eeb5ac6a8a..ea00c9502ba1 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -14,10 +14,10 @@  static bool __read_mostly tdp_mmu_enabled = false;
 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
 
 /* Initializes the TDP MMU for the VM, if enabled. */
-void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
+bool kvm_mmu_init_tdp_mmu(struct kvm *kvm)
 {
 	if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
-		return;
+		return false;
 
 	/* This should not be changed for the lifetime of the VM. */
 	kvm->arch.tdp_mmu_enabled = true;
@@ -25,6 +25,8 @@  void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
 	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
+
+	return true;
 }
 
 static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 5fdf63090451..b046ab5137a1 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -80,12 +80,12 @@  int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
 			 int *root_level);
 
 #ifdef CONFIG_X86_64
-void kvm_mmu_init_tdp_mmu(struct kvm *kvm);
+bool kvm_mmu_init_tdp_mmu(struct kvm *kvm);
 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
 static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; }
 static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; }
 #else
-static inline void kvm_mmu_init_tdp_mmu(struct kvm *kvm) {}
+static inline bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return false; }
 static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {}
 static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; }
 static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 03b6bcff2a53..fdc1b2759771 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10920,6 +10920,8 @@  static int memslot_rmap_alloc(struct kvm_memory_slot *slot,
 		int lpages;
 		int level = i + 1;
 
+		WARN_ON(slot->arch.rmap[i]);
+
 		lpages = gfn_to_index(slot->base_gfn + npages - 1,
 				      slot->base_gfn, level) + 1;
 
@@ -10935,6 +10937,46 @@  static int memslot_rmap_alloc(struct kvm_memory_slot *slot,
 	return 0;
 }
 
+int alloc_all_memslots_rmaps(struct kvm *kvm)
+{
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *slot;
+	int r = 0;
+	int i;
+
+	/*
+	 * Check memslots_have_rmaps early before acquiring the
+	 * slots_arch_lock below.
+	 */
+	if (smp_load_acquire(&kvm->arch.memslots_have_rmaps))
+		return 0;
+
+	mutex_lock(&kvm->slots_arch_lock);
+
+	/*
+	 * Read memslots_have_rmaps again, under the slots arch lock,
+	 * before allocating the rmaps
+	 */
+	if (smp_load_acquire(&kvm->arch.memslots_have_rmaps))
+		return 0;
+
+	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+		slots = __kvm_memslots(kvm, i);
+		kvm_for_each_memslot(slot, slots) {
+			r = memslot_rmap_alloc(slot, slot->npages);
+			if (r) {
+				mutex_unlock(&kvm->slots_arch_lock);
+				return r;
+			}
+		}
+	}
+
+	/* Write rmap pointers before memslots_have_rmaps */
+	smp_store_release(&kvm->arch.memslots_have_rmaps, true);
+	mutex_unlock(&kvm->slots_arch_lock);
+	return 0;
+}
+
 static int kvm_alloc_memslot_metadata(struct kvm *kvm,
 				      struct kvm_memory_slot *slot,
 				      unsigned long npages)
@@ -10949,7 +10991,8 @@  static int kvm_alloc_memslot_metadata(struct kvm *kvm,
 	 */
 	memset(&slot->arch, 0, sizeof(slot->arch));
 
-	if (kvm->arch.memslots_have_rmaps) {
+	/* Read memslots_have_rmaps before allocating the rmaps */
+	if (smp_load_acquire(&kvm->arch.memslots_have_rmaps)) {
 		r = memslot_rmap_alloc(slot, npages);
 		if (r)
 			return r;