diff mbox series

[11/24] kvm: x86/mmu: Put TDP MMU PT walks in RCU read-critical section

Message ID 20210112181041.356734-12-bgardon@google.com (mailing list archive)
State New, archived
Headers show
Series Allow parallel page faults with TDP MMU | expand

Commit Message

Ben Gardon Jan. 12, 2021, 6:10 p.m. UTC
In order to enable concurrent modifications to the paging structures in
the TDP MMU, threads must be able to safely remove pages of page table
memory while other threads are traversing the same memory. To ensure
threads do not access PT memory after it is freed, protect PT memory
with RCU.

Reviewed-by: Peter Feiner <pfeiner@google.com>

Signed-off-by: Ben Gardon <bgardon@google.com>
---
 arch/x86/kvm/mmu/tdp_mmu.c | 53 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 51 insertions(+), 2 deletions(-)

Comments

Sean Christopherson Jan. 20, 2021, 10:19 p.m. UTC | #1
On Tue, Jan 12, 2021, Ben Gardon wrote:                                         
> In order to enable concurrent modifications to the paging structures in       
> the TDP MMU, threads must be able to safely remove pages of page table        
> memory while other threads are traversing the same memory. To ensure          
> threads do not access PT memory after it is freed, protect PT memory          
> with RCU.                                                                     
                                                                                
Normally I like splitting up patches, but the three RCU patches (11-13) probably
need to be combined into a single patch.  I assume you introduced the RCU       
readers in a separate patch to isolate deadlocks, but it's impossible to give   
this patch a proper review without peeking ahead to see how what's actually     
being protected with RCU.                                                       
                                                                                
The combined changelog should also explain why READING_SHADOW_PAGE_TABLES isn't 
a good solution.  I suspect the answer is because the longer-running walks would
disable IRQs for too long, but that should be explicitly documented.

> Reviewed-by: Peter Feiner <pfeiner@google.com>                                
>                                                                               
> Signed-off-by: Ben Gardon <bgardon@google.com>                                
> ---                                                                           
>  arch/x86/kvm/mmu/tdp_mmu.c | 53 ++++++++++++++++++++++++++++++++++++--       
>  1 file changed, 51 insertions(+), 2 deletions(-)                             
>                                                                               
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c          
> index e8f35cd46b4c..662907d374b3 100644                                       
> --- a/arch/x86/kvm/mmu/tdp_mmu.c                                              
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c                                              
> @@ -458,11 +458,14 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
>   * Return true if this function yielded, the TLBs were flushed, and the      
>   * iterator's traversal was reset. Return false if a yield was not needed.   
>   */                                                                          
> -static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm, struct tdp_iter *iter)
> +static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm,                 
> +             struct tdp_iter *iter)                                          
                                                                                
Unrelated newline.                                                              
                                                                                
>  {                                                                            
>       if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {                 
>               kvm_flush_remote_tlbs(kvm);                                     
> +             rcu_read_unlock();                                              
                                                                                
I'm 99% certain rcu_read_unlock() can be moved before the TLB flush.  IIUC, RCU 
is protecting only the host kernel's software walks; the only true "writer" is  
immediately preceded by a remote TLB flush (in patch 13).                       
                                                                                
        kvm_flush_remote_tlbs_with_address(kvm, gfn,                            
                                           KVM_PAGES_PER_HPAGE(level));         
                                                                                
        call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);                  
                                                                                
That also resolves an inconsistency with zap_gfn_range(), which unlocks before
doing the remote flush.  Ditto for zap_collapsible_spte_range(), and I think a
few other flows.

>  		cond_resched_lock(&kvm->mmu_lock);
> +		rcu_read_lock();
>  		tdp_iter_refresh_walk(iter);
>  		return true;
>  	} else
> @@ -483,7 +486,9 @@ static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm, struct tdp_iter *it
>  static bool tdp_mmu_iter_cond_resched(struct kvm *kvm, struct tdp_iter *iter)
>  {
>  	if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
> +		rcu_read_unlock();
>  		cond_resched_lock(&kvm->mmu_lock);
> +		rcu_read_lock();
>  		tdp_iter_refresh_walk(iter);
>  		return true;
>  	} else
> @@ -508,6 +513,8 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
>  	gfn_t last_goal_gfn = start;
>  	bool flush_needed = false;
>  
> +	rcu_read_lock();
> +
>  	tdp_root_for_each_pte(iter, root, start, end) {
>  		/* Ensure forward progress has been made before yielding. */
>  		if (can_yield && iter.goal_gfn != last_goal_gfn &&
> @@ -538,6 +545,8 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
>  		tdp_mmu_set_spte(kvm, &iter, 0);
>  		flush_needed = true;
>  	}
> +
> +	rcu_read_unlock();

Unlock before TLB flush.  <-------

>  	return flush_needed;
>  }

...

> @@ -844,6 +863,8 @@ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
>  	u64 new_spte;
>  	int need_flush = 0;
>  
> +	rcu_read_lock();
> +
>  	WARN_ON(pte_huge(*ptep));
>  
>  	new_pfn = pte_pfn(*ptep);
> @@ -872,6 +893,8 @@ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
>  	if (need_flush)
>  		kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
>  
> +	rcu_read_unlock();

Unlock before flush?

> +
>  	return 0;
>  }
>  
  
...

> @@ -1277,10 +1322,14 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
>  
>  	*root_level = vcpu->arch.mmu->shadow_root_level;
>  
> +	rcu_read_lock();

Hrm, isn't this an existing bug?  And also not really the correct fix?  mmu_lock
is not held here, so the existing code has no protections.  Using
walk_shadow_page_lockless_begin/end() feels more appropriate for this particular
walk.

> +
>  	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
>  		leaf = iter.level;
>  		sptes[leaf] = iter.old_spte;
>  	}
>  
> +	rcu_read_unlock();
> +
>  	return leaf;
>  }
> -- 
> 2.30.0.284.gd98b1dd5eaa7-goog
>
diff mbox series

Patch

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index e8f35cd46b4c..662907d374b3 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -458,11 +458,14 @@  static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
  * Return true if this function yielded, the TLBs were flushed, and the
  * iterator's traversal was reset. Return false if a yield was not needed.
  */
-static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm, struct tdp_iter *iter)
+static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm,
+		struct tdp_iter *iter)
 {
 	if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
 		kvm_flush_remote_tlbs(kvm);
+		rcu_read_unlock();
 		cond_resched_lock(&kvm->mmu_lock);
+		rcu_read_lock();
 		tdp_iter_refresh_walk(iter);
 		return true;
 	} else
@@ -483,7 +486,9 @@  static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm, struct tdp_iter *it
 static bool tdp_mmu_iter_cond_resched(struct kvm *kvm, struct tdp_iter *iter)
 {
 	if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+		rcu_read_unlock();
 		cond_resched_lock(&kvm->mmu_lock);
+		rcu_read_lock();
 		tdp_iter_refresh_walk(iter);
 		return true;
 	} else
@@ -508,6 +513,8 @@  static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 	gfn_t last_goal_gfn = start;
 	bool flush_needed = false;
 
+	rcu_read_lock();
+
 	tdp_root_for_each_pte(iter, root, start, end) {
 		/* Ensure forward progress has been made before yielding. */
 		if (can_yield && iter.goal_gfn != last_goal_gfn &&
@@ -538,6 +545,8 @@  static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 		tdp_mmu_set_spte(kvm, &iter, 0);
 		flush_needed = true;
 	}
+
+	rcu_read_unlock();
 	return flush_needed;
 }
 
@@ -650,6 +659,9 @@  int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 					huge_page_disallowed, &req_level);
 
 	trace_kvm_mmu_spte_requested(gpa, level, pfn);
+
+	rcu_read_lock();
+
 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
 		if (nx_huge_page_workaround_enabled)
 			disallowed_hugepage_adjust(iter.old_spte, gfn,
@@ -693,11 +705,14 @@  int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 		}
 	}
 
-	if (WARN_ON(iter.level != level))
+	if (WARN_ON(iter.level != level)) {
+		rcu_read_unlock();
 		return RET_PF_RETRY;
+	}
 
 	ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
 					      pfn, prefault);
+	rcu_read_unlock();
 
 	return ret;
 }
@@ -768,6 +783,8 @@  static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
 	int young = 0;
 	u64 new_spte = 0;
 
+	rcu_read_lock();
+
 	tdp_root_for_each_leaf_pte(iter, root, start, end) {
 		/*
 		 * If we have a non-accessed entry we don't need to change the
@@ -799,6 +816,8 @@  static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
 		trace_kvm_age_page(iter.gfn, iter.level, slot, young);
 	}
 
+	rcu_read_unlock();
+
 	return young;
 }
 
@@ -844,6 +863,8 @@  static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
 	u64 new_spte;
 	int need_flush = 0;
 
+	rcu_read_lock();
+
 	WARN_ON(pte_huge(*ptep));
 
 	new_pfn = pte_pfn(*ptep);
@@ -872,6 +893,8 @@  static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
 	if (need_flush)
 		kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
 
+	rcu_read_unlock();
+
 	return 0;
 }
 
@@ -896,6 +919,8 @@  static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 	gfn_t last_goal_gfn = start;
 	bool spte_set = false;
 
+	rcu_read_lock();
+
 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
 
 	for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
@@ -924,6 +949,8 @@  static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 		tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
 		spte_set = true;
 	}
+
+	rcu_read_unlock();
 	return spte_set;
 }
 
@@ -966,6 +993,8 @@  static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 	gfn_t last_goal_gfn = start;
 	bool spte_set = false;
 
+	rcu_read_lock();
+
 	tdp_root_for_each_leaf_pte(iter, root, start, end) {
 		/* Ensure forward progress has been made before yielding. */
 		if (iter.goal_gfn != last_goal_gfn &&
@@ -994,6 +1023,8 @@  static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 		tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
 		spte_set = true;
 	}
+
+	rcu_read_unlock();
 	return spte_set;
 }
 
@@ -1035,6 +1066,8 @@  static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
 	struct tdp_iter iter;
 	u64 new_spte;
 
+	rcu_read_lock();
+
 	tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
 				    gfn + BITS_PER_LONG) {
 		if (!mask)
@@ -1060,6 +1093,8 @@  static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
 
 		mask &= ~(1UL << (iter.gfn - gfn));
 	}
+
+	rcu_read_unlock();
 }
 
 /*
@@ -1100,6 +1135,8 @@  static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 	gfn_t last_goal_gfn = start;
 	bool spte_set = false;
 
+	rcu_read_lock();
+
 	tdp_root_for_each_pte(iter, root, start, end) {
 		/* Ensure forward progress has been made before yielding. */
 		if (iter.goal_gfn != last_goal_gfn &&
@@ -1125,6 +1162,7 @@  static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 		spte_set = true;
 	}
 
+	rcu_read_unlock();
 	return spte_set;
 }
 
@@ -1163,6 +1201,8 @@  static void zap_collapsible_spte_range(struct kvm *kvm,
 	gfn_t last_goal_gfn = start;
 	bool spte_set = false;
 
+	rcu_read_lock();
+
 	tdp_root_for_each_pte(iter, root, start, end) {
 		/* Ensure forward progress has been made before yielding. */
 		if (iter.goal_gfn != last_goal_gfn &&
@@ -1190,6 +1230,7 @@  static void zap_collapsible_spte_range(struct kvm *kvm,
 		spte_set = true;
 	}
 
+	rcu_read_unlock();
 	if (spte_set)
 		kvm_flush_remote_tlbs(kvm);
 }
@@ -1226,6 +1267,8 @@  static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
 	u64 new_spte;
 	bool spte_set = false;
 
+	rcu_read_lock();
+
 	tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) {
 		if (!is_writable_pte(iter.old_spte))
 			break;
@@ -1237,6 +1280,8 @@  static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
 		spte_set = true;
 	}
 
+	rcu_read_unlock();
+
 	return spte_set;
 }
 
@@ -1277,10 +1322,14 @@  int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
 
 	*root_level = vcpu->arch.mmu->shadow_root_level;
 
+	rcu_read_lock();
+
 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
 		leaf = iter.level;
 		sptes[leaf] = iter.old_spte;
 	}
 
+	rcu_read_unlock();
+
 	return leaf;
 }