diff mbox series

[RFC,11/28] kvm: mmu: Optimize for freeing direct MMU PTs on teardown

Message ID 20190926231824.149014-12-bgardon@google.com (mailing list archive)
State New, archived
Headers show
Series kvm: mmu: Rework the x86 TDP direct mapped case | expand

Commit Message

Ben Gardon Sept. 26, 2019, 11:18 p.m. UTC
Waiting for a TLB flush and an RCU grace priod before freeing page table
memory grants safety in steady state operation, however these
protections are not always necessary. On VM teardown, only one thread is
operating on the paging structures and no vCPUs are running. As a result
a fast path can be added to the disconnected page table handler which
frees the memory immediately. Add the fast path and use it when tearing
down VMs.

Signed-off-by: Ben Gardon <bgardon@google.com>
---
 arch/x86/kvm/mmu.c | 44 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 34 insertions(+), 10 deletions(-)

Comments

Sean Christopherson Dec. 2, 2019, 11:54 p.m. UTC | #1
On Thu, Sep 26, 2019 at 04:18:07PM -0700, Ben Gardon wrote:
> Waiting for a TLB flush and an RCU grace priod before freeing page table
> memory grants safety in steady state operation, however these
> protections are not always necessary. On VM teardown, only one thread is
> operating on the paging structures and no vCPUs are running. As a result
> a fast path can be added to the disconnected page table handler which
> frees the memory immediately. Add the fast path and use it when tearing
> down VMs.
> 
> Signed-off-by: Ben Gardon <bgardon@google.com>
> ---

...

> @@ -1849,13 +1863,20 @@ static void handle_disconnected_pt(struct kvm *kvm, int as_id,
>  		 * try to map in an entry there or try to free any child page
>  		 * table the entry might have pointed to.
>  		 */
> -		mark_pte_disconnected(kvm, as_id, gfn, &pt[i], level);
> +		mark_pte_disconnected(kvm, as_id, gfn, &pt[i], level,
> +				      vm_teardown);
>  
>  		gfn += KVM_PAGES_PER_HPAGE(level);
>  	}
>  
> -	page = pfn_to_page(pfn);
> -	direct_mmu_disconnected_pt_list_add(kvm, page);
> +	if (vm_teardown) {
> +		BUG_ON(atomic_read(&kvm->online_vcpus) != 0);

BUG() isn't justified here, e.g.

	if (vm_teardown && !WARN_ON_ONCE(atomic_read(&kvm->online_vcpus)))

> +		cond_resched();
> +		free_page((unsigned long)pt);
> +	} else {
> +		page = pfn_to_page(pfn);
> +		direct_mmu_disconnected_pt_list_add(kvm, page);
> +	}
>  }
>  
>  /**
> @@ -1866,6 +1887,8 @@ static void handle_disconnected_pt(struct kvm *kvm, int as_id,
>   * @old_pte: The value of the PTE before the atomic compare / exchange
>   * @new_pte: The value of the PTE after the atomic compare / exchange
>   * @level: the level of the PT the PTE is part of in the paging structure
> + * @vm_teardown: all vCPUs are paused and the VM is being torn down. Yield and
> + *	free child page table memory immediately.
>   *
>   * Handle bookkeeping that might result from the modification of a PTE.
>   * This function should be called in the same RCU read critical section as the
> @@ -1874,7 +1897,8 @@ static void handle_disconnected_pt(struct kvm *kvm, int as_id,
>   * setting the dirty bit on a pte.
>   */
>  static void handle_changed_pte(struct kvm *kvm, int as_id, gfn_t gfn,
> -			       u64 old_pte, u64 new_pte, int level)
> +			       u64 old_pte, u64 new_pte, int level,
> +			       bool vm_teardown)
>  {
>  	bool was_present = is_present_direct_pte(old_pte);
>  	bool is_present = is_present_direct_pte(new_pte);
> @@ -1920,7 +1944,7 @@ static void handle_changed_pte(struct kvm *kvm, int as_id, gfn_t gfn,
>  		 * pointed to must be freed.
>  		 */
>  		handle_disconnected_pt(kvm, as_id, gfn, spte_to_pfn(old_pte),
> -				       child_level);
> +				       child_level, vm_teardown);
>  	}
>  }
>  
> @@ -5932,7 +5956,7 @@ static void kvm_mmu_uninit_direct_mmu(struct kvm *kvm)
>  	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
>  		handle_disconnected_pt(kvm, i, 0,
>  			(kvm_pfn_t)(kvm->arch.direct_root_hpa[i] >> PAGE_SHIFT),
> -			PT64_ROOT_4LEVEL);
> +			PT64_ROOT_4LEVEL, true);
>  }
>  
>  /* The return value indicates if tlb flush on all vcpus is needed. */
> -- 
> 2.23.0.444.g18eeb5a265-goog
>
diff mbox series

Patch

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 317e9238f17b2..263718d49f730 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1795,7 +1795,8 @@  static void direct_mmu_disconnected_pt_list_add(struct kvm *kvm,
 
 
 static void handle_changed_pte(struct kvm *kvm, int as_id, gfn_t gfn,
-			       u64 old_pte, u64 new_pte, int level);
+			       u64 old_pte, u64 new_pte, int level,
+			       bool vm_teardown);
 
 /**
  * mark_pte_disconnected - Mark a PTE as part of a disconnected PT
@@ -1805,16 +1806,19 @@  static void handle_changed_pte(struct kvm *kvm, int as_id, gfn_t gfn,
  * @ptep: a pointer to the PTE to be marked disconnected
  * @level: the level of the PT this PTE was a part of, when it was part of the
  *	paging structure
+ * @vm_teardown: all vCPUs are paused and the VM is being torn down. Yield and
+ *	free child page table memory immediately.
  */
 static void mark_pte_disconnected(struct kvm *kvm, int as_id, gfn_t gfn,
-				  u64 *ptep, int level)
+				  u64 *ptep, int level, bool vm_teardown)
 {
 	u64 old_pte;
 
 	old_pte = xchg(ptep, DISCONNECTED_PTE);
 	BUG_ON(old_pte == DISCONNECTED_PTE);
 
-	handle_changed_pte(kvm, as_id, gfn, old_pte, DISCONNECTED_PTE, level);
+	handle_changed_pte(kvm, as_id, gfn, old_pte, DISCONNECTED_PTE, level,
+			   vm_teardown);
 }
 
 /**
@@ -1825,6 +1829,8 @@  static void mark_pte_disconnected(struct kvm *kvm, int as_id, gfn_t gfn,
  * @pt_base_gfn: the base GFN that was mapped by the first PTE in the PT
  * @pfn: The physical frame number of the disconnected PT page
  * @level: the level of the PT, when it was part of the paging structure
+ * @vm_teardown: all vCPUs are paused and the VM is being torn down. Yield and
+ *	free child page table memory immediately.
  *
  * Given a pointer to a page table that has been removed from the paging
  * structure and its level, recursively free child page tables and mark their
@@ -1834,9 +1840,17 @@  static void mark_pte_disconnected(struct kvm *kvm, int as_id, gfn_t gfn,
  * page table or its children because it has been atomically removed from the
  * root of the paging structure, so no other thread will be trying to free the
  * memory.
+ *
+ * If vm_teardown=true, this function will yield while handling the
+ * disconnected page tables and will free memory immediately. This option
+ * should only be used during VM teardown when no other CPUs are accessing the
+ * direct paging structures. Yielding is necessary because the paging structure
+ * could be quite large, and freeing it without yielding would induce
+ * soft-lockups or scheduler warnings.
  */
 static void handle_disconnected_pt(struct kvm *kvm, int as_id,
-				   gfn_t pt_base_gfn, kvm_pfn_t pfn, int level)
+				   gfn_t pt_base_gfn, kvm_pfn_t pfn, int level,
+				   bool vm_teardown)
 {
 	int i;
 	gfn_t gfn = pt_base_gfn;
@@ -1849,13 +1863,20 @@  static void handle_disconnected_pt(struct kvm *kvm, int as_id,
 		 * try to map in an entry there or try to free any child page
 		 * table the entry might have pointed to.
 		 */
-		mark_pte_disconnected(kvm, as_id, gfn, &pt[i], level);
+		mark_pte_disconnected(kvm, as_id, gfn, &pt[i], level,
+				      vm_teardown);
 
 		gfn += KVM_PAGES_PER_HPAGE(level);
 	}
 
-	page = pfn_to_page(pfn);
-	direct_mmu_disconnected_pt_list_add(kvm, page);
+	if (vm_teardown) {
+		BUG_ON(atomic_read(&kvm->online_vcpus) != 0);
+		cond_resched();
+		free_page((unsigned long)pt);
+	} else {
+		page = pfn_to_page(pfn);
+		direct_mmu_disconnected_pt_list_add(kvm, page);
+	}
 }
 
 /**
@@ -1866,6 +1887,8 @@  static void handle_disconnected_pt(struct kvm *kvm, int as_id,
  * @old_pte: The value of the PTE before the atomic compare / exchange
  * @new_pte: The value of the PTE after the atomic compare / exchange
  * @level: the level of the PT the PTE is part of in the paging structure
+ * @vm_teardown: all vCPUs are paused and the VM is being torn down. Yield and
+ *	free child page table memory immediately.
  *
  * Handle bookkeeping that might result from the modification of a PTE.
  * This function should be called in the same RCU read critical section as the
@@ -1874,7 +1897,8 @@  static void handle_disconnected_pt(struct kvm *kvm, int as_id,
  * setting the dirty bit on a pte.
  */
 static void handle_changed_pte(struct kvm *kvm, int as_id, gfn_t gfn,
-			       u64 old_pte, u64 new_pte, int level)
+			       u64 old_pte, u64 new_pte, int level,
+			       bool vm_teardown)
 {
 	bool was_present = is_present_direct_pte(old_pte);
 	bool is_present = is_present_direct_pte(new_pte);
@@ -1920,7 +1944,7 @@  static void handle_changed_pte(struct kvm *kvm, int as_id, gfn_t gfn,
 		 * pointed to must be freed.
 		 */
 		handle_disconnected_pt(kvm, as_id, gfn, spte_to_pfn(old_pte),
-				       child_level);
+				       child_level, vm_teardown);
 	}
 }
 
@@ -5932,7 +5956,7 @@  static void kvm_mmu_uninit_direct_mmu(struct kvm *kvm)
 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
 		handle_disconnected_pt(kvm, i, 0,
 			(kvm_pfn_t)(kvm->arch.direct_root_hpa[i] >> PAGE_SHIFT),
-			PT64_ROOT_4LEVEL);
+			PT64_ROOT_4LEVEL, true);
 }
 
 /* The return value indicates if tlb flush on all vcpus is needed. */