diff mbox series

[v2,4/6] KVM: x86/mmu: Track the number of TDP MMU pages, but not the actual pages

Message ID 20220723012325.1715714-5-seanjc@google.com (mailing list archive)
State New, archived
Headers show
Series KVM: x86: Apply NX mitigation more precisely | expand

Commit Message

Sean Christopherson July 23, 2022, 1:23 a.m. UTC
Track the number of TDP MMU "shadow" pages instead of tracking the pages
themselves. With the NX huge page list manipulation moved out of the common
linking flow, elminating the list-based tracking means the happy path of
adding a shadow page doesn't need to acquire a spinlock and can instead
inc/dec an atomic.

Keep the tracking as the WARN during TDP MMU teardown on leaked shadow
pages is very, very useful for detecting KVM bugs.

Tracking the number of pages will also make it trivial to expose the
counter to userspace as a stat in the future, which may or may not be
desirable.

Note, the TDP MMU needs to use a separate counter (and stat if that ever
comes to be) from the existing n_used_mmu_pages. The TDP MMU doesn't bother
supporting the shrinker nor does it honor KVM_SET_NR_MMU_PAGES (because the
TDP MMU consumes so few pages relative to shadow paging), and including TDP
MMU pages in that counter would break both the shrinker and shadow MMUs,
e.g. if a VM is using nested TDP.

Cc: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Mingwei Zhang <mizhang@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/include/asm/kvm_host.h | 11 +++--------
 arch/x86/kvm/mmu/tdp_mmu.c      | 19 +++++++++----------
 2 files changed, 12 insertions(+), 18 deletions(-)

Comments

David Matlack July 25, 2022, 11:21 p.m. UTC | #1
On Sat, Jul 23, 2022 at 01:23:23AM +0000, Sean Christopherson wrote:
> Track the number of TDP MMU "shadow" pages instead of tracking the pages
> themselves. With the NX huge page list manipulation moved out of the common
> linking flow, elminating the list-based tracking means the happy path of
> adding a shadow page doesn't need to acquire a spinlock and can instead
> inc/dec an atomic.
> 
> Keep the tracking as the WARN during TDP MMU teardown on leaked shadow
> pages is very, very useful for detecting KVM bugs.
> 
> Tracking the number of pages will also make it trivial to expose the
> counter to userspace as a stat in the future, which may or may not be
> desirable.
> 
> Note, the TDP MMU needs to use a separate counter (and stat if that ever
> comes to be) from the existing n_used_mmu_pages. The TDP MMU doesn't bother
> supporting the shrinker nor does it honor KVM_SET_NR_MMU_PAGES (because the
> TDP MMU consumes so few pages relative to shadow paging), and including TDP
> MMU pages in that counter would break both the shrinker and shadow MMUs,
> e.g. if a VM is using nested TDP.
> 
> Cc: Yosry Ahmed <yosryahmed@google.com>
> Reviewed-by: Mingwei Zhang <mizhang@google.com>
> Signed-off-by: Sean Christopherson <seanjc@google.com>

Reviewed-by: David Matlack <dmatlack@google.com>

> ---
>  arch/x86/include/asm/kvm_host.h | 11 +++--------
>  arch/x86/kvm/mmu/tdp_mmu.c      | 19 +++++++++----------
>  2 files changed, 12 insertions(+), 18 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 246b69262b93..5c269b2556d6 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1271,6 +1271,9 @@ struct kvm_arch {
>  	 */
>  	bool tdp_mmu_enabled;
>  
> +	/* The number of TDP MMU pages across all roots. */
> +	atomic64_t tdp_mmu_pages;

This is the number of non-root TDP MMU pages, right?

> +
>  	/*
>  	 * List of struct kvm_mmu_pages being used as roots.
>  	 * All struct kvm_mmu_pages in the list should have
> @@ -1291,18 +1294,10 @@ struct kvm_arch {
>  	 */
>  	struct list_head tdp_mmu_roots;
>  
> -	/*
> -	 * List of struct kvmp_mmu_pages not being used as roots.
> -	 * All struct kvm_mmu_pages in the list should have
> -	 * tdp_mmu_page set and a tdp_mmu_root_count of 0.
> -	 */
> -	struct list_head tdp_mmu_pages;
> -
>  	/*
>  	 * Protects accesses to the following fields when the MMU lock
>  	 * is held in read mode:
>  	 *  - tdp_mmu_roots (above)
> -	 *  - tdp_mmu_pages (above)
>  	 *  - the link field of struct kvm_mmu_pages used by the TDP MMU
>  	 *  - possible_nx_huge_pages;
>  	 *  - the possible_nx_huge_page_link field of struct kvm_mmu_pages used
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 626c40ec2af9..fea22dc481a0 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -29,7 +29,6 @@ int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
>  	kvm->arch.tdp_mmu_enabled = true;
>  	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
>  	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
> -	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
>  	kvm->arch.tdp_mmu_zap_wq = wq;
>  	return 1;
>  }
> @@ -54,7 +53,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
>  	/* Also waits for any queued work items.  */
>  	destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
>  
> -	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
> +	WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
>  	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
>  
>  	/*
> @@ -386,16 +385,18 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
>  static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
>  			      bool shared)
>  {
> +	atomic64_dec(&kvm->arch.tdp_mmu_pages);
> +
> +	if (!sp->nx_huge_page_disallowed)
> +		return;
> +
>  	if (shared)
>  		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
>  	else
>  		lockdep_assert_held_write(&kvm->mmu_lock);
>  
> -	list_del(&sp->link);
> -	if (sp->nx_huge_page_disallowed) {
> -		sp->nx_huge_page_disallowed = false;
> -		untrack_possible_nx_huge_page(kvm, sp);
> -	}
> +	sp->nx_huge_page_disallowed = false;
> +	untrack_possible_nx_huge_page(kvm, sp);
>  
>  	if (shared)
>  		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
> @@ -1132,9 +1133,7 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
>  		tdp_mmu_set_spte(kvm, iter, spte);
>  	}
>  
> -	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
> -	list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
> -	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
> +	atomic64_inc(&kvm->arch.tdp_mmu_pages);
>  
>  	return 0;
>  }
> -- 
> 2.37.1.359.gd136c6c3e2-goog
>
Sean Christopherson July 25, 2022, 11:27 p.m. UTC | #2
On Mon, Jul 25, 2022, David Matlack wrote:
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 246b69262b93..5c269b2556d6 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -1271,6 +1271,9 @@ struct kvm_arch {
> >  	 */
> >  	bool tdp_mmu_enabled;
> >  
> > +	/* The number of TDP MMU pages across all roots. */
> > +	atomic64_t tdp_mmu_pages;
> 
> This is the number of non-root TDP MMU pages, right?

Yes.
Yan Zhao July 27, 2022, 2:41 a.m. UTC | #3
On Sat, Jul 23, 2022 at 01:23:23AM +0000, Sean Christopherson wrote:

<snip>

> @@ -386,16 +385,18 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
>  static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
>  			      bool shared)
>  {
> +	atomic64_dec(&kvm->arch.tdp_mmu_pages);
> +
> +	if (!sp->nx_huge_page_disallowed)
> +		return;
> +
Does this read of sp->nx_huge_page_disallowed also need to be protected by
tdp_mmu_pages_lock in shared path?

Thanks
Yan

>  	if (shared)
>  		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
>  	else
>  		lockdep_assert_held_write(&kvm->mmu_lock);
>  
> -	list_del(&sp->link);
> -	if (sp->nx_huge_page_disallowed) {
> -		sp->nx_huge_page_disallowed = false;
> -		untrack_possible_nx_huge_page(kvm, sp);
> -	}
> +	sp->nx_huge_page_disallowed = false;
> +	untrack_possible_nx_huge_page(kvm, sp);
>  
>  	if (shared)
>  		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
> @@ -1132,9 +1133,7 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
>  		tdp_mmu_set_spte(kvm, iter, spte);
>  	}
>  
> -	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
> -	list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
> -	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
> +	atomic64_inc(&kvm->arch.tdp_mmu_pages);
>  
>  	return 0;
>  }
> -- 
> 2.37.1.359.gd136c6c3e2-goog
>
Sean Christopherson July 27, 2022, 7:04 p.m. UTC | #4
On Wed, Jul 27, 2022, Yan Zhao wrote:
> On Sat, Jul 23, 2022 at 01:23:23AM +0000, Sean Christopherson wrote:
> 
> <snip>
> 
> > @@ -386,16 +385,18 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
> >  static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
> >  			      bool shared)
> >  {
> > +	atomic64_dec(&kvm->arch.tdp_mmu_pages);
> > +
> > +	if (!sp->nx_huge_page_disallowed)
> > +		return;
> > +
> Does this read of sp->nx_huge_page_disallowed also need to be protected by
> tdp_mmu_pages_lock in shared path?


No, because only one CPU can call tdp_mmu_unlink_sp() for a shadow page.  E.g. in
a shared walk, the SPTE is zapped atomically and only the CPU that "wins" gets to
unlink the s[.  The extra lock is needed to prevent list corruption, but the
sp itself is thread safe.

FWIW, even if that guarantee didn't hold, checking the flag outside of tdp_mmu_pages_lock
is safe because false positives are ok.  untrack_possible_nx_huge_page() checks that
the shadow page is actually on the list, i.e. it's a nop if a different task unlinks
the page first.

False negatives need to be avoided, but nx_huge_page_disallowed is cleared only
when untrack_possible_nx_huge_page() is guaranteed to be called, i.e. true false
negatives can't occur.

Hmm, but I think there's a missing smp_rmb(), which is needed to ensure
nx_huge_page_disallowed is read after observing the shadow-present SPTE (that's
being unlinked).  I'll add that in the next version.
Yan Zhao July 29, 2022, 1:02 a.m. UTC | #5
On Wed, Jul 27, 2022 at 07:04:35PM +0000, Sean Christopherson wrote:
> On Wed, Jul 27, 2022, Yan Zhao wrote:
> > On Sat, Jul 23, 2022 at 01:23:23AM +0000, Sean Christopherson wrote:
> > 
> > <snip>
> > 
> > > @@ -386,16 +385,18 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
> > >  static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
> > >  			      bool shared)
> > >  {
> > > +	atomic64_dec(&kvm->arch.tdp_mmu_pages);
> > > +
> > > +	if (!sp->nx_huge_page_disallowed)
> > > +		return;
> > > +
> > Does this read of sp->nx_huge_page_disallowed also need to be protected by
> > tdp_mmu_pages_lock in shared path?
> 
> 
> No, because only one CPU can call tdp_mmu_unlink_sp() for a shadow page.  E.g. in
> a shared walk, the SPTE is zapped atomically and only the CPU that "wins" gets to
> unlink the s[.  The extra lock is needed to prevent list corruption, but the
> sp itself is thread safe.
> 
> FWIW, even if that guarantee didn't hold, checking the flag outside of tdp_mmu_pages_lock
> is safe because false positives are ok.  untrack_possible_nx_huge_page() checks that
> the shadow page is actually on the list, i.e. it's a nop if a different task unlinks
> the page first.
> 
> False negatives need to be avoided, but nx_huge_page_disallowed is cleared only
> when untrack_possible_nx_huge_page() is guaranteed to be called, i.e. true false
> negatives can't occur.
> 
> Hmm, but I think there's a missing smp_rmb(), which is needed to ensure
> nx_huge_page_disallowed is read after observing the shadow-present SPTE (that's
> being unlinked).  I'll add that in the next version.

It makes sense. Thanks for such detailed explanation!

Thanks
Yan
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 246b69262b93..5c269b2556d6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1271,6 +1271,9 @@  struct kvm_arch {
 	 */
 	bool tdp_mmu_enabled;
 
+	/* The number of TDP MMU pages across all roots. */
+	atomic64_t tdp_mmu_pages;
+
 	/*
 	 * List of struct kvm_mmu_pages being used as roots.
 	 * All struct kvm_mmu_pages in the list should have
@@ -1291,18 +1294,10 @@  struct kvm_arch {
 	 */
 	struct list_head tdp_mmu_roots;
 
-	/*
-	 * List of struct kvmp_mmu_pages not being used as roots.
-	 * All struct kvm_mmu_pages in the list should have
-	 * tdp_mmu_page set and a tdp_mmu_root_count of 0.
-	 */
-	struct list_head tdp_mmu_pages;
-
 	/*
 	 * Protects accesses to the following fields when the MMU lock
 	 * is held in read mode:
 	 *  - tdp_mmu_roots (above)
-	 *  - tdp_mmu_pages (above)
 	 *  - the link field of struct kvm_mmu_pages used by the TDP MMU
 	 *  - possible_nx_huge_pages;
 	 *  - the possible_nx_huge_page_link field of struct kvm_mmu_pages used
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 626c40ec2af9..fea22dc481a0 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -29,7 +29,6 @@  int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
 	kvm->arch.tdp_mmu_enabled = true;
 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
 	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
-	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
 	kvm->arch.tdp_mmu_zap_wq = wq;
 	return 1;
 }
@@ -54,7 +53,7 @@  void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
 	/* Also waits for any queued work items.  */
 	destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
 
-	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
+	WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
 
 	/*
@@ -386,16 +385,18 @@  static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
 			      bool shared)
 {
+	atomic64_dec(&kvm->arch.tdp_mmu_pages);
+
+	if (!sp->nx_huge_page_disallowed)
+		return;
+
 	if (shared)
 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 	else
 		lockdep_assert_held_write(&kvm->mmu_lock);
 
-	list_del(&sp->link);
-	if (sp->nx_huge_page_disallowed) {
-		sp->nx_huge_page_disallowed = false;
-		untrack_possible_nx_huge_page(kvm, sp);
-	}
+	sp->nx_huge_page_disallowed = false;
+	untrack_possible_nx_huge_page(kvm, sp);
 
 	if (shared)
 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
@@ -1132,9 +1133,7 @@  static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
 		tdp_mmu_set_spte(kvm, iter, spte);
 	}
 
-	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
-	list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
-	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+	atomic64_inc(&kvm->arch.tdp_mmu_pages);
 
 	return 0;
 }