Message ID | 20220723012325.1715714-5-seanjc@google.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | KVM: x86: Apply NX mitigation more precisely | expand |
On Sat, Jul 23, 2022 at 01:23:23AM +0000, Sean Christopherson wrote: > Track the number of TDP MMU "shadow" pages instead of tracking the pages > themselves. With the NX huge page list manipulation moved out of the common > linking flow, elminating the list-based tracking means the happy path of > adding a shadow page doesn't need to acquire a spinlock and can instead > inc/dec an atomic. > > Keep the tracking as the WARN during TDP MMU teardown on leaked shadow > pages is very, very useful for detecting KVM bugs. > > Tracking the number of pages will also make it trivial to expose the > counter to userspace as a stat in the future, which may or may not be > desirable. > > Note, the TDP MMU needs to use a separate counter (and stat if that ever > comes to be) from the existing n_used_mmu_pages. The TDP MMU doesn't bother > supporting the shrinker nor does it honor KVM_SET_NR_MMU_PAGES (because the > TDP MMU consumes so few pages relative to shadow paging), and including TDP > MMU pages in that counter would break both the shrinker and shadow MMUs, > e.g. if a VM is using nested TDP. > > Cc: Yosry Ahmed <yosryahmed@google.com> > Reviewed-by: Mingwei Zhang <mizhang@google.com> > Signed-off-by: Sean Christopherson <seanjc@google.com> Reviewed-by: David Matlack <dmatlack@google.com> > --- > arch/x86/include/asm/kvm_host.h | 11 +++-------- > arch/x86/kvm/mmu/tdp_mmu.c | 19 +++++++++---------- > 2 files changed, 12 insertions(+), 18 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index 246b69262b93..5c269b2556d6 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -1271,6 +1271,9 @@ struct kvm_arch { > */ > bool tdp_mmu_enabled; > > + /* The number of TDP MMU pages across all roots. */ > + atomic64_t tdp_mmu_pages; This is the number of non-root TDP MMU pages, right? > + > /* > * List of struct kvm_mmu_pages being used as roots. > * All struct kvm_mmu_pages in the list should have > @@ -1291,18 +1294,10 @@ struct kvm_arch { > */ > struct list_head tdp_mmu_roots; > > - /* > - * List of struct kvmp_mmu_pages not being used as roots. > - * All struct kvm_mmu_pages in the list should have > - * tdp_mmu_page set and a tdp_mmu_root_count of 0. > - */ > - struct list_head tdp_mmu_pages; > - > /* > * Protects accesses to the following fields when the MMU lock > * is held in read mode: > * - tdp_mmu_roots (above) > - * - tdp_mmu_pages (above) > * - the link field of struct kvm_mmu_pages used by the TDP MMU > * - possible_nx_huge_pages; > * - the possible_nx_huge_page_link field of struct kvm_mmu_pages used > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c > index 626c40ec2af9..fea22dc481a0 100644 > --- a/arch/x86/kvm/mmu/tdp_mmu.c > +++ b/arch/x86/kvm/mmu/tdp_mmu.c > @@ -29,7 +29,6 @@ int kvm_mmu_init_tdp_mmu(struct kvm *kvm) > kvm->arch.tdp_mmu_enabled = true; > INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); > spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); > - INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); > kvm->arch.tdp_mmu_zap_wq = wq; > return 1; > } > @@ -54,7 +53,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) > /* Also waits for any queued work items. */ > destroy_workqueue(kvm->arch.tdp_mmu_zap_wq); > > - WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages)); > + WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); > WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); > > /* > @@ -386,16 +385,18 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, > static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp, > bool shared) > { > + atomic64_dec(&kvm->arch.tdp_mmu_pages); > + > + if (!sp->nx_huge_page_disallowed) > + return; > + > if (shared) > spin_lock(&kvm->arch.tdp_mmu_pages_lock); > else > lockdep_assert_held_write(&kvm->mmu_lock); > > - list_del(&sp->link); > - if (sp->nx_huge_page_disallowed) { > - sp->nx_huge_page_disallowed = false; > - untrack_possible_nx_huge_page(kvm, sp); > - } > + sp->nx_huge_page_disallowed = false; > + untrack_possible_nx_huge_page(kvm, sp); > > if (shared) > spin_unlock(&kvm->arch.tdp_mmu_pages_lock); > @@ -1132,9 +1133,7 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, > tdp_mmu_set_spte(kvm, iter, spte); > } > > - spin_lock(&kvm->arch.tdp_mmu_pages_lock); > - list_add(&sp->link, &kvm->arch.tdp_mmu_pages); > - spin_unlock(&kvm->arch.tdp_mmu_pages_lock); > + atomic64_inc(&kvm->arch.tdp_mmu_pages); > > return 0; > } > -- > 2.37.1.359.gd136c6c3e2-goog >
On Mon, Jul 25, 2022, David Matlack wrote: > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > > index 246b69262b93..5c269b2556d6 100644 > > --- a/arch/x86/include/asm/kvm_host.h > > +++ b/arch/x86/include/asm/kvm_host.h > > @@ -1271,6 +1271,9 @@ struct kvm_arch { > > */ > > bool tdp_mmu_enabled; > > > > + /* The number of TDP MMU pages across all roots. */ > > + atomic64_t tdp_mmu_pages; > > This is the number of non-root TDP MMU pages, right? Yes.
On Sat, Jul 23, 2022 at 01:23:23AM +0000, Sean Christopherson wrote: <snip> > @@ -386,16 +385,18 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, > static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp, > bool shared) > { > + atomic64_dec(&kvm->arch.tdp_mmu_pages); > + > + if (!sp->nx_huge_page_disallowed) > + return; > + Does this read of sp->nx_huge_page_disallowed also need to be protected by tdp_mmu_pages_lock in shared path? Thanks Yan > if (shared) > spin_lock(&kvm->arch.tdp_mmu_pages_lock); > else > lockdep_assert_held_write(&kvm->mmu_lock); > > - list_del(&sp->link); > - if (sp->nx_huge_page_disallowed) { > - sp->nx_huge_page_disallowed = false; > - untrack_possible_nx_huge_page(kvm, sp); > - } > + sp->nx_huge_page_disallowed = false; > + untrack_possible_nx_huge_page(kvm, sp); > > if (shared) > spin_unlock(&kvm->arch.tdp_mmu_pages_lock); > @@ -1132,9 +1133,7 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, > tdp_mmu_set_spte(kvm, iter, spte); > } > > - spin_lock(&kvm->arch.tdp_mmu_pages_lock); > - list_add(&sp->link, &kvm->arch.tdp_mmu_pages); > - spin_unlock(&kvm->arch.tdp_mmu_pages_lock); > + atomic64_inc(&kvm->arch.tdp_mmu_pages); > > return 0; > } > -- > 2.37.1.359.gd136c6c3e2-goog >
On Wed, Jul 27, 2022, Yan Zhao wrote: > On Sat, Jul 23, 2022 at 01:23:23AM +0000, Sean Christopherson wrote: > > <snip> > > > @@ -386,16 +385,18 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, > > static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp, > > bool shared) > > { > > + atomic64_dec(&kvm->arch.tdp_mmu_pages); > > + > > + if (!sp->nx_huge_page_disallowed) > > + return; > > + > Does this read of sp->nx_huge_page_disallowed also need to be protected by > tdp_mmu_pages_lock in shared path? No, because only one CPU can call tdp_mmu_unlink_sp() for a shadow page. E.g. in a shared walk, the SPTE is zapped atomically and only the CPU that "wins" gets to unlink the s[. The extra lock is needed to prevent list corruption, but the sp itself is thread safe. FWIW, even if that guarantee didn't hold, checking the flag outside of tdp_mmu_pages_lock is safe because false positives are ok. untrack_possible_nx_huge_page() checks that the shadow page is actually on the list, i.e. it's a nop if a different task unlinks the page first. False negatives need to be avoided, but nx_huge_page_disallowed is cleared only when untrack_possible_nx_huge_page() is guaranteed to be called, i.e. true false negatives can't occur. Hmm, but I think there's a missing smp_rmb(), which is needed to ensure nx_huge_page_disallowed is read after observing the shadow-present SPTE (that's being unlinked). I'll add that in the next version.
On Wed, Jul 27, 2022 at 07:04:35PM +0000, Sean Christopherson wrote: > On Wed, Jul 27, 2022, Yan Zhao wrote: > > On Sat, Jul 23, 2022 at 01:23:23AM +0000, Sean Christopherson wrote: > > > > <snip> > > > > > @@ -386,16 +385,18 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, > > > static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp, > > > bool shared) > > > { > > > + atomic64_dec(&kvm->arch.tdp_mmu_pages); > > > + > > > + if (!sp->nx_huge_page_disallowed) > > > + return; > > > + > > Does this read of sp->nx_huge_page_disallowed also need to be protected by > > tdp_mmu_pages_lock in shared path? > > > No, because only one CPU can call tdp_mmu_unlink_sp() for a shadow page. E.g. in > a shared walk, the SPTE is zapped atomically and only the CPU that "wins" gets to > unlink the s[. The extra lock is needed to prevent list corruption, but the > sp itself is thread safe. > > FWIW, even if that guarantee didn't hold, checking the flag outside of tdp_mmu_pages_lock > is safe because false positives are ok. untrack_possible_nx_huge_page() checks that > the shadow page is actually on the list, i.e. it's a nop if a different task unlinks > the page first. > > False negatives need to be avoided, but nx_huge_page_disallowed is cleared only > when untrack_possible_nx_huge_page() is guaranteed to be called, i.e. true false > negatives can't occur. > > Hmm, but I think there's a missing smp_rmb(), which is needed to ensure > nx_huge_page_disallowed is read after observing the shadow-present SPTE (that's > being unlinked). I'll add that in the next version. It makes sense. Thanks for such detailed explanation! Thanks Yan
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 246b69262b93..5c269b2556d6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1271,6 +1271,9 @@ struct kvm_arch { */ bool tdp_mmu_enabled; + /* The number of TDP MMU pages across all roots. */ + atomic64_t tdp_mmu_pages; + /* * List of struct kvm_mmu_pages being used as roots. * All struct kvm_mmu_pages in the list should have @@ -1291,18 +1294,10 @@ struct kvm_arch { */ struct list_head tdp_mmu_roots; - /* - * List of struct kvmp_mmu_pages not being used as roots. - * All struct kvm_mmu_pages in the list should have - * tdp_mmu_page set and a tdp_mmu_root_count of 0. - */ - struct list_head tdp_mmu_pages; - /* * Protects accesses to the following fields when the MMU lock * is held in read mode: * - tdp_mmu_roots (above) - * - tdp_mmu_pages (above) * - the link field of struct kvm_mmu_pages used by the TDP MMU * - possible_nx_huge_pages; * - the possible_nx_huge_page_link field of struct kvm_mmu_pages used diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 626c40ec2af9..fea22dc481a0 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -29,7 +29,6 @@ int kvm_mmu_init_tdp_mmu(struct kvm *kvm) kvm->arch.tdp_mmu_enabled = true; INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); - INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); kvm->arch.tdp_mmu_zap_wq = wq; return 1; } @@ -54,7 +53,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) /* Also waits for any queued work items. */ destroy_workqueue(kvm->arch.tdp_mmu_zap_wq); - WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages)); + WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); /* @@ -386,16 +385,18 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp, bool shared) { + atomic64_dec(&kvm->arch.tdp_mmu_pages); + + if (!sp->nx_huge_page_disallowed) + return; + if (shared) spin_lock(&kvm->arch.tdp_mmu_pages_lock); else lockdep_assert_held_write(&kvm->mmu_lock); - list_del(&sp->link); - if (sp->nx_huge_page_disallowed) { - sp->nx_huge_page_disallowed = false; - untrack_possible_nx_huge_page(kvm, sp); - } + sp->nx_huge_page_disallowed = false; + untrack_possible_nx_huge_page(kvm, sp); if (shared) spin_unlock(&kvm->arch.tdp_mmu_pages_lock); @@ -1132,9 +1133,7 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, tdp_mmu_set_spte(kvm, iter, spte); } - spin_lock(&kvm->arch.tdp_mmu_pages_lock); - list_add(&sp->link, &kvm->arch.tdp_mmu_pages); - spin_unlock(&kvm->arch.tdp_mmu_pages_lock); + atomic64_inc(&kvm->arch.tdp_mmu_pages); return 0; }