diff mbox series

[15/28] KVM: x86/mmu: Take TDP MMU roots off list when invalidating all roots

Message ID 20211120045046.3940942-16-seanjc@google.com (mailing list archive)
State New, archived
Headers show
Series KVM: x86/mmu: Overhaul TDP MMU zapping and flushing | expand

Commit Message

Sean Christopherson Nov. 20, 2021, 4:50 a.m. UTC
Take TDP MMU roots off the list of roots when they're invalidated instead
of walking later on to find the roots that were just invalidated.  In
addition to making the flow more straightforward, this allows warning
if something attempts to elevate the refcount of an invalid root, which
should be unreachable (no longer on the list so can't be reached by MMU
notifier, and vCPUs must reload a new root before installing new SPTE).

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/mmu.c     |   6 +-
 arch/x86/kvm/mmu/tdp_mmu.c | 171 ++++++++++++++++++++-----------------
 arch/x86/kvm/mmu/tdp_mmu.h |  14 ++-
 3 files changed, 108 insertions(+), 83 deletions(-)

Comments

Ben Gardon Nov. 22, 2021, 10:20 p.m. UTC | #1
"/

On Fri, Nov 19, 2021 at 8:51 PM Sean Christopherson <seanjc@google.com> wrote:
>
> Take TDP MMU roots off the list of roots when they're invalidated instead
> of walking later on to find the roots that were just invalidated.  In
> addition to making the flow more straightforward, this allows warning
> if something attempts to elevate the refcount of an invalid root, which
> should be unreachable (no longer on the list so can't be reached by MMU
> notifier, and vCPUs must reload a new root before installing new SPTE).
>
> Signed-off-by: Sean Christopherson <seanjc@google.com>

There are a bunch of awesome little cleanups and unrelated fixes
included in this commit that could be factored out.

I'm skeptical of immediately moving the invalidated roots into another
list as that seems like it has a lot of potential for introducing
weird races. I'm not sure it actually solves a problem either. Part of
the motive from the commit description "this allows warning if
something attempts to elevate the refcount of an invalid root" can be
achieved already without moving the roots into a separate list.

Maybe this would seem more straightforward with some of the little
cleanups factored out, but this feels more complicated to me.

> ---
>  arch/x86/kvm/mmu/mmu.c     |   6 +-
>  arch/x86/kvm/mmu/tdp_mmu.c | 171 ++++++++++++++++++++-----------------
>  arch/x86/kvm/mmu/tdp_mmu.h |  14 ++-
>  3 files changed, 108 insertions(+), 83 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index e00e46205730..e3cd330c9532 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -5664,6 +5664,8 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
>   */
>  static void kvm_mmu_zap_all_fast(struct kvm *kvm)
>  {
> +       LIST_HEAD(invalidated_roots);
> +
>         lockdep_assert_held(&kvm->slots_lock);
>
>         write_lock(&kvm->mmu_lock);
> @@ -5685,7 +5687,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
>          * could drop the MMU lock and yield.
>          */
>         if (is_tdp_mmu_enabled(kvm))
> -               kvm_tdp_mmu_invalidate_all_roots(kvm);
> +               kvm_tdp_mmu_invalidate_all_roots(kvm, &invalidated_roots);
>
>         /*
>          * Notify all vcpus to reload its shadow page table and flush TLB.
> @@ -5703,7 +5705,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
>
>         if (is_tdp_mmu_enabled(kvm)) {
>                 read_lock(&kvm->mmu_lock);
> -               kvm_tdp_mmu_zap_invalidated_roots(kvm);
> +               kvm_tdp_mmu_zap_invalidated_roots(kvm, &invalidated_roots);
>                 read_unlock(&kvm->mmu_lock);
>         }
>  }
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index ca6b30a7130d..085f6b09e5f3 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -94,9 +94,17 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
>
>         WARN_ON(!root->tdp_mmu_page);
>
> -       spin_lock(&kvm->arch.tdp_mmu_pages_lock);
> -       list_del_rcu(&root->link);
> -       spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
> +       /*
> +        * Remove the root from tdp_mmu_roots, unless the root is invalid in
> +        * which case the root was pulled off tdp_mmu_roots when it was
> +        * invalidated.  Note, this must be an RCU-protected deletion to avoid
> +        * use-after-free in the yield-safe iterator!
> +        */
> +       if (!root->role.invalid) {
> +               spin_lock(&kvm->arch.tdp_mmu_pages_lock);
> +               list_del_rcu(&root->link);
> +               spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
> +       }
>
>         /*
>          * A TLB flush is not necessary as KVM performs a local TLB flush when
> @@ -105,18 +113,23 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
>          * invalidates any paging-structure-cache entries, i.e. TLB entries for
>          * intermediate paging structures, that may be zapped, as such entries
>          * are associated with the ASID on both VMX and SVM.
> +        *
> +        * WARN if a flush is reported for an invalid root, as its child SPTEs
> +        * should have been zapped by kvm_tdp_mmu_zap_invalidated_roots(), and
> +        * inserting new SPTEs under an invalid root is a KVM bug.
>          */
> -       (void)zap_gfn_range(kvm, root, 0, -1ull, true, false, shared);
> +       if (zap_gfn_range(kvm, root, 0, -1ull, true, false, shared))
> +               WARN_ON_ONCE(root->role.invalid);
>
>         call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
>  }
>
>  /*
> - * Finds the next valid root after root (or the first valid root if root
> - * is NULL), takes a reference on it, and returns that next root. If root
> - * is not NULL, this thread should have already taken a reference on it, and
> - * that reference will be dropped. If no valid root is found, this
> - * function will return NULL.
> + * Finds the next root after @prev_root (or the first root if @prev_root is NULL
> + * or invalid), takes a reference on it, and returns that next root.  If root is
> + * not NULL, this thread should have already taken a reference on it, and that
> + * reference will be dropped. If no valid root is found, this function will
> + * return NULL.
>   */
>  static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
>                                               struct kvm_mmu_page *prev_root,
> @@ -124,6 +137,27 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
>  {
>         struct kvm_mmu_page *next_root;
>
> +       lockdep_assert_held(&kvm->mmu_lock);
> +
> +       /*
> +        * Restart the walk if the previous root was invalidated, which can
> +        * happen if the caller drops mmu_lock when yielding.  Restarting the
> +        * walke is necessary because invalidating a root also removes it from

Nit: *walk

> +        * tdp_mmu_roots.  Restarting is safe and correct because invalidating
> +        * a root is done if and only if _all_ roots are invalidated, i.e. any
> +        * root on tdp_mmu_roots was added _after_ the invalidation event.
> +        */
> +       if (prev_root && prev_root->role.invalid) {
> +               kvm_tdp_mmu_put_root(kvm, prev_root, shared);
> +               prev_root = NULL;
> +       }
> +
> +       /*
> +        * Finding the next root must be done under RCU read lock.  Although
> +        * @prev_root itself cannot be removed from tdp_mmu_roots because this
> +        * task holds a reference, its next and prev pointers can be modified
> +        * when freeing a different root.  Ditto for tdp_mmu_roots itself.
> +        */

I'm not sure this is correct with the rest of the changes in this
patch. The new version of invalidate_roots removes roots from the list
immediately, even if they have a non-zero ref-count.

>         rcu_read_lock();
>
>         if (prev_root)
> @@ -230,10 +264,13 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
>         root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
>         refcount_set(&root->tdp_mmu_root_count, 1);
>
> -       spin_lock(&kvm->arch.tdp_mmu_pages_lock);
> -       list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
> -       spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
> -
> +       /*
> +        * Because mmu_lock must be held for write to ensure that KVM doesn't
> +        * create multiple roots for a given role, this does not need to use
> +        * an RCU-friendly variant as readers of tdp_mmu_roots must also hold
> +        * mmu_lock in some capacity.
> +        */

I doubt we're doing it now, but in principle we could allocate new
roots with mmu_lock in read + tdp_mmu_pages_lock. That might be better
than depending on the write lock.

> +       list_add(&root->link, &kvm->arch.tdp_mmu_roots);
>  out:
>         return __pa(root->spt);
>  }
> @@ -814,28 +851,6 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
>                 kvm_flush_remote_tlbs(kvm);
>  }
>
> -static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
> -                                                 struct kvm_mmu_page *prev_root)
> -{
> -       struct kvm_mmu_page *next_root;
> -
> -       if (prev_root)
> -               next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
> -                                                 &prev_root->link,
> -                                                 typeof(*prev_root), link);
> -       else
> -               next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
> -                                                  typeof(*next_root), link);
> -
> -       while (next_root && !(next_root->role.invalid &&
> -                             refcount_read(&next_root->tdp_mmu_root_count)))
> -               next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
> -                                                 &next_root->link,
> -                                                 typeof(*next_root), link);
> -
> -       return next_root;
> -}
> -
>  /*
>   * Since kvm_tdp_mmu_zap_all_fast has acquired a reference to each
>   * invalidated root, they will not be freed until this function drops the
> @@ -844,22 +859,21 @@ static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
>   * only has to do a trivial amount of work. Since the roots are invalid,
>   * no new SPTEs should be created under them.
>   */
> -void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
> +void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm,
> +                                      struct list_head *invalidated_roots)
>  {
> -       struct kvm_mmu_page *next_root;
> -       struct kvm_mmu_page *root;
> +       struct kvm_mmu_page *root, *tmp;
>
> +       lockdep_assert_held(&kvm->slots_lock);
>         lockdep_assert_held_read(&kvm->mmu_lock);
>
> -       rcu_read_lock();
> -
> -       root = next_invalidated_root(kvm, NULL);
> -
> -       while (root) {
> -               next_root = next_invalidated_root(kvm, root);
> -
> -               rcu_read_unlock();
> -
> +       /*
> +        * Put the ref to each root, acquired by kvm_tdp_mmu_put_root().  The

Nit: s/kvm_tdp_mmu_put_root/kvm_tdp_mmu_get_root/

> +        * safe variant is required even though kvm_tdp_mmu_put_root() doesn't
> +        * explicitly remove the root from the invalid list, as this task does
> +        * not take rcu_read_lock() and so the list object itself can be freed.
> +        */
> +       list_for_each_entry_safe(root, tmp, invalidated_roots, link) {
>                 /*
>                  * A TLB flush is unnecessary, invalidated roots are guaranteed
>                  * to be unreachable by the guest (see kvm_tdp_mmu_put_root()
> @@ -870,49 +884,50 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
>                  * blip and not a functional issue.
>                  */
>                 (void)zap_gfn_range(kvm, root, 0, -1ull, true, false, true);
> -
> -               /*
> -                * Put the reference acquired in
> -                * kvm_tdp_mmu_invalidate_roots
> -                */
>                 kvm_tdp_mmu_put_root(kvm, root, true);
> -
> -               root = next_root;
> -
> -               rcu_read_lock();
>         }
> -
> -       rcu_read_unlock();
>  }
>
>  /*
> - * Mark each TDP MMU root as invalid so that other threads
> - * will drop their references and allow the root count to
> - * go to 0.
> + * Mark each TDP MMU root as invalid so that other threads will drop their
> + * references and allow the root count to go to 0.
>   *
> - * Also take a reference on all roots so that this thread
> - * can do the bulk of the work required to free the roots
> - * once they are invalidated. Without this reference, a
> - * vCPU thread might drop the last reference to a root and
> - * get stuck with tearing down the entire paging structure.
> + * Take a reference on each root and move it to a local list so that this task
> + * can do the actual work required to free the roots once they are invalidated,
> + * e.g. zap the SPTEs and trigger a remote TLB flush. Without this reference, a
> + * vCPU task might drop the last reference to a root and get stuck with tearing
> + * down the entire paging structure.
>   *
> - * Roots which have a zero refcount should be skipped as
> - * they're already being torn down.
> - * Already invalid roots should be referenced again so that
> - * they aren't freed before kvm_tdp_mmu_zap_all_fast is
> - * done with them.
> + * Roots which have a zero refcount are skipped as they're already being torn
> + * down.  Encountering a root that is already invalid is a KVM bug, as this is
> + * the only path that is allowed to invalidate roots and (a) it's proteced by

Nit: protected

> + * slots_lock and (b) pulls each root off tdp_mmu_roots.
>   *
> - * This has essentially the same effect for the TDP MMU
> - * as updating mmu_valid_gen does for the shadow MMU.
> + * This has essentially the same effect for the TDP MMU as updating
> + * mmu_valid_gen does for the shadow MMU.
>   */
> -void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
> +void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm,
> +                                     struct list_head *invalidated_roots)
>  {
> -       struct kvm_mmu_page *root;
> +       struct kvm_mmu_page *root, *tmp;
>
> +       /*
> +        * mmu_lock must be held for write, moving entries off an RCU-protected
> +        * list is not safe, entries can only be deleted.   All accesses to
> +        * tdp_mmu_roots are required to hold mmu_lock in some capacity, thus
> +        * holding it for write ensures there are no concurrent readers.
> +        */
>         lockdep_assert_held_write(&kvm->mmu_lock);
> -       list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link)
> -               if (refcount_inc_not_zero(&root->tdp_mmu_root_count))
> -                       root->role.invalid = true;
> +
> +       list_for_each_entry_safe(root, tmp, &kvm->arch.tdp_mmu_roots, link) {
> +               if (!kvm_tdp_mmu_get_root(root))
> +                       continue;
> +
> +               list_move_tail(&root->link, invalidated_roots);
> +
> +               WARN_ON_ONCE(root->role.invalid);
> +               root->role.invalid = true;
> +       }
>  }
>
>  /*
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
> index 599714de67c3..ced6d8e47362 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.h
> +++ b/arch/x86/kvm/mmu/tdp_mmu.h
> @@ -9,7 +9,13 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
>
>  __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
>  {
> -       if (root->role.invalid)
> +       /*
> +        * Acquiring a reference on an invalid root is a KVM bug.  Invalid roots
> +        * are supposed to be reachable only by references that were acquired
> +        * before the invalidation, and taking an additional reference to an
> +        * invalid root is not allowed.
> +        */
> +       if (WARN_ON_ONCE(root->role.invalid))
>                 return false;
>
>         return refcount_inc_not_zero(&root->tdp_mmu_root_count);
> @@ -44,8 +50,10 @@ static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
>  }
>
>  void kvm_tdp_mmu_zap_all(struct kvm *kvm);
> -void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm);
> -void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm);
> +void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm,
> +                                     struct list_head *invalidated_roots);
> +void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm,
> +                                      struct list_head *invalidated_roots);
>
>  int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
>
> --
> 2.34.0.rc2.393.gf8c9666880-goog
>
Sean Christopherson Nov. 22, 2021, 11:08 p.m. UTC | #2
On Mon, Nov 22, 2021, Ben Gardon wrote:
> On Fri, Nov 19, 2021 at 8:51 PM Sean Christopherson <seanjc@google.com> wrote:
> >
> > Take TDP MMU roots off the list of roots when they're invalidated instead
> > of walking later on to find the roots that were just invalidated.  In
> > addition to making the flow more straightforward, this allows warning
> > if something attempts to elevate the refcount of an invalid root, which
> > should be unreachable (no longer on the list so can't be reached by MMU
> > notifier, and vCPUs must reload a new root before installing new SPTE).
> >
> > Signed-off-by: Sean Christopherson <seanjc@google.com>
> 
> There are a bunch of awesome little cleanups and unrelated fixes
> included in this commit that could be factored out.
> 
> I'm skeptical of immediately moving the invalidated roots into another
> list as that seems like it has a lot of potential for introducing
> weird races.

I disagree, the entire premise of fast invalidate is that there can't be races,
i.e. mmu_lock must be held for write.  IMO, it's actually the opposite, as the only
reason leaving roots on the per-VM list doesn't have weird races is because slots_lock
is held.  If slots_lock weren't required to do a fast zap, which is feasible for the
TDP MMU since it doesn't rely on the memslots generation, then it would be possible
for multiple calls to kvm_tdp_mmu_zap_invalidated_roots() to run in parallel.  And in
that case, leaving roots on the per-VM list would lead to a single instance of a
"fast zap" zapping roots it didn't invalidate.  That's wouldn't be problematic per se,
but I don't like not having a clear "owner" of the invalidated root.

> I'm not sure it actually solves a problem either. Part of
> the motive from the commit description "this allows warning if
> something attempts to elevate the refcount of an invalid root" can be
> achieved already without moving the roots into a separate list.

Hmm, true in the sense that kvm_tdp_mmu_get_root() could be converted to a WARN,
but that would require tdp_mmu_next_root() to manually skip invalid roots.
kvm_tdp_mmu_get_vcpu_root_hpa() is naturally safe because page_role_for_level()
will never set the invalid flag.

I don't care too much about adding a manual check in tdp_mmu_next_root(), what I don't
like is that a WARN in kvm_tdp_mmu_get_root() wouldn't be establishing an invariant
that invalidated roots are unreachable, it would simply be forcing callers to check
role.invalid.

> Maybe this would seem more straightforward with some of the little
> cleanups factored out, but this feels more complicated to me.
> > @@ -124,6 +137,27 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
> >  {
> >         struct kvm_mmu_page *next_root;
> >
> > +       lockdep_assert_held(&kvm->mmu_lock);
> > +
> > +       /*
> > +        * Restart the walk if the previous root was invalidated, which can
> > +        * happen if the caller drops mmu_lock when yielding.  Restarting the
> > +        * walke is necessary because invalidating a root also removes it from
> 
> Nit: *walk
> 
> > +        * tdp_mmu_roots.  Restarting is safe and correct because invalidating
> > +        * a root is done if and only if _all_ roots are invalidated, i.e. any
> > +        * root on tdp_mmu_roots was added _after_ the invalidation event.
> > +        */
> > +       if (prev_root && prev_root->role.invalid) {
> > +               kvm_tdp_mmu_put_root(kvm, prev_root, shared);
> > +               prev_root = NULL;
> > +       }
> > +
> > +       /*
> > +        * Finding the next root must be done under RCU read lock.  Although
> > +        * @prev_root itself cannot be removed from tdp_mmu_roots because this
> > +        * task holds a reference, its next and prev pointers can be modified
> > +        * when freeing a different root.  Ditto for tdp_mmu_roots itself.
> > +        */
> 
> I'm not sure this is correct with the rest of the changes in this
> patch. The new version of invalidate_roots removes roots from the list
> immediately, even if they have a non-zero ref-count.

Roots don't have to be invalidated to be removed, e.g. if the last reference is
put due to kvm_mmu_reset_context().  Or did I misunderstand?

> >         rcu_read_lock();
> >
> >         if (prev_root)
> > @@ -230,10 +264,13 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
> >         root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
> >         refcount_set(&root->tdp_mmu_root_count, 1);
> >
> > -       spin_lock(&kvm->arch.tdp_mmu_pages_lock);
> > -       list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
> > -       spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
> > -
> > +       /*
> > +        * Because mmu_lock must be held for write to ensure that KVM doesn't
> > +        * create multiple roots for a given role, this does not need to use
> > +        * an RCU-friendly variant as readers of tdp_mmu_roots must also hold
> > +        * mmu_lock in some capacity.
> > +        */
> 
> I doubt we're doing it now, but in principle we could allocate new
> roots with mmu_lock in read + tdp_mmu_pages_lock. That might be better
> than depending on the write lock.

We're not, this function does lockdep_assert_held_write(&kvm->mmu_lock) a few
lines above.  I don't have a preference between using mmu_lock.read+tdp_mmu_pages_lock
versus mmu_lock.write, but I do care that the current code doesn't incorrectly imply
that it's possible for something else to be walking the roots while this runs.

Either way, this should definitely be a separate patch, pretty sure I just lost
track of it.
 
> > +       list_add(&root->link, &kvm->arch.tdp_mmu_roots);
> >  out:
> >         return __pa(root->spt);
> >  }
Ben Gardon Nov. 23, 2021, 12:03 a.m. UTC | #3
On Mon, Nov 22, 2021 at 3:08 PM Sean Christopherson <seanjc@google.com> wrote:
>
> On Mon, Nov 22, 2021, Ben Gardon wrote:
> > On Fri, Nov 19, 2021 at 8:51 PM Sean Christopherson <seanjc@google.com> wrote:
> > >
> > > Take TDP MMU roots off the list of roots when they're invalidated instead
> > > of walking later on to find the roots that were just invalidated.  In
> > > addition to making the flow more straightforward, this allows warning
> > > if something attempts to elevate the refcount of an invalid root, which
> > > should be unreachable (no longer on the list so can't be reached by MMU
> > > notifier, and vCPUs must reload a new root before installing new SPTE).
> > >
> > > Signed-off-by: Sean Christopherson <seanjc@google.com>
> >
> > There are a bunch of awesome little cleanups and unrelated fixes
> > included in this commit that could be factored out.
> >
> > I'm skeptical of immediately moving the invalidated roots into another
> > list as that seems like it has a lot of potential for introducing
> > weird races.
>
> I disagree, the entire premise of fast invalidate is that there can't be races,
> i.e. mmu_lock must be held for write.  IMO, it's actually the opposite, as the only
> reason leaving roots on the per-VM list doesn't have weird races is because slots_lock
> is held.  If slots_lock weren't required to do a fast zap, which is feasible for the
> TDP MMU since it doesn't rely on the memslots generation, then it would be possible
> for multiple calls to kvm_tdp_mmu_zap_invalidated_roots() to run in parallel.  And in
> that case, leaving roots on the per-VM list would lead to a single instance of a
> "fast zap" zapping roots it didn't invalidate.  That's wouldn't be problematic per se,
> but I don't like not having a clear "owner" of the invalidated root.

That's a good point, the potential interleaving of zap_alls would be gross.

My mental model for the invariant here was "roots that are still in
use are on the roots list," but I can see how "the roots list contains
all valid, in-use roots" could be a more useful invariant.

>
> > I'm not sure it actually solves a problem either. Part of
> > the motive from the commit description "this allows warning if
> > something attempts to elevate the refcount of an invalid root" can be
> > achieved already without moving the roots into a separate list.
>
> Hmm, true in the sense that kvm_tdp_mmu_get_root() could be converted to a WARN,
> but that would require tdp_mmu_next_root() to manually skip invalid roots.
> kvm_tdp_mmu_get_vcpu_root_hpa() is naturally safe because page_role_for_level()
> will never set the invalid flag.
>
> I don't care too much about adding a manual check in tdp_mmu_next_root(), what I don't
> like is that a WARN in kvm_tdp_mmu_get_root() wouldn't be establishing an invariant
> that invalidated roots are unreachable, it would simply be forcing callers to check
> role.invalid.

That makes sense.

>
> > Maybe this would seem more straightforward with some of the little
> > cleanups factored out, but this feels more complicated to me.
> > > @@ -124,6 +137,27 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
> > >  {
> > >         struct kvm_mmu_page *next_root;
> > >
> > > +       lockdep_assert_held(&kvm->mmu_lock);
> > > +
> > > +       /*
> > > +        * Restart the walk if the previous root was invalidated, which can
> > > +        * happen if the caller drops mmu_lock when yielding.  Restarting the
> > > +        * walke is necessary because invalidating a root also removes it from
> >
> > Nit: *walk
> >
> > > +        * tdp_mmu_roots.  Restarting is safe and correct because invalidating
> > > +        * a root is done if and only if _all_ roots are invalidated, i.e. any
> > > +        * root on tdp_mmu_roots was added _after_ the invalidation event.
> > > +        */
> > > +       if (prev_root && prev_root->role.invalid) {
> > > +               kvm_tdp_mmu_put_root(kvm, prev_root, shared);
> > > +               prev_root = NULL;
> > > +       }
> > > +
> > > +       /*
> > > +        * Finding the next root must be done under RCU read lock.  Although
> > > +        * @prev_root itself cannot be removed from tdp_mmu_roots because this
> > > +        * task holds a reference, its next and prev pointers can be modified
> > > +        * when freeing a different root.  Ditto for tdp_mmu_roots itself.
> > > +        */
> >
> > I'm not sure this is correct with the rest of the changes in this
> > patch. The new version of invalidate_roots removes roots from the list
> > immediately, even if they have a non-zero ref-count.
>
> Roots don't have to be invalidated to be removed, e.g. if the last reference is
> put due to kvm_mmu_reset_context().  Or did I misunderstand?

Ah, sorry that was kind of a pedantic comment. I think the comment
should say "@prev_root itself cannot be removed from tdp_mmu_roots
because this task holds the MMU lock (in read or write mode)"
With the other changes in the patch, holding a reference on a root
provides protection against it being freed, but not against it being
removed from the roots list.

>
> > >         rcu_read_lock();
> > >
> > >         if (prev_root)
> > > @@ -230,10 +264,13 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
> > >         root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
> > >         refcount_set(&root->tdp_mmu_root_count, 1);
> > >
> > > -       spin_lock(&kvm->arch.tdp_mmu_pages_lock);
> > > -       list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
> > > -       spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
> > > -
> > > +       /*
> > > +        * Because mmu_lock must be held for write to ensure that KVM doesn't
> > > +        * create multiple roots for a given role, this does not need to use
> > > +        * an RCU-friendly variant as readers of tdp_mmu_roots must also hold
> > > +        * mmu_lock in some capacity.
> > > +        */
> >
> > I doubt we're doing it now, but in principle we could allocate new
> > roots with mmu_lock in read + tdp_mmu_pages_lock. That might be better
> > than depending on the write lock.
>
> We're not, this function does lockdep_assert_held_write(&kvm->mmu_lock) a few
> lines above.  I don't have a preference between using mmu_lock.read+tdp_mmu_pages_lock
> versus mmu_lock.write, but I do care that the current code doesn't incorrectly imply
> that it's possible for something else to be walking the roots while this runs.

That makes sense. It seems like it'd be pretty easy to let this run
under the read lock, but I'd be fine with either way. I agree both
options are an improvement.

>
> Either way, this should definitely be a separate patch, pretty sure I just lost
> track of it.
>
> > > +       list_add(&root->link, &kvm->arch.tdp_mmu_roots);
> > >  out:
> > >         return __pa(root->spt);
> > >  }
Sean Christopherson Dec. 14, 2021, 11:34 p.m. UTC | #4
On Mon, Nov 22, 2021, Ben Gardon wrote:
> On Mon, Nov 22, 2021 at 3:08 PM Sean Christopherson <seanjc@google.com> wrote:
> >
> > On Mon, Nov 22, 2021, Ben Gardon wrote:
> > > On Fri, Nov 19, 2021 at 8:51 PM Sean Christopherson <seanjc@google.com> wrote:
> > > >
> > > > Take TDP MMU roots off the list of roots when they're invalidated instead
> > > > of walking later on to find the roots that were just invalidated.  In
> > > > addition to making the flow more straightforward, this allows warning
> > > > if something attempts to elevate the refcount of an invalid root, which
> > > > should be unreachable (no longer on the list so can't be reached by MMU
> > > > notifier, and vCPUs must reload a new root before installing new SPTE).
> > > >
> > > > Signed-off-by: Sean Christopherson <seanjc@google.com>
> > >
> > > There are a bunch of awesome little cleanups and unrelated fixes
> > > included in this commit that could be factored out.
> > >
> > > I'm skeptical of immediately moving the invalidated roots into another
> > > list as that seems like it has a lot of potential for introducing
> > > weird races.
> >
> > I disagree, the entire premise of fast invalidate is that there can't be races,
> > i.e. mmu_lock must be held for write.  IMO, it's actually the opposite, as the only
> > reason leaving roots on the per-VM list doesn't have weird races is because slots_lock
> > is held.  If slots_lock weren't required to do a fast zap, which is feasible for the
> > TDP MMU since it doesn't rely on the memslots generation, then it would be possible
> > for multiple calls to kvm_tdp_mmu_zap_invalidated_roots() to run in parallel.  And in
> > that case, leaving roots on the per-VM list would lead to a single instance of a
> > "fast zap" zapping roots it didn't invalidate.  That's wouldn't be problematic per se,
> > but I don't like not having a clear "owner" of the invalidated root.
> 
> That's a good point, the potential interleaving of zap_alls would be gross.
> 
> My mental model for the invariant here was "roots that are still in
> use are on the roots list," but I can see how "the roots list contains
> all valid, in-use roots" could be a more useful invariant.

Sadly, my idea of taking invalid roots off the list ain't gonna happen.

The immediate issue is that the TDP MMU doesn't zap invalid roots in mmu_notifier
callbacks.  This leads to use-after-free and other issues if the mmu_notifier runs
to completion while an invalid root zapper yields as KVM fails to honor the
requirement that there must be _no_ references to the page after the mmu_notifier
returns.

This is most noticeable with set_nx_huge_pages() + kvm_mmu_notifier_release(),
but the bug exists between kvm_mmu_notifier_invalidate_range_start() and memslot
updates as well.  The pages aren't accessible by the guest, and KVM won't read or
write the data itself, but KVM will trigger e.g. kvm_set_pfn_dirty() when zapping
SPTEs, _after_ the mmu_notifier returns.
diff mbox series

Patch

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index e00e46205730..e3cd330c9532 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5664,6 +5664,8 @@  static void kvm_zap_obsolete_pages(struct kvm *kvm)
  */
 static void kvm_mmu_zap_all_fast(struct kvm *kvm)
 {
+	LIST_HEAD(invalidated_roots);
+
 	lockdep_assert_held(&kvm->slots_lock);
 
 	write_lock(&kvm->mmu_lock);
@@ -5685,7 +5687,7 @@  static void kvm_mmu_zap_all_fast(struct kvm *kvm)
 	 * could drop the MMU lock and yield.
 	 */
 	if (is_tdp_mmu_enabled(kvm))
-		kvm_tdp_mmu_invalidate_all_roots(kvm);
+		kvm_tdp_mmu_invalidate_all_roots(kvm, &invalidated_roots);
 
 	/*
 	 * Notify all vcpus to reload its shadow page table and flush TLB.
@@ -5703,7 +5705,7 @@  static void kvm_mmu_zap_all_fast(struct kvm *kvm)
 
 	if (is_tdp_mmu_enabled(kvm)) {
 		read_lock(&kvm->mmu_lock);
-		kvm_tdp_mmu_zap_invalidated_roots(kvm);
+		kvm_tdp_mmu_zap_invalidated_roots(kvm, &invalidated_roots);
 		read_unlock(&kvm->mmu_lock);
 	}
 }
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index ca6b30a7130d..085f6b09e5f3 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -94,9 +94,17 @@  void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
 
 	WARN_ON(!root->tdp_mmu_page);
 
-	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
-	list_del_rcu(&root->link);
-	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+	/*
+	 * Remove the root from tdp_mmu_roots, unless the root is invalid in
+	 * which case the root was pulled off tdp_mmu_roots when it was
+	 * invalidated.  Note, this must be an RCU-protected deletion to avoid
+	 * use-after-free in the yield-safe iterator!
+	 */
+	if (!root->role.invalid) {
+		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+		list_del_rcu(&root->link);
+		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+	}
 
 	/*
 	 * A TLB flush is not necessary as KVM performs a local TLB flush when
@@ -105,18 +113,23 @@  void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
 	 * invalidates any paging-structure-cache entries, i.e. TLB entries for
 	 * intermediate paging structures, that may be zapped, as such entries
 	 * are associated with the ASID on both VMX and SVM.
+	 *
+	 * WARN if a flush is reported for an invalid root, as its child SPTEs
+	 * should have been zapped by kvm_tdp_mmu_zap_invalidated_roots(), and
+	 * inserting new SPTEs under an invalid root is a KVM bug.
 	 */
-	(void)zap_gfn_range(kvm, root, 0, -1ull, true, false, shared);
+	if (zap_gfn_range(kvm, root, 0, -1ull, true, false, shared))
+		WARN_ON_ONCE(root->role.invalid);
 
 	call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
 }
 
 /*
- * Finds the next valid root after root (or the first valid root if root
- * is NULL), takes a reference on it, and returns that next root. If root
- * is not NULL, this thread should have already taken a reference on it, and
- * that reference will be dropped. If no valid root is found, this
- * function will return NULL.
+ * Finds the next root after @prev_root (or the first root if @prev_root is NULL
+ * or invalid), takes a reference on it, and returns that next root.  If root is
+ * not NULL, this thread should have already taken a reference on it, and that
+ * reference will be dropped. If no valid root is found, this function will
+ * return NULL.
  */
 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
 					      struct kvm_mmu_page *prev_root,
@@ -124,6 +137,27 @@  static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
 {
 	struct kvm_mmu_page *next_root;
 
+	lockdep_assert_held(&kvm->mmu_lock);
+
+	/*
+	 * Restart the walk if the previous root was invalidated, which can
+	 * happen if the caller drops mmu_lock when yielding.  Restarting the
+	 * walke is necessary because invalidating a root also removes it from
+	 * tdp_mmu_roots.  Restarting is safe and correct because invalidating
+	 * a root is done if and only if _all_ roots are invalidated, i.e. any
+	 * root on tdp_mmu_roots was added _after_ the invalidation event.
+	 */
+	if (prev_root && prev_root->role.invalid) {
+		kvm_tdp_mmu_put_root(kvm, prev_root, shared);
+		prev_root = NULL;
+	}
+
+	/*
+	 * Finding the next root must be done under RCU read lock.  Although
+	 * @prev_root itself cannot be removed from tdp_mmu_roots because this
+	 * task holds a reference, its next and prev pointers can be modified
+	 * when freeing a different root.  Ditto for tdp_mmu_roots itself.
+	 */
 	rcu_read_lock();
 
 	if (prev_root)
@@ -230,10 +264,13 @@  hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
 	root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
 	refcount_set(&root->tdp_mmu_root_count, 1);
 
-	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
-	list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
-	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
-
+	/*
+	 * Because mmu_lock must be held for write to ensure that KVM doesn't
+	 * create multiple roots for a given role, this does not need to use
+	 * an RCU-friendly variant as readers of tdp_mmu_roots must also hold
+	 * mmu_lock in some capacity.
+	 */
+	list_add(&root->link, &kvm->arch.tdp_mmu_roots);
 out:
 	return __pa(root->spt);
 }
@@ -814,28 +851,6 @@  void kvm_tdp_mmu_zap_all(struct kvm *kvm)
 		kvm_flush_remote_tlbs(kvm);
 }
 
-static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
-						  struct kvm_mmu_page *prev_root)
-{
-	struct kvm_mmu_page *next_root;
-
-	if (prev_root)
-		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
-						  &prev_root->link,
-						  typeof(*prev_root), link);
-	else
-		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
-						   typeof(*next_root), link);
-
-	while (next_root && !(next_root->role.invalid &&
-			      refcount_read(&next_root->tdp_mmu_root_count)))
-		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
-						  &next_root->link,
-						  typeof(*next_root), link);
-
-	return next_root;
-}
-
 /*
  * Since kvm_tdp_mmu_zap_all_fast has acquired a reference to each
  * invalidated root, they will not be freed until this function drops the
@@ -844,22 +859,21 @@  static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
  * only has to do a trivial amount of work. Since the roots are invalid,
  * no new SPTEs should be created under them.
  */
-void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
+void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm,
+				       struct list_head *invalidated_roots)
 {
-	struct kvm_mmu_page *next_root;
-	struct kvm_mmu_page *root;
+	struct kvm_mmu_page *root, *tmp;
 
+	lockdep_assert_held(&kvm->slots_lock);
 	lockdep_assert_held_read(&kvm->mmu_lock);
 
-	rcu_read_lock();
-
-	root = next_invalidated_root(kvm, NULL);
-
-	while (root) {
-		next_root = next_invalidated_root(kvm, root);
-
-		rcu_read_unlock();
-
+	/*
+	 * Put the ref to each root, acquired by kvm_tdp_mmu_put_root().  The
+	 * safe variant is required even though kvm_tdp_mmu_put_root() doesn't
+	 * explicitly remove the root from the invalid list, as this task does
+	 * not take rcu_read_lock() and so the list object itself can be freed.
+	 */
+	list_for_each_entry_safe(root, tmp, invalidated_roots, link) {
 		/*
 		 * A TLB flush is unnecessary, invalidated roots are guaranteed
 		 * to be unreachable by the guest (see kvm_tdp_mmu_put_root()
@@ -870,49 +884,50 @@  void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
 		 * blip and not a functional issue.
 		 */
 		(void)zap_gfn_range(kvm, root, 0, -1ull, true, false, true);
-
-		/*
-		 * Put the reference acquired in
-		 * kvm_tdp_mmu_invalidate_roots
-		 */
 		kvm_tdp_mmu_put_root(kvm, root, true);
-
-		root = next_root;
-
-		rcu_read_lock();
 	}
-
-	rcu_read_unlock();
 }
 
 /*
- * Mark each TDP MMU root as invalid so that other threads
- * will drop their references and allow the root count to
- * go to 0.
+ * Mark each TDP MMU root as invalid so that other threads will drop their
+ * references and allow the root count to go to 0.
  *
- * Also take a reference on all roots so that this thread
- * can do the bulk of the work required to free the roots
- * once they are invalidated. Without this reference, a
- * vCPU thread might drop the last reference to a root and
- * get stuck with tearing down the entire paging structure.
+ * Take a reference on each root and move it to a local list so that this task
+ * can do the actual work required to free the roots once they are invalidated,
+ * e.g. zap the SPTEs and trigger a remote TLB flush. Without this reference, a
+ * vCPU task might drop the last reference to a root and get stuck with tearing
+ * down the entire paging structure.
  *
- * Roots which have a zero refcount should be skipped as
- * they're already being torn down.
- * Already invalid roots should be referenced again so that
- * they aren't freed before kvm_tdp_mmu_zap_all_fast is
- * done with them.
+ * Roots which have a zero refcount are skipped as they're already being torn
+ * down.  Encountering a root that is already invalid is a KVM bug, as this is
+ * the only path that is allowed to invalidate roots and (a) it's proteced by
+ * slots_lock and (b) pulls each root off tdp_mmu_roots.
  *
- * This has essentially the same effect for the TDP MMU
- * as updating mmu_valid_gen does for the shadow MMU.
+ * This has essentially the same effect for the TDP MMU as updating
+ * mmu_valid_gen does for the shadow MMU.
  */
-void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
+void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm,
+				      struct list_head *invalidated_roots)
 {
-	struct kvm_mmu_page *root;
+	struct kvm_mmu_page *root, *tmp;
 
+	/*
+	 * mmu_lock must be held for write, moving entries off an RCU-protected
+	 * list is not safe, entries can only be deleted.   All accesses to
+	 * tdp_mmu_roots are required to hold mmu_lock in some capacity, thus
+	 * holding it for write ensures there are no concurrent readers.
+	 */
 	lockdep_assert_held_write(&kvm->mmu_lock);
-	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link)
-		if (refcount_inc_not_zero(&root->tdp_mmu_root_count))
-			root->role.invalid = true;
+
+	list_for_each_entry_safe(root, tmp, &kvm->arch.tdp_mmu_roots, link) {
+		if (!kvm_tdp_mmu_get_root(root))
+			continue;
+
+		list_move_tail(&root->link, invalidated_roots);
+
+		WARN_ON_ONCE(root->role.invalid);
+		root->role.invalid = true;
+	}
 }
 
 /*
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 599714de67c3..ced6d8e47362 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -9,7 +9,13 @@  hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
 
 __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
 {
-	if (root->role.invalid)
+	/*
+	 * Acquiring a reference on an invalid root is a KVM bug.  Invalid roots
+	 * are supposed to be reachable only by references that were acquired
+	 * before the invalidation, and taking an additional reference to an
+	 * invalid root is not allowed.
+	 */
+	if (WARN_ON_ONCE(root->role.invalid))
 		return false;
 
 	return refcount_inc_not_zero(&root->tdp_mmu_root_count);
@@ -44,8 +50,10 @@  static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
 }
 
 void kvm_tdp_mmu_zap_all(struct kvm *kvm);
-void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm);
-void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm);
+void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm,
+				      struct list_head *invalidated_roots);
+void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm,
+				       struct list_head *invalidated_roots);
 
 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);