diff mbox series

KVM: Move gfn_to_pfn_cache invalidation to invalidate_range_end hook

Message ID f862cefff2ed3f4211b69d785670f41667703cf3.camel@infradead.org (mailing list archive)
State New, archived
Headers show
Series KVM: Move gfn_to_pfn_cache invalidation to invalidate_range_end hook | expand

Commit Message

David Woodhouse Aug. 5, 2024, 11:08 a.m. UTC
From: David Woodhouse <dwmw@amazon.co.uk>

The existing retry loop in hva_to_pfn_retry() is extremely pessimistic.
If there is an invalidation running concurrently, it is effectively just
a complex busy wait loop because its local mmu_notifier_retry_cache()
function will always return true.

It ends up functioning as a very unfair read/write lock. If userspace is
acting as a 'writer', performing many unrelated MM changes, then the
hva_to_pfn_retry() function acting as the 'reader' just backs off and
keep retrying for ever, not making any progress.

Solve this by introducing a separate 'validating' flag to the GPC, so
that it can be marked invalid before it's even mapped. This allows the
invalidation to be moved to the range_end hook, and the retry loop in
hva_to_pfn_retry() can be changed to loop only if its particular uHVA
has been affected.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
I note I'm deleting a big comment in kvm_main.c about doing the
invalidation before acquiring mmu_lock. But we don't hold the lock
in the range_end callback either, do we?
 
 include/linux/kvm_types.h |  1 +
 virt/kvm/kvm_main.c       | 14 ++------
 virt/kvm/kvm_mm.h         | 12 +++----
 virt/kvm/pfncache.c       | 75 +++++++++++++++++++--------------------
 4 files changed, 45 insertions(+), 57 deletions(-)

Comments

Sean Christopherson Aug. 6, 2024, 12:45 a.m. UTC | #1
On Mon, Aug 05, 2024, David Woodhouse wrote:
> From: David Woodhouse <dwmw@amazon.co.uk>
> 
> The existing retry loop in hva_to_pfn_retry() is extremely pessimistic.
> If there is an invalidation running concurrently, it is effectively just
> a complex busy wait loop because its local mmu_notifier_retry_cache()
> function will always return true.
> 
> It ends up functioning as a very unfair read/write lock. If userspace is
> acting as a 'writer', performing many unrelated MM changes, then the
> hva_to_pfn_retry() function acting as the 'reader' just backs off and
> keep retrying for ever, not making any progress.
> 
> Solve this by introducing a separate 'validating' flag to the GPC, so
> that it can be marked invalid before it's even mapped. This allows the
> invalidation to be moved to the range_end hook, and the retry loop in
> hva_to_pfn_retry() can be changed to loop only if its particular uHVA
> has been affected.

I think I'm missing something.  How does allowing hva_to_pfn_retry() allow KVM
as a whole to make forward progress?  Doesn't getting past hva_to_pfn_retry()
just move the problem to kvm_gpc_check()?

kvm_gpc_refresh() can't be called with gpc->lock held, and nor does it return
with gpc->lock held, so a racing mmu_notifier invalidation can/will acquire
gpc->lock and clear gpc->active, no?

Oh, by "unrelated", you mean _completely_ unrelated?  As in, KVM happens to do a
refresh when userspace is blasting MADV_DONTNEED, and gets stuck retrying for
no good reason?

Servicing guest pages faults has the same problem, which is why
mmu_invalidate_retry_gfn() was added.  Supporting hva-only GPCs made our lives a
little harder, but not horrifically so (there are ordering differences regardless).

Woefully incomplete, but I think this is the gist of what you want:

diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index f0039efb9e1e..1c4c95ab7d0a 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -28,6 +28,26 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
        struct gfn_to_pfn_cache *gpc;
 
        spin_lock(&kvm->gpc_lock);
+
+       if (likely(kvm_is_error_hva(kvm->mmu_gpc_invalidate_range_start)) {
+               kvm->mmu_gpc_invalidate_range_start = start;
+               kvm->mmu_gpc_invalidate_range_end = end;
+       } else {
+               /*
+                * Fully tracking multiple concurrent ranges has diminishing
+                * returns. Keep things simple and just find the minimal range
+                * which includes the current and new ranges. As there won't be
+                * enough information to subtract a range after its invalidate
+                * completes, any ranges invalidated concurrently will
+                * accumulate and persist until all outstanding invalidates
+                * complete.
+                */
+               kvm->mmu_gpc_invalidate_range_start =
+                       min(kvm->mmu_gpc_invalidate_range_start, start);
+               kvm->mmu_gpc_invalidate_range_end =
+                       max(kvm->mmu_gpc_invalidate_range_end, end);
+       }
+
        list_for_each_entry(gpc, &kvm->gpc_list, list) {
                read_lock_irq(&gpc->lock);
 
@@ -124,8 +144,11 @@ static void gpc_unmap(kvm_pfn_t pfn, void *khva)
 #endif
 }
 
-static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_seq)
+static inline bool mmu_notifier_retry_cache(struct gfn_to_pfn_cache *gpc,
+                                           unsigned long mmu_seq)
 {
+       struct kvm *kvm = gpc->kvm;
+
        /*
         * mn_active_invalidate_count acts for all intents and purposes
         * like mmu_invalidate_in_progress here; but the latter cannot
@@ -138,7 +161,9 @@ static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_s
         * be elevated before the mmu_notifier acquires gpc->lock, and
         * isn't dropped until after mmu_invalidate_seq is updated.
         */
-       if (kvm->mn_active_invalidate_count)
+       if (kvm->mn_active_invalidate_count &&
+           gpc->uhva >= kvm->mmu_gpc_invalidate_range_start &&
+           gpc->uhva < kvm->mmu_gpc_invalidate_range_end)
                return true;
 
        /*
@@ -224,7 +249,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
                 * attempting to refresh.
                 */
                WARN_ON_ONCE(gpc->valid);
-       } while (mmu_notifier_retry_cache(gpc->kvm, mmu_seq));
+       } while (mmu_notifier_retry_cache(gpc, mmu_seq));
 
        gpc->valid = true;
        gpc->pfn = new_pfn;

> Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
> ---
> I note I'm deleting a big comment in kvm_main.c about doing the
> invalidation before acquiring mmu_lock. But we don't hold the lock
> in the range_end callback either, do we?

Correct, __kvm_handle_hva_range() acquires and releases mmu_lock.  However, the
intent of the comment was to clarify why GPCs are invalidated in
kvm_mmu_notifier_invalidate_range_start(), as opposed to kvm_mmu_invalidate_begin()
which _is_ called under mmu_lock and is also called if and only if KVM has a
relevant memslot.  E.g. that's why the comment also talks about memslot overlap
checks.

>  
>  include/linux/kvm_types.h |  1 +
>  virt/kvm/kvm_main.c       | 14 ++------
>  virt/kvm/kvm_mm.h         | 12 +++----
>  virt/kvm/pfncache.c       | 75 +++++++++++++++++++--------------------
>  4 files changed, 45 insertions(+), 57 deletions(-)
> 
> diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
> index 827ecc0b7e10..30ed1019cfc6 100644
> --- a/include/linux/kvm_types.h
> +++ b/include/linux/kvm_types.h
> @@ -69,6 +69,7 @@ struct gfn_to_pfn_cache {
>  	void *khva;
>  	kvm_pfn_t pfn;
>  	bool active;
> +	bool validating;

This is a confusing name, partly because KVM usually deals with invalidation
events, but also because it's sticky and stays set long after the act of
validating the GPC is complete.

Something like "needs_invalidation" is the best I can come up with, but I believe
this bikeshed is moot (see above and below).

>  	bool valid;
>  };
>  
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index d0788d0a72cc..ffd6ab4c2a16 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -777,18 +777,6 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
>  	kvm->mn_active_invalidate_count++;
>  	spin_unlock(&kvm->mn_invalidate_lock);
>  
> -	/*
> -	 * Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e.
> -	 * before acquiring mmu_lock, to avoid holding mmu_lock while acquiring
> -	 * each cache's lock.  There are relatively few caches in existence at
> -	 * any given time, and the caches themselves can check for hva overlap,
> -	 * i.e. don't need to rely on memslot overlap checks for performance.
> -	 * Because this runs without holding mmu_lock, the pfn caches must use
> -	 * mn_active_invalidate_count (see above) instead of
> -	 * mmu_invalidate_in_progress.
> -	 */
> -	gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end);
> -
>  	/*
>  	 * If one or more memslots were found and thus zapped, notify arch code
>  	 * that guest memory has been reclaimed.  This needs to be done *after*
> @@ -849,6 +837,8 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
>  	wake = !kvm->mn_active_invalidate_count;
>  	spin_unlock(&kvm->mn_invalidate_lock);
>  
> +	gfn_to_pfn_cache_invalidate(kvm, range->start, range->end);

We can't do this.  The contract with mmu_notifiers is that secondary MMUs must
unmap the hva before returning from invalidate_range_start(), and must not create
new mappings until invalidate_range_end().
David Woodhouse Aug. 6, 2024, 9:06 a.m. UTC | #2
On Mon, 2024-08-05 at 17:45 -0700, Sean Christopherson wrote:
> On Mon, Aug 05, 2024, David Woodhouse wrote:
> > From: David Woodhouse <dwmw@amazon.co.uk>
> > 
> > The existing retry loop in hva_to_pfn_retry() is extremely pessimistic.
> > If there is an invalidation running concurrently, it is effectively just
> > a complex busy wait loop because its local mmu_notifier_retry_cache()
> > function will always return true.
> > 
> > It ends up functioning as a very unfair read/write lock. If userspace is
> > acting as a 'writer', performing many unrelated MM changes, then the
> > hva_to_pfn_retry() function acting as the 'reader' just backs off and
> > keep retrying for ever, not making any progress.
> > 
> > Solve this by introducing a separate 'validating' flag to the GPC, so
> > that it can be marked invalid before it's even mapped. This allows the
> > invalidation to be moved to the range_end hook, and the retry loop in
> > hva_to_pfn_retry() can be changed to loop only if its particular uHVA
> > has been affected.
> 
> I think I'm missing something.  How does allowing hva_to_pfn_retry() allow KVM
> as a whole to make forward progress?  Doesn't getting past hva_to_pfn_retry()
> just move the problem to kvm_gpc_check()?
> 
> kvm_gpc_refresh() can't be called with gpc->lock held, and nor does it return
> with gpc->lock held, so a racing mmu_notifier invalidation can/will acquire
> gpc->lock and clear gpc->active, no?
> 
> Oh, by "unrelated", you mean _completely_ unrelated?  As in, KVM happens to do a
> refresh when userspace is blasting MADV_DONTNEED, and gets stuck retrying for
> no good reason?

Right. And again, I have no idea why userspace is doing that, and we
need to make it stop. But that's not really the point; the kernel
should be able to make progress anyway.

> Servicing guest pages faults has the same problem, which is why
> mmu_invalidate_retry_gfn() was added.  Supporting hva-only GPCs made our lives a
> little harder, but not horrifically so (there are ordering differences regardless).
> 
> Woefully incomplete, but I think this is the gist of what you want:

Hm, maybe. It does mean that migration occurring all through memory
(indeed, just one at top and bottom of guest memory space) would
perturb GPCs which remain present.

> > @@ -849,6 +837,8 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
> >         wake = !kvm->mn_active_invalidate_count;
> >         spin_unlock(&kvm->mn_invalidate_lock);
> >  
> > +       gfn_to_pfn_cache_invalidate(kvm, range->start, range->end);
> 
> We can't do this.  The contract with mmu_notifiers is that secondary MMUs must
> unmap the hva before returning from invalidate_range_start(), and must not create
> new mappings until invalidate_range_end().

But in the context of the GPC, it is only "mapped" when the ->valid bit is set. 

Even the invalidation callback just clears the valid bit, and that
means nobody is allowed to dereference the ->khva any more. It doesn't
matter that the underlying (stale) PFN is still kmapped.

Can we not apply the same logic to the hva_to_pfn_retry() loop? Yes, it
might kmap a page that gets removed, but it's not actually created a
new mapping if it hasn't set the ->valid bit.

I don't think this version quite meets the constraints, and I might
need to hook *both* the start and end notifiers, and might not like it
once I get there. But I'll have a go...
Sean Christopherson Aug. 6, 2024, 2:03 p.m. UTC | #3
On Tue, Aug 06, 2024, David Woodhouse wrote:
> On Mon, 2024-08-05 at 17:45 -0700, Sean Christopherson wrote:
> > On Mon, Aug 05, 2024, David Woodhouse wrote:
> > > From: David Woodhouse <dwmw@amazon.co.uk>
> > Servicing guest pages faults has the same problem, which is why
> > mmu_invalidate_retry_gfn() was added.  Supporting hva-only GPCs made our lives a
> > little harder, but not horrifically so (there are ordering differences regardless).
> > 
> > Woefully incomplete, but I think this is the gist of what you want:
> 
> Hm, maybe. It does mean that migration occurring all through memory
> (indeed, just one at top and bottom of guest memory space) would
> perturb GPCs which remain present.

If that happens with a real world VMM, and it's not a blatant VMM goof, then we
can fix KVM.  The stage-2 page fault path hammers the mmu_notifier retry logic
far more than GPCs, so if a range-based check is inadequate for some use case,
then we definitely need to fix both.

In short, I don't see any reason to invent something different for GPCs.

> > > @@ -849,6 +837,8 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
> > >         wake = !kvm->mn_active_invalidate_count;
> > >         spin_unlock(&kvm->mn_invalidate_lock);
> > >  
> > > +       gfn_to_pfn_cache_invalidate(kvm, range->start, range->end);
> > 
> > We can't do this.  The contract with mmu_notifiers is that secondary MMUs must
> > unmap the hva before returning from invalidate_range_start(), and must not create
> > new mappings until invalidate_range_end().
> 
> But in the context of the GPC, it is only "mapped" when the ->valid bit is set. 
> 
> Even the invalidation callback just clears the valid bit, and that
> means nobody is allowed to dereference the ->khva any more. It doesn't
> matter that the underlying (stale) PFN is still kmapped.
> 
> Can we not apply the same logic to the hva_to_pfn_retry() loop? Yes, it
> might kmap a page that gets removed, but it's not actually created a
> new mapping if it hasn't set the ->valid bit.
> 
> I don't think this version quite meets the constraints, and I might
> need to hook *both* the start and end notifiers, and might not like it
> once I get there. But I'll have a go...

I'm pretty sure you're going to need the range-based retry logic.  KVM can't
safely set gpc->valid until mn_active_invalidate_count reaches zero, so if a GPC
refresh comes along after mn_active_invalidate_count has been elevated, it won't
be able to set gpc->valid until the MADV_DONTNEED storm goes away.  Without
range-based tracking, there's no way to know if a previous invalidation was
relevant to the GPC.
David Woodhouse Aug. 6, 2024, 2:24 p.m. UTC | #4
On Tue, 2024-08-06 at 07:03 -0700, Sean Christopherson wrote:
> On Tue, Aug 06, 2024, David Woodhouse wrote:
> > On Mon, 2024-08-05 at 17:45 -0700, Sean Christopherson wrote:
> > > On Mon, Aug 05, 2024, David Woodhouse wrote:
> > > > From: David Woodhouse <dwmw@amazon.co.uk>
> > > Servicing guest pages faults has the same problem, which is why
> > > mmu_invalidate_retry_gfn() was added.  Supporting hva-only GPCs made our lives a
> > > little harder, but not horrifically so (there are ordering differences regardless).
> > > 
> > > Woefully incomplete, but I think this is the gist of what you want:
> > 
> > Hm, maybe. It does mean that migration occurring all through memory
> > (indeed, just one at top and bottom of guest memory space) would
> > perturb GPCs which remain present.
> 
> If that happens with a real world VMM, and it's not a blatant VMM goof, then we
> can fix KVM.  The stage-2 page fault path hammers the mmu_notifier retry logic
> far more than GPCs, so if a range-based check is inadequate for some use case,
> then we definitely need to fix both.
> 
> In short, I don't see any reason to invent something different for GPCs.
> 
> > > > @@ -849,6 +837,8 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
> > > >         wake = !kvm->mn_active_invalidate_count;
> > > >         spin_unlock(&kvm->mn_invalidate_lock);
> > > >  
> > > > +       gfn_to_pfn_cache_invalidate(kvm, range->start, range->end);
> > > 
> > > We can't do this.  The contract with mmu_notifiers is that secondary MMUs must
> > > unmap the hva before returning from invalidate_range_start(), and must not create
> > > new mappings until invalidate_range_end().

Looking at that assertion harder... where is that rule written? It
seems counter-intuitive to me; that isn't how TLBs work. Another CPU
can populate a TLB entry right up to the moment the PTE is actually
*changed* in the page tables, and then the CPU which is
modifying/zapping the PTE needs to perform a remote TLB flush. That
remote TLB flush is analogous to the invalidate_range_end() call,
surely?

I'm fairly sure that's how it works for PASID support too; nothing
prevents the IOMMU+device from populating an IOTLB entry until the PTE
is actually changed in the process page tables.

So why can't we do the same for the GPC?

> > But in the context of the GPC, it is only "mapped" when the ->valid bit is set. 
> > 
> > Even the invalidation callback just clears the valid bit, and that
> > means nobody is allowed to dereference the ->khva any more. It doesn't
> > matter that the underlying (stale) PFN is still kmapped.
> > 
> > Can we not apply the same logic to the hva_to_pfn_retry() loop? Yes, it
> > might kmap a page that gets removed, but it's not actually created a
> > new mapping if it hasn't set the ->valid bit.
> > 
> > I don't think this version quite meets the constraints, and I might
> > need to hook *both* the start and end notifiers, and might not like it
> > once I get there. But I'll have a go...
> 
> I'm pretty sure you're going to need the range-based retry logic.  KVM can't
> safely set gpc->valid until mn_active_invalidate_count reaches zero, so if a GPC
> refresh comes along after mn_active_invalidate_count has been elevated, it won't
> be able to set gpc->valid until the MADV_DONTNEED storm goes away.  Without
> range-based tracking, there's no way to know if a previous invalidation was
> relevant to the GPC.

If it is indeed the case that KVM can't just behave like a normal TLB,
so it and can't set gpc->valid until mn_active_invalidate_count reaches
zero, it still only needs to *wait* (or spin, maybe). It certainly
doesn't need to keep looping and remapping the same PFN over and over
again, as it does at the moment.

When mn_active_invalidate_count does reach zero, either the young GPC
will have been invalidated by clearing the (to be renamed) ->validating
flag, or it won't have been. If it *has* been invalidated, that's when
hva_to_pfn_retry() needs to go one more time round its full loop.

So it just needs to wait until any pending (relevant) invalidations
have completed, *then* check and potentially loop once more.

And yes, making that *wait* range-based does make some sense, I
suppose. It becomes "wait for gpc->uhva not to be within the range of
kvm->mmu_gpc_invalidate_range_{start,end}."

Except... that range can never shrink *except* when
mn_active_invalidate_count becomes zero, can it? So if we do end up
waiting, the wake condition is *still* just that the count has become
zero. There's already a wakeup in that case, on kvm-
>mn_memslots_update_rcuwait. Can I wait on that?
Sean Christopherson Aug. 6, 2024, 3:57 p.m. UTC | #5
On Tue, Aug 06, 2024, David Woodhouse wrote:
> On Tue, 2024-08-06 at 07:03 -0700, Sean Christopherson wrote:
> > On Tue, Aug 06, 2024, David Woodhouse wrote:
> > > On Mon, 2024-08-05 at 17:45 -0700, Sean Christopherson wrote:
> > > > On Mon, Aug 05, 2024, David Woodhouse wrote:
> > > > > From: David Woodhouse <dwmw@amazon.co.uk>
> > > > Servicing guest pages faults has the same problem, which is why
> > > > mmu_invalidate_retry_gfn() was added.  Supporting hva-only GPCs made our lives a
> > > > little harder, but not horrifically so (there are ordering differences regardless).
> > > > 
> > > > Woefully incomplete, but I think this is the gist of what you want:
> > > 
> > > Hm, maybe. It does mean that migration occurring all through memory
> > > (indeed, just one at top and bottom of guest memory space) would
> > > perturb GPCs which remain present.
> > 
> > If that happens with a real world VMM, and it's not a blatant VMM goof, then we
> > can fix KVM.  The stage-2 page fault path hammers the mmu_notifier retry logic
> > far more than GPCs, so if a range-based check is inadequate for some use case,
> > then we definitely need to fix both.
> > 
> > In short, I don't see any reason to invent something different for GPCs.
> > 
> > > > > @@ -849,6 +837,8 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
> > > > >         wake = !kvm->mn_active_invalidate_count;
> > > > >         spin_unlock(&kvm->mn_invalidate_lock);
> > > > >  
> > > > > +       gfn_to_pfn_cache_invalidate(kvm, range->start, range->end);
> > > > 
> > > > We can't do this.  The contract with mmu_notifiers is that secondary MMUs must
> > > > unmap the hva before returning from invalidate_range_start(), and must not create
> > > > new mappings until invalidate_range_end().
> 
> Looking at that assertion harder... where is that rule written?

The big comment for invalidate_range_{start,end}() in include/linux/mmu_notifier.h.
The relevant snippets are:

	 * If the subsystem can't guarantee that no additional references are
	 * taken to the pages in the range, it has to implement the
	 * invalidate_range() notifier to remove any references taken after
	 * invalidate_range_start().

	 * invalidate_range_start() is called when all pages in the
	 * range are still mapped and have at least a refcount of one.
	 *
	 * invalidate_range_end() is called when all pages in the
	 * range have been unmapped and the pages have been freed by
	 * the VM.

The last one is key: the pages have already been freed when invalidate_range_end()
is called, and so unmapping at that time would be too late.

> It seems counter-intuitive to me; that isn't how TLBs work. Another CPU can
> populate a TLB entry right up to the moment the PTE is actually *changed* in
> the page tables, and then the CPU which is modifying/zapping the PTE needs to
> perform a remote TLB flush. That remote TLB flush is analogous to the
> invalidate_range_end() call, surely?

KVM's usage isn't about (hardware) TLBs.  Ah, and the history is even somewhat
evident in the above comment I referenced.  invalidate_range() no longer exists,
it was morphed into arch_invalidate_secondary_tlbs().  For secondary MMUs that
reuse the primary MMU's PTEs, mmu_notifier_arch_invalidate_secondary_tlbs() is
indeed called after the PTEs have been modified.

KVM's usage is different.  Because KVM has its own (Secondary) PTEs (commit
1af5a8109904 ("mmu_notifiers: rename invalidate_range notifier") calls them
"software TLBs", but I find that to be a confusing description), zapping on-demand
when the primary PTEs are modified is tricky and ultimately undesirable.

E.g. invoking mmu_notifiers while holding a PTE lock would prevent KVM from
blocking, which can be problematic if KVM needs to zap a large number SPTEs.

And doing invalidation on-demand for each primary PTE would be suboptimal for
cases where a large VMA range is unmapped/modified, e.g. KVM would get a large
number of invalidation events instead of one big, all-encompassing invalidation.

The obvious downside is what you've run into, where the start+end approach forces
KVM to wait for all in-flight invalidations to go away.  But again, in practice
the rudimentary range tracking suffices for all known use cases.

> I'm fairly sure that's how it works for PASID support too; nothing
> prevents the IOMMU+device from populating an IOTLB entry until the PTE
> is actually changed in the process page tables.
> 
> So why can't we do the same for the GPC?
> 
> > > But in the context of the GPC, it is only "mapped" when the ->valid bit is set. 
> > > 
> > > Even the invalidation callback just clears the valid bit, and that
> > > means nobody is allowed to dereference the ->khva any more. It doesn't
> > > matter that the underlying (stale) PFN is still kmapped.
> > > 
> > > Can we not apply the same logic to the hva_to_pfn_retry() loop? Yes, it
> > > might kmap a page that gets removed, but it's not actually created a
> > > new mapping if it hasn't set the ->valid bit.
> > > 
> > > I don't think this version quite meets the constraints, and I might
> > > need to hook *both* the start and end notifiers, and might not like it
> > > once I get there. But I'll have a go...
> > 
> > I'm pretty sure you're going to need the range-based retry logic.  KVM can't
> > safely set gpc->valid until mn_active_invalidate_count reaches zero, so if a GPC
> > refresh comes along after mn_active_invalidate_count has been elevated, it won't
> > be able to set gpc->valid until the MADV_DONTNEED storm goes away.  Without
> > range-based tracking, there's no way to know if a previous invalidation was
> > relevant to the GPC.
> 
> If it is indeed the case that KVM can't just behave like a normal TLB,
> so it and can't set gpc->valid until mn_active_invalidate_count reaches
> zero, it still only needs to *wait* (or spin, maybe). It certainly
> doesn't need to keep looping and remapping the same PFN over and over
> again, as it does at the moment.
> 
> When mn_active_invalidate_count does reach zero, either the young GPC
> will have been invalidated by clearing the (to be renamed) ->validating
> flag, or it won't have been. If it *has* been invalidated, that's when
> hva_to_pfn_retry() needs to go one more time round its full loop.
> 
> So it just needs to wait until any pending (relevant) invalidations
> have completed, *then* check and potentially loop once more.
> 
> And yes, making that *wait* range-based does make some sense, I
> suppose. It becomes "wait for gpc->uhva not to be within the range of
> kvm->mmu_gpc_invalidate_range_{start,end}."

Yep, exactly.  Without range-based tracking, there's no way for KVM to know when
a relevant in-flight invalidation has completed.

> Except... that range can never shrink *except* when
> mn_active_invalidate_count becomes zero, can it?

Not without more sophisticated logic, no.  E.g. if KVM supported tracking multiple
distinct ranges, then individual invalidation ranges could be dropped.  But to
to avoid memory allocations in invalidate_range_start(), KVM would still need to
hardcode the maximum number of in-flight ranges.  E.g. even if KVM used a dynamic
container, we'd probably want the container entries to be "allocated" out of a
cache, and that cache would need a maximum capacity.

With a max limit on the number of ranges, KVM would still be forced to combine
ranges if there are too many in-flight invalidations.

So, because tracking a single range has sufficed for all known use cases, and
it's significantly simpler than tracking multiple ranges, AFAIK no one has pursued
a multi-range tracking implementation.

> So if we do end up waiting, the wake condition is *still* just that the count
> has become zero. There's already a wakeup in that case, on kvm-
> >mn_memslots_update_rcuwait. Can I wait on that?

I suspect you're trying to solve a problem that doesn't exist in practice.
hva_to_pfn_retry() already has a cond_resched(), so getting stuck for a long
duration isn't fatal, just suboptimal.  And similar to the range-based tracking,
_if_ there's a problem in practice, then it also affects guest page faults.  KVM
simply resumes the vCPU and keeps re-faulting until the in-flight invalidation(s)
has gone away.

Not without reworking mn_memslots_update_rcuwait.  KVM assumes there is at most
one waiter, as that wait+wake combination is specifically to handle the case where
a _relevant_ in-flight mmu_notifier invalidation needs to block a userspace memslot
deletion.  KVM takes mmu_lock in invalidate_range_{start,end}() if and only if
there is an overlapping memslot, and so KVM needs to prevent a memslot from being
deleted between start() and end().
David Woodhouse Aug. 6, 2024, 4:40 p.m. UTC | #6
On Tue, 2024-08-06 at 08:57 -0700, Sean Christopherson wrote:
> The last one is key: the pages have already been freed when invalidate_range_end()
> is called, and so unmapping at that time would be too late.

OK, thanks.

> The obvious downside is what you've run into, where the start+end approach forces
> KVM to wait for all in-flight invalidations to go away.  But again, in practice
> the rudimentary range tracking suffices for all known use cases.

Makes sense.

> I suspect you're trying to solve a problem that doesn't exist in practice.
> hva_to_pfn_retry() already has a cond_resched(), so getting stuck for a long
> duration isn't fatal, just suboptimal.  And similar to the range-based tracking,
> _if_ there's a problem in practice, then it also affects guest page faults.  KVM
> simply resumes the vCPU and keeps re-faulting until the in-flight invalidation(s)
> has gone away.

Yeah. When it's looping on actual page faults it's easy to pretend it
isn't a busy-loop :)

Making it wait is fairly simple; it would be easy to just make it
cond_resched() instead though. Here's what I have so far (incremental).

https://git.infradead.org/?p=users/dwmw2/linux.git;a=commitdiff;h=pfncache-2

I need to revive the torture test you had at the end of your earlier series.

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 689e8be873a7..9b5261e11802 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -770,6 +770,9 @@ struct kvm {
 	/* For management / invalidation of gfn_to_pfn_caches */
 	spinlock_t gpc_lock;
 	struct list_head gpc_list;
+	u64 mmu_gpc_invalidate_range_start;
+	u64 mmu_gpc_invalidate_range_end;
+	wait_queue_head_t gpc_invalidate_wq;
 
 	/*
 	 * created_vcpus is protected by kvm->lock, and is incremented
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ffd6ab4c2a16..a243f9f8a9c0 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -775,6 +775,24 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	 */
 	spin_lock(&kvm->mn_invalidate_lock);
 	kvm->mn_active_invalidate_count++;
+	if (likely(kvm->mn_active_invalidate_count == 1) {
+		kvm->mmu_gpc_invalidate_range_start = range->start;
+		kvm->mmu_gpc_invalidate_range_end = range->end;
+	} else {
+		/*
+		 * Fully tracking multiple concurrent ranges has diminishing
+		 * returns. Keep things simple and just find the minimal range
+		 * which includes the current and new ranges. As there won't be
+		 * enough information to subtract a range after its invalidate
+		 * completes, any ranges invalidated concurrently will
+		 * accumulate and persist until all outstanding invalidates
+		 * complete.
+		 */
+		kvm->mmu_gpc_invalidate_range_start =
+			min(kvm->mmu_gpc_invalidate_range_start, range->start);
+		kvm->mmu_gpc_invalidate_range_end =
+			max(kvm->mmu_gpc_invalidate_range_end, range->end);
+	}
 	spin_unlock(&kvm->mn_invalidate_lock);
 
 	/*
@@ -830,21 +848,27 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 
 	__kvm_handle_hva_range(kvm, &hva_range);
 
+	gfn_to_pfn_cache_invalidate(kvm, range->start, range->end);
+
 	/* Pairs with the increment in range_start(). */
 	spin_lock(&kvm->mn_invalidate_lock);
 	if (!WARN_ON_ONCE(!kvm->mn_active_invalidate_count))
 		--kvm->mn_active_invalidate_count;
 	wake = !kvm->mn_active_invalidate_count;
+	if (wake) {
+		kvm->mmu_gpc_invalidate_range_start = KVM_HVA_ERR_BAD;
+		kvm->mmu_gpc_invalidate_range_end = KVM_HVA_ERR_BAD;
+	}
 	spin_unlock(&kvm->mn_invalidate_lock);
 
-	gfn_to_pfn_cache_invalidate(kvm, range->start, range->end);
-
 	/*
 	 * There can only be one waiter, since the wait happens under
 	 * slots_lock.
 	 */
-	if (wake)
+	if (wake) {
+		wake_up(&kvm->gpc_invalidate_wq);
 		rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
+	}
 }
 
 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
@@ -1154,7 +1178,9 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
 
 	INIT_LIST_HEAD(&kvm->gpc_list);
 	spin_lock_init(&kvm->gpc_lock);
-
+	init_waitqueue_head(&kvm->gpc_invalidate_wq);
+	kvm->mmu_gpc_invalidate_range_start = KVM_HVA_ERR_BAD;
+	kvm->mmu_gpc_invalidate_range_end = KVM_HVA_ERR_BAD;
 	INIT_LIST_HEAD(&kvm->devices);
 	kvm->max_vcpus = KVM_MAX_VCPUS;
 
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 187b58150ef7..dad32ef5ac87 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -133,6 +133,39 @@ static void gpc_unmap(kvm_pfn_t pfn, void *khva)
 #endif
 }
 
+static bool gpc_invalidations_pending(struct gfn_to_pfn_cache *gpc)
+{
+	/*
+	 * No need for locking on GPC here because these fields are protected
+	 * by gpc->refresh_lock.
+	 */
+	return unlikely(gpc->kvm->mn_active_invalidate_count) &&
+		(gpc->kvm->mmu_gpc_invalidate_range_start <= gpc->uhva) &&
+		(gpc->kvm->mmu_gpc_invalidate_range_end > gpc->uhva);
+}
+
+static void gpc_wait_for_invalidations(struct gfn_to_pfn_cache *gpc)
+{
+	spin_lock(&gpc->kvm->mn_invalidate_lock);
+	if (gpc_invalidations_pending(gpc)) {
+		DECLARE_WAITQUEUE(wait, current);
+
+		for (;;) {
+			prepare_to_wait(&gpc->kvm->gpc_invalidate_wq, &wait,
+					TASK_UNINTERRUPTIBLE);
+
+			if (!gpc_invalidations_pending(gpc))
+				break;
+
+			spin_unlock(&gpc->kvm->mn_invalidate_lock);
+			schedule();
+			spin_lock(&gpc->kvm->mn_invalidate_lock);
+		}
+		finish_wait(&gpc->kvm->gpc_invalidate_wq, &wait);
+	}
+	spin_unlock(&gpc->kvm->mn_invalidate_lock);
+}
+
 static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 {
 	/* Note, the new page offset may be different than the old! */
@@ -205,6 +238,15 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 			goto out_error;
 		}
 
+		/*
+		 * Avoid populating a GPC with a user address which is already
+		 * being invalidated, if the invalidate_range_start() notifier
+		 * has already been called. The actual invalidation happens in
+		 * the invalidate_range_end() callback, so wait until there are
+		 * no active invalidations (for the relevant HVA).
+		 */
+		gpc_wait_for_invalidations(gpc);
+
 		write_lock_irq(&gpc->lock);
 
 		/*
David Woodhouse Aug. 22, 2024, 9 a.m. UTC | #7
On Tue, 2024-08-06 at 07:03 -0700, Sean Christopherson wrote:
> On Tue, Aug 06, 2024, David Woodhouse wrote:
> > On Mon, 2024-08-05 at 17:45 -0700, Sean Christopherson wrote:
> > > On Mon, Aug 05, 2024, David Woodhouse wrote:
> > > > From: David Woodhouse <dwmw@amazon.co.uk>
> > > Servicing guest pages faults has the same problem, which is why
> > > mmu_invalidate_retry_gfn() was added.  Supporting hva-only GPCs made our lives a
> > > little harder, but not horrifically so (there are ordering differences regardless).
> > > 
> > > Woefully incomplete, but I think this is the gist of what you want:
> > 
> > Hm, maybe. It does mean that migration occurring all through memory
> > (indeed, just one at top and bottom of guest memory space) would
> > perturb GPCs which remain present.
> 
> If that happens with a real world VMM, and it's not a blatant VMM goof, then we
> can fix KVM.  The stage-2 page fault path hammers the mmu_notifier retry logic
> far more than GPCs, so if a range-based check is inadequate for some use case,
> then we definitely need to fix both.

FWIW this turned out to be because someone used C++ where they
shouldn't have done. Opinion is divided on whether to call that a
'blatant VMM goof'.

But if you use std::vector for something that's modified that many
times a second, and free() implicitly calls madvise(MADV_DONTNEED)
every time, you get what you deserve :)

We should still fix it in the kernel.
diff mbox series

Patch

diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 827ecc0b7e10..30ed1019cfc6 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -69,6 +69,7 @@  struct gfn_to_pfn_cache {
 	void *khva;
 	kvm_pfn_t pfn;
 	bool active;
+	bool validating;
 	bool valid;
 };
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d0788d0a72cc..ffd6ab4c2a16 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -777,18 +777,6 @@  static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	kvm->mn_active_invalidate_count++;
 	spin_unlock(&kvm->mn_invalidate_lock);
 
-	/*
-	 * Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e.
-	 * before acquiring mmu_lock, to avoid holding mmu_lock while acquiring
-	 * each cache's lock.  There are relatively few caches in existence at
-	 * any given time, and the caches themselves can check for hva overlap,
-	 * i.e. don't need to rely on memslot overlap checks for performance.
-	 * Because this runs without holding mmu_lock, the pfn caches must use
-	 * mn_active_invalidate_count (see above) instead of
-	 * mmu_invalidate_in_progress.
-	 */
-	gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end);
-
 	/*
 	 * If one or more memslots were found and thus zapped, notify arch code
 	 * that guest memory has been reclaimed.  This needs to be done *after*
@@ -849,6 +837,8 @@  static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 	wake = !kvm->mn_active_invalidate_count;
 	spin_unlock(&kvm->mn_invalidate_lock);
 
+	gfn_to_pfn_cache_invalidate(kvm, range->start, range->end);
+
 	/*
 	 * There can only be one waiter, since the wait happens under
 	 * slots_lock.
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index 715f19669d01..34e4e67f09f8 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -24,13 +24,13 @@  kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
 		     bool *async, bool write_fault, bool *writable);
 
 #ifdef CONFIG_HAVE_KVM_PFNCACHE
-void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
-				       unsigned long start,
-				       unsigned long end);
+void gfn_to_pfn_cache_invalidate(struct kvm *kvm,
+				 unsigned long start,
+				 unsigned long end);
 #else
-static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
-						     unsigned long start,
-						     unsigned long end)
+static inline void gfn_to_pfn_cache_invalidate(struct kvm *kvm,
+					       unsigned long start,
+					       unsigned long end)
 {
 }
 #endif /* HAVE_KVM_PFNCACHE */
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index f0039efb9e1e..187b58150ef7 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -20,10 +20,14 @@ 
 #include "kvm_mm.h"
 
 /*
- * MMU notifier 'invalidate_range_start' hook.
+ * MMU notifier 'invalidate_range_end' hook. The hva_to_pfn_retry() function
+ * below may look up a PFN just before it is zapped, and may be mapping it
+ * concurrently (with the GPC lock dropped). By using a separate 'validating'
+ * flag, the invalidation can occur concurrently, causing hva_to_pfn_retry()
+ * to drop its result and retry correctly.
  */
-void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
-				       unsigned long end)
+void gfn_to_pfn_cache_invalidate(struct kvm *kvm, unsigned long start,
+				 unsigned long end)
 {
 	struct gfn_to_pfn_cache *gpc;
 
@@ -32,7 +36,7 @@  void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
 		read_lock_irq(&gpc->lock);
 
 		/* Only a single page so no need to care about length */
-		if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) &&
+		if (gpc->validating && !is_error_noslot_pfn(gpc->pfn) &&
 		    gpc->uhva >= start && gpc->uhva < end) {
 			read_unlock_irq(&gpc->lock);
 
@@ -45,9 +49,11 @@  void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
 			 */
 
 			write_lock_irq(&gpc->lock);
-			if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) &&
-			    gpc->uhva >= start && gpc->uhva < end)
+			if (gpc->validating && !is_error_noslot_pfn(gpc->pfn) &&
+			    gpc->uhva >= start && gpc->uhva < end) {
+				gpc->validating = false;
 				gpc->valid = false;
+			}
 			write_unlock_irq(&gpc->lock);
 			continue;
 		}
@@ -93,6 +99,9 @@  bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len)
 	if (!gpc->valid)
 		return false;
 
+	/* It can never be valid unless it was once validating! */
+	WARN_ON_ONCE(!gpc->validating);
+
 	return true;
 }
 
@@ -124,41 +133,12 @@  static void gpc_unmap(kvm_pfn_t pfn, void *khva)
 #endif
 }
 
-static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_seq)
-{
-	/*
-	 * mn_active_invalidate_count acts for all intents and purposes
-	 * like mmu_invalidate_in_progress here; but the latter cannot
-	 * be used here because the invalidation of caches in the
-	 * mmu_notifier event occurs _before_ mmu_invalidate_in_progress
-	 * is elevated.
-	 *
-	 * Note, it does not matter that mn_active_invalidate_count
-	 * is not protected by gpc->lock.  It is guaranteed to
-	 * be elevated before the mmu_notifier acquires gpc->lock, and
-	 * isn't dropped until after mmu_invalidate_seq is updated.
-	 */
-	if (kvm->mn_active_invalidate_count)
-		return true;
-
-	/*
-	 * Ensure mn_active_invalidate_count is read before
-	 * mmu_invalidate_seq.  This pairs with the smp_wmb() in
-	 * mmu_notifier_invalidate_range_end() to guarantee either the
-	 * old (non-zero) value of mn_active_invalidate_count or the
-	 * new (incremented) value of mmu_invalidate_seq is observed.
-	 */
-	smp_rmb();
-	return kvm->mmu_invalidate_seq != mmu_seq;
-}
-
 static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 {
 	/* Note, the new page offset may be different than the old! */
 	void *old_khva = (void *)PAGE_ALIGN_DOWN((uintptr_t)gpc->khva);
 	kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT;
 	void *new_khva = NULL;
-	unsigned long mmu_seq;
 
 	lockdep_assert_held(&gpc->refresh_lock);
 
@@ -172,8 +152,16 @@  static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 	gpc->valid = false;
 
 	do {
-		mmu_seq = gpc->kvm->mmu_invalidate_seq;
-		smp_rmb();
+		/*
+		 * The translation made by hva_to_pfn() below could be made
+		 * invalid as soon as it's mapped. But the uhva is already
+		 * known and that's all that gfn_to_pfn_cache_invalidate()
+		 * looks at. So set the 'validating' flag to allow the GPC
+		 * to be marked invalid from the moment the lock is dropped,
+		 * before the corresponding PFN is even found (and, more to
+		 * the point, immediately afterwards).
+		 */
+		gpc->validating = true;
 
 		write_unlock_irq(&gpc->lock);
 
@@ -224,7 +212,14 @@  static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 		 * attempting to refresh.
 		 */
 		WARN_ON_ONCE(gpc->valid);
-	} while (mmu_notifier_retry_cache(gpc->kvm, mmu_seq));
+
+		/*
+		 * Since gfn_to_pfn_cache_invalidate() is called from the
+		 * kvm_mmu_notifier_invalidate_range_end() callback, it can
+		 * invalidate the GPC the moment after hva_to_pfn() returned
+		 * a valid PFN. If that happens, retry.
+		 */
+	} while (!gpc->validating);
 
 	gpc->valid = true;
 	gpc->pfn = new_pfn;
@@ -339,6 +334,7 @@  static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned l
 	 */
 	if (ret) {
 		gpc->valid = false;
+		gpc->validating = false;
 		gpc->pfn = KVM_PFN_ERR_FAULT;
 		gpc->khva = NULL;
 	}
@@ -383,7 +379,7 @@  void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm)
 	gpc->pfn = KVM_PFN_ERR_FAULT;
 	gpc->gpa = INVALID_GPA;
 	gpc->uhva = KVM_HVA_ERR_BAD;
-	gpc->active = gpc->valid = false;
+	gpc->active = gpc->valid = gpc->validating = false;
 }
 
 static int __kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long uhva,
@@ -453,6 +449,7 @@  void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc)
 		write_lock_irq(&gpc->lock);
 		gpc->active = false;
 		gpc->valid = false;
+		gpc->validating = false;
 
 		/*
 		 * Leave the GPA => uHVA cache intact, it's protected by the