diff mbox series

[RFC,v3,04/19] KVM: x86: mmu: allow to enable write tracking externally

Message ID 20220427200314.276673-5-mlevitsk@redhat.com (mailing list archive)
State New, archived
Headers show
Series RFC: nested AVIC | expand

Commit Message

Maxim Levitsky April 27, 2022, 8:02 p.m. UTC
This will be used to enable write tracking from nested AVIC code
and can also be used to enable write tracking in GVT-g module
when it actually uses it as opposed to always enabling it,
when the module is compiled in the kernel.

No functional change intended.

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
---
 arch/x86/include/asm/kvm_host.h       |  2 +-
 arch/x86/include/asm/kvm_page_track.h |  1 +
 arch/x86/kvm/mmu.h                    |  8 +++++---
 arch/x86/kvm/mmu/mmu.c                | 17 ++++++++++-------
 arch/x86/kvm/mmu/page_track.c         | 10 ++++++++--
 5 files changed, 25 insertions(+), 13 deletions(-)

Comments

Sean Christopherson May 19, 2022, 4:27 p.m. UTC | #1
On Wed, Apr 27, 2022, Maxim Levitsky wrote:
> This will be used to enable write tracking from nested AVIC code
> and can also be used to enable write tracking in GVT-g module
> when it actually uses it as opposed to always enabling it,
> when the module is compiled in the kernel.

Wrap at ~75.

> No functional change intended.
> 
> Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> ---
>  arch/x86/include/asm/kvm_host.h       |  2 +-
>  arch/x86/include/asm/kvm_page_track.h |  1 +
>  arch/x86/kvm/mmu.h                    |  8 +++++---
>  arch/x86/kvm/mmu/mmu.c                | 17 ++++++++++-------
>  arch/x86/kvm/mmu/page_track.c         | 10 ++++++++--
>  5 files changed, 25 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 636df87542555..fc7df778a3d71 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1254,7 +1254,7 @@ struct kvm_arch {
>  	 * is used as one input when determining whether certain memslot
>  	 * related allocations are necessary.
>  	 */

The above comment needs to be rewritten.

> -	bool shadow_root_allocated;
> +	bool mmu_page_tracking_enabled;
>  #if IS_ENABLED(CONFIG_HYPERV)
>  	hpa_t	hv_root_tdp;
> diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h
> index eb186bc57f6a9..955a5ae07b10e 100644
> --- a/arch/x86/include/asm/kvm_page_track.h
> +++ b/arch/x86/include/asm/kvm_page_track.h
> @@ -50,6 +50,7 @@ int kvm_page_track_init(struct kvm *kvm);
>  void kvm_page_track_cleanup(struct kvm *kvm);
>  
>  bool kvm_page_track_write_tracking_enabled(struct kvm *kvm);
> +int kvm_page_track_write_tracking_enable(struct kvm *kvm);
>  int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot);
>  
>  void kvm_page_track_free_memslot(struct kvm_memory_slot *slot);
> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index 671cfeccf04e9..44d15551f7156 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -269,7 +269,7 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
>  int kvm_mmu_post_init_vm(struct kvm *kvm);
>  void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
>  
> -static inline bool kvm_shadow_root_allocated(struct kvm *kvm)
> +static inline bool mmu_page_tracking_enabled(struct kvm *kvm)
>  {
>  	/*
>  	 * Read shadow_root_allocated before related pointers. Hence, threads
> @@ -277,9 +277,11 @@ static inline bool kvm_shadow_root_allocated(struct kvm *kvm)
>  	 * see the pointers. Pairs with smp_store_release in
>  	 * mmu_first_shadow_root_alloc.
>  	 */

This comment also needs to be rewritten.

> -	return smp_load_acquire(&kvm->arch.shadow_root_allocated);
> +	return smp_load_acquire(&kvm->arch.mmu_page_tracking_enabled);
>  }

...

> diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
> index 2e09d1b6249f3..8857d629036d7 100644
> --- a/arch/x86/kvm/mmu/page_track.c
> +++ b/arch/x86/kvm/mmu/page_track.c
> @@ -21,10 +21,16 @@
>  
>  bool kvm_page_track_write_tracking_enabled(struct kvm *kvm)

This can be static, it's now used only by page_track.c.

>  {
> -	return IS_ENABLED(CONFIG_KVM_EXTERNAL_WRITE_TRACKING) ||
> -	       !tdp_enabled || kvm_shadow_root_allocated(kvm);
> +	return mmu_page_tracking_enabled(kvm);
>  }
>  
> +int kvm_page_track_write_tracking_enable(struct kvm *kvm)

This is too similar to the "enabled" version; "kvm_page_track_enable_write_tracking()"
would maintain namespacing and be less confusing.

Hmm, I'd probably vote to make this a "static inline" in kvm_page_track.h, and
rename mmu_enable_write_tracking() to kvm_mmu_enable_write_tracking and export.
Not a strong preference, just feels silly to export a one-liner.

> +{
> +	return mmu_enable_write_tracking(kvm);
> +}
> +EXPORT_SYMBOL_GPL(kvm_page_track_write_tracking_enable);
> +
> +
>  void kvm_page_track_free_memslot(struct kvm_memory_slot *slot)
>  {
>  	int i;
> -- 
> 2.26.3
>
Sean Christopherson May 19, 2022, 4:37 p.m. UTC | #2
On Wed, Apr 27, 2022, Maxim Levitsky wrote:
> @@ -5753,6 +5752,10 @@ int kvm_mmu_init_vm(struct kvm *kvm)
>  	node->track_write = kvm_mmu_pte_write;
>  	node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
>  	kvm_page_track_register_notifier(kvm, node);

Can you add a patch to move this call to kvm_page_track_register_notifier() into
mmu_enable_write_tracking(), and simultaneously add a WARN in the register path
that page tracking is enabled?

Oh, actually, a better idea. Add an inner __kvm_page_track_register_notifier()
that is not exported and thus used only by KVM, invoke mmu_enable_write_tracking()
from the exported kvm_page_track_register_notifier(), and then do the above.
That will require modifying KVMGT and KVM in a single patch, but that's ok.

That will avoid any possibility of an external user failing to enabling tracking
before registering its notifier, and also avoids bikeshedding over what to do with
the one-line wrapper to enable tracking.
Maxim Levitsky May 22, 2022, 10:21 a.m. UTC | #3
On Thu, 2022-05-19 at 16:27 +0000, Sean Christopherson wrote:
> On Wed, Apr 27, 2022, Maxim Levitsky wrote:
> > This will be used to enable write tracking from nested AVIC code
> > and can also be used to enable write tracking in GVT-g module
> > when it actually uses it as opposed to always enabling it,
> > when the module is compiled in the kernel.
> 
> Wrap at ~75.
Well, the checkpatch.pl didn't complain, so I didn't notice.

> 
> > No functional change intended.
> > 
> > Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> > ---
> >  arch/x86/include/asm/kvm_host.h       |  2 +-
> >  arch/x86/include/asm/kvm_page_track.h |  1 +
> >  arch/x86/kvm/mmu.h                    |  8 +++++---
> >  arch/x86/kvm/mmu/mmu.c                | 17 ++++++++++-------
> >  arch/x86/kvm/mmu/page_track.c         | 10 ++++++++--
> >  5 files changed, 25 insertions(+), 13 deletions(-)
> > 
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 636df87542555..fc7df778a3d71 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -1254,7 +1254,7 @@ struct kvm_arch {
> >  	 * is used as one input when determining whether certain memslot
> >  	 * related allocations are necessary.
> >  	 */
> 
> The above comment needs to be rewritten.
Good catch, thank a lot!!

> 
> > -	bool shadow_root_allocated;
> > +	bool mmu_page_tracking_enabled;
> >  #if IS_ENABLED(CONFIG_HYPERV)
> >  	hpa_t	hv_root_tdp;
> > diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h
> > index eb186bc57f6a9..955a5ae07b10e 100644
> > --- a/arch/x86/include/asm/kvm_page_track.h
> > +++ b/arch/x86/include/asm/kvm_page_track.h
> > @@ -50,6 +50,7 @@ int kvm_page_track_init(struct kvm *kvm);
> >  void kvm_page_track_cleanup(struct kvm *kvm);
> >  
> >  bool kvm_page_track_write_tracking_enabled(struct kvm *kvm);
> > +int kvm_page_track_write_tracking_enable(struct kvm *kvm);
> >  int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot);
> >  
> >  void kvm_page_track_free_memslot(struct kvm_memory_slot *slot);
> > diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> > index 671cfeccf04e9..44d15551f7156 100644
> > --- a/arch/x86/kvm/mmu.h
> > +++ b/arch/x86/kvm/mmu.h
> > @@ -269,7 +269,7 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
> >  int kvm_mmu_post_init_vm(struct kvm *kvm);
> >  void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
> >  
> > -static inline bool kvm_shadow_root_allocated(struct kvm *kvm)
> > +static inline bool mmu_page_tracking_enabled(struct kvm *kvm)
> >  {
> >  	/*
> >  	 * Read shadow_root_allocated before related pointers. Hence, threads
> > @@ -277,9 +277,11 @@ static inline bool kvm_shadow_root_allocated(struct kvm *kvm)
> >  	 * see the pointers. Pairs with smp_store_release in
> >  	 * mmu_first_shadow_root_alloc.
> >  	 */
> 
> This comment also needs to be rewritten.
Also thanks a lot, next time I'll check comments better.

> 
> > -	return smp_load_acquire(&kvm->arch.shadow_root_allocated);
> > +	return smp_load_acquire(&kvm->arch.mmu_page_tracking_enabled);
> >  }
> 
> ...
> 
> > diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
> > index 2e09d1b6249f3..8857d629036d7 100644
> > --- a/arch/x86/kvm/mmu/page_track.c
> > +++ b/arch/x86/kvm/mmu/page_track.c
> > @@ -21,10 +21,16 @@
> >  
> >  bool kvm_page_track_write_tracking_enabled(struct kvm *kvm)
> 
> This can be static, it's now used only by page_track.c.
I'll fix this.
> 
> >  {
> > -	return IS_ENABLED(CONFIG_KVM_EXTERNAL_WRITE_TRACKING) ||
> > -	       !tdp_enabled || kvm_shadow_root_allocated(kvm);
> > +	return mmu_page_tracking_enabled(kvm);
> >  }
> >  
> > +int kvm_page_track_write_tracking_enable(struct kvm *kvm)
> 
> This is too similar to the "enabled" version; "kvm_page_track_enable_write_tracking()"
> would maintain namespacing and be less confusing.
Makes sense, thanks, will do!

> 
> Hmm, I'd probably vote to make this a "static inline" in kvm_page_track.h, and
> rename mmu_enable_write_tracking() to kvm_mmu_enable_write_tracking and export.
> Not a strong preference, just feels silly to export a one-liner.

The sole reason I did it this way, because 'page_track.c' this way contains all the interfaces
that an external user of write tracking needs to use.

> 
> > +{
> > +	return mmu_enable_write_tracking(kvm);
> > +}
> > +EXPORT_SYMBOL_GPL(kvm_page_track_write_tracking_enable);
> > +
> > +
> >  void kvm_page_track_free_memslot(struct kvm_memory_slot *slot)
> >  {
> >  	int i;
> > -- 
> > 2.26.3
> > 

Best regards,
	Maxim Levitsky
Maxim Levitsky May 22, 2022, 10:22 a.m. UTC | #4
On Thu, 2022-05-19 at 16:37 +0000, Sean Christopherson wrote:
> On Wed, Apr 27, 2022, Maxim Levitsky wrote:
> > @@ -5753,6 +5752,10 @@ int kvm_mmu_init_vm(struct kvm *kvm)
> >  	node->track_write = kvm_mmu_pte_write;
> >  	node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
> >  	kvm_page_track_register_notifier(kvm, node);
> 
> Can you add a patch to move this call to kvm_page_track_register_notifier() into
> mmu_enable_write_tracking(), and simultaneously add a WARN in the register path
> that page tracking is enabled?
> 
> Oh, actually, a better idea. Add an inner __kvm_page_track_register_notifier()
> that is not exported and thus used only by KVM, invoke mmu_enable_write_tracking()
> from the exported kvm_page_track_register_notifier(), and then do the above.
> That will require modifying KVMGT and KVM in a single patch, but that's ok.
> 
> That will avoid any possibility of an external user failing to enabling tracking
> before registering its notifier, and also avoids bikeshedding over what to do with
> the one-line wrapper to enable tracking.
> 

This is a good idea as well, especially looking at kvmgt and seeing that
it registers the page track notifier, when the vGPU is opened.

I'll do this in the next series.

Thanks for the review!

Best regards,
	Maxim Levitsky
Maxim Levitsky July 20, 2022, 2:42 p.m. UTC | #5
On Sun, 2022-05-22 at 13:22 +0300, Maxim Levitsky wrote:
> On Thu, 2022-05-19 at 16:37 +0000, Sean Christopherson wrote:
> > On Wed, Apr 27, 2022, Maxim Levitsky wrote:
> > > @@ -5753,6 +5752,10 @@ int kvm_mmu_init_vm(struct kvm *kvm)
> > >         node->track_write = kvm_mmu_pte_write;
> > >         node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
> > >         kvm_page_track_register_notifier(kvm, node);
> > 
> > Can you add a patch to move this call to kvm_page_track_register_notifier() into
> > mmu_enable_write_tracking(), and simultaneously add a WARN in the register path
> > that page tracking is enabled?
> > 
> > Oh, actually, a better idea. Add an inner __kvm_page_track_register_notifier()
> > that is not exported and thus used only by KVM, invoke mmu_enable_write_tracking()
> > from the exported kvm_page_track_register_notifier(), and then do the above.
> > That will require modifying KVMGT and KVM in a single patch, but that's ok.
> > 
> > That will avoid any possibility of an external user failing to enabling tracking
> > before registering its notifier, and also avoids bikeshedding over what to do with
> > the one-line wrapper to enable tracking.
> > 
> 
> This is a good idea as well, especially looking at kvmgt and seeing that
> it registers the page track notifier, when the vGPU is opened.
> 
> I'll do this in the next series.
> 
> Thanks for the review!

After putting some thought into this, I am not 100% sure anymore I want to do it this way.
 
Let me explain the current state of things:

For mmu: 
- write tracking notifier is registered on VM initialization (that is pretty much always),
and if it is called because write tracking was enabled due to some other reason
(currently only KVMGT), it checks the number of shadow mmu pages and if zero, bails out.
 
- write tracking enabled when shadow root is allocated.
 
This can be kept as is by using the __kvm_page_track_register_notifier as you suggested.
 
For KVMGT:
- both write tracking and notifier are enabled when an vgpu mdev device is first opened.
That 'works' only because KVMGT doesn't allow to assign more that one mdev to same VM,
thus a per VM notifier and the write tracking for that VM are enabled at the same time
 
 
Now for nested AVIC, this is what I would like to do:
 
- just like mmu, I prefer to register the write tracking notifier, when the VM is created.
- just like mmu, write tracking should only be enabled when nested AVIC is actually used
  first time, so that write tracking is not always enabled when you just boot a VM with nested avic supported,
  since the VM might not use nested at all.
 
Thus I either need to use the __kvm_page_track_register_notifier too for AVIC (and thus need to export it)
or I need to have a boolean (nested_avic_was_used_once) and register the write tracking
notifier only when false and do it not on VM creation but on first attempt to use nested AVIC.
 
Do you think this is worth it? I mean there is some value of registering the notifier only when needed
(this way it is not called for nothing) but it does complicate things a bit.
 
I can also stash this boolean (like 'bool registered;') into the 'struct kvm_page_track_notifier_node', 
and thus allow the kvm_page_track_register_notifier to be called more that once - 
then I can also get rid of __kvm_page_track_register_notifier. 

What do you think about this?
 
Best regards,
	Maxim Levitsky


> 
> Best regards,
>         Maxim Levitsky
Sean Christopherson July 25, 2022, 4:08 p.m. UTC | #6
On Wed, Jul 20, 2022, Maxim Levitsky wrote:
> On Sun, 2022-05-22 at 13:22 +0300, Maxim Levitsky wrote:
> > On Thu, 2022-05-19 at 16:37 +0000, Sean Christopherson wrote:
> > > On Wed, Apr 27, 2022, Maxim Levitsky wrote:
> > > > @@ -5753,6 +5752,10 @@ int kvm_mmu_init_vm(struct kvm *kvm)
> Now for nested AVIC, this is what I would like to do:
>  
> - just like mmu, I prefer to register the write tracking notifier, when the
>   VM is created.
>
> - just like mmu, write tracking should only be enabled when nested AVIC is
>   actually used first time, so that write tracking is not always enabled when
>   you just boot a VM with nested avic supported, since the VM might not use
>   nested at all.
>  
> Thus I either need to use the __kvm_page_track_register_notifier too for AVIC
> (and thus need to export it) or I need to have a boolean
> (nested_avic_was_used_once) and register the write tracking notifier only
> when false and do it not on VM creation but on first attempt to use nested
> AVIC.
>  
> Do you think this is worth it? I mean there is some value of registering the
> notifier only when needed (this way it is not called for nothing) but it does
> complicate things a bit.

Compared to everything else that you're doing in the nested AVIC code, refcounting
the shared kvm_page_track_notifier_node object is a trivial amount of complexity.

And on that topic, do you have performance numbers to justify using a single
shared node?  E.g. if every table instance has its own notifier, then no additional
refcounting is needed.  It's not obvious that a shared node will provide better
performance, e.g. if there are only a handful of AVIC tables being shadowed, then
a linear walk of all nodes is likely fast enough, and doesn't bring the risk of
a write potentially being stalled due to having to acquire a VM-scoped mutex.

> I can also stash this boolean (like 'bool registered;') into the 'struct
> kvm_page_track_notifier_node',  and thus allow the
> kvm_page_track_register_notifier to be called more that once -  then I can
> also get rid of __kvm_page_track_register_notifier. 

No, allowing redundant registration without proper refcounting leads to pain,
e.g. X registers, Y registers, X unregisters, kaboom.
Maxim Levitsky July 28, 2022, 7:46 a.m. UTC | #7
On Mon, 2022-07-25 at 16:08 +0000, Sean Christopherson wrote:
> On Wed, Jul 20, 2022, Maxim Levitsky wrote:
> > On Sun, 2022-05-22 at 13:22 +0300, Maxim Levitsky wrote:
> > > On Thu, 2022-05-19 at 16:37 +0000, Sean Christopherson wrote:
> > > > On Wed, Apr 27, 2022, Maxim Levitsky wrote:
> > > > > @@ -5753,6 +5752,10 @@ int kvm_mmu_init_vm(struct kvm *kvm)
> > Now for nested AVIC, this is what I would like to do:
> >  
> > - just like mmu, I prefer to register the write tracking notifier, when the
> >   VM is created.
> > 
> > - just like mmu, write tracking should only be enabled when nested AVIC is
> >   actually used first time, so that write tracking is not always enabled when
> >   you just boot a VM with nested avic supported, since the VM might not use
> >   nested at all.
> >  
> > Thus I either need to use the __kvm_page_track_register_notifier too for AVIC
> > (and thus need to export it) or I need to have a boolean
> > (nested_avic_was_used_once) and register the write tracking notifier only
> > when false and do it not on VM creation but on first attempt to use nested
> > AVIC.
> >  
> > Do you think this is worth it? I mean there is some value of registering the
> > notifier only when needed (this way it is not called for nothing) but it does
> > complicate things a bit.
> 
> Compared to everything else that you're doing in the nested AVIC code, refcounting
> the shared kvm_page_track_notifier_node object is a trivial amount of complexity.
Makes sense.

> 
> And on that topic, do you have performance numbers to justify using a single
> shared node?  E.g. if every table instance has its own notifier, then no additional
> refcounting is needed. 

The thing is that KVM goes over the list of notifiers and calls them for every write from the emulator
in fact even just for mmio write, and when you enable write tracking on a page,
you just write protect the page and add a mark in the page track array, which is roughly 

'don't install spte, don't install mmio spte, but just emulate the page fault if it hits this page'

So adding more than a bare minimum to this list, seems just a bit wrong.


>  It's not obvious that a shared node will provide better
> performance, e.g. if there are only a handful of AVIC tables being shadowed, then
> a linear walk of all nodes is likely fast enough, and doesn't bring the risk of
> a write potentially being stalled due to having to acquire a VM-scoped mutex.

The thing is that if I register multiple notifiers, they all will be called anyway,
but yes I can use container_of, and discover which table the notifier belongs to,
instead of having a hash table where I lookup the GFN of the fault.

The above means practically that all the shadow physid tables will be in a linear
list of notifiers, so I could indeed avoid per vm mutex on the write tracking,
however for simplicity I probably will still need it because I do modify the page,
and having per physid table mutex complicates things.

Currently in my code the locking is very simple and somewhat dumb, but the performance
is very good because the code isn't executed often, most of the time the AVIC hardware
works alone without any VM exits.

Once the code is accepted upstream, it's one of the things that can be improved.


Note though that I still need a hash table and a mutex because on each VM entry,
the guest can use a different physid table, so I need to lookup it, and create it,
if not found, which would require read/write of the hash table and thus a mutex.



> 
> > I can also stash this boolean (like 'bool registered;') into the 'struct
> > kvm_page_track_notifier_node',  and thus allow the
> > kvm_page_track_register_notifier to be called more that once -  then I can
> > also get rid of __kvm_page_track_register_notifier. 
> 
> No, allowing redundant registration without proper refcounting leads to pain,
> e.g. X registers, Y registers, X unregisters, kaboom.
> 

True, but then what about adding a refcount to 'struct kvm_page_track_notifier_node'
instead of a boolean, and allowing redundant registration? 
Probably not worth it, in which case I am OK to add a refcount to my avic code.

Or maybe just scrap the whole thing and just leave registration and activation of the
write tracking as two separate things? Honestly now that looks like the most clean
solution.

Best regards,
	Maxim Levitsky
Maxim Levitsky Aug. 1, 2022, 3:53 p.m. UTC | #8
On Thu, 2022-07-28 at 10:46 +0300, Maxim Levitsky wrote:
> On Mon, 2022-07-25 at 16:08 +0000, Sean Christopherson wrote:
> > On Wed, Jul 20, 2022, Maxim Levitsky wrote:
> > > On Sun, 2022-05-22 at 13:22 +0300, Maxim Levitsky wrote:
> > > > On Thu, 2022-05-19 at 16:37 +0000, Sean Christopherson wrote:
> > > > > On Wed, Apr 27, 2022, Maxim Levitsky wrote:
> > > > > > @@ -5753,6 +5752,10 @@ int kvm_mmu_init_vm(struct kvm *kvm)
> > > Now for nested AVIC, this is what I would like to do:
> > >  
> > > - just like mmu, I prefer to register the write tracking notifier, when the
> > >   VM is created.
> > > 
> > > - just like mmu, write tracking should only be enabled when nested AVIC is
> > >   actually used first time, so that write tracking is not always enabled when
> > >   you just boot a VM with nested avic supported, since the VM might not use
> > >   nested at all.
> > >  
> > > Thus I either need to use the __kvm_page_track_register_notifier too for AVIC
> > > (and thus need to export it) or I need to have a boolean
> > > (nested_avic_was_used_once) and register the write tracking notifier only
> > > when false and do it not on VM creation but on first attempt to use nested
> > > AVIC.
> > >  
> > > Do you think this is worth it? I mean there is some value of registering the
> > > notifier only when needed (this way it is not called for nothing) but it does
> > > complicate things a bit.
> > 
> > Compared to everything else that you're doing in the nested AVIC code, refcounting
> > the shared kvm_page_track_notifier_node object is a trivial amount of complexity.
> Makes sense.
> 
> > And on that topic, do you have performance numbers to justify using a single
> > shared node?  E.g. if every table instance has its own notifier, then no additional
> > refcounting is needed. 
> 
> The thing is that KVM goes over the list of notifiers and calls them for every write from the emulator
> in fact even just for mmio write, and when you enable write tracking on a page,
> you just write protect the page and add a mark in the page track array, which is roughly 
> 
> 'don't install spte, don't install mmio spte, but just emulate the page fault if it hits this page'
> 
> So adding more than a bare minimum to this list, seems just a bit wrong.
> 
> 
> >  It's not obvious that a shared node will provide better
> > performance, e.g. if there are only a handful of AVIC tables being shadowed, then
> > a linear walk of all nodes is likely fast enough, and doesn't bring the risk of
> > a write potentially being stalled due to having to acquire a VM-scoped mutex.
> 
> The thing is that if I register multiple notifiers, they all will be called anyway,
> but yes I can use container_of, and discover which table the notifier belongs to,
> instead of having a hash table where I lookup the GFN of the fault.
> 
> The above means practically that all the shadow physid tables will be in a linear
> list of notifiers, so I could indeed avoid per vm mutex on the write tracking,
> however for simplicity I probably will still need it because I do modify the page,
> and having per physid table mutex complicates things.
> 
> Currently in my code the locking is very simple and somewhat dumb, but the performance
> is very good because the code isn't executed often, most of the time the AVIC hardware
> works alone without any VM exits.
> 
> Once the code is accepted upstream, it's one of the things that can be improved.
> 
> 
> Note though that I still need a hash table and a mutex because on each VM entry,
> the guest can use a different physid table, so I need to lookup it, and create it,
> if not found, which would require read/write of the hash table and thus a mutex.
> 
> 
> 
> > > I can also stash this boolean (like 'bool registered;') into the 'struct
> > > kvm_page_track_notifier_node',  and thus allow the
> > > kvm_page_track_register_notifier to be called more that once -  then I can
> > > also get rid of __kvm_page_track_register_notifier. 
> > 
> > No, allowing redundant registration without proper refcounting leads to pain,
> > e.g. X registers, Y registers, X unregisters, kaboom.
> > 
> 
> True, but then what about adding a refcount to 'struct kvm_page_track_notifier_node'
> instead of a boolean, and allowing redundant registration? 
> Probably not worth it, in which case I am OK to add a refcount to my avic code.
> 
> Or maybe just scrap the whole thing and just leave registration and activation of the
> write tracking as two separate things? Honestly now that looks like the most clean
> solution.


Kind ping on this. Do you still want me to enable write tracking on the notifier registeration,
or scrap the idea?


Best regards,
	Maxim Levitsky
> 
> Best regards,
> 	Maxim Levitsky
Sean Christopherson Aug. 1, 2022, 5:20 p.m. UTC | #9
On Thu, Jul 28, 2022, Maxim Levitsky wrote:
> On Mon, 2022-07-25 at 16:08 +0000, Sean Christopherson wrote:
> > On Wed, Jul 20, 2022, Maxim Levitsky wrote:
> > And on that topic, do you have performance numbers to justify using a single
> > shared node?  E.g. if every table instance has its own notifier, then no additional
> > refcounting is needed. 
> 
> The thing is that KVM goes over the list of notifiers and calls them for
> every write from the emulator in fact even just for mmio write, and when you
> enable write tracking on a page, you just write protect the page and add a
> mark in the page track array, which is roughly 
> 
> 'don't install spte, don't install mmio spte, but just emulate the page fault if it hits this page'
> 
> So adding more than a bare minimum to this list, seems just a bit wrong.

Hmm, I see what you're saying.  To some extent, having a minimal page tracker
implementation is just that, an implementation detail.  But for better or worse,
the existing API effectively pushes range checking to the callers.  I agree that
breaking from that pattern would be odd.

> >  It's not obvious that a shared node will provide better performance, e.g.
> >  if there are only a handful of AVIC tables being shadowed, then a linear
> >  walk of all nodes is likely fast enough, and doesn't bring the risk of a
> >  write potentially being stalled due to having to acquire a VM-scoped
> >  mutex.
> 
> The thing is that if I register multiple notifiers, they all will be called anyway,
> but yes I can use container_of, and discover which table the notifier belongs to,
> instead of having a hash table where I lookup the GFN of the fault.
> 
> The above means practically that all the shadow physid tables will be in a linear
> list of notifiers, so I could indeed avoid per vm mutex on the write tracking,
> however for simplicity I probably will still need it because I do modify the page,
> and having per physid table mutex complicates things.
> 
> Currently in my code the locking is very simple and somewhat dumb, but the performance
> is very good because the code isn't executed often, most of the time the AVIC hardware
> works alone without any VM exits.

Yes, but because the code isn't executed often, pretty much any solution will
provide good performance.

> Once the code is accepted upstream, it's one of the things that can be improved.
> 
> Note though that I still need a hash table and a mutex because on each VM entry,
> the guest can use a different physid table, so I need to lookup it, and create it,
> if not found, which would require read/write of the hash table and thus a mutex.

One of the points I'm trying to make is that a hash table isn't strictly required.
E.g. if I understand the update rules correctly, I believe tables can be tracked
via an RCU-protected list, with vCPUs taking a spinlock and doing synchronize_rcu()
when adding/removing a table.  That would avoid having to take any "real" locks in
the page track notifier.

The VM-scoped mutex worries me as it will be a bottleneck if L1 is running multiple
L2 VMs.  E.g. if L1 is frequently switching vmcs12 and thus avic_physical_id, then
nested VMRUN will effectively get serialized.  That is mitigated to some extent by
an RCU-protected list, as a sane L1 will use a single table for each L2, and so a
vCPU will need to add/remove a table if and only if it's the first/last vCPU to
start/stop running an L2 VM.

> > > I can also stash this boolean (like 'bool registered;') into the 'struct
> > > kvm_page_track_notifier_node',  and thus allow the
> > > kvm_page_track_register_notifier to be called more that once -  then I can
> > > also get rid of __kvm_page_track_register_notifier. 
> > 
> > No, allowing redundant registration without proper refcounting leads to pain,
> > e.g. X registers, Y registers, X unregisters, kaboom.
> > 
> 
> True, but then what about adding a refcount to 'struct kvm_page_track_notifier_node'
> instead of a boolean, and allowing redundant registration?
> Probably not worth it, in which case I am OK to add a refcount to my avic code.

Ya, I would rather force AVIC to do the refcounting.  Existing users don't need a
refcount, and doing the refcounting in AVIC code means kvm_page_track_notifier_node
can WARN on redundant registration, i.e. can sanity check the AVIC code to some
extent.

> Or maybe just scrap the whole thing and just leave registration and
> activation of the write tracking as two separate things? Honestly now that
> looks like the most clean solution.

It's the easiest, but IMO it's not the cleanest.  Allowing notifiers to be
registered without tracking being enabled is undesirable, especially since we know
we can prevent it.
Maxim Levitsky Aug. 8, 2022, 1:13 p.m. UTC | #10
On Mon, 2022-08-01 at 17:20 +0000, Sean Christopherson wrote:
> On Thu, Jul 28, 2022, Maxim Levitsky wrote:
> > On Mon, 2022-07-25 at 16:08 +0000, Sean Christopherson wrote:
> > > On Wed, Jul 20, 2022, Maxim Levitsky wrote:
> > > And on that topic, do you have performance numbers to justify using a single
> > > shared node?  E.g. if every table instance has its own notifier, then no additional
> > > refcounting is needed. 
> > 
> > The thing is that KVM goes over the list of notifiers and calls them for
> > every write from the emulator in fact even just for mmio write, and when you
> > enable write tracking on a page, you just write protect the page and add a
> > mark in the page track array, which is roughly 
> > 
> > 'don't install spte, don't install mmio spte, but just emulate the page fault if it hits this page'
> > 
> > So adding more than a bare minimum to this list, seems just a bit wrong.
> 
> Hmm, I see what you're saying.  To some extent, having a minimal page tracker
> implementation is just that, an implementation detail.  But for better or worse,
> the existing API effectively pushes range checking to the callers.  I agree that
> breaking from that pattern would be odd.
> 
> > >  It's not obvious that a shared node will provide better performance, e.g.
> > >  if there are only a handful of AVIC tables being shadowed, then a linear
> > >  walk of all nodes is likely fast enough, and doesn't bring the risk of a
> > >  write potentially being stalled due to having to acquire a VM-scoped
> > >  mutex.
> > 
> > The thing is that if I register multiple notifiers, they all will be called anyway,
> > but yes I can use container_of, and discover which table the notifier belongs to,
> > instead of having a hash table where I lookup the GFN of the fault.
> > 
> > The above means practically that all the shadow physid tables will be in a linear
> > list of notifiers, so I could indeed avoid per vm mutex on the write tracking,
> > however for simplicity I probably will still need it because I do modify the page,
> > and having per physid table mutex complicates things.
> > 
> > Currently in my code the locking is very simple and somewhat dumb, but the performance
> > is very good because the code isn't executed often, most of the time the AVIC hardware
> > works alone without any VM exits.
> 
> Yes, but because the code isn't executed often, pretty much any solution will
> provide good performance.
> 
> > Once the code is accepted upstream, it's one of the things that can be improved.
> > 
> > Note though that I still need a hash table and a mutex because on each VM entry,
> > the guest can use a different physid table, so I need to lookup it, and create it,
> > if not found, which would require read/write of the hash table and thus a mutex.
> 
> One of the points I'm trying to make is that a hash table isn't strictly required.
> E.g. if I understand the update rules correctly, I believe tables can be tracked
> via an RCU-protected list, with vCPUs taking a spinlock and doing synchronize_rcu()
> when adding/removing a table.  That would avoid having to take any "real" locks in
> the page track notifier.
> 
> The VM-scoped mutex worries me as it will be a bottleneck if L1 is running multiple
> L2 VMs.  E.g. if L1 is frequently switching vmcs12 and thus avic_physical_id, then
> nested VMRUN will effectively get serialized.  That is mitigated to some extent by
> an RCU-protected list, as a sane L1 will use a single table for each L2, and so a
> vCPU will need to add/remove a table if and only if it's the first/last vCPU to
> start/stop running an L2 VM.

Hi Sean, Paolo, and everyone else who wants to review my nested AVIC work.
 
I would like to explain the design choices for locking, and life cycle of the shadow physid tables, and I hope
that this will make it easier for you to review my code and/or make some suggestions on how to improve it.
 
=====================================================================================================================
Explanation of the AVIC physid page (AVIC physical ID table)
=====================================================================================================================
 
This table gives a vCPU enough knowledge of its peers to send them IPIs without VM exit.
 
A vCPU doesn’t use this table to send IPIs to itself and or to process its own interrupts from its own
IRR/ISR. It accesses its APIC backing page directly.
 
This table contains an entry for each vCPU, and each entry contains 2 things:
 
1. A physical address of a peer’s vCPU APIC backing page, so that when sending IPIs to this vCPU
   It can set them in the IRR location in this page (this differs from APICv, which uses PIR bitmaps).
 
   NOTE1: There is also a ‘V’(valid) bit attached to the address - when clear, then whole entry is invalid
   and trying to use it will trigger a VM exit for unicast IPIs, but for broadcast interrupts the
   entry will be ignored.
 
   NOTE2: This part of the entry is not supposed to change during the lifetime of a VM.
 
2. An apic id of a physical vCPU where this vCPU is running (in case of nesting, this would be L1 APIC id).
 
   This field allows AVIC to ring the doorbell on the target physical CPU to make its AVIC process the 
   incoming interrupt request.
 
   It also has a ‘IR’ bit (is running) which when clear indicates that the target vCPU is not running anywhere
   thus the field content is not valid.
 
   - This field is supposed to be changed by L1 once in a while when it either migrates
     the L2's vCPUs around and/or schedules them in/out
 
   - Write tracking of the guest physid table ensures that the shadow physid table is kept up to date.
 
   - In addition to that, the L1's vCPUs can be migrated and/or scheduled in/out, which would 
     lead to an update of the shadow table as well.
     (similar how mmu notifiers need to update the shadow tables, not because of a guest 
     lead change but due to host triggered change)
 
 
- All vCPUs of a nested VM are supposed to share the same physid page, and the page is supposed to contain
  entries such as each entry points to unique apic backing page and contains the L1’s physical apic id,
  On which this nested vCPU runs now (or has is_running=1 meaning that this vCPU is scheduled out)
 
- The number of entries in the physid table (aka max guest apic id) is not specified in it, bur rather it is given
  In the vmcb that references it (also all vmcbs of a guest should have the same value).
 
NOTE: while I say ‘supposed’ I understand that a malicious guest will try to bend each of these
  assumptions and AFAIK I do handle (but often in a slow way) all these unusual cases while
  still following the AVIC spec.
 
=====================================================================================================================
Lifecycle of the shadow physid pages
=====================================================================================================================
 
- An empty shadow physid page is created when a nested entry with AVIC is attempted with a new physid table.
  New shadow physid table is created, and has 0 entries, thus it needs to be synced.
 
- On each VM entry, if the vCPU’s shadow physid table is not NULL but is not synced, then all the entries in the
  table are created (synced):
 
  - the apic backing page pointed by the entry is pinned in ram and its real physical address is written 
    in the shadow entry
 
  - the L1 vCPU in the entry, when valid (is_running=1) is translated to L0 apic id based on which CPU, the L1 vCPU 
    runs, and the value is written in the shadow entry.
 
- On nested VM exit, pretty much nothing is done in regard to shadow physid tables:
  the vCPU keeps its shadow physid table, its shadow entries are still valid and point to pinned apic backing pages.
 
- Once L1 is running, if it is about to schedule the L2’s vCPU off, it can toggle is_running bit, which will trigger
   write tracking and update the shadow physid table.
 
 
- On another nested VM entry with *same* physid table, nothing happens
  (Unless for some reason the guest increased the number of entries, then new entries are synced, which
  is very rare to happen - can only practically happen when nested CPU hotplug happens)
 
- On another nested VM entry with a different physid table:
 
  - The current table refcount is decreased, and the table is freed if it reaches 0. Freeing triggers unpinning of
    all guest apic backing pages referenced by the page.
 
    This relatively simple approach means that if L1 switches a lot between nested guests, and these  guests don't
    have many vCPUs, it would be possible that all nested vCPUs would switch to one  physid page and then to another
    thus triggering freeing of the first and creating of the second page  and then vice versa.
 
    In my testing that doesn't happen that often, unless there is quite some oversubscription  and/or double nesting
    (which leads to L1 running two guests (01 and 02) and switching between them like crazy.
 
    The design choice was made to avoid keeping a cache of physid tables (like mmu does) and  shrinking it once in
    a while.
 
    The problem with such cache is that each inactive physid table in it (which might very well be already reused 
    for something else), will keep all its entries pinned in the guest memory.
 
    With this design choice, the maximum number of shadow physid tables is the number of vCPUs.
 
  - new physid table is looked up in the hash table and created if not found there.
 
 
- When a vCPU disables nesting (clears EFER.SVME) and/or the VM is shut down the physid table that belongs to it,
  has its refcount decreased as well, which can also lead to its freeing.
  
  So when L1 fully disables nesting (in KVM case, means that it destroys all VMs), then all shadow physid
  pages will be freed.
 
 
- When L1 vCPU is migrated across physical cpus and/or scheduled in/out, all shadow physid table's entries which
  reference this vCPU, are updated.
 
  NOTE: usually there will be just one or zero such entries, but if this L1 vCPU is oversubscribed, it is possible 
  that two physid tables would contain entries that reference this vCPU, if two guests are running almost at the 
  same time on this vCPU. 
 
  It can't happen if the nested guest is KVM, because KVM always unloads the previous vCPU before it loads the
  next one, which will lead to setting is_running to 0 on the previous vCPU.
 
  In case of double nesting, KVM also clears is_running bit of L1 guest before running L2.
 
  A linked list of only the entries themselves is kept in each L1's vCPU, and it is protected from races vs write
  tracking by a spinlock.
 
=====================================================================================================================
Locking in the nested AVIC
=====================================================================================================================
 
First of all I use two locks.
 
1. a per VM mutex that roughly shares the same purpose as 'kvm->mmu_lock' and protects the hash table, and also just  
   serializes some operations.
 
2. a per VM spinlock which protects access to the physical CPU portion of physid tables. It is either taken with the
   mutex held or taken alone.
 
The choice of two locks is a bit odd, and I might be able to only have a single spinlock.
 
Let me now explain how the locking is used and how it compares with kvm’s mmu lock:
 
======================================
-> Nested VM entry
======================================
 
  mutex -> spinlock
 
  Mutex ensures that KVM doesn’t race against another nested VM entry which is also trying to create the 
  shadow physid page
 
  Spinlock ensures that we don't race with one of the vCPU schedule in/out, updating the is_running bit,
 
  kvm's mmu:
        - kvm_mmu_load is called when current mmu root is invalid
        - mmu lock is taken, and a new mmu root page is created or existing one looked up in the hash table
 
======================================
-> VM entry
======================================
 
  mutex -> spinlock
 
  (done only when KVM_REQ_APIC_PAGE_RELOAD is pending)
 
  Very similar to the nested VM entry, and in practice will happen *very rarely* because this can happen only if a 
  memslot that *contains* the page got flushed, or if write tracking detected unusual write to the page
  (like update of the avic backing page)
 
  kvm’s mmu:
	- kvm_mmu_load is called when current mmu root is invalid
	- mmu lock is taken, and a new mmu root page is created or existing one looked up in the hash table
 
======================================
-> Write tracking <-
======================================
 
   mutex -> spinlock
 
   Also like the above. 
 
   - Updates only the is_running bits in the shadow physid table.
 
   - Otherwise all entries in the table are erased and the KVM_REQ_APIC_PAGE_RELOAD request raised, which ensures 
     that if that table is used on another CPU, it will sync it before using it again.
     
     That is also very rare to happen, unless the guest stopped using the page as a physid page, in which case
     the page will be just dropped by vCPUs which still reference it but don’t use it.
 
   kvm’s mmu:
 
   - kvm_mmu_pte_write is called
 
   - mmu lock is taken, and a new mmu root page is created or existing one looked up in the hash table
 
   - if unaligned write / write flooding is detected, the page is zapped
 
   - for zapped root pages, since they are still could be in use by other cpus, this removes the table from 
     the linked list + raises KVM_REQ_MMU_FREE_OBSOLETE_ROOTS)
 
   - KVM_REQ_MMU_FREE_OBSOLETE_ROOTS makes each vcpu get rid of its mmu root if zapped, and later will lead
     to 'kvm_mmu_load' creating a new root shadow page
 
     (this is similar to raising KVM_REQ_APIC_PAGE_RELOAD)
 
======================================
-> Memslot flush <-
======================================
 
    mutex -> spinlock
 
   - Memslot flush happens very rarely, and leads to erase of all shadow physid tables in the memslot.
     and raising of KVM_REQ_APIC_PAGE_RELOAD which if some vCPUs use the page, will make them re-sync it.
 
   kvm’s mmu:
       kvm_mmu_invalidate_zap_pages_in_memslot is called which
	   - takes mmu lock
	   - zaps *all* the shadow pages (kvm_mmu_zap_all_fast)
	   - raises KVM_REQ_MMU_FREE_OBSOLETE_ROOTS to get re-create all the current mmu roots
 
======================================
-> L1 vCPU schedule in/out <-
======================================
 
   *only spinlock is taken*
 
    Here the KVM only updates the is_running bit in shadow physid tables that reference it using a linked list of 
    these entries.
 
    Can be optimized to avoid taking spinlock if the linked list is empty, using the correct memory barriers.
 
    kvm mmu: No equivalent.
 
======================================
-> Unaccelerated IPI emulation <-
======================================
 
   * no lock are taken *
 
   Guest physid table is read to determine guest value of is_running bit. This is done without locking vs
   write tracking because the guest must itself insure that it either has locking or barriers to avoid a race here.
 
======================================
-> Nested doorbell emulation <-
======================================
 
   * no lock are taken *
 
   Thankfully the code doesn't need physid table at all, it just needs to translate the L1's apic ID to the L0's 
   apic id and ring the real doorbell.
 
=====================================================================================================================
Ideas for improvement:
=====================================================================================================================
 
1. Stopping pinning the avic backing pages. 
 
   While these pages have to be pinned when a vCPU uses it directly, they don't have to be pinned when a 
   the physid table references it if they could be pinned on demand.
 
   Many of these tables might not be used anymore and until KVM finds out, these backing pages will be pinned 
   for nothing.
 
   The problem with this is that keeping 'V' (valid/present) bit off in the shadow table isn't suitable for 
   on demand access to these entries like one would do in paging - the reason - when sending a broadcast interrupt
   through AVIC, it ignores the non valid entries and doesn't VMexit - which makes sense but ruins the plan.
 
   However there is a way to overcome it. An valid shadow physid table entry is created which points to a 'dummy' 
   page, and doesn't have the 'is_running' bit set. 
 
   For such entry, AVIC will set IRR bits in that dummy page, and then signal unaccelerated IPI vm exit,
   and then KVM can detect the condition, locate and swap in the AVIC backing page and write the bit there manually,
   by looking at what was written in the ICR (that is thankfully in the vm exit info field).
 
   This, together with hooking into mmu notifier to erase shadow physid entries, when an apic backing page is swapped out, 
   should make it work.

   The downside of this is that I will have to emulate more of the AVIC, I will have to set the IRR bits manually
   in the apic backing pages I just pinned.
   And I need a hash to track all avic backing pages, so that when I get mmu notifier notification, I can know
   that a page is a apic backing page, and I also need to know which physid table references it (I need a sort of
   'rmap' for this).
 
2. Use just a spinlock.
 
  - I have to use a spinlock because this is the only locking primitive that can be used from L1's vCPU load/put
    functions which are called from schedule().
 
  - I can avoid using the mutex, which is currently used because allocation of physid table can sleep and also
    because pinning of avic backing pages can sleep and accessing the guest physid table can sleep as well, so by having
    a spinlock, I can take it only in short critical sections where I update the is_running bit in the shadow table
    and nowhere else.
 
    KVM's mmu avoids first issue by having a pre-allocated cache of mmu pages and for the second issue, 
    it either uses atomic guest access functions and retires if they fail (need sleep), or pre-caches the values
    (like in mmu page walk struct) then takes the mmu spinlock, and then uses the read values.

 
Your feedback, ideas, and of course review of the patches is very welcome!
 
Best regards,
	Maxim Levitsky


> 
> > > > I can also stash this boolean (like 'bool registered;') into the 'struct
> > > > kvm_page_track_notifier_node',  and thus allow the
> > > > kvm_page_track_register_notifier to be called more that once -  then I can
> > > > also get rid of __kvm_page_track_register_notifier. 
> > > 
> > > No, allowing redundant registration without proper refcounting leads to pain,
> > > e.g. X registers, Y registers, X unregisters, kaboom.
> > > 
> > 
> > True, but then what about adding a refcount to 'struct kvm_page_track_notifier_node'
> > instead of a boolean, and allowing redundant registration?
> > Probably not worth it, in which case I am OK to add a refcount to my avic code.
> 
> Ya, I would rather force AVIC to do the refcounting.  Existing users don't need a
> refcount, and doing the refcounting in AVIC code means kvm_page_track_notifier_node
> can WARN on redundant registration, i.e. can sanity check the AVIC code to some
> extent.
> 
> > Or maybe just scrap the whole thing and just leave registration and
> > activation of the write tracking as two separate things? Honestly now that
> > looks like the most clean solution.
> 
> It's the easiest, but IMO it's not the cleanest.  Allowing notifiers to be
> registered without tracking being enabled is undesirable, especially since we know
> we can prevent it.
>
Sean Christopherson Sept. 29, 2022, 10:38 p.m. UTC | #11
On Mon, Aug 08, 2022, Maxim Levitsky wrote:
> Hi Sean, Paolo, and everyone else who wants to review my nested AVIC work.

Before we dive deep into design details, I think we should first decide whether
or not nested AVIC is worth pursing/supporting.

  - Rome has a ucode/silicon bug with no known workaround and no anticipated fix[*];
    AMD's recommended "workaround" is to disable AVIC.
  - AVIC is not available in Milan, which may or may not be related to the
    aforementioned bug.
  - AVIC is making a comeback on Zen4, but Zen4 comes with x2AVIC.
  - x2APIC is likely going to become ubiquitous, e.g. Intel is effectively
    requiring x2APIC to fudge around xAPIC bugs.
  - It's actually quite realistic to effectively force the guest to use x2APIC,
    at least if it's a Linux guest.  E.g. turn x2APIC on in BIOS, which is often
    (always?) controlled by the host, and Linux will use x2APIC.

In other words, given that AVIC is well on its way to becoming a "legacy" feature,
IMO there needs to be a fairly strong use case to justify taking on this much code
and complexity.  ~1500 lines of code to support a feature that has historically
been buggy _without_ nested support is going to require a non-trivial amount of
effort to review, stabilize, and maintain.

[*] 1235 "Guest With AVIC (Advanced Virtual Interrupt Controller) Enabled May Fail
    to Process IPI (Inter-Processor Interrupt) Until Guest Is Re-Scheduled" in
    https://www.amd.com/system/files/TechDocs/56323-PUB_1.00.pdf
Maxim Levitsky Oct. 3, 2022, 7:27 a.m. UTC | #12
On Thu, 2022-09-29 at 22:38 +0000, Sean Christopherson wrote:
> On Mon, Aug 08, 2022, Maxim Levitsky wrote:
> > Hi Sean, Paolo, and everyone else who wants to review my nested AVIC work.
> 
> Before we dive deep into design details, I think we should first decide whether
> or not nested AVIC is worth pursing/supporting.
> 
>   - Rome has a ucode/silicon bug with no known workaround and no anticipated fix[*];
>     AMD's recommended "workaround" is to disable AVIC.
>   - AVIC is not available in Milan, which may or may not be related to the
>     aforementioned bug.
>   - AVIC is making a comeback on Zen4, but Zen4 comes with x2AVIC.
>   - x2APIC is likely going to become ubiquitous, e.g. Intel is effectively
>     requiring x2APIC to fudge around xAPIC bugs.
>   - It's actually quite realistic to effectively force the guest to use x2APIC,
>     at least if it's a Linux guest.  E.g. turn x2APIC on in BIOS, which is often
>     (always?) controlled by the host, and Linux will use x2APIC.
> 
> In other words, given that AVIC is well on its way to becoming a "legacy" feature,
> IMO there needs to be a fairly strong use case to justify taking on this much code
> and complexity.  ~1500 lines of code to support a feature that has historically
> been buggy _without_ nested support is going to require a non-trivial amount of
> effort to review, stabilize, and maintain.
> 
> [*] 1235 "Guest With AVIC (Advanced Virtual Interrupt Controller) Enabled May Fail
>     to Process IPI (Inter-Processor Interrupt) Until Guest Is Re-Scheduled" in
>     https://www.amd.com/system/files/TechDocs/56323-PUB_1.00.pdf
> 

I am afraid that you mixed things up:

You mistake is that x2avic is just a minor addition to AVIC. It is still for
all practical purposes the same feature.

 
1. The AVIC is indeed kind of broken on Zen2 (but AFAIK for all practical purposes,
   including nested it works fine, the errata only shows up in a unit test and/or
   under very specific workloads (most of the time a delayed wakeup doesn't cause a hang).
   Yet, I agree that for production Zen2 should not have AVIC enabled.
 

2. Zen3 does indeed have AVIC soft disabled in CPUID. AFAIK it works just fine,
   but I understand that customers won't use it against AMD's guidance.
 
 
3. On Zen4, AVIC is fully enabled and also extended to support x2apic mode.
   The fact that AVIC was extended to support X2apic mode also shows that AMD
   is committed to supporting it.
 
 
My nested AVIC code technically doesn't expose x2avic to the guest, but it
is pretty much trivial to add (I am only waiting to get my hands on Zen4 machine
to do it), and also even in its current form it would work just fine if the host
uses normal AVIC .
 
(or even doesn't use AVIC at all - the nested AVIC code works just fine
even if the host has its AVIC inhibited for some reason).
 
Adding nested x2avic support is literally about not passing through that MMIO address,
Enabling the x2avic bit in int_ctl, and opening up the access to x2apic msrs.
Plus I need to do some minor changes in unaccelerated IPI handler, dealing
With read-only logical ID and such.
 
Physid tables, apic backing pages, doorbell emulation, 
everything is pretty much unchanged.
 
So AVIC is nothing but a legacy feature, and my nested AVIC code will support
both nested AVIC and nested X2AVIC.
 
Best regards,
	Maxim Levitsky
Sean Christopherson Nov. 10, 2022, 12:47 a.m. UTC | #13
Sorry for the super slow reply, I don't have a good excuse other than I needed to
take break from AVIC code...

On Mon, Oct 03, 2022, Maxim Levitsky wrote:
> On Thu, 2022-09-29 at 22:38 +0000, Sean Christopherson wrote:
> > On Mon, Aug 08, 2022, Maxim Levitsky wrote:
> > > Hi Sean, Paolo, and everyone else who wants to review my nested AVIC work.
> > 
> > Before we dive deep into design details, I think we should first decide whether
> > or not nested AVIC is worth pursing/supporting.
> > 
> >   - Rome has a ucode/silicon bug with no known workaround and no anticipated fix[*];
> >     AMD's recommended "workaround" is to disable AVIC.
> >   - AVIC is not available in Milan, which may or may not be related to the
> >     aforementioned bug.
> >   - AVIC is making a comeback on Zen4, but Zen4 comes with x2AVIC.
> >   - x2APIC is likely going to become ubiquitous, e.g. Intel is effectively
> >     requiring x2APIC to fudge around xAPIC bugs.
> >   - It's actually quite realistic to effectively force the guest to use x2APIC,
> >     at least if it's a Linux guest.  E.g. turn x2APIC on in BIOS, which is often
> >     (always?) controlled by the host, and Linux will use x2APIC.
> > 
> > In other words, given that AVIC is well on its way to becoming a "legacy" feature,
> > IMO there needs to be a fairly strong use case to justify taking on this much code
> > and complexity.  ~1500 lines of code to support a feature that has historically
> > been buggy _without_ nested support is going to require a non-trivial amount of
> > effort to review, stabilize, and maintain.
> > 
> > [*] 1235 "Guest With AVIC (Advanced Virtual Interrupt Controller) Enabled May Fail
> >     to Process IPI (Inter-Processor Interrupt) Until Guest Is Re-Scheduled" in
> >     https://www.amd.com/system/files/TechDocs/56323-PUB_1.00.pdf
> > 
> 
> I am afraid that you mixed things up:
> 
> You mistake is that x2avic is just a minor addition to AVIC. It is still for
> all practical purposes the same feature.

...

> Physid tables, apic backing pages, doorbell emulation, 
> everything is pretty much unchanged.

Ya, it finally clicked for me that KVM would needs to shadow the physical ID
tables irrespective of x2APIC.

I'm still very hesitant to support full virtualization of nested (x2)AVIC.  The
complexity and amount of code is daunting, and nSVM has lower hanging fruit that
we should pick before going after full nested (x2)AVIC, e.g. SVM's TLB flushing
needs a serious overhaul.  And if we go through the pain for SVM, I think we'd
probably want to come up with a solution that can be at least shared shared with
VMX's IPI virtualization.

As an intermediate step, can we expose (x2)AVIC to L2 without any shadowing?
E.g. run all L2s with a single dummy physical ID table and emulate IPIs in KVM?

If that works, that seems like a logical first step even if we want to eventually
support nested IPI virtualization.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 636df87542555..fc7df778a3d71 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1254,7 +1254,7 @@  struct kvm_arch {
 	 * is used as one input when determining whether certain memslot
 	 * related allocations are necessary.
 	 */
-	bool shadow_root_allocated;
+	bool mmu_page_tracking_enabled;
 
 #if IS_ENABLED(CONFIG_HYPERV)
 	hpa_t	hv_root_tdp;
diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h
index eb186bc57f6a9..955a5ae07b10e 100644
--- a/arch/x86/include/asm/kvm_page_track.h
+++ b/arch/x86/include/asm/kvm_page_track.h
@@ -50,6 +50,7 @@  int kvm_page_track_init(struct kvm *kvm);
 void kvm_page_track_cleanup(struct kvm *kvm);
 
 bool kvm_page_track_write_tracking_enabled(struct kvm *kvm);
+int kvm_page_track_write_tracking_enable(struct kvm *kvm);
 int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot);
 
 void kvm_page_track_free_memslot(struct kvm_memory_slot *slot);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 671cfeccf04e9..44d15551f7156 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -269,7 +269,7 @@  int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
 int kvm_mmu_post_init_vm(struct kvm *kvm);
 void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
 
-static inline bool kvm_shadow_root_allocated(struct kvm *kvm)
+static inline bool mmu_page_tracking_enabled(struct kvm *kvm)
 {
 	/*
 	 * Read shadow_root_allocated before related pointers. Hence, threads
@@ -277,9 +277,11 @@  static inline bool kvm_shadow_root_allocated(struct kvm *kvm)
 	 * see the pointers. Pairs with smp_store_release in
 	 * mmu_first_shadow_root_alloc.
 	 */
-	return smp_load_acquire(&kvm->arch.shadow_root_allocated);
+	return smp_load_acquire(&kvm->arch.mmu_page_tracking_enabled);
 }
 
+int mmu_enable_write_tracking(struct kvm *kvm);
+
 #ifdef CONFIG_X86_64
 static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; }
 #else
@@ -288,7 +290,7 @@  static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; }
 
 static inline bool kvm_memslots_have_rmaps(struct kvm *kvm)
 {
-	return !is_tdp_mmu_enabled(kvm) || kvm_shadow_root_allocated(kvm);
+	return !is_tdp_mmu_enabled(kvm) || mmu_page_tracking_enabled(kvm);
 }
 
 static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 904f0faff2186..fb744616bf7df 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3389,7 +3389,7 @@  static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 	return r;
 }
 
-static int mmu_first_shadow_root_alloc(struct kvm *kvm)
+int mmu_enable_write_tracking(struct kvm *kvm)
 {
 	struct kvm_memslots *slots;
 	struct kvm_memory_slot *slot;
@@ -3399,21 +3399,20 @@  static int mmu_first_shadow_root_alloc(struct kvm *kvm)
 	 * Check if this is the first shadow root being allocated before
 	 * taking the lock.
 	 */
-	if (kvm_shadow_root_allocated(kvm))
+	if (mmu_page_tracking_enabled(kvm))
 		return 0;
 
 	mutex_lock(&kvm->slots_arch_lock);
 
 	/* Recheck, under the lock, whether this is the first shadow root. */
-	if (kvm_shadow_root_allocated(kvm))
+	if (mmu_page_tracking_enabled(kvm))
 		goto out_unlock;
 
 	/*
 	 * Check if anything actually needs to be allocated, e.g. all metadata
 	 * will be allocated upfront if TDP is disabled.
 	 */
-	if (kvm_memslots_have_rmaps(kvm) &&
-	    kvm_page_track_write_tracking_enabled(kvm))
+	if (kvm_memslots_have_rmaps(kvm) && mmu_page_tracking_enabled(kvm))
 		goto out_success;
 
 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
@@ -3443,7 +3442,7 @@  static int mmu_first_shadow_root_alloc(struct kvm *kvm)
 	 * all the related pointers are set.
 	 */
 out_success:
-	smp_store_release(&kvm->arch.shadow_root_allocated, true);
+	smp_store_release(&kvm->arch.mmu_page_tracking_enabled, true);
 
 out_unlock:
 	mutex_unlock(&kvm->slots_arch_lock);
@@ -3480,7 +3479,7 @@  static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 		}
 	}
 
-	r = mmu_first_shadow_root_alloc(vcpu->kvm);
+	r = mmu_enable_write_tracking(vcpu->kvm);
 	if (r)
 		return r;
 
@@ -5753,6 +5752,10 @@  int kvm_mmu_init_vm(struct kvm *kvm)
 	node->track_write = kvm_mmu_pte_write;
 	node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
 	kvm_page_track_register_notifier(kvm, node);
+
+	if (IS_ENABLED(CONFIG_KVM_EXTERNAL_WRITE_TRACKING) || !tdp_enabled)
+		mmu_enable_write_tracking(kvm);
+
 	return 0;
 }
 
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
index 2e09d1b6249f3..8857d629036d7 100644
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -21,10 +21,16 @@ 
 
 bool kvm_page_track_write_tracking_enabled(struct kvm *kvm)
 {
-	return IS_ENABLED(CONFIG_KVM_EXTERNAL_WRITE_TRACKING) ||
-	       !tdp_enabled || kvm_shadow_root_allocated(kvm);
+	return mmu_page_tracking_enabled(kvm);
 }
 
+int kvm_page_track_write_tracking_enable(struct kvm *kvm)
+{
+	return mmu_enable_write_tracking(kvm);
+}
+EXPORT_SYMBOL_GPL(kvm_page_track_write_tracking_enable);
+
+
 void kvm_page_track_free_memslot(struct kvm_memory_slot *slot)
 {
 	int i;