diff mbox series

[19/27] KVM: x86/mmu: Use page-track notifiers iff there are external users

Message ID 20221223005739.1295925-20-seanjc@google.com (mailing list archive)
State New, archived
Headers show
Series drm/i915/gvt: KVM: KVMGT fixes and page-track cleanups | expand

Commit Message

Sean Christopherson Dec. 23, 2022, 12:57 a.m. UTC
Disable the page-track notifier code at compile time if there are no
external users, i.e. if CONFIG_KVM_EXTERNAL_WRITE_TRACKING=n.  KVM itself
now hooks emulated writes directly instead of relying on the page-track
mechanism.

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/include/asm/kvm_host.h       |  2 ++
 arch/x86/include/asm/kvm_page_track.h |  2 ++
 arch/x86/kvm/mmu/page_track.c         |  9 ++++----
 arch/x86/kvm/mmu/page_track.h         | 30 +++++++++++++++++++++++----
 4 files changed, 35 insertions(+), 8 deletions(-)

Comments

Yan Zhao Dec. 28, 2022, 6:56 a.m. UTC | #1
On Fri, Dec 23, 2022 at 12:57:31AM +0000, Sean Christopherson wrote:
> Disable the page-track notifier code at compile time if there are no
> external users, i.e. if CONFIG_KVM_EXTERNAL_WRITE_TRACKING=n.  KVM itself
> now hooks emulated writes directly instead of relying on the page-track
> mechanism.
> 
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>  arch/x86/include/asm/kvm_host.h       |  2 ++
>  arch/x86/include/asm/kvm_page_track.h |  2 ++
>  arch/x86/kvm/mmu/page_track.c         |  9 ++++----
>  arch/x86/kvm/mmu/page_track.h         | 30 +++++++++++++++++++++++----
>  4 files changed, 35 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index eec424fac0ba..e8f8e1bd96c7 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1223,7 +1223,9 @@ struct kvm_arch {
>  	 * create an NX huge page (without hanging the guest).
>  	 */
>  	struct list_head possible_nx_huge_pages;
> +#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
>  	struct kvm_page_track_notifier_head track_notifier_head;
> +#endif
>  	/*
>  	 * Protects marking pages unsync during page faults, as TDP MMU page
>  	 * faults only take mmu_lock for read.  For simplicity, the unsync
> diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h
> index deece45936a5..53c2adb25a07 100644
> --- a/arch/x86/include/asm/kvm_page_track.h
> +++ b/arch/x86/include/asm/kvm_page_track.h
> @@ -55,6 +55,7 @@ void kvm_slot_page_track_remove_page(struct kvm *kvm,
>  				     struct kvm_memory_slot *slot, gfn_t gfn,
>  				     enum kvm_page_track_mode mode);
>  
> +#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
>  enum pg_level kvm_page_track_max_mapping_level(struct kvm *kvm, gfn_t gfn,
>  					       enum pg_level max_level);
>  
> @@ -64,5 +65,6 @@ kvm_page_track_register_notifier(struct kvm *kvm,
>  void
>  kvm_page_track_unregister_notifier(struct kvm *kvm,
>  				   struct kvm_page_track_notifier_node *n);
> +#endif /* CONFIG_KVM_EXTERNAL_WRITE_TRACKING */
>  
>  #endif
> diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
> index 2b302fd2c5dd..f932909aa9b5 100644
> --- a/arch/x86/kvm/mmu/page_track.c
> +++ b/arch/x86/kvm/mmu/page_track.c
> @@ -193,6 +193,7 @@ bool kvm_slot_page_track_is_active(struct kvm *kvm,
>  	return !!READ_ONCE(slot->arch.gfn_track[mode][index]);
>  }
>  
> +#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
>  void kvm_page_track_cleanup(struct kvm *kvm)
>  {
>  	struct kvm_page_track_notifier_head *head;
> @@ -208,6 +209,7 @@ int kvm_page_track_init(struct kvm *kvm)
>  	head = &kvm->arch.track_notifier_head;
>  	INIT_HLIST_HEAD(&head->track_notifier_list);
>  	return init_srcu_struct(&head->track_srcu);
> +	return 0;
Double "return"s.


>  }
>  
>  /*
> @@ -254,8 +256,8 @@ EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier);
>   * The node should figure out if the written page is the one that node is
>   * interested in by itself.
>   */
> -void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
> -			  int bytes)
> +void __kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
> +			    int bytes)
>  {
>  	struct kvm_page_track_notifier_head *head;
>  	struct kvm_page_track_notifier_node *n;
> @@ -272,8 +274,6 @@ void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
>  		if (n->track_write)
>  			n->track_write(gpa, new, bytes, n);
>  	srcu_read_unlock(&head->track_srcu, idx);
> -
> -	kvm_mmu_track_write(vcpu, gpa, new, bytes);
>  }
>  
>  /*
> @@ -316,3 +316,4 @@ enum pg_level kvm_page_track_max_mapping_level(struct kvm *kvm, gfn_t gfn,
>  	return max_level;
>  }
>  EXPORT_SYMBOL_GPL(kvm_page_track_max_mapping_level);
> +#endif
> diff --git a/arch/x86/kvm/mmu/page_track.h b/arch/x86/kvm/mmu/page_track.h
> index 89712f123ad3..1b363784aa4a 100644
> --- a/arch/x86/kvm/mmu/page_track.h
> +++ b/arch/x86/kvm/mmu/page_track.h
> @@ -6,8 +6,6 @@
>  
>  #include <asm/kvm_page_track.h>
>  
> -int kvm_page_track_init(struct kvm *kvm);
> -void kvm_page_track_cleanup(struct kvm *kvm);
>  
>  bool kvm_page_track_write_tracking_enabled(struct kvm *kvm);
>  int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot);
> @@ -21,13 +19,37 @@ bool kvm_slot_page_track_is_active(struct kvm *kvm,
>  				   const struct kvm_memory_slot *slot,
>  				   gfn_t gfn, enum kvm_page_track_mode mode);
>  
> -void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
> -			  int bytes);
> +#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
> +int kvm_page_track_init(struct kvm *kvm);
> +void kvm_page_track_cleanup(struct kvm *kvm);
> +
> +void __kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
> +			    int bytes);
>  void kvm_page_track_delete_slot(struct kvm *kvm, struct kvm_memory_slot *slot);
>  
>  static inline bool kvm_page_track_has_external_user(struct kvm *kvm)
>  {
>  	return hlist_empty(&kvm->arch.track_notifier_head.track_notifier_list);
>  }
> +#else
> +static inline int kvm_page_track_init(struct kvm *kvm) { return 0; }
> +static inline void kvm_page_track_cleanup(struct kvm *kvm) { }
> +
> +static inline void __kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
> +					  const u8 *new, int bytes) { }
> +static inline void kvm_page_track_delete_slot(struct kvm *kvm,
> +					      struct kvm_memory_slot *slot) { }
> +
> +static inline bool kvm_page_track_has_external_user(struct kvm *kvm) { return false; }
> +
> +#endif /* CONFIG_KVM_EXTERNAL_WRITE_TRACKING */
> +
> +static inline void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
> +					const u8 *new, int bytes)
> +{
> +	__kvm_page_track_write(vcpu, gpa, new, bytes);
> +
Why not convert "vcpu" to "kvm" in __kvm_page_track_write() ?
i.e.
void __kvm_page_track_write(struct kvm *kvm, gpa_t gpa, const u8 *new, int bytes);


Thanks
Yan

> +	kvm_mmu_track_write(vcpu, gpa, new, bytes);
> +}
>  
>  #endif /* __KVM_X86_PAGE_TRACK_H */
> -- 
> 2.39.0.314.g84b9a713c41-goog
>
Sean Christopherson Jan. 4, 2023, 12:50 a.m. UTC | #2
On Wed, Dec 28, 2022, Yan Zhao wrote:
> On Fri, Dec 23, 2022 at 12:57:31AM +0000, Sean Christopherson wrote:
> > diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
> > index 2b302fd2c5dd..f932909aa9b5 100644
> > --- a/arch/x86/kvm/mmu/page_track.c
> > +++ b/arch/x86/kvm/mmu/page_track.c
> > @@ -193,6 +193,7 @@ bool kvm_slot_page_track_is_active(struct kvm *kvm,
> >  	return !!READ_ONCE(slot->arch.gfn_track[mode][index]);
> >  }
> >  
> > +#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
> >  void kvm_page_track_cleanup(struct kvm *kvm)
> >  {
> >  	struct kvm_page_track_notifier_head *head;
> > @@ -208,6 +209,7 @@ int kvm_page_track_init(struct kvm *kvm)
> >  	head = &kvm->arch.track_notifier_head;
> >  	INIT_HLIST_HEAD(&head->track_notifier_list);
> >  	return init_srcu_struct(&head->track_srcu);
> > +	return 0;
> Double "return"s.

Huh, I'm surprised this didn't throw a warning.  I'm pretty sure I screwed up a
refactoring, I originally had the "return 0" in an #else branch.

> > +#endif /* CONFIG_KVM_EXTERNAL_WRITE_TRACKING */
> > +
> > +static inline void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
> > +					const u8 *new, int bytes)
> > +{
> > +	__kvm_page_track_write(vcpu, gpa, new, bytes);
> > +
> Why not convert "vcpu" to "kvm" in __kvm_page_track_write() ?

No reason, I just overlooked the opportunistic cleanup.  I'll do this in the next
version.

Thanks much for the reviews!
Like Xu Aug. 7, 2023, 12:01 p.m. UTC | #3
On 23/12/2022 8:57 am, Sean Christopherson wrote:
> +static inline void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
> +					const u8 *new, int bytes)
> +{
> +	__kvm_page_track_write(vcpu, gpa, new, bytes);
> +
> +	kvm_mmu_track_write(vcpu, gpa, new, bytes);
> +}

The kvm_mmu_track_write() is only used for x86, where the incoming parameter
"u8 *new" has not been required since 0e0fee5c539b ("kvm: mmu: Fix race in
emulated page table writes"), please help confirm if it's still needed ? Thanks.
A minor clean up is proposed.
Sean Christopherson Aug. 7, 2023, 5:19 p.m. UTC | #4
On Mon, Aug 07, 2023, Like Xu wrote:
> On 23/12/2022 8:57 am, Sean Christopherson wrote:
> > +static inline void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
> > +					const u8 *new, int bytes)
> > +{
> > +	__kvm_page_track_write(vcpu, gpa, new, bytes);
> > +
> > +	kvm_mmu_track_write(vcpu, gpa, new, bytes);
> > +}
> 
> The kvm_mmu_track_write() is only used for x86, where the incoming parameter
> "u8 *new" has not been required since 0e0fee5c539b ("kvm: mmu: Fix race in
> emulated page table writes"), please help confirm if it's still needed ? Thanks.
> A minor clean up is proposed.

Hmm, unless I'm misreading things, KVMGT ultimately doesn't consume @new either.
So I think we can remove @new from kvm_page_track_write() entirely.

Feel free to send a patch, otherwise I'll get to it later this week.
Yan Zhao Aug. 9, 2023, 1:02 a.m. UTC | #5
On Mon, Aug 07, 2023 at 10:19:07AM -0700, Sean Christopherson wrote:
> On Mon, Aug 07, 2023, Like Xu wrote:
> > On 23/12/2022 8:57 am, Sean Christopherson wrote:
> > > +static inline void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
> > > +					const u8 *new, int bytes)
> > > +{
> > > +	__kvm_page_track_write(vcpu, gpa, new, bytes);
> > > +
> > > +	kvm_mmu_track_write(vcpu, gpa, new, bytes);
> > > +}
> > 
> > The kvm_mmu_track_write() is only used for x86, where the incoming parameter
> > "u8 *new" has not been required since 0e0fee5c539b ("kvm: mmu: Fix race in
> > emulated page table writes"), please help confirm if it's still needed ? Thanks.
> > A minor clean up is proposed.
> 
> Hmm, unless I'm misreading things, KVMGT ultimately doesn't consume @new either.
> So I think we can remove @new from kvm_page_track_write() entirely.
Sorry for the late reply.
Yes, KVMGT does not consume @new and it reads the guest PTE again in the
page track write handler.

But I have a couple of questions related to the memtioned commit as
below:

(1) If "re-reading the current value of the guest PTE after the MMU lock has
been acquired", then should KVMGT also acquire the MMU lock too?
If so, could we move the MMU lock and unlock into kvm_page_track_write()
as it's common.

(2) Even if KVMGT consumes @new,
will kvm_page_track_write() be called for once or twice if there are two
concurent emulated write?


commit 0e0fee5c539b61fdd098332e0e2cc375d9073706
Author: Junaid Shahid <junaids@google.com>
Date:   Wed Oct 31 14:53:57 2018 -0700

    kvm: mmu: Fix race in emulated page table writes
    
    When a guest page table is updated via an emulated write,
    kvm_mmu_pte_write() is called to update the shadow PTE using the just
    written guest PTE value. But if two emulated guest PTE writes happened
    concurrently, it is possible that the guest PTE and the shadow PTE end
    up being out of sync. Emulated writes do not mark the shadow page as
    unsync-ed, so this inconsistency will not be resolved even by a guest TLB
    flush (unless the page was marked as unsync-ed at some other point).
    
    This is fixed by re-reading the current value of the guest PTE after the
    MMU lock has been acquired instead of just using the value that was
    written prior to calling kvm_mmu_pte_write().
Sean Christopherson Aug. 9, 2023, 2:33 p.m. UTC | #6
On Wed, Aug 09, 2023, Yan Zhao wrote:
> On Mon, Aug 07, 2023 at 10:19:07AM -0700, Sean Christopherson wrote:
> > On Mon, Aug 07, 2023, Like Xu wrote:
> > > On 23/12/2022 8:57 am, Sean Christopherson wrote:
> > > > +static inline void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
> > > > +					const u8 *new, int bytes)
> > > > +{
> > > > +	__kvm_page_track_write(vcpu, gpa, new, bytes);
> > > > +
> > > > +	kvm_mmu_track_write(vcpu, gpa, new, bytes);
> > > > +}
> > > 
> > > The kvm_mmu_track_write() is only used for x86, where the incoming parameter
> > > "u8 *new" has not been required since 0e0fee5c539b ("kvm: mmu: Fix race in
> > > emulated page table writes"), please help confirm if it's still needed ? Thanks.
> > > A minor clean up is proposed.
> > 
> > Hmm, unless I'm misreading things, KVMGT ultimately doesn't consume @new either.
> > So I think we can remove @new from kvm_page_track_write() entirely.
> Sorry for the late reply.
> Yes, KVMGT does not consume @new and it reads the guest PTE again in the
> page track write handler.
> 
> But I have a couple of questions related to the memtioned commit as
> below:
> 
> (1) If "re-reading the current value of the guest PTE after the MMU lock has
> been acquired", then should KVMGT also acquire the MMU lock too?

No.  If applicable, KVMGT should read the new/current value after acquiring
whatever lock protects the generation (or update) of the shadow entries.  I
suspect KVMGT already does this, but I don't have time to confirm that at this
exact memory.

The race that was fixed in KVM was:

  vCPU0         vCPU1   
  write X
                 write Y
                 sync SPTE w/ Y
  sync SPTE w/ X

Reading the value after acquiring mmu_lock ensures that both vCPUs will see whatever
value "loses" the race, i.e. whatever written value is processed second ('Y' in the
above sequence).

> If so, could we move the MMU lock and unlock into kvm_page_track_write()
> as it's common.
> 
> (2) Even if KVMGT consumes @new,
> will kvm_page_track_write() be called for once or twice if there are two
> concurent emulated write?

Twice, kvm_page_track_write() is wired up directly to the emulation of the write,
i.e. there is no batching.
Yan Zhao Aug. 9, 2023, 11:21 p.m. UTC | #7
On Wed, Aug 09, 2023 at 07:33:45AM -0700, Sean Christopherson wrote:
> On Wed, Aug 09, 2023, Yan Zhao wrote:
> > On Mon, Aug 07, 2023 at 10:19:07AM -0700, Sean Christopherson wrote:
> > > On Mon, Aug 07, 2023, Like Xu wrote:
> > > > On 23/12/2022 8:57 am, Sean Christopherson wrote:
> > > > > +static inline void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
> > > > > +					const u8 *new, int bytes)
> > > > > +{
> > > > > +	__kvm_page_track_write(vcpu, gpa, new, bytes);
> > > > > +
> > > > > +	kvm_mmu_track_write(vcpu, gpa, new, bytes);
> > > > > +}
> > > > 
> > > > The kvm_mmu_track_write() is only used for x86, where the incoming parameter
> > > > "u8 *new" has not been required since 0e0fee5c539b ("kvm: mmu: Fix race in
> > > > emulated page table writes"), please help confirm if it's still needed ? Thanks.
> > > > A minor clean up is proposed.
> > > 
> > > Hmm, unless I'm misreading things, KVMGT ultimately doesn't consume @new either.
> > > So I think we can remove @new from kvm_page_track_write() entirely.
> > Sorry for the late reply.
> > Yes, KVMGT does not consume @new and it reads the guest PTE again in the
> > page track write handler.
> > 
> > But I have a couple of questions related to the memtioned commit as
> > below:
> > 
> > (1) If "re-reading the current value of the guest PTE after the MMU lock has
> > been acquired", then should KVMGT also acquire the MMU lock too?
> 
> No.  If applicable, KVMGT should read the new/current value after acquiring
> whatever lock protects the generation (or update) of the shadow entries.  I
> suspect KVMGT already does this, but I don't have time to confirm that at this
I think the mutex lock and unlock of info->vgpu_lock you added in
kvmgt_page_track_write() is the counterpart :)

> exact memory.
> 
> The race that was fixed in KVM was:
> 
>   vCPU0         vCPU1   
>   write X
>                  write Y
>                  sync SPTE w/ Y
>   sync SPTE w/ X
> 
> Reading the value after acquiring mmu_lock ensures that both vCPUs will see whatever
> value "loses" the race, i.e. whatever written value is processed second ('Y' in the
> above sequence).
I suspect that vCPU0 may still generate a wrong SPTE if vCPU1 wrote 4
bytes while vCPU0 wrote 8 bytes, though the chances are very low.


> 
> > If so, could we move the MMU lock and unlock into kvm_page_track_write()
> > as it's common.
> > 
> > (2) Even if KVMGT consumes @new,
> > will kvm_page_track_write() be called for once or twice if there are two
> > concurent emulated write?
> 
> Twice, kvm_page_track_write() is wired up directly to the emulation of the write,
> i.e. there is no batching.
Yan Zhao Aug. 10, 2023, 3:02 a.m. UTC | #8
On Thu, Aug 10, 2023 at 07:21:03AM +0800, Yan Zhao wrote:
> On Wed, Aug 09, 2023 at 07:33:45AM -0700, Sean Christopherson wrote:
> > On Wed, Aug 09, 2023, Yan Zhao wrote:
> > > On Mon, Aug 07, 2023 at 10:19:07AM -0700, Sean Christopherson wrote:
> > > > On Mon, Aug 07, 2023, Like Xu wrote:
> > > > > On 23/12/2022 8:57 am, Sean Christopherson wrote:
> > > > > > +static inline void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
> > > > > > +					const u8 *new, int bytes)
> > > > > > +{
> > > > > > +	__kvm_page_track_write(vcpu, gpa, new, bytes);
> > > > > > +
> > > > > > +	kvm_mmu_track_write(vcpu, gpa, new, bytes);
> > > > > > +}
> > > > > 
> > > > > The kvm_mmu_track_write() is only used for x86, where the incoming parameter
> > > > > "u8 *new" has not been required since 0e0fee5c539b ("kvm: mmu: Fix race in
> > > > > emulated page table writes"), please help confirm if it's still needed ? Thanks.
> > > > > A minor clean up is proposed.
> > > > 
> > > > Hmm, unless I'm misreading things, KVMGT ultimately doesn't consume @new either.
> > > > So I think we can remove @new from kvm_page_track_write() entirely.
> > > Sorry for the late reply.
> > > Yes, KVMGT does not consume @new and it reads the guest PTE again in the
> > > page track write handler.
> > > 
> > > But I have a couple of questions related to the memtioned commit as
> > > below:
> > > 
> > > (1) If "re-reading the current value of the guest PTE after the MMU lock has
> > > been acquired", then should KVMGT also acquire the MMU lock too?
> > 
> > No.  If applicable, KVMGT should read the new/current value after acquiring
> > whatever lock protects the generation (or update) of the shadow entries.  I
> > suspect KVMGT already does this, but I don't have time to confirm that at this
> I think the mutex lock and unlock of info->vgpu_lock you added in
> kvmgt_page_track_write() is the counterpart :)
> 
> > exact memory.
> > 
> > The race that was fixed in KVM was:
> > 
> >   vCPU0         vCPU1   
> >   write X
> >                  write Y
> >                  sync SPTE w/ Y
> >   sync SPTE w/ X
> > 
> > Reading the value after acquiring mmu_lock ensures that both vCPUs will see whatever
> > value "loses" the race, i.e. whatever written value is processed second ('Y' in the
> > above sequence).
> I suspect that vCPU0 may still generate a wrong SPTE if vCPU1 wrote 4
> bytes while vCPU0 wrote 8 bytes, though the chances are very low.
> 
This could happen in below sequence:
vCPU0 updates a PTE to AABBCCDD;
vCPU1 updates a PTE to EEFFGGHH in two writes.
(each character stands for a byte)

vCPU0                  vCPU1   
write AABBCCDD
                       write GGHH
                       detect 4 bytes write and hold on sync
sync SPTE w/ AABBGGHH
                       write EEFF
                       sync SPTE w/ EEFFGGHH


Do you think it worth below serialization work?

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a915e23d61fa..51cd0ab73529 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1445,6 +1445,8 @@ struct kvm_arch {
         */
 #define SPLIT_DESC_CACHE_MIN_NR_OBJECTS (SPTE_ENT_PER_PAGE + 1)
        struct kvm_mmu_memory_cache split_desc_cache;
+
+       struct xarray track_writing_range;
 };

 struct kvm_vm_stat {
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
index fd04e618ad2d..4b271701dcf6 100644
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -142,12 +142,14 @@ void kvm_page_track_cleanup(struct kvm *kvm)

        head = &kvm->arch.track_notifier_head;
        cleanup_srcu_struct(&head->track_srcu);
+       xa_destroy(&kvm->arch.track_writing_range);
 }

 int kvm_page_track_init(struct kvm *kvm)
 {
        struct kvm_page_track_notifier_head *head;

+       xa_init(&kvm->arch.track_writing_range);
        head = &kvm->arch.track_notifier_head;
        INIT_HLIST_HEAD(&head->track_notifier_list);
        return init_srcu_struct(&head->track_srcu);
diff --git a/arch/x86/kvm/mmu/page_track.h b/arch/x86/kvm/mmu/page_track.h
index 62f98c6c5af3..1829792b9892 100644
--- a/arch/x86/kvm/mmu/page_track.h
+++ b/arch/x86/kvm/mmu/page_track.h
@@ -47,12 +47,46 @@ static inline bool kvm_page_track_has_external_user(struct kvm *kvm) { return fa

 #endif /* CONFIG_KVM_EXTERNAL_WRITE_TRACKING */

-static inline void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-                                       const u8 *new, int bytes)
+static inline void kvm_page_track_write_begin(struct kvm_vcpu *vcpu, gpa_t gpa,
+                                             int bytes)
 {
+       struct kvm *kvm = vcpu->kvm;
+       gfn_t gfn = gpa_to_gfn(gpa);
+
+       WARN_ON(gfn != gpa_to_gfn(gpa + bytes - 1));
+
+       if (!kvm_page_track_write_tracking_enabled(kvm))
+               return;
+
+retry:
+       if (xa_insert(&kvm->arch.track_writing_range, gfn, xa_mk_value(gfn),
+                     GFP_KERNEL_ACCOUNT)) {
+               cpu_relax();
+               goto retry;
+       }
+       return;
+}
+
+static inline void kvm_page_track_write_abort(struct kvm_vcpu *vcpu, gpa_t gpa,
+                                             int bytes)
+{
+       if (!kvm_page_track_write_tracking_enabled(vcpu->kvm))
+               return;
+
+       xa_erase(&vcpu->kvm->arch.track_writing_range, gpa_to_gfn(gpa));
+}
+
+static inline void kvm_page_track_write_end(struct kvm_vcpu *vcpu, gpa_t gpa,
+                                           const u8 *new, int bytes)
+{
+       if (!kvm_page_track_write_tracking_enabled(vcpu->kvm))
+               return;
+
        __kvm_page_track_write(vcpu->kvm, gpa, new, bytes);

        kvm_mmu_track_write(vcpu, gpa, new, bytes);
+
+       xa_erase(&vcpu->kvm->arch.track_writing_range, gpa_to_gfn(gpa));
 }

 #endif /* __KVM_X86_PAGE_TRACK_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 05a68d7d99fe..9b75829d5d7a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7544,10 +7544,13 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 {
        int ret;

+       kvm_page_track_write_begin(vcpu, gpa, bytes);
        ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes);
-       if (ret < 0)
+       if (ret < 0) {
+               kvm_page_track_write_abort(vcpu, gpa, bytes);
                return 0;
-       kvm_page_track_write(vcpu, gpa, val, bytes);
+       }
+       kvm_page_track_write_end(vcpu, gpa, val, bytes);
        return 1;
 }

@@ -7792,6 +7795,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,

        hva += offset_in_page(gpa);

+       kvm_page_track_write_begin(vcpu, gpa, bytes);
        switch (bytes) {
        case 1:
                r = emulator_try_cmpxchg_user(u8, hva, old, new);
@@ -7809,12 +7813,16 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
                BUG();
        }

-       if (r < 0)
+       if (r < 0) {
+               kvm_page_track_write_abort(vcpu, gpa, bytes);
                return X86EMUL_UNHANDLEABLE;
-       if (r)
+       }
+       if (r) {
+               kvm_page_track_write_abort(vcpu, gpa, bytes);
                return X86EMUL_CMPXCHG_FAILED;
+       }

-       kvm_page_track_write(vcpu, gpa, new, bytes);
+       kvm_page_track_write_end(vcpu, gpa, new, bytes);

        return X86EMUL_CONTINUE;
Sean Christopherson Aug. 10, 2023, 3:41 p.m. UTC | #9
On Thu, Aug 10, 2023, Yan Zhao wrote:
> On Thu, Aug 10, 2023 at 07:21:03AM +0800, Yan Zhao wrote:
> > > Reading the value after acquiring mmu_lock ensures that both vCPUs will see whatever
> > > value "loses" the race, i.e. whatever written value is processed second ('Y' in the
> > > above sequence).
> > I suspect that vCPU0 may still generate a wrong SPTE if vCPU1 wrote 4
> > bytes while vCPU0 wrote 8 bytes, though the chances are very low.
> > 
> This could happen in below sequence:
> vCPU0 updates a PTE to AABBCCDD;
> vCPU1 updates a PTE to EEFFGGHH in two writes.
> (each character stands for a byte)
> 
> vCPU0                  vCPU1   
> write AABBCCDD
>                        write GGHH
>                        detect 4 bytes write and hold on sync
> sync SPTE w/ AABBGGHH
>                        write EEFF
>                        sync SPTE w/ EEFFGGHH
> 
> 
> Do you think it worth below serialization work?

No, because I don't see any KVM bugs with the above sequence.  If the guest doesn't
ensure *all* writes from vCPU0 and vCPU1 are fully serialized, then it is completely
legal for hardware (KVM in this case) to consume AABBGGHH as a PTE.  The only thing
the guest shouldn't see is EEFFCCDD, but I don't see how that can happen.
Yan Zhao Aug. 11, 2023, 5:57 a.m. UTC | #10
On Thu, Aug 10, 2023 at 08:41:14AM -0700, Sean Christopherson wrote:
> On Thu, Aug 10, 2023, Yan Zhao wrote:
> > On Thu, Aug 10, 2023 at 07:21:03AM +0800, Yan Zhao wrote:
> > > > Reading the value after acquiring mmu_lock ensures that both vCPUs will see whatever
> > > > value "loses" the race, i.e. whatever written value is processed second ('Y' in the
> > > > above sequence).
> > > I suspect that vCPU0 may still generate a wrong SPTE if vCPU1 wrote 4
> > > bytes while vCPU0 wrote 8 bytes, though the chances are very low.
> > > 
> > This could happen in below sequence:
> > vCPU0 updates a PTE to AABBCCDD;
> > vCPU1 updates a PTE to EEFFGGHH in two writes.
> > (each character stands for a byte)
> > 
> > vCPU0                  vCPU1   
> > write AABBCCDD
> >                        write GGHH
> >                        detect 4 bytes write and hold on sync
> > sync SPTE w/ AABBGGHH
> >                        write EEFF
> >                        sync SPTE w/ EEFFGGHH
> > 
> > 
> > Do you think it worth below serialization work?
> 
> No, because I don't see any KVM bugs with the above sequence.  If the guest doesn't
> ensure *all* writes from vCPU0 and vCPU1 are fully serialized, then it is completely
> legal for hardware (KVM in this case) to consume AABBGGHH as a PTE.  The only thing
> the guest shouldn't see is EEFFCCDD, but I don't see how that can happen.
Ok, though still feel it's a little odd when a 1st cmpxch instruction on a GPA is still
under emulation, a 2nd or 3rd... cmpxch instruction to the same GPA may have returned
and they all succeeded :)
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index eec424fac0ba..e8f8e1bd96c7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1223,7 +1223,9 @@  struct kvm_arch {
 	 * create an NX huge page (without hanging the guest).
 	 */
 	struct list_head possible_nx_huge_pages;
+#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
 	struct kvm_page_track_notifier_head track_notifier_head;
+#endif
 	/*
 	 * Protects marking pages unsync during page faults, as TDP MMU page
 	 * faults only take mmu_lock for read.  For simplicity, the unsync
diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h
index deece45936a5..53c2adb25a07 100644
--- a/arch/x86/include/asm/kvm_page_track.h
+++ b/arch/x86/include/asm/kvm_page_track.h
@@ -55,6 +55,7 @@  void kvm_slot_page_track_remove_page(struct kvm *kvm,
 				     struct kvm_memory_slot *slot, gfn_t gfn,
 				     enum kvm_page_track_mode mode);
 
+#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
 enum pg_level kvm_page_track_max_mapping_level(struct kvm *kvm, gfn_t gfn,
 					       enum pg_level max_level);
 
@@ -64,5 +65,6 @@  kvm_page_track_register_notifier(struct kvm *kvm,
 void
 kvm_page_track_unregister_notifier(struct kvm *kvm,
 				   struct kvm_page_track_notifier_node *n);
+#endif /* CONFIG_KVM_EXTERNAL_WRITE_TRACKING */
 
 #endif
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
index 2b302fd2c5dd..f932909aa9b5 100644
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -193,6 +193,7 @@  bool kvm_slot_page_track_is_active(struct kvm *kvm,
 	return !!READ_ONCE(slot->arch.gfn_track[mode][index]);
 }
 
+#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
 void kvm_page_track_cleanup(struct kvm *kvm)
 {
 	struct kvm_page_track_notifier_head *head;
@@ -208,6 +209,7 @@  int kvm_page_track_init(struct kvm *kvm)
 	head = &kvm->arch.track_notifier_head;
 	INIT_HLIST_HEAD(&head->track_notifier_list);
 	return init_srcu_struct(&head->track_srcu);
+	return 0;
 }
 
 /*
@@ -254,8 +256,8 @@  EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier);
  * The node should figure out if the written page is the one that node is
  * interested in by itself.
  */
-void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
-			  int bytes)
+void __kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
+			    int bytes)
 {
 	struct kvm_page_track_notifier_head *head;
 	struct kvm_page_track_notifier_node *n;
@@ -272,8 +274,6 @@  void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
 		if (n->track_write)
 			n->track_write(gpa, new, bytes, n);
 	srcu_read_unlock(&head->track_srcu, idx);
-
-	kvm_mmu_track_write(vcpu, gpa, new, bytes);
 }
 
 /*
@@ -316,3 +316,4 @@  enum pg_level kvm_page_track_max_mapping_level(struct kvm *kvm, gfn_t gfn,
 	return max_level;
 }
 EXPORT_SYMBOL_GPL(kvm_page_track_max_mapping_level);
+#endif
diff --git a/arch/x86/kvm/mmu/page_track.h b/arch/x86/kvm/mmu/page_track.h
index 89712f123ad3..1b363784aa4a 100644
--- a/arch/x86/kvm/mmu/page_track.h
+++ b/arch/x86/kvm/mmu/page_track.h
@@ -6,8 +6,6 @@ 
 
 #include <asm/kvm_page_track.h>
 
-int kvm_page_track_init(struct kvm *kvm);
-void kvm_page_track_cleanup(struct kvm *kvm);
 
 bool kvm_page_track_write_tracking_enabled(struct kvm *kvm);
 int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot);
@@ -21,13 +19,37 @@  bool kvm_slot_page_track_is_active(struct kvm *kvm,
 				   const struct kvm_memory_slot *slot,
 				   gfn_t gfn, enum kvm_page_track_mode mode);
 
-void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
-			  int bytes);
+#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
+int kvm_page_track_init(struct kvm *kvm);
+void kvm_page_track_cleanup(struct kvm *kvm);
+
+void __kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
+			    int bytes);
 void kvm_page_track_delete_slot(struct kvm *kvm, struct kvm_memory_slot *slot);
 
 static inline bool kvm_page_track_has_external_user(struct kvm *kvm)
 {
 	return hlist_empty(&kvm->arch.track_notifier_head.track_notifier_list);
 }
+#else
+static inline int kvm_page_track_init(struct kvm *kvm) { return 0; }
+static inline void kvm_page_track_cleanup(struct kvm *kvm) { }
+
+static inline void __kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+					  const u8 *new, int bytes) { }
+static inline void kvm_page_track_delete_slot(struct kvm *kvm,
+					      struct kvm_memory_slot *slot) { }
+
+static inline bool kvm_page_track_has_external_user(struct kvm *kvm) { return false; }
+
+#endif /* CONFIG_KVM_EXTERNAL_WRITE_TRACKING */
+
+static inline void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+					const u8 *new, int bytes)
+{
+	__kvm_page_track_write(vcpu, gpa, new, bytes);
+
+	kvm_mmu_track_write(vcpu, gpa, new, bytes);
+}
 
 #endif /* __KVM_X86_PAGE_TRACK_H */