diff mbox series

[RFC,v2,16/69] KVM: x86/mmu: Zap only leaf SPTEs for deleted/moved memslot by default

Message ID 78d02fee3a21741cc26f6b6b2fba258cd52f2c3c.1625186503.git.isaku.yamahata@intel.com (mailing list archive)
State New, archived
Headers show
Series KVM: X86: TDX support | expand

Commit Message

Isaku Yamahata July 2, 2021, 10:04 p.m. UTC
From: Sean Christopherson <sean.j.christopherson@intel.com>

Zap only leaf SPTEs when deleting/moving a memslot by default, and add a
module param to allow reverting to the old behavior of zapping all SPTEs
at all levels and memslots when any memslot is updated.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
---
 arch/x86/kvm/mmu/mmu.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

Comments

Paolo Bonzini July 6, 2021, 1:44 p.m. UTC | #1
On 03/07/21 00:04, isaku.yamahata@intel.com wrote:
> From: Sean Christopherson <sean.j.christopherson@intel.com>
> 
> Zap only leaf SPTEs when deleting/moving a memslot by default, and add a
> module param to allow reverting to the old behavior of zapping all SPTEs
> at all levels and memslots when any memslot is updated.
> 
> Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> ---
>   arch/x86/kvm/mmu/mmu.c | 21 ++++++++++++++++++++-
>   1 file changed, 20 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 8d5876dfc6b7..5b8a640f8042 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -85,6 +85,9 @@ __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
>   static bool __read_mostly force_flush_and_sync_on_reuse;
>   module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
>   
> +static bool __read_mostly memslot_update_zap_all;
> +module_param(memslot_update_zap_all, bool, 0444);
> +
>   /*
>    * When setting this variable to true it enables Two-Dimensional-Paging
>    * where the hardware walks 2 page tables:
> @@ -5480,11 +5483,27 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
>   	return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
>   }
>   
> +static void kvm_mmu_zap_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
> +{
> +	/*
> +	 * Zapping non-leaf SPTEs, a.k.a. not-last SPTEs, isn't required, worst
> +	 * case scenario we'll have unused shadow pages lying around until they
> +	 * are recycled due to age or when the VM is destroyed.
> +	 */
> +	write_lock(&kvm->mmu_lock);
> +	slot_handle_level(kvm, slot, kvm_zap_rmapp, PG_LEVEL_4K,
> +			  KVM_MAX_HUGEPAGE_LEVEL, true);
> +	write_unlock(&kvm->mmu_lock);
> +}
> +
>   static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
>   			struct kvm_memory_slot *slot,
>   			struct kvm_page_track_notifier_node *node)
>   {
> -	kvm_mmu_zap_all_fast(kvm);
> +	if (memslot_update_zap_all)
> +		kvm_mmu_zap_all_fast(kvm);
> +	else
> +		kvm_mmu_zap_memslot(kvm, slot);
>   }
>   
>   void kvm_mmu_init_vm(struct kvm *kvm)
> 

This is the old patch that broke VFIO for some unknown reason.  The 
commit message should at least say why memslot_update_zap_all is not 
true by default.  Also, IIUC the bug still there with NX hugepage splits 
disabled, but what if the TDP MMU is enabled?

Paolo
Sean Christopherson July 13, 2021, 8:17 p.m. UTC | #2
On Tue, Jul 06, 2021, Paolo Bonzini wrote:
> On 03/07/21 00:04, isaku.yamahata@intel.com wrote:
> > From: Sean Christopherson <sean.j.christopherson@intel.com>
> > 
> > Zap only leaf SPTEs when deleting/moving a memslot by default, and add a
> > module param to allow reverting to the old behavior of zapping all SPTEs
> > at all levels and memslots when any memslot is updated.
> > 
> > Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> > Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> > ---
> >   arch/x86/kvm/mmu/mmu.c | 21 ++++++++++++++++++++-
> >   1 file changed, 20 insertions(+), 1 deletion(-)
> > 
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index 8d5876dfc6b7..5b8a640f8042 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -85,6 +85,9 @@ __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
> >   static bool __read_mostly force_flush_and_sync_on_reuse;
> >   module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
> > +static bool __read_mostly memslot_update_zap_all;
> > +module_param(memslot_update_zap_all, bool, 0444);
> > +
> >   /*
> >    * When setting this variable to true it enables Two-Dimensional-Paging
> >    * where the hardware walks 2 page tables:
> > @@ -5480,11 +5483,27 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
> >   	return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
> >   }
> > +static void kvm_mmu_zap_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
> > +{
> > +	/*
> > +	 * Zapping non-leaf SPTEs, a.k.a. not-last SPTEs, isn't required, worst
> > +	 * case scenario we'll have unused shadow pages lying around until they
> > +	 * are recycled due to age or when the VM is destroyed.
> > +	 */
> > +	write_lock(&kvm->mmu_lock);
> > +	slot_handle_level(kvm, slot, kvm_zap_rmapp, PG_LEVEL_4K,
> > +			  KVM_MAX_HUGEPAGE_LEVEL, true);
> > +	write_unlock(&kvm->mmu_lock);
> > +}
> > +
> >   static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
> >   			struct kvm_memory_slot *slot,
> >   			struct kvm_page_track_notifier_node *node)
> >   {
> > -	kvm_mmu_zap_all_fast(kvm);
> > +	if (memslot_update_zap_all)
> > +		kvm_mmu_zap_all_fast(kvm);
> > +	else
> > +		kvm_mmu_zap_memslot(kvm, slot);
> >   }
> >   void kvm_mmu_init_vm(struct kvm *kvm)
> > 
> 
> This is the old patch that broke VFIO for some unknown reason.

Yes, my white whale :-/

> The commit message should at least say why memslot_update_zap_all is not true
> by default.  Also, IIUC the bug still there with NX hugepage splits disabled,

I strongly suspect the bug is also there with hugepage splits enabled, it's just
masked and/or harder to hit.

> but what if the TDP MMU is enabled?

This should not be a module param.  IIRC, the original code I wrote had it as a
per-VM flag that wasn't even exposed to the user, i.e. TDX guests always do the
partial flush and non-TDX guests always do the full flush.  I think that's the
least awful approach if we can't figure out the underlying bug before TDX is
ready for inclusion.
diff mbox series

Patch

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 8d5876dfc6b7..5b8a640f8042 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -85,6 +85,9 @@  __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
 static bool __read_mostly force_flush_and_sync_on_reuse;
 module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
 
+static bool __read_mostly memslot_update_zap_all;
+module_param(memslot_update_zap_all, bool, 0444);
+
 /*
  * When setting this variable to true it enables Two-Dimensional-Paging
  * where the hardware walks 2 page tables:
@@ -5480,11 +5483,27 @@  static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
 	return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
 }
 
+static void kvm_mmu_zap_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+	/*
+	 * Zapping non-leaf SPTEs, a.k.a. not-last SPTEs, isn't required, worst
+	 * case scenario we'll have unused shadow pages lying around until they
+	 * are recycled due to age or when the VM is destroyed.
+	 */
+	write_lock(&kvm->mmu_lock);
+	slot_handle_level(kvm, slot, kvm_zap_rmapp, PG_LEVEL_4K,
+			  KVM_MAX_HUGEPAGE_LEVEL, true);
+	write_unlock(&kvm->mmu_lock);
+}
+
 static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
 			struct kvm_memory_slot *slot,
 			struct kvm_page_track_notifier_node *node)
 {
-	kvm_mmu_zap_all_fast(kvm);
+	if (memslot_update_zap_all)
+		kvm_mmu_zap_all_fast(kvm);
+	else
+		kvm_mmu_zap_memslot(kvm, slot);
 }
 
 void kvm_mmu_init_vm(struct kvm *kvm)