diff mbox series

[v4,18/30] KVM: x86/mmu: Zap only TDP MMU leafs in kvm_zap_gfn_range()

Message ID 20220303193842.370645-19-pbonzini@redhat.com (mailing list archive)
State New, archived
Headers show
Series KVM: x86/mmu: Overhaul TDP MMU zapping and flushing | expand

Commit Message

Paolo Bonzini March 3, 2022, 7:38 p.m. UTC
From: Sean Christopherson <seanjc@google.com>

Zap only leaf SPTEs in the TDP MMU's zap_gfn_range(), and rename various
functions accordingly.  When removing mappings for functional correctness
(except for the stupid VFIO GPU passthrough memslots bug), zapping the
leaf SPTEs is sufficient as the paging structures themselves do not point
at guest memory and do not directly impact the final translation (in the
TDP MMU).

Note, this aligns the TDP MMU with the legacy/full MMU, which zaps only
the rmaps, a.k.a. leaf SPTEs, in kvm_zap_gfn_range() and
kvm_unmap_gfn_range().

Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Ben Gardon <bgardon@google.com>
Message-Id: <20220226001546.360188-18-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c     |  4 ++--
 arch/x86/kvm/mmu/tdp_mmu.c | 41 ++++++++++----------------------------
 arch/x86/kvm/mmu/tdp_mmu.h |  8 +-------
 3 files changed, 14 insertions(+), 39 deletions(-)

Comments

Mingwei Zhang March 4, 2022, 1:16 a.m. UTC | #1
On Thu, Mar 03, 2022, Paolo Bonzini wrote:
> From: Sean Christopherson <seanjc@google.com>
> 
> Zap only leaf SPTEs in the TDP MMU's zap_gfn_range(), and rename various
> functions accordingly.  When removing mappings for functional correctness
> (except for the stupid VFIO GPU passthrough memslots bug), zapping the
> leaf SPTEs is sufficient as the paging structures themselves do not point
> at guest memory and do not directly impact the final translation (in the
> TDP MMU).
> 
> Note, this aligns the TDP MMU with the legacy/full MMU, which zaps only
> the rmaps, a.k.a. leaf SPTEs, in kvm_zap_gfn_range() and
> kvm_unmap_gfn_range().
> 
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> Reviewed-by: Ben Gardon <bgardon@google.com>
> Message-Id: <20220226001546.360188-18-seanjc@google.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  arch/x86/kvm/mmu/mmu.c     |  4 ++--
>  arch/x86/kvm/mmu/tdp_mmu.c | 41 ++++++++++----------------------------
>  arch/x86/kvm/mmu/tdp_mmu.h |  8 +-------
>  3 files changed, 14 insertions(+), 39 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 8408d7db8d2a..febdcaaa7b94 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -5834,8 +5834,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
>  
>  	if (is_tdp_mmu_enabled(kvm)) {
>  		for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
> -			flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start,
> -							  gfn_end, flush);
> +			flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start,
> +						      gfn_end, true, flush);
>  	}
>  
>  	if (flush)
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index f3939ce4a115..c71debdbc732 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -834,10 +834,8 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
>  }
>  
>  /*
> - * Tears down the mappings for the range of gfns, [start, end), and frees the
> - * non-root pages mapping GFNs strictly within that range. Returns true if
> - * SPTEs have been cleared and a TLB flush is needed before releasing the
> - * MMU lock.
> + * Zap leafs SPTEs for the range of gfns, [start, end). Returns true if SPTEs
> + * have been cleared and a TLB flush is needed before releasing the MMU lock.

I think the original code does not _over_ zapping. But the new version
does. Will that have some side effects? In particular, if the range is
within a huge page (or HugeTLB page of various sizes), then we choose to
zap it even if it is more than the range.

Regardless of side effect, I think we probably should mention that in
the comments?

>   *
>   * If can_yield is true, will release the MMU lock and reschedule if the
>   * scheduler needs the CPU or there is contention on the MMU lock. If this
> @@ -845,42 +843,25 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
>   * the caller must ensure it does not supply too large a GFN range, or the
>   * operation can cause a soft lockup.
>   */
> -static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
> -			  gfn_t start, gfn_t end, bool can_yield, bool flush)
> +static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
> +			      gfn_t start, gfn_t end, bool can_yield, bool flush)
>  {
> -	bool zap_all = (start == 0 && end >= tdp_mmu_max_gfn_host());
>  	struct tdp_iter iter;
>  
> -	/*
> -	 * No need to try to step down in the iterator when zapping all SPTEs,
> -	 * zapping the top-level non-leaf SPTEs will recurse on their children.
> -	 */
> -	int min_level = zap_all ? root->role.level : PG_LEVEL_4K;
> -
>  	end = min(end, tdp_mmu_max_gfn_host());
>  
>  	lockdep_assert_held_write(&kvm->mmu_lock);
>  
>  	rcu_read_lock();
>  
> -	for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
> +	for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
>  		if (can_yield &&
>  		    tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
>  			flush = false;
>  			continue;
>  		}
>  
> -		if (!is_shadow_present_pte(iter.old_spte))
> -			continue;
> -
> -		/*
> -		 * If this is a non-last-level SPTE that covers a larger range
> -		 * than should be zapped, continue, and zap the mappings at a
> -		 * lower level, except when zapping all SPTEs.
> -		 */
> -		if (!zap_all &&
> -		    (iter.gfn < start ||
> -		     iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
> +		if (!is_shadow_present_pte(iter.old_spte) ||
>  		    !is_last_spte(iter.old_spte, iter.level))
>  			continue;
>  
> @@ -898,13 +879,13 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
>   * SPTEs have been cleared and a TLB flush is needed before releasing the
>   * MMU lock.
>   */
> -bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
> -				 gfn_t end, bool can_yield, bool flush)
> +bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
> +			   bool can_yield, bool flush)
>  {
>  	struct kvm_mmu_page *root;
>  
>  	for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
> -		flush = zap_gfn_range(kvm, root, start, end, can_yield, flush);
> +		flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, false);
>  
>  	return flush;
>  }
> @@ -1202,8 +1183,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
>  bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
>  				 bool flush)
>  {
> -	return __kvm_tdp_mmu_zap_gfn_range(kvm, range->slot->as_id, range->start,
> -					   range->end, range->may_block, flush);
> +	return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start,
> +				     range->end, range->may_block, flush);
>  }
>  
>  typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
> index 5e5ef2576c81..54bc8118c40a 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.h
> +++ b/arch/x86/kvm/mmu/tdp_mmu.h
> @@ -15,14 +15,8 @@ __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
>  void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
>  			  bool shared);
>  
> -bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
> +bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start,
>  				 gfn_t end, bool can_yield, bool flush);
> -static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id,
> -					     gfn_t start, gfn_t end, bool flush)
> -{
> -	return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush);
> -}
> -
>  bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp);
>  void kvm_tdp_mmu_zap_all(struct kvm *kvm);
>  void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm);
> -- 
> 2.31.1
> 
>
Sean Christopherson March 4, 2022, 4:11 p.m. UTC | #2
On Fri, Mar 04, 2022, Mingwei Zhang wrote:
> On Thu, Mar 03, 2022, Paolo Bonzini wrote:
> > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> > index f3939ce4a115..c71debdbc732 100644
> > --- a/arch/x86/kvm/mmu/tdp_mmu.c
> > +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> > @@ -834,10 +834,8 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
> >  }
> >  
> >  /*
> > - * Tears down the mappings for the range of gfns, [start, end), and frees the
> > - * non-root pages mapping GFNs strictly within that range. Returns true if
> > - * SPTEs have been cleared and a TLB flush is needed before releasing the
> > - * MMU lock.
> > + * Zap leafs SPTEs for the range of gfns, [start, end). Returns true if SPTEs
> > + * have been cleared and a TLB flush is needed before releasing the MMU lock.
> 
> I think the original code does not _over_ zapping. But the new version
> does.

No, the new version doesn't overzap.

> Will that have some side effects? In particular, if the range is
> within a huge page (or HugeTLB page of various sizes), then we choose to
> zap it even if it is more than the range.

The old version did that too.  KVM _must_ zap a hugepage that overlaps the range,
otherwise the guest would be able to access memory that has been freed/moved.  If
the operation has unmapped a subset of a hugepage, KVM needs to zap and rebuild
the portions that are still valid using smaller pages.

> Regardless of side effect, I think we probably should mention that in
> the comments?
> > -		/*
> > -		 * If this is a non-last-level SPTE that covers a larger range
> > -		 * than should be zapped, continue, and zap the mappings at a
> > -		 * lower level, except when zapping all SPTEs.
> > -		 */
> > -		if (!zap_all &&
> > -		    (iter.gfn < start ||
> > -		     iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
> > +		if (!is_shadow_present_pte(iter.old_spte) ||
> >  		    !is_last_spte(iter.old_spte, iter.level))

It's hard to see in the diff, but the key is the "!is_last_spte()" check.  The
check before was skipping non-leaf, a.k.a. shadow pages, if they weren't in the
range.  The new version _always_ skips shadow pages.  Hugepages will always
return true for is_last_spte() and will never be skipped.
Mingwei Zhang March 4, 2022, 6 p.m. UTC | #3
On Fri, Mar 04, 2022, Sean Christopherson wrote:
> On Fri, Mar 04, 2022, Mingwei Zhang wrote:
> > On Thu, Mar 03, 2022, Paolo Bonzini wrote:
> > > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> > > index f3939ce4a115..c71debdbc732 100644
> > > --- a/arch/x86/kvm/mmu/tdp_mmu.c
> > > +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> > > @@ -834,10 +834,8 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
> > >  }
> > >  
> > >  /*
> > > - * Tears down the mappings for the range of gfns, [start, end), and frees the
> > > - * non-root pages mapping GFNs strictly within that range. Returns true if
> > > - * SPTEs have been cleared and a TLB flush is needed before releasing the
> > > - * MMU lock.
> > > + * Zap leafs SPTEs for the range of gfns, [start, end). Returns true if SPTEs
> > > + * have been cleared and a TLB flush is needed before releasing the MMU lock.
> > 
> > I think the original code does not _over_ zapping. But the new version
> > does.
> 
> No, the new version doesn't overzap.

It does overzap, but it does not matter and the semantic does not
change.
> 
> > Will that have some side effects? In particular, if the range is
> > within a huge page (or HugeTLB page of various sizes), then we choose to
> > zap it even if it is more than the range.

ACK.
> 
> The old version did that too.  KVM _must_ zap a hugepage that overlaps the range,
> otherwise the guest would be able to access memory that has been freed/moved.  If
> the operation has unmapped a subset of a hugepage, KVM needs to zap and rebuild
> the portions that are still valid using smaller pages.
> 
> > Regardless of side effect, I think we probably should mention that in
> > the comments?
> > > -		/*
> > > -		 * If this is a non-last-level SPTE that covers a larger range
> > > -		 * than should be zapped, continue, and zap the mappings at a
> > > -		 * lower level, except when zapping all SPTEs.
> > > -		 */
> > > -		if (!zap_all &&
> > > -		    (iter.gfn < start ||
> > > -		     iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
> > > +		if (!is_shadow_present_pte(iter.old_spte) ||
> > >  		    !is_last_spte(iter.old_spte, iter.level))
> 
> It's hard to see in the diff, but the key is the "!is_last_spte()" check.  The
> check before was skipping non-leaf, a.k.a. shadow pages, if they weren't in the
> range.  The new version _always_ skips shadow pages.  Hugepages will always
> return true for is_last_spte() and will never be skipped.

ACK

Reviewed-by: Mingwei Zhang <mizhang@google.com>
Sean Christopherson March 4, 2022, 6:42 p.m. UTC | #4
On Fri, Mar 04, 2022, Mingwei Zhang wrote:
> On Fri, Mar 04, 2022, Sean Christopherson wrote:
> > On Fri, Mar 04, 2022, Mingwei Zhang wrote:
> > > On Thu, Mar 03, 2022, Paolo Bonzini wrote:
> > > > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> > > > index f3939ce4a115..c71debdbc732 100644
> > > > --- a/arch/x86/kvm/mmu/tdp_mmu.c
> > > > +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> > > > @@ -834,10 +834,8 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
> > > >  }
> > > >  
> > > >  /*
> > > > - * Tears down the mappings for the range of gfns, [start, end), and frees the
> > > > - * non-root pages mapping GFNs strictly within that range. Returns true if
> > > > - * SPTEs have been cleared and a TLB flush is needed before releasing the
> > > > - * MMU lock.
> > > > + * Zap leafs SPTEs for the range of gfns, [start, end). Returns true if SPTEs
> > > > + * have been cleared and a TLB flush is needed before releasing the MMU lock.
> > > 
> > > I think the original code does not _over_ zapping. But the new version
> > > does.
> > 
> > No, the new version doesn't overzap.
> 
> It does overzap, but it does not matter and the semantic does not
> change.

Belaboring the point a bit... it very much matters, KVM must "overzap" for functional
correctness.  It's only an "overzap" from the perspective that KVM could thoeretically
shatter the hugepage then zap only the relevant small pages.  But it's not an overzap
in the sense that KVM absolutely has to zap the hugepage.  Even if KVM replaces it
with a shadow page, the hugepage is still being zapped, i.e. it's gone and KVM must do
a TLB flush regardless of whether or not there's a new mapping.
Vitaly Kuznetsov March 11, 2022, 3:09 p.m. UTC | #5
Paolo Bonzini <pbonzini@redhat.com> writes:

> From: Sean Christopherson <seanjc@google.com>
>
> Zap only leaf SPTEs in the TDP MMU's zap_gfn_range(), and rename various
> functions accordingly.  When removing mappings for functional correctness
> (except for the stupid VFIO GPU passthrough memslots bug), zapping the
> leaf SPTEs is sufficient as the paging structures themselves do not point
> at guest memory and do not directly impact the final translation (in the
> TDP MMU).
>
> Note, this aligns the TDP MMU with the legacy/full MMU, which zaps only
> the rmaps, a.k.a. leaf SPTEs, in kvm_zap_gfn_range() and
> kvm_unmap_gfn_range().
>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> Reviewed-by: Ben Gardon <bgardon@google.com>
> Message-Id: <20220226001546.360188-18-seanjc@google.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

I've noticed that multi-vCPU Hyper-V guests started crashing randomly on
boot with the latest kvm/queue and I've bisected the problem to this
particular patch. Basically, I'm not able to boot e.g. 16-vCPU guest
successfully anymore. Both Intel and AMD seem to be affected. Reverting
this commit saves the day.

Having some experience with similarly looking crashes in the past, I'd
suspect it is TLB flush related. I'd appreciate any thoughts.

> ---
>  arch/x86/kvm/mmu/mmu.c     |  4 ++--
>  arch/x86/kvm/mmu/tdp_mmu.c | 41 ++++++++++----------------------------
>  arch/x86/kvm/mmu/tdp_mmu.h |  8 +-------
>  3 files changed, 14 insertions(+), 39 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 8408d7db8d2a..febdcaaa7b94 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -5834,8 +5834,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
>  
>  	if (is_tdp_mmu_enabled(kvm)) {
>  		for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
> -			flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start,
> -							  gfn_end, flush);
> +			flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start,
> +						      gfn_end, true, flush);
>  	}
>  
>  	if (flush)
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index f3939ce4a115..c71debdbc732 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -834,10 +834,8 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
>  }
>  
>  /*
> - * Tears down the mappings for the range of gfns, [start, end), and frees the
> - * non-root pages mapping GFNs strictly within that range. Returns true if
> - * SPTEs have been cleared and a TLB flush is needed before releasing the
> - * MMU lock.
> + * Zap leafs SPTEs for the range of gfns, [start, end). Returns true if SPTEs
> + * have been cleared and a TLB flush is needed before releasing the MMU lock.
>   *
>   * If can_yield is true, will release the MMU lock and reschedule if the
>   * scheduler needs the CPU or there is contention on the MMU lock. If this
> @@ -845,42 +843,25 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
>   * the caller must ensure it does not supply too large a GFN range, or the
>   * operation can cause a soft lockup.
>   */
> -static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
> -			  gfn_t start, gfn_t end, bool can_yield, bool flush)
> +static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
> +			      gfn_t start, gfn_t end, bool can_yield, bool flush)
>  {
> -	bool zap_all = (start == 0 && end >= tdp_mmu_max_gfn_host());
>  	struct tdp_iter iter;
>  
> -	/*
> -	 * No need to try to step down in the iterator when zapping all SPTEs,
> -	 * zapping the top-level non-leaf SPTEs will recurse on their children.
> -	 */
> -	int min_level = zap_all ? root->role.level : PG_LEVEL_4K;
> -
>  	end = min(end, tdp_mmu_max_gfn_host());
>  
>  	lockdep_assert_held_write(&kvm->mmu_lock);
>  
>  	rcu_read_lock();
>  
> -	for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
> +	for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
>  		if (can_yield &&
>  		    tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
>  			flush = false;
>  			continue;
>  		}
>  
> -		if (!is_shadow_present_pte(iter.old_spte))
> -			continue;
> -
> -		/*
> -		 * If this is a non-last-level SPTE that covers a larger range
> -		 * than should be zapped, continue, and zap the mappings at a
> -		 * lower level, except when zapping all SPTEs.
> -		 */
> -		if (!zap_all &&
> -		    (iter.gfn < start ||
> -		     iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
> +		if (!is_shadow_present_pte(iter.old_spte) ||
>  		    !is_last_spte(iter.old_spte, iter.level))
>  			continue;
>  
> @@ -898,13 +879,13 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
>   * SPTEs have been cleared and a TLB flush is needed before releasing the
>   * MMU lock.
>   */
> -bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
> -				 gfn_t end, bool can_yield, bool flush)
> +bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
> +			   bool can_yield, bool flush)
>  {
>  	struct kvm_mmu_page *root;
>  
>  	for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
> -		flush = zap_gfn_range(kvm, root, start, end, can_yield, flush);
> +		flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, false);
>  
>  	return flush;
>  }
> @@ -1202,8 +1183,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
>  bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
>  				 bool flush)
>  {
> -	return __kvm_tdp_mmu_zap_gfn_range(kvm, range->slot->as_id, range->start,
> -					   range->end, range->may_block, flush);
> +	return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start,
> +				     range->end, range->may_block, flush);
>  }
>  
>  typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
> index 5e5ef2576c81..54bc8118c40a 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.h
> +++ b/arch/x86/kvm/mmu/tdp_mmu.h
> @@ -15,14 +15,8 @@ __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
>  void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
>  			  bool shared);
>  
> -bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
> +bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start,
>  				 gfn_t end, bool can_yield, bool flush);
> -static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id,
> -					     gfn_t start, gfn_t end, bool flush)
> -{
> -	return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush);
> -}
> -
>  bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp);
>  void kvm_tdp_mmu_zap_all(struct kvm *kvm);
>  void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm);
Mingwei Zhang March 13, 2022, 6:40 p.m. UTC | #6
On Thu, Mar 3, 2022 at 11:39 AM Paolo Bonzini <pbonzini@redhat.com> wrote:
>
> From: Sean Christopherson <seanjc@google.com>
>
> Zap only leaf SPTEs in the TDP MMU's zap_gfn_range(), and rename various
> functions accordingly.  When removing mappings for functional correctness
> (except for the stupid VFIO GPU passthrough memslots bug), zapping the
> leaf SPTEs is sufficient as the paging structures themselves do not point
> at guest memory and do not directly impact the final translation (in the
> TDP MMU).
>
> Note, this aligns the TDP MMU with the legacy/full MMU, which zaps only
> the rmaps, a.k.a. leaf SPTEs, in kvm_zap_gfn_range() and
> kvm_unmap_gfn_range().
>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> Reviewed-by: Ben Gardon <bgardon@google.com>
> Message-Id: <20220226001546.360188-18-seanjc@google.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  arch/x86/kvm/mmu/mmu.c     |  4 ++--
>  arch/x86/kvm/mmu/tdp_mmu.c | 41 ++++++++++----------------------------
>  arch/x86/kvm/mmu/tdp_mmu.h |  8 +-------
>  3 files changed, 14 insertions(+), 39 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 8408d7db8d2a..febdcaaa7b94 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -5834,8 +5834,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
>
>         if (is_tdp_mmu_enabled(kvm)) {
>                 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
> -                       flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start,
> -                                                         gfn_end, flush);
> +                       flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start,
> +                                                     gfn_end, true, flush);
>         }
>
>         if (flush)
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index f3939ce4a115..c71debdbc732 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -834,10 +834,8 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
>  }
>
>  /*
> - * Tears down the mappings for the range of gfns, [start, end), and frees the
> - * non-root pages mapping GFNs strictly within that range. Returns true if
> - * SPTEs have been cleared and a TLB flush is needed before releasing the
> - * MMU lock.
> + * Zap leafs SPTEs for the range of gfns, [start, end). Returns true if SPTEs
> + * have been cleared and a TLB flush is needed before releasing the MMU lock.
>   *
>   * If can_yield is true, will release the MMU lock and reschedule if the
>   * scheduler needs the CPU or there is contention on the MMU lock. If this
> @@ -845,42 +843,25 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
>   * the caller must ensure it does not supply too large a GFN range, or the
>   * operation can cause a soft lockup.
>   */
> -static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
> -                         gfn_t start, gfn_t end, bool can_yield, bool flush)
> +static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
> +                             gfn_t start, gfn_t end, bool can_yield, bool flush)
>  {
> -       bool zap_all = (start == 0 && end >= tdp_mmu_max_gfn_host());
>         struct tdp_iter iter;
>
> -       /*
> -        * No need to try to step down in the iterator when zapping all SPTEs,
> -        * zapping the top-level non-leaf SPTEs will recurse on their children.
> -        */
> -       int min_level = zap_all ? root->role.level : PG_LEVEL_4K;
> -
>         end = min(end, tdp_mmu_max_gfn_host());
>
>         lockdep_assert_held_write(&kvm->mmu_lock);
>
>         rcu_read_lock();
>
> -       for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
> +       for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
>                 if (can_yield &&
>                     tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
>                         flush = false;
>                         continue;
>                 }
>
> -               if (!is_shadow_present_pte(iter.old_spte))
> -                       continue;
> -
> -               /*
> -                * If this is a non-last-level SPTE that covers a larger range
> -                * than should be zapped, continue, and zap the mappings at a
> -                * lower level, except when zapping all SPTEs.
> -                */
> -               if (!zap_all &&
> -                   (iter.gfn < start ||
> -                    iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
> +               if (!is_shadow_present_pte(iter.old_spte) ||
>                     !is_last_spte(iter.old_spte, iter.level))
>                         continue;
>
> @@ -898,13 +879,13 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
>   * SPTEs have been cleared and a TLB flush is needed before releasing the
>   * MMU lock.
>   */
> -bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
> -                                gfn_t end, bool can_yield, bool flush)
> +bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
> +                          bool can_yield, bool flush)
>  {
>         struct kvm_mmu_page *root;
>
>         for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
> -               flush = zap_gfn_range(kvm, root, start, end, can_yield, flush);
> +               flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, false);

hmm, I think we might have to be very careful here. If we only zap
leafs, then there could be side effects. For instance, the code in
disallowed_hugepage_adjust() may not work as intended. If you check
the following condition in arch/x86/kvm/mmu/mmu.c:2918

if (cur_level > PG_LEVEL_4K &&
    cur_level == fault->goal_level &&
    is_shadow_present_pte(spte) &&
    !is_large_pte(spte)) {

If we previously use 4K mappings in this range due to various reasons
(dirty logging etc), then afterwards, we zap the range. Then the guest
touches a 4K and now we should map the range with whatever the maximum
level we can for the guest.

However, if we just zap only the leafs, then when the code comes to
the above location, is_shadow_present_pte(spte) will return true,
since the spte is a non-leaf (say a regular PMD entry). The whole if
statement will be true, then we never allow remapping guest memory
with huge pages.

>
>         return flush;
>  }
> @@ -1202,8 +1183,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
>  bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
>                                  bool flush)
>  {
> -       return __kvm_tdp_mmu_zap_gfn_range(kvm, range->slot->as_id, range->start,
> -                                          range->end, range->may_block, flush);
> +       return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start,
> +                                    range->end, range->may_block, flush);
>  }
>
>  typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
> index 5e5ef2576c81..54bc8118c40a 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.h
> +++ b/arch/x86/kvm/mmu/tdp_mmu.h
> @@ -15,14 +15,8 @@ __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
>  void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
>                           bool shared);
>
> -bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
> +bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start,
>                                  gfn_t end, bool can_yield, bool flush);
> -static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id,
> -                                            gfn_t start, gfn_t end, bool flush)
> -{
> -       return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush);
> -}
> -
>  bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp);
>  void kvm_tdp_mmu_zap_all(struct kvm *kvm);
>  void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm);
> --
> 2.31.1
>
>
Sean Christopherson March 25, 2022, 3:13 p.m. UTC | #7
On Sun, Mar 13, 2022, Mingwei Zhang wrote:
> On Thu, Mar 3, 2022 at 11:39 AM Paolo Bonzini <pbonzini@redhat.com> wrote:
> > @@ -898,13 +879,13 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
> >   * SPTEs have been cleared and a TLB flush is needed before releasing the
> >   * MMU lock.
> >   */
> > -bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
> > -                                gfn_t end, bool can_yield, bool flush)
> > +bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
> > +                          bool can_yield, bool flush)
> >  {
> >         struct kvm_mmu_page *root;
> >
> >         for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
> > -               flush = zap_gfn_range(kvm, root, start, end, can_yield, flush);
> > +               flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, false);
> 
> hmm, I think we might have to be very careful here. If we only zap
> leafs, then there could be side effects. For instance, the code in
> disallowed_hugepage_adjust() may not work as intended. If you check
> the following condition in arch/x86/kvm/mmu/mmu.c:2918
> 
> if (cur_level > PG_LEVEL_4K &&
>     cur_level == fault->goal_level &&
>     is_shadow_present_pte(spte) &&
>     !is_large_pte(spte)) {
> 
> If we previously use 4K mappings in this range due to various reasons
> (dirty logging etc), then afterwards, we zap the range. Then the guest
> touches a 4K and now we should map the range with whatever the maximum
> level we can for the guest.
> 
> However, if we just zap only the leafs, then when the code comes to
> the above location, is_shadow_present_pte(spte) will return true,
> since the spte is a non-leaf (say a regular PMD entry). The whole if
> statement will be true, then we never allow remapping guest memory
> with huge pages.

But that's at worst a performance issue, and arguably working as intended.  The
zap in this case is never due to the _guest_ unmapping the pfn, so odds are good
the guest will want to map back in the same pfns with the same permissions.
Zapping shadow pages so that the guest can maybe create a hugepage may end up
being a lot of extra work for no benefit.  Or it may be a net positive.  Either
way, it's not a functional issue.
Mingwei Zhang March 26, 2022, 6:10 p.m. UTC | #8
On Fri, Mar 25, 2022, Sean Christopherson wrote:
> On Sun, Mar 13, 2022, Mingwei Zhang wrote:
> > On Thu, Mar 3, 2022 at 11:39 AM Paolo Bonzini <pbonzini@redhat.com> wrote:
> > > @@ -898,13 +879,13 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
> > >   * SPTEs have been cleared and a TLB flush is needed before releasing the
> > >   * MMU lock.
> > >   */
> > > -bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
> > > -                                gfn_t end, bool can_yield, bool flush)
> > > +bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
> > > +                          bool can_yield, bool flush)
> > >  {
> > >         struct kvm_mmu_page *root;
> > >
> > >         for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
> > > -               flush = zap_gfn_range(kvm, root, start, end, can_yield, flush);
> > > +               flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, false);
> > 
> > hmm, I think we might have to be very careful here. If we only zap
> > leafs, then there could be side effects. For instance, the code in
> > disallowed_hugepage_adjust() may not work as intended. If you check
> > the following condition in arch/x86/kvm/mmu/mmu.c:2918
> > 
> > if (cur_level > PG_LEVEL_4K &&
> >     cur_level == fault->goal_level &&
> >     is_shadow_present_pte(spte) &&
> >     !is_large_pte(spte)) {
> > 
> > If we previously use 4K mappings in this range due to various reasons
> > (dirty logging etc), then afterwards, we zap the range. Then the guest
> > touches a 4K and now we should map the range with whatever the maximum
> > level we can for the guest.
> > 
> > However, if we just zap only the leafs, then when the code comes to
> > the above location, is_shadow_present_pte(spte) will return true,
> > since the spte is a non-leaf (say a regular PMD entry). The whole if
> > statement will be true, then we never allow remapping guest memory
> > with huge pages.
> 
> But that's at worst a performance issue, and arguably working as intended.  The
> zap in this case is never due to the _guest_ unmapping the pfn, so odds are good
> the guest will want to map back in the same pfns with the same permissions.
> Zapping shadow pages so that the guest can maybe create a hugepage may end up
> being a lot of extra work for no benefit.  Or it may be a net positive.  Either
> way, it's not a functional issue.

This should be a performance bug instead of a functional one. But it
does affect both dirty logging (before Ben's early page promotion) and
our demand paging. So I proposed the fix in here:

https://lore.kernel.org/lkml/20220323184915.1335049-2-mizhang@google.com/T/#me78d50ffac33f4f418432f7b171c50630414ef28

If we see memory corruptions, I bet it could only be that we miss some
TLB flushes, since this patch series is basically trying to avoid
immediate TLB flushing by simply changing ASID (assigning new root).

To debug, maybe force the TLB flushes after zap_gfn_range and see if the
problem still exist?
Sean Christopherson March 28, 2022, 3:06 p.m. UTC | #9
On Sat, Mar 26, 2022, Mingwei Zhang wrote:
> On Fri, Mar 25, 2022, Sean Christopherson wrote:
> > On Sun, Mar 13, 2022, Mingwei Zhang wrote:
> > > On Thu, Mar 3, 2022 at 11:39 AM Paolo Bonzini <pbonzini@redhat.com> wrote:
> > > > @@ -898,13 +879,13 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
> > > >   * SPTEs have been cleared and a TLB flush is needed before releasing the
> > > >   * MMU lock.
> > > >   */
> > > > -bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
> > > > -                                gfn_t end, bool can_yield, bool flush)
> > > > +bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
> > > > +                          bool can_yield, bool flush)
> > > >  {
> > > >         struct kvm_mmu_page *root;
> > > >
> > > >         for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
> > > > -               flush = zap_gfn_range(kvm, root, start, end, can_yield, flush);
> > > > +               flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, false);
> > > 
> > > hmm, I think we might have to be very careful here. If we only zap
> > > leafs, then there could be side effects. For instance, the code in
> > > disallowed_hugepage_adjust() may not work as intended. If you check
> > > the following condition in arch/x86/kvm/mmu/mmu.c:2918
> > > 
> > > if (cur_level > PG_LEVEL_4K &&
> > >     cur_level == fault->goal_level &&
> > >     is_shadow_present_pte(spte) &&
> > >     !is_large_pte(spte)) {
> > > 
> > > If we previously use 4K mappings in this range due to various reasons
> > > (dirty logging etc), then afterwards, we zap the range. Then the guest
> > > touches a 4K and now we should map the range with whatever the maximum
> > > level we can for the guest.
> > > 
> > > However, if we just zap only the leafs, then when the code comes to
> > > the above location, is_shadow_present_pte(spte) will return true,
> > > since the spte is a non-leaf (say a regular PMD entry). The whole if
> > > statement will be true, then we never allow remapping guest memory
> > > with huge pages.
> > 
> > But that's at worst a performance issue, and arguably working as intended.  The
> > zap in this case is never due to the _guest_ unmapping the pfn, so odds are good
> > the guest will want to map back in the same pfns with the same permissions.
> > Zapping shadow pages so that the guest can maybe create a hugepage may end up
> > being a lot of extra work for no benefit.  Or it may be a net positive.  Either
> > way, it's not a functional issue.
> 
> This should be a performance bug instead of a functional one. But it
> does affect both dirty logging (before Ben's early page promotion) and
> our demand paging.

I'd buy the argument that KVM should zap shadow pages when zapping specifically to
recreate huge pages, but that's a different path entirely.  Disabling of dirty
logging uses a dedicated path, zap_collapsible_spte_range().

> So I proposed the fix in here:
> 
> https://lore.kernel.org/lkml/20220323184915.1335049-2-mizhang@google.com/T/#me78d50ffac33f4f418432f7b171c50630414ef28
> 
> If we see memory corruptions, I bet it could only be that we miss some
> TLB flushes, since this patch series is basically trying to avoid
> immediate TLB flushing by simply changing ASID (assigning new root).

Ya, it was a lost TLB flush goof.  My apologaies for not cc'ing you on the patch.

https://lore.kernel.org/all/20220325230348.2587437-1-seanjc@google.com

> To debug, maybe force the TLB flushes after zap_gfn_range and see if the
> problem still exist?
> 
>
diff mbox series

Patch

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 8408d7db8d2a..febdcaaa7b94 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5834,8 +5834,8 @@  void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
 
 	if (is_tdp_mmu_enabled(kvm)) {
 		for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
-			flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start,
-							  gfn_end, flush);
+			flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start,
+						      gfn_end, true, flush);
 	}
 
 	if (flush)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index f3939ce4a115..c71debdbc732 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -834,10 +834,8 @@  bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
 }
 
 /*
- * Tears down the mappings for the range of gfns, [start, end), and frees the
- * non-root pages mapping GFNs strictly within that range. Returns true if
- * SPTEs have been cleared and a TLB flush is needed before releasing the
- * MMU lock.
+ * Zap leafs SPTEs for the range of gfns, [start, end). Returns true if SPTEs
+ * have been cleared and a TLB flush is needed before releasing the MMU lock.
  *
  * If can_yield is true, will release the MMU lock and reschedule if the
  * scheduler needs the CPU or there is contention on the MMU lock. If this
@@ -845,42 +843,25 @@  bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
  * the caller must ensure it does not supply too large a GFN range, or the
  * operation can cause a soft lockup.
  */
-static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
-			  gfn_t start, gfn_t end, bool can_yield, bool flush)
+static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
+			      gfn_t start, gfn_t end, bool can_yield, bool flush)
 {
-	bool zap_all = (start == 0 && end >= tdp_mmu_max_gfn_host());
 	struct tdp_iter iter;
 
-	/*
-	 * No need to try to step down in the iterator when zapping all SPTEs,
-	 * zapping the top-level non-leaf SPTEs will recurse on their children.
-	 */
-	int min_level = zap_all ? root->role.level : PG_LEVEL_4K;
-
 	end = min(end, tdp_mmu_max_gfn_host());
 
 	lockdep_assert_held_write(&kvm->mmu_lock);
 
 	rcu_read_lock();
 
-	for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
+	for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
 		if (can_yield &&
 		    tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
 			flush = false;
 			continue;
 		}
 
-		if (!is_shadow_present_pte(iter.old_spte))
-			continue;
-
-		/*
-		 * If this is a non-last-level SPTE that covers a larger range
-		 * than should be zapped, continue, and zap the mappings at a
-		 * lower level, except when zapping all SPTEs.
-		 */
-		if (!zap_all &&
-		    (iter.gfn < start ||
-		     iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
+		if (!is_shadow_present_pte(iter.old_spte) ||
 		    !is_last_spte(iter.old_spte, iter.level))
 			continue;
 
@@ -898,13 +879,13 @@  static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
  * SPTEs have been cleared and a TLB flush is needed before releasing the
  * MMU lock.
  */
-bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
-				 gfn_t end, bool can_yield, bool flush)
+bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
+			   bool can_yield, bool flush)
 {
 	struct kvm_mmu_page *root;
 
 	for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
-		flush = zap_gfn_range(kvm, root, start, end, can_yield, flush);
+		flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, false);
 
 	return flush;
 }
@@ -1202,8 +1183,8 @@  int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
 				 bool flush)
 {
-	return __kvm_tdp_mmu_zap_gfn_range(kvm, range->slot->as_id, range->start,
-					   range->end, range->may_block, flush);
+	return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start,
+				     range->end, range->may_block, flush);
 }
 
 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 5e5ef2576c81..54bc8118c40a 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -15,14 +15,8 @@  __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
 			  bool shared);
 
-bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
+bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start,
 				 gfn_t end, bool can_yield, bool flush);
-static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id,
-					     gfn_t start, gfn_t end, bool flush)
-{
-	return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush);
-}
-
 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp);
 void kvm_tdp_mmu_zap_all(struct kvm *kvm);
 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm);