diff mbox series

KVM: x86/mmu: simplify kvm_tdp_mmu_map flow when guest has to retry

Message ID 20221117161449.114086-1-pbonzini@redhat.com (mailing list archive)
State New, archived
Headers show
Series KVM: x86/mmu: simplify kvm_tdp_mmu_map flow when guest has to retry | expand

Commit Message

Paolo Bonzini Nov. 17, 2022, 4:14 p.m. UTC
A removed SPTE is never present, hence the "if" in kvm_tdp_mmu_map
only fails in the exact same conditions that the earlier loop
tested in order to issue a  "break". So, instead of checking twice the
condition (upper level SPTEs could not be created or was frozen), just
exit the loop with a goto---the usual poor-man C replacement for RAII
early returns.

While at it, do not use the "ret" variable for return values of
functions that do not return a RET_PF_* enum.  This is clearer
and also makes it possible to initialize ret to RET_PF_RETRY.

Suggested-by: Robert Hoo <robert.hu@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/tdp_mmu.c | 40 ++++++++++++++++++--------------------
 1 file changed, 19 insertions(+), 21 deletions(-)

Comments

David Matlack Nov. 17, 2022, 6:43 p.m. UTC | #1
On Thu, Nov 17, 2022 at 8:14 AM Paolo Bonzini <pbonzini@redhat.com> wrote:
>
> A removed SPTE is never present, hence the "if" in kvm_tdp_mmu_map
> only fails in the exact same conditions that the earlier loop
> tested in order to issue a  "break". So, instead of checking twice the
> condition (upper level SPTEs could not be created or was frozen), just
> exit the loop with a goto---the usual poor-man C replacement for RAII
> early returns.
>
> While at it, do not use the "ret" variable for return values of
> functions that do not return a RET_PF_* enum.  This is clearer
> and also makes it possible to initialize ret to RET_PF_RETRY.
>
> Suggested-by: Robert Hoo <robert.hu@linux.intel.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  arch/x86/kvm/mmu/tdp_mmu.c | 40 ++++++++++++++++++--------------------
>  1 file changed, 19 insertions(+), 21 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index e08596775427..771210ce5181 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -1159,7 +1159,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
>         struct kvm *kvm = vcpu->kvm;
>         struct tdp_iter iter;
>         struct kvm_mmu_page *sp;
> -       int ret;
> +       int ret = RET_PF_RETRY;
>
>         kvm_mmu_hugepage_adjust(vcpu, fault);
>
> @@ -1168,23 +1168,25 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
>         rcu_read_lock();
>
>         tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
> +               int r;
> +
>                 if (fault->nx_huge_page_workaround_enabled)
>                         disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
>
>                 if (iter.level == fault->goal_level)
>                         break;
>
> -               /* Step down into the lower level page table if it exists. */
> -               if (is_shadow_present_pte(iter.old_spte) &&
> -                   !is_large_pte(iter.old_spte))
> -                       continue;
> -
>                 /*
>                  * If SPTE has been frozen by another thread, just give up and
>                  * retry, avoiding unnecessary page table allocation and free.
>                  */
>                 if (is_removed_spte(iter.old_spte))
> -                       break;
> +                       goto retry;
> +
> +               /* Step down into the lower level page table if it exists. */
> +               if (is_shadow_present_pte(iter.old_spte) &&
> +                   !is_large_pte(iter.old_spte))
> +                       continue;
>
>                 /*
>                  * The SPTE is either non-present or points to a huge page that
> @@ -1196,13 +1198,17 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
>                 sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
>
>                 if (is_shadow_present_pte(iter.old_spte))
> -                       ret = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
> +                       r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
>                 else
> -                       ret = tdp_mmu_link_sp(kvm, &iter, sp, true);
> +                       r = tdp_mmu_link_sp(kvm, &iter, sp, true);

Can this fix be squashed into [1]? It seems like a serious enough bug.
If 2 threads race to update the same PTE, KVM will return -EBUSY out
to userspace from KVM_RUN, I think. I'm not sure about QEMU, but that
would be fatal for the VM in Vanadium.

[1] https://lore.kernel.org/kvm/20221109185905.486172-3-dmatlack@google.com/

>
> -               if (ret) {
> +               /*
> +                * Also force the guest to retry the access if the upper level SPTEs
> +                * aren't in place.
> +                */
> +               if (r) {
>                         tdp_mmu_free_sp(sp);
> -                       break;
> +                       goto retry;
>                 }
>
>                 if (fault->huge_page_disallowed &&
> @@ -1213,18 +1219,10 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
>                 }
>         }
>
> -       /*
> -        * Force the guest to retry the access if the upper level SPTEs aren't
> -        * in place, or if the target leaf SPTE is frozen by another CPU.
> -        */
> -       if (iter.level != fault->goal_level || is_removed_spte(iter.old_spte)) {
> -               rcu_read_unlock();
> -               return RET_PF_RETRY;
> -       }
> -
>         ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
> -       rcu_read_unlock();
>
> +retry:
> +       rcu_read_unlock();
>         return ret;
>  }
>
> --
> 2.31.1
>
Robert Hoo Nov. 18, 2022, 1:34 a.m. UTC | #2
On Thu, 2022-11-17 at 11:14 -0500, Paolo Bonzini wrote:
> A removed SPTE is never present, hence the "if" in kvm_tdp_mmu_map
> only fails in the exact same conditions that the earlier loop
> tested in order to issue a  "break". So, instead of checking twice
> the
> condition (upper level SPTEs could not be created or was frozen),
> just
> exit the loop with a goto---the usual poor-man C replacement for RAII
> early returns.
> 
> While at it, do not use the "ret" variable for return values of
> functions that do not return a RET_PF_* enum.  This is clearer
> and also makes it possible to initialize ret to RET_PF_RETRY.
> 
> Suggested-by: Robert Hoo <robert.hu@linux.intel.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  arch/x86/kvm/mmu/tdp_mmu.c | 40 ++++++++++++++++++----------------
> ----
>  1 file changed, 19 insertions(+), 21 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index e08596775427..771210ce5181 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -1159,7 +1159,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu,
> struct kvm_page_fault *fault)
>  	struct kvm *kvm = vcpu->kvm;
>  	struct tdp_iter iter;
>  	struct kvm_mmu_page *sp;
> -	int ret;
> +	int ret = RET_PF_RETRY;
>  
>  	kvm_mmu_hugepage_adjust(vcpu, fault);
>  
> @@ -1168,23 +1168,25 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu,
> struct kvm_page_fault *fault)
>  	rcu_read_lock();
>  
>  	tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
> +		int r;
> +
>  		if (fault->nx_huge_page_workaround_enabled)
>  			disallowed_hugepage_adjust(fault,
> iter.old_spte, iter.level);
>  
And here can also be improved, I think.

        tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
-               if (fault->nx_huge_page_workaround_enabled)
+               if (fault->huge_page_disallowed)

in the case of !fault->exec && fault->nx_huge_page_workaround_enabled,
huge page should be still allowed, shouldn't it?

If you agree, I can send out a patch for this. I've roughly tested
this, with an ordinary guest boot, works normally.

>  		if (iter.level == fault->goal_level)
>  			break;
>  
> -		/* Step down into the lower level page table if it
> exists. */
> -		if (is_shadow_present_pte(iter.old_spte) &&
> -		    !is_large_pte(iter.old_spte))
> -			continue;
> -
>  		/*
>  		 * If SPTE has been frozen by another thread, just give
> up and
>  		 * retry, avoiding unnecessary page table allocation
> and free.
>  		 */
>  		if (is_removed_spte(iter.old_spte))
> -			break;
> +			goto retry;
> +
> +		/* Step down into the lower level page table if it
> exists. */
> +		if (is_shadow_present_pte(iter.old_spte) &&
> +		    !is_large_pte(iter.old_spte))
> +			continue;
>  
>  		/*
>  		 * The SPTE is either non-present or points to a huge
> page that
> @@ -1196,13 +1198,17 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu,
> struct kvm_page_fault *fault)
>  		sp->nx_huge_page_disallowed = fault-
> >huge_page_disallowed;
>  
>  		if (is_shadow_present_pte(iter.old_spte))
> -			ret = tdp_mmu_split_huge_page(kvm, &iter, sp,
> true);
> +			r = tdp_mmu_split_huge_page(kvm, &iter, sp,
> true);
>  		else
> -			ret = tdp_mmu_link_sp(kvm, &iter, sp, true);
> +			r = tdp_mmu_link_sp(kvm, &iter, sp, true);
>  
> -		if (ret) {
> +		/*
> +		 * Also force the guest to retry the access if the
> upper level SPTEs
> +		 * aren't in place.
> +		 */
> +		if (r) {
>  			tdp_mmu_free_sp(sp);
> -			break;
> +			goto retry;
>  		}
>  
>  		if (fault->huge_page_disallowed &&
> @@ -1213,18 +1219,10 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu,
> struct kvm_page_fault *fault)
>  		}
>  	}
>  
> -	/*
> -	 * Force the guest to retry the access if the upper level SPTEs
> aren't
> -	 * in place, or if the target leaf SPTE is frozen by another
> CPU.
> -	 */
> -	if (iter.level != fault->goal_level ||
> is_removed_spte(iter.old_spte)) {
> -		rcu_read_unlock();
> -		return RET_PF_RETRY;
> -	}
> -
>  	ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
> -	rcu_read_unlock();
>  
> +retry:
> +	rcu_read_unlock();
>  	return ret;
>  }
>
Robert Hoo Nov. 18, 2022, 2 a.m. UTC | #3
On Thu, 2022-11-17 at 10:43 -0800, David Matlack wrote:
> On Thu, Nov 17, 2022 at 8:14 AM Paolo Bonzini <pbonzini@redhat.com>
> wrote:
> > 
> > A removed SPTE is never present, hence the "if" in kvm_tdp_mmu_map
> > only fails in the exact same conditions that the earlier loop
> > tested in order to issue a  "break". So, instead of checking twice
> > the
> > condition (upper level SPTEs could not be created or was frozen),
> > just
> > exit the loop with a goto---the usual poor-man C replacement for
> > RAII
> > early returns.
> > 
> > While at it, do not use the "ret" variable for return values of
> > functions that do not return a RET_PF_* enum.  This is clearer
> > and also makes it possible to initialize ret to RET_PF_RETRY.
> > 
> > Suggested-by: Robert Hoo <robert.hu@linux.intel.com>
> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> > ---
> >  arch/x86/kvm/mmu/tdp_mmu.c | 40 ++++++++++++++++++--------------
> > ------
> >  1 file changed, 19 insertions(+), 21 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c
> > b/arch/x86/kvm/mmu/tdp_mmu.c
> > index e08596775427..771210ce5181 100644
> > --- a/arch/x86/kvm/mmu/tdp_mmu.c
> > +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> > @@ -1159,7 +1159,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu,
> > struct kvm_page_fault *fault)
> >         struct kvm *kvm = vcpu->kvm;
> >         struct tdp_iter iter;
> >         struct kvm_mmu_page *sp;
> > -       int ret;
> > +       int ret = RET_PF_RETRY;
> > 
> >         kvm_mmu_hugepage_adjust(vcpu, fault);
> > 
> > @@ -1168,23 +1168,25 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu,
> > struct kvm_page_fault *fault)
> >         rcu_read_lock();
> > 
> >         tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1)
> > {
> > +               int r;
> > +
> >                 if (fault->nx_huge_page_workaround_enabled)
> >                         disallowed_hugepage_adjust(fault,
> > iter.old_spte, iter.level);
> > 
> >                 if (iter.level == fault->goal_level)
> >                         break;
> > 
> > -               /* Step down into the lower level page table if it
> > exists. */
> > -               if (is_shadow_present_pte(iter.old_spte) &&
> > -                   !is_large_pte(iter.old_spte))
> > -                       continue;
> > -
> >                 /*
> >                  * If SPTE has been frozen by another thread, just
> > give up and
> >                  * retry, avoiding unnecessary page table
> > allocation and free.
> >                  */
> >                 if (is_removed_spte(iter.old_spte))
> > -                       break;
> > +                       goto retry;
> > +
> > +               /* Step down into the lower level page table if it
> > exists. */
> > +               if (is_shadow_present_pte(iter.old_spte) &&
> > +                   !is_large_pte(iter.old_spte))
> > +                       continue;
> > 
> >                 /*
> >                  * The SPTE is either non-present or points to a
> > huge page that
> > @@ -1196,13 +1198,17 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu,
> > struct kvm_page_fault *fault)
> >                 sp->nx_huge_page_disallowed = fault-
> > >huge_page_disallowed;
> > 
> >                 if (is_shadow_present_pte(iter.old_spte))
> > -                       ret = tdp_mmu_split_huge_page(kvm, &iter,
> > sp, true);
> > +                       r = tdp_mmu_split_huge_page(kvm, &iter, sp,
> > true);
> >                 else
> > -                       ret = tdp_mmu_link_sp(kvm, &iter, sp,
> > true);
> > +                       r = tdp_mmu_link_sp(kvm, &iter, sp, true);
> 
> Can this fix be squashed into [1]? It seems like a serious enough
> bug.
> If 2 threads race to update the same PTE, KVM will return -EBUSY out
> to userspace from KVM_RUN, I think. I'm not sure about QEMU, but that
> would be fatal for the VM in Vanadium.
> 
> [1] 
> https://lore.kernel.org/kvm/20221109185905.486172-3-dmatlack@google.com/
> 
I think in you patch it's all right, since then before
kvm_tdp_mmu_map() returns, it must go through
tdp_mmu_map_handle_target_level(), it returns RET_PF_* enum.

> > 
> > -               if (ret) {
> > +               /*
> > +                * Also force the guest to retry the access if the
> > upper level SPTEs
> > +                * aren't in place.
> > +                */
> > +               if (r) {
> >                         tdp_mmu_free_sp(sp);
> > -                       break;
> > +                       goto retry;
> >                 }
> > 
> >                 if (fault->huge_page_disallowed &&
> > @@ -1213,18 +1219,10 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu,
> > struct kvm_page_fault *fault)
> >                 }
> >         }
> > 
> > -       /*
> > -        * Force the guest to retry the access if the upper level
> > SPTEs aren't
> > -        * in place, or if the target leaf SPTE is frozen by
> > another CPU.
> > -        */
> > -       if (iter.level != fault->goal_level ||
> > is_removed_spte(iter.old_spte)) {
> > -               rcu_read_unlock();
> > -               return RET_PF_RETRY;
> > -       }
> > -
> >         ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
> > -       rcu_read_unlock();
> > 
> > +retry:
> > +       rcu_read_unlock();
> >         return ret;
> >  }
> > 
> > --
> > 2.31.1
> >
David Matlack Nov. 18, 2022, 3:49 a.m. UTC | #4
On Thu, Nov 17, 2022 at 6:01 PM Robert Hoo <robert.hu@linux.intel.com> wrote:
>
> On Thu, 2022-11-17 at 10:43 -0800, David Matlack wrote:
> > On Thu, Nov 17, 2022 at 8:14 AM Paolo Bonzini <pbonzini@redhat.com>
> > wrote:
> > >                 if (is_shadow_present_pte(iter.old_spte))
> > > -                       ret = tdp_mmu_split_huge_page(kvm, &iter,
> > > sp, true);
> > > +                       r = tdp_mmu_split_huge_page(kvm, &iter, sp,
> > > true);
> > >                 else
> > > -                       ret = tdp_mmu_link_sp(kvm, &iter, sp,
> > > true);
> > > +                       r = tdp_mmu_link_sp(kvm, &iter, sp, true);
> >
> > Can this fix be squashed into [1]? It seems like a serious enough
> > bug.
> > If 2 threads race to update the same PTE, KVM will return -EBUSY out
> > to userspace from KVM_RUN, I think. I'm not sure about QEMU, but that
> > would be fatal for the VM in Vanadium.
> >
> > [1]
> > https://lore.kernel.org/kvm/20221109185905.486172-3-dmatlack@google.com/
> >
> I think in you patch it's all right, since then before
> kvm_tdp_mmu_map() returns, it must go through
> tdp_mmu_map_handle_target_level(), it returns RET_PF_* enum.

Ah that's right. kvm_tdp_mmu_map() won't actually return 0/-EBUSY,
because it either returns RET_PF_RETRY or goes through
tdp_mmu_map_handle_target_level().
David Matlack Nov. 18, 2022, 4 a.m. UTC | #5
On Thu, Nov 17, 2022 at 5:35 PM Robert Hoo <robert.hu@linux.intel.com> wrote:
>
> On Thu, 2022-11-17 at 11:14 -0500, Paolo Bonzini wrote:
> > +
> >               if (fault->nx_huge_page_workaround_enabled)
> >                       disallowed_hugepage_adjust(fault,
> > iter.old_spte, iter.level);
> >
> And here can also be improved, I think.
>
>         tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
> -               if (fault->nx_huge_page_workaround_enabled)
> +               if (fault->huge_page_disallowed)
>
> in the case of !fault->exec && fault->nx_huge_page_workaround_enabled,
> huge page should be still allowed, shouldn't it?
>
> If you agree, I can send out a patch for this. I've roughly tested
> this, with an ordinary guest boot, works normally.

This check handles the case where a read or write fault occurs within
a region that has already been split due to an NX huge page. If we
recovered the NX Huge Page on such faults, the guest could end up
continuously faulting on the same huge page (e.g. if writing to one
page and executing from another within a GPA region backed by a huge
page). So instead, NX Huge Page recovery is done periodically by a
background thread.

That being said, I'm not surprised you didn't encounter any issues
when testing. Now that the TDP MMU fully splits NX Huge Pages on
fault, such faults should be rare at best. Perhaps even impossible?
Hm, can we can drop the call to disallowed_hugepage_adjust() entirely?
Robert Hoo Nov. 18, 2022, 8:28 a.m. UTC | #6
On Thu, 2022-11-17 at 20:00 -0800, David Matlack wrote:
> On Thu, Nov 17, 2022 at 5:35 PM Robert Hoo <robert.hu@linux.intel.com
> > wrote:
> > 
> > On Thu, 2022-11-17 at 11:14 -0500, Paolo Bonzini wrote:
> > > +
> > >               if (fault->nx_huge_page_workaround_enabled)
> > >                       disallowed_hugepage_adjust(fault,
> > > iter.old_spte, iter.level);
> > > 
> > 
> > And here can also be improved, I think.
> > 
> >         tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1)
> > {
> > -               if (fault->nx_huge_page_workaround_enabled)
> > +               if (fault->huge_page_disallowed)
> > 
> > in the case of !fault->exec && fault-
> > >nx_huge_page_workaround_enabled,
> > huge page should be still allowed, shouldn't it?
> > 
> > If you agree, I can send out a patch for this. I've roughly tested
> > this, with an ordinary guest boot, works normally.
> 
> This check handles the case where a read or write fault occurs within
> a region that has already been split due to an NX huge page. 

By NX huge page split, the sub-sptes are installed, if my understanding
is right. So no fault should happen when next r/w access.

> If we
> recovered the NX Huge Page on such faults, the guest could end up
> continuously faulting on the same huge page (e.g. if writing to one
> page and executing from another within a GPA region backed by a huge
> page). So instead, NX Huge Page recovery is done periodically by a
> background thread.

Do you mean the kvm_nx_huge_page_recovery_worker() kthread? My
understanding is that it recycles SPs that was created by NX huge page
split. This would cause above fault happened, I guess, i.e. the
previously installed spte is zapped by the child SP recycled.

OK, understand you point now, if let r/w access fault of your mentioned
type skip disallowed_hugepage_adjust(), then it will break out and huge
page will be installed. Then next exec access will cause the huge page
split; then next r/w access fault will install a huge page again ... 
> 
> That being said, I'm not surprised you didn't encounter any issues
> when testing. Now that the TDP MMU fully splits NX Huge Pages on
> fault, such faults should be rare at best. Perhaps even impossible?

Possible, and not rare, I added debug info in
disallowed_hugepage_adjust() and showed hits.

> Hm, can we can drop the call to disallowed_hugepage_adjust()
> entirely?

I guess not, keep it as is. Though rare, even impossible, what if
is_nx_huge_page_enabled() changed during the run time? e.g. NX huge
page enabled --> disabled, give it a chance to restore huge page
mapping?
diff mbox series

Patch

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index e08596775427..771210ce5181 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1159,7 +1159,7 @@  int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 	struct kvm *kvm = vcpu->kvm;
 	struct tdp_iter iter;
 	struct kvm_mmu_page *sp;
-	int ret;
+	int ret = RET_PF_RETRY;
 
 	kvm_mmu_hugepage_adjust(vcpu, fault);
 
@@ -1168,23 +1168,25 @@  int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 	rcu_read_lock();
 
 	tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
+		int r;
+
 		if (fault->nx_huge_page_workaround_enabled)
 			disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
 
 		if (iter.level == fault->goal_level)
 			break;
 
-		/* Step down into the lower level page table if it exists. */
-		if (is_shadow_present_pte(iter.old_spte) &&
-		    !is_large_pte(iter.old_spte))
-			continue;
-
 		/*
 		 * If SPTE has been frozen by another thread, just give up and
 		 * retry, avoiding unnecessary page table allocation and free.
 		 */
 		if (is_removed_spte(iter.old_spte))
-			break;
+			goto retry;
+
+		/* Step down into the lower level page table if it exists. */
+		if (is_shadow_present_pte(iter.old_spte) &&
+		    !is_large_pte(iter.old_spte))
+			continue;
 
 		/*
 		 * The SPTE is either non-present or points to a huge page that
@@ -1196,13 +1198,17 @@  int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 		sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
 
 		if (is_shadow_present_pte(iter.old_spte))
-			ret = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
+			r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
 		else
-			ret = tdp_mmu_link_sp(kvm, &iter, sp, true);
+			r = tdp_mmu_link_sp(kvm, &iter, sp, true);
 
-		if (ret) {
+		/*
+		 * Also force the guest to retry the access if the upper level SPTEs
+		 * aren't in place.
+		 */
+		if (r) {
 			tdp_mmu_free_sp(sp);
-			break;
+			goto retry;
 		}
 
 		if (fault->huge_page_disallowed &&
@@ -1213,18 +1219,10 @@  int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 		}
 	}
 
-	/*
-	 * Force the guest to retry the access if the upper level SPTEs aren't
-	 * in place, or if the target leaf SPTE is frozen by another CPU.
-	 */
-	if (iter.level != fault->goal_level || is_removed_spte(iter.old_spte)) {
-		rcu_read_unlock();
-		return RET_PF_RETRY;
-	}
-
 	ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
-	rcu_read_unlock();
 
+retry:
+	rcu_read_unlock();
 	return ret;
 }