diff mbox series

KVM: x86/MMU: Zap non-leaf SPTEs when disabling dirty logging

Message ID 20220525230904.1584480-1-bgardon@google.com (mailing list archive)
State New, archived
Headers show
Series KVM: x86/MMU: Zap non-leaf SPTEs when disabling dirty logging | expand

Commit Message

Ben Gardon May 25, 2022, 11:09 p.m. UTC
When disabling dirty logging, zap non-leaf parent entries to allow
replacement with huge pages instead of recursing and zapping all of the
child, leaf entries. This reduces the number of TLB flushes required.

Currently disabling dirty logging with the TDP MMU is extremely slow.
On a 96 vCPU / 96G VM backed with gigabyte pages, it takes ~200 seconds
to disable dirty logging with the TDP MMU, as opposed to ~4 seconds with
the shadow MMU. This patch reduces the disable dirty log time with the
TDP MMU to ~3 seconds.

Testing:
Ran KVM selftests and kvm-unit-tests on an Intel Haswell. This
patch introduced no new failures.

Signed-off-by: Ben Gardon <bgardon@google.com>
---
 arch/x86/kvm/mmu/tdp_iter.c |  9 +++++++++
 arch/x86/kvm/mmu/tdp_iter.h |  1 +
 arch/x86/kvm/mmu/tdp_mmu.c  | 38 +++++++++++++++++++++++++++++++------
 3 files changed, 42 insertions(+), 6 deletions(-)

Comments

Ben Gardon May 25, 2022, 11:12 p.m. UTC | #1
On Wed, May 25, 2022 at 4:09 PM Ben Gardon <bgardon@google.com> wrote:
>
> When disabling dirty logging, zap non-leaf parent entries to allow
> replacement with huge pages instead of recursing and zapping all of the
> child, leaf entries. This reduces the number of TLB flushes required.
>
> Currently disabling dirty logging with the TDP MMU is extremely slow.
> On a 96 vCPU / 96G VM backed with gigabyte pages, it takes ~200 seconds
> to disable dirty logging with the TDP MMU, as opposed to ~4 seconds with
> the shadow MMU. This patch reduces the disable dirty log time with the
> TDP MMU to ~3 seconds.
>

After Sean pointed out that I was changing the zapping scheme to zap
non-leaf SPTEs in my in-place promotion series, I started to wonder if
that would provide good-enough performance without the complexity of
in-place promo. Turns out it does! This relatively simple patch gives
essentially the same disable time as the in-place promo series.

The main downside to this approach is that it does cause all the vCPUs
to take page faults, so it may still be worth investigating in-place
promotion.

> Testing:
> Ran KVM selftests and kvm-unit-tests on an Intel Haswell. This
> patch introduced no new failures.
>
> Signed-off-by: Ben Gardon <bgardon@google.com>
> ---
>  arch/x86/kvm/mmu/tdp_iter.c |  9 +++++++++
>  arch/x86/kvm/mmu/tdp_iter.h |  1 +
>  arch/x86/kvm/mmu/tdp_mmu.c  | 38 +++++++++++++++++++++++++++++++------
>  3 files changed, 42 insertions(+), 6 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
> index 6d3b3e5a5533..ee4802d7b36c 100644
> --- a/arch/x86/kvm/mmu/tdp_iter.c
> +++ b/arch/x86/kvm/mmu/tdp_iter.c
> @@ -145,6 +145,15 @@ static bool try_step_up(struct tdp_iter *iter)
>         return true;
>  }
>
> +/*
> + * Step the iterator back up a level in the paging structure. Should only be
> + * used when the iterator is below the root level.
> + */
> +void tdp_iter_step_up(struct tdp_iter *iter)
> +{
> +       WARN_ON(!try_step_up(iter));
> +}
> +
>  /*
>   * Step to the next SPTE in a pre-order traversal of the paging structure.
>   * To get to the next SPTE, the iterator either steps down towards the goal
> diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
> index f0af385c56e0..adfca0cf94d3 100644
> --- a/arch/x86/kvm/mmu/tdp_iter.h
> +++ b/arch/x86/kvm/mmu/tdp_iter.h
> @@ -114,5 +114,6 @@ void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
>                     int min_level, gfn_t next_last_level_gfn);
>  void tdp_iter_next(struct tdp_iter *iter);
>  void tdp_iter_restart(struct tdp_iter *iter);
> +void tdp_iter_step_up(struct tdp_iter *iter);
>
>  #endif /* __KVM_X86_MMU_TDP_ITER_H */
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 841feaa48be5..7b9265d67131 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -1742,12 +1742,12 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
>         gfn_t start = slot->base_gfn;
>         gfn_t end = start + slot->npages;
>         struct tdp_iter iter;
> +       int max_mapping_level;
>         kvm_pfn_t pfn;
>
>         rcu_read_lock();
>
>         tdp_root_for_each_pte(iter, root, start, end) {
> -retry:
>                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
>                         continue;
>
> @@ -1755,15 +1755,41 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
>                     !is_last_spte(iter.old_spte, iter.level))
>                         continue;
>
> +               /*
> +                * This is a leaf SPTE. Check if the PFN it maps can
> +                * be mapped at a higher level.
> +                */
>                 pfn = spte_to_pfn(iter.old_spte);
> -               if (kvm_is_reserved_pfn(pfn) ||
> -                   iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
> -                                                           pfn, PG_LEVEL_NUM))
> +
> +               if (kvm_is_reserved_pfn(pfn))
>                         continue;
>
> +               max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
> +                               iter.gfn, pfn, PG_LEVEL_NUM);
> +
> +               WARN_ON(max_mapping_level < iter.level);
> +
> +               /*
> +                * If this page is already mapped at the highest
> +                * viable level, there's nothing more to do.
> +                */
> +               if (max_mapping_level == iter.level)
> +                       continue;
> +
> +               /*
> +                * The page can be remapped at a higher level, so step
> +                * up to zap the parent SPTE.
> +                */
> +               while (max_mapping_level > iter.level)
> +                       tdp_iter_step_up(&iter);
> +
>                 /* Note, a successful atomic zap also does a remote TLB flush. */
> -               if (tdp_mmu_zap_spte_atomic(kvm, &iter))
> -                       goto retry;
> +               tdp_mmu_zap_spte_atomic(kvm, &iter);
> +
> +               /*
> +                * If the atomic zap fails, the iter will recurse back into
> +                * the same subtree to retry.
> +                */
>         }
>
>         rcu_read_unlock();
> --
> 2.36.1.124.g0e6072fb45-goog
>
Yuan Yao May 26, 2022, 1:30 a.m. UTC | #2
On Wed, May 25, 2022 at 11:09:04PM +0000, Ben Gardon wrote:
> When disabling dirty logging, zap non-leaf parent entries to allow
> replacement with huge pages instead of recursing and zapping all of the
> child, leaf entries. This reduces the number of TLB flushes required.
>
> Currently disabling dirty logging with the TDP MMU is extremely slow.
> On a 96 vCPU / 96G VM backed with gigabyte pages, it takes ~200 seconds
> to disable dirty logging with the TDP MMU, as opposed to ~4 seconds with
> the shadow MMU. This patch reduces the disable dirty log time with the
> TDP MMU to ~3 seconds.
>
> Testing:
> Ran KVM selftests and kvm-unit-tests on an Intel Haswell. This
> patch introduced no new failures.
>
> Signed-off-by: Ben Gardon <bgardon@google.com>
> ---
>  arch/x86/kvm/mmu/tdp_iter.c |  9 +++++++++
>  arch/x86/kvm/mmu/tdp_iter.h |  1 +
>  arch/x86/kvm/mmu/tdp_mmu.c  | 38 +++++++++++++++++++++++++++++++------
>  3 files changed, 42 insertions(+), 6 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
> index 6d3b3e5a5533..ee4802d7b36c 100644
> --- a/arch/x86/kvm/mmu/tdp_iter.c
> +++ b/arch/x86/kvm/mmu/tdp_iter.c
> @@ -145,6 +145,15 @@ static bool try_step_up(struct tdp_iter *iter)
>  	return true;
>  }
>
> +/*
> + * Step the iterator back up a level in the paging structure. Should only be
> + * used when the iterator is below the root level.
> + */
> +void tdp_iter_step_up(struct tdp_iter *iter)
> +{
> +	WARN_ON(!try_step_up(iter));
> +}
> +
>  /*
>   * Step to the next SPTE in a pre-order traversal of the paging structure.
>   * To get to the next SPTE, the iterator either steps down towards the goal
> diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
> index f0af385c56e0..adfca0cf94d3 100644
> --- a/arch/x86/kvm/mmu/tdp_iter.h
> +++ b/arch/x86/kvm/mmu/tdp_iter.h
> @@ -114,5 +114,6 @@ void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
>  		    int min_level, gfn_t next_last_level_gfn);
>  void tdp_iter_next(struct tdp_iter *iter);
>  void tdp_iter_restart(struct tdp_iter *iter);
> +void tdp_iter_step_up(struct tdp_iter *iter);
>
>  #endif /* __KVM_X86_MMU_TDP_ITER_H */
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 841feaa48be5..7b9265d67131 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -1742,12 +1742,12 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
>  	gfn_t start = slot->base_gfn;
>  	gfn_t end = start + slot->npages;
>  	struct tdp_iter iter;
> +	int max_mapping_level;
>  	kvm_pfn_t pfn;
>
>  	rcu_read_lock();
>
>  	tdp_root_for_each_pte(iter, root, start, end) {
> -retry:
>  		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
>  			continue;
>
> @@ -1755,15 +1755,41 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
>  		    !is_last_spte(iter.old_spte, iter.level))
>  			continue;
>
> +		/*
> +		 * This is a leaf SPTE. Check if the PFN it maps can
> +		 * be mapped at a higher level.
> +		 */
>  		pfn = spte_to_pfn(iter.old_spte);
> -		if (kvm_is_reserved_pfn(pfn) ||
> -		    iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
> -							    pfn, PG_LEVEL_NUM))
> +
> +		if (kvm_is_reserved_pfn(pfn))
>  			continue;
>
> +		max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
> +				iter.gfn, pfn, PG_LEVEL_NUM);
> +
> +		WARN_ON(max_mapping_level < iter.level);
> +
> +		/*
> +		 * If this page is already mapped at the highest
> +		 * viable level, there's nothing more to do.
> +		 */
> +		if (max_mapping_level == iter.level)
> +			continue;
> +
> +		/*
> +		 * The page can be remapped at a higher level, so step
> +		 * up to zap the parent SPTE.
> +		 */
> +		while (max_mapping_level > iter.level)
> +			tdp_iter_step_up(&iter);

So the benefit from this is:
Before: Zap 512 ptes in 4K level page table do TLB flush 512 times.
Now: Zap higher level 1 2MB level pte do TLB flush 1 time, event
     it also handles all 512 lower level 4K ptes, but just atomic operation
     there, see handle_removed_pt().

Is my understanding correct ?

> +
>  		/* Note, a successful atomic zap also does a remote TLB flush. */
> -		if (tdp_mmu_zap_spte_atomic(kvm, &iter))
> -			goto retry;
> +		tdp_mmu_zap_spte_atomic(kvm, &iter);
> +
> +		/*
> +		 * If the atomic zap fails, the iter will recurse back into
> +		 * the same subtree to retry.
> +		 */
>  	}
>
>  	rcu_read_unlock();
> --
> 2.36.1.124.g0e6072fb45-goog
>
Paolo Bonzini May 26, 2022, 12:01 p.m. UTC | #3
On 5/26/22 01:09, Ben Gardon wrote:
> +		WARN_ON(max_mapping_level < iter.level);
> +
> +		/*
> +		 * If this page is already mapped at the highest
> +		 * viable level, there's nothing more to do.
> +		 */
> +		if (max_mapping_level == iter.level)
> +			continue;
> +
> +		/*
> +		 * The page can be remapped at a higher level, so step
> +		 * up to zap the parent SPTE.
> +		 */
> +		while (max_mapping_level > iter.level)
> +			tdp_iter_step_up(&iter);
> +
>   		/* Note, a successful atomic zap also does a remote TLB flush. */
> -		if (tdp_mmu_zap_spte_atomic(kvm, &iter))
> -			goto retry;
> +		tdp_mmu_zap_spte_atomic(kvm, &iter);
> +

Can you make this a sparate function (for example 
tdp_mmu_zap_collapsible_spte_atomic)?  Otherwise looks great!

Paolo
Peter Xu May 26, 2022, 2:30 p.m. UTC | #4
On Thu, May 26, 2022 at 02:01:43PM +0200, Paolo Bonzini wrote:
> On 5/26/22 01:09, Ben Gardon wrote:
> > +		WARN_ON(max_mapping_level < iter.level);
> > +
> > +		/*
> > +		 * If this page is already mapped at the highest
> > +		 * viable level, there's nothing more to do.
> > +		 */
> > +		if (max_mapping_level == iter.level)
> > +			continue;
> > +
> > +		/*
> > +		 * The page can be remapped at a higher level, so step
> > +		 * up to zap the parent SPTE.
> > +		 */
> > +		while (max_mapping_level > iter.level)
> > +			tdp_iter_step_up(&iter);
> > +
> >   		/* Note, a successful atomic zap also does a remote TLB flush. */
> > -		if (tdp_mmu_zap_spte_atomic(kvm, &iter))
> > -			goto retry;
> > +		tdp_mmu_zap_spte_atomic(kvm, &iter);
> > +
> 
> Can you make this a sparate function (for example
> tdp_mmu_zap_collapsible_spte_atomic)?  Otherwise looks great!

There could be a tiny downside of using a helper in that it'll hide the
step-up of the iterator, which might not be as obvious as keeping it in the
loop?

Thanks,
Paolo Bonzini May 26, 2022, 3:52 p.m. UTC | #5
On 5/26/22 16:30, Peter Xu wrote:
> On Thu, May 26, 2022 at 02:01:43PM +0200, Paolo Bonzini wrote:
>> On 5/26/22 01:09, Ben Gardon wrote:
>>> +		WARN_ON(max_mapping_level < iter.level);
>>> +
>>> +		/*
>>> +		 * If this page is already mapped at the highest
>>> +		 * viable level, there's nothing more to do.
>>> +		 */
>>> +		if (max_mapping_level == iter.level)
>>> +			continue;
>>> +
>>> +		/*
>>> +		 * The page can be remapped at a higher level, so step
>>> +		 * up to zap the parent SPTE.
>>> +		 */
>>> +		while (max_mapping_level > iter.level)
>>> +			tdp_iter_step_up(&iter);
>>> +
>>>    		/* Note, a successful atomic zap also does a remote TLB flush. */
>>> -		if (tdp_mmu_zap_spte_atomic(kvm, &iter))
>>> -			goto retry;
>>> +		tdp_mmu_zap_spte_atomic(kvm, &iter);
>>> +
>>
>> Can you make this a sparate function (for example
>> tdp_mmu_zap_collapsible_spte_atomic)?  Otherwise looks great!
> 
> There could be a tiny downside of using a helper in that it'll hide the
> step-up of the iterator, which might not be as obvious as keeping it in the
> loop?

That's true, my reasoning is that zapping at a higher level can only be 
done by first moving the iterator up.  Maybe 
tdp_mmu_zap_at_level_atomic() is a better Though, I can very well apply 
this patch as is.

Paolo
Ben Gardon May 26, 2022, 3:57 p.m. UTC | #6
On Wed, May 25, 2022 at 6:30 PM Yuan Yao <yuan.yao@linux.intel.com> wrote:
>
> On Wed, May 25, 2022 at 11:09:04PM +0000, Ben Gardon wrote:
> > When disabling dirty logging, zap non-leaf parent entries to allow
> > replacement with huge pages instead of recursing and zapping all of the
> > child, leaf entries. This reduces the number of TLB flushes required.
> >
> > Currently disabling dirty logging with the TDP MMU is extremely slow.
> > On a 96 vCPU / 96G VM backed with gigabyte pages, it takes ~200 seconds
> > to disable dirty logging with the TDP MMU, as opposed to ~4 seconds with
> > the shadow MMU. This patch reduces the disable dirty log time with the
> > TDP MMU to ~3 seconds.
> >
> > Testing:
> > Ran KVM selftests and kvm-unit-tests on an Intel Haswell. This
> > patch introduced no new failures.
> >
> > Signed-off-by: Ben Gardon <bgardon@google.com>
> > ---
> >  arch/x86/kvm/mmu/tdp_iter.c |  9 +++++++++
> >  arch/x86/kvm/mmu/tdp_iter.h |  1 +
> >  arch/x86/kvm/mmu/tdp_mmu.c  | 38 +++++++++++++++++++++++++++++++------
> >  3 files changed, 42 insertions(+), 6 deletions(-)
> >
> > diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
> > index 6d3b3e5a5533..ee4802d7b36c 100644
> > --- a/arch/x86/kvm/mmu/tdp_iter.c
> > +++ b/arch/x86/kvm/mmu/tdp_iter.c
> > @@ -145,6 +145,15 @@ static bool try_step_up(struct tdp_iter *iter)
> >       return true;
> >  }
> >
> > +/*
> > + * Step the iterator back up a level in the paging structure. Should only be
> > + * used when the iterator is below the root level.
> > + */
> > +void tdp_iter_step_up(struct tdp_iter *iter)
> > +{
> > +     WARN_ON(!try_step_up(iter));
> > +}
> > +
> >  /*
> >   * Step to the next SPTE in a pre-order traversal of the paging structure.
> >   * To get to the next SPTE, the iterator either steps down towards the goal
> > diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
> > index f0af385c56e0..adfca0cf94d3 100644
> > --- a/arch/x86/kvm/mmu/tdp_iter.h
> > +++ b/arch/x86/kvm/mmu/tdp_iter.h
> > @@ -114,5 +114,6 @@ void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
> >                   int min_level, gfn_t next_last_level_gfn);
> >  void tdp_iter_next(struct tdp_iter *iter);
> >  void tdp_iter_restart(struct tdp_iter *iter);
> > +void tdp_iter_step_up(struct tdp_iter *iter);
> >
> >  #endif /* __KVM_X86_MMU_TDP_ITER_H */
> > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> > index 841feaa48be5..7b9265d67131 100644
> > --- a/arch/x86/kvm/mmu/tdp_mmu.c
> > +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> > @@ -1742,12 +1742,12 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
> >       gfn_t start = slot->base_gfn;
> >       gfn_t end = start + slot->npages;
> >       struct tdp_iter iter;
> > +     int max_mapping_level;
> >       kvm_pfn_t pfn;
> >
> >       rcu_read_lock();
> >
> >       tdp_root_for_each_pte(iter, root, start, end) {
> > -retry:
> >               if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
> >                       continue;
> >
> > @@ -1755,15 +1755,41 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
> >                   !is_last_spte(iter.old_spte, iter.level))
> >                       continue;
> >
> > +             /*
> > +              * This is a leaf SPTE. Check if the PFN it maps can
> > +              * be mapped at a higher level.
> > +              */
> >               pfn = spte_to_pfn(iter.old_spte);
> > -             if (kvm_is_reserved_pfn(pfn) ||
> > -                 iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
> > -                                                         pfn, PG_LEVEL_NUM))
> > +
> > +             if (kvm_is_reserved_pfn(pfn))
> >                       continue;
> >
> > +             max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
> > +                             iter.gfn, pfn, PG_LEVEL_NUM);
> > +
> > +             WARN_ON(max_mapping_level < iter.level);
> > +
> > +             /*
> > +              * If this page is already mapped at the highest
> > +              * viable level, there's nothing more to do.
> > +              */
> > +             if (max_mapping_level == iter.level)
> > +                     continue;
> > +
> > +             /*
> > +              * The page can be remapped at a higher level, so step
> > +              * up to zap the parent SPTE.
> > +              */
> > +             while (max_mapping_level > iter.level)
> > +                     tdp_iter_step_up(&iter);
>
> So the benefit from this is:
> Before: Zap 512 ptes in 4K level page table do TLB flush 512 times.
> Now: Zap higher level 1 2MB level pte do TLB flush 1 time, event
>      it also handles all 512 lower level 4K ptes, but just atomic operation
>      there, see handle_removed_pt().
>
> Is my understanding correct ?

Yes, that's exactly right.

>
> > +
> >               /* Note, a successful atomic zap also does a remote TLB flush. */
> > -             if (tdp_mmu_zap_spte_atomic(kvm, &iter))
> > -                     goto retry;
> > +             tdp_mmu_zap_spte_atomic(kvm, &iter);
> > +
> > +             /*
> > +              * If the atomic zap fails, the iter will recurse back into
> > +              * the same subtree to retry.
> > +              */
> >       }
> >
> >       rcu_read_unlock();
> > --
> > 2.36.1.124.g0e6072fb45-goog
> >
Ben Gardon May 26, 2022, 4 p.m. UTC | #7
On Thu, May 26, 2022 at 8:52 AM Paolo Bonzini <pbonzini@redhat.com> wrote:
>
> On 5/26/22 16:30, Peter Xu wrote:
> > On Thu, May 26, 2022 at 02:01:43PM +0200, Paolo Bonzini wrote:
> >> On 5/26/22 01:09, Ben Gardon wrote:
> >>> +           WARN_ON(max_mapping_level < iter.level);
> >>> +
> >>> +           /*
> >>> +            * If this page is already mapped at the highest
> >>> +            * viable level, there's nothing more to do.
> >>> +            */
> >>> +           if (max_mapping_level == iter.level)
> >>> +                   continue;
> >>> +
> >>> +           /*
> >>> +            * The page can be remapped at a higher level, so step
> >>> +            * up to zap the parent SPTE.
> >>> +            */
> >>> +           while (max_mapping_level > iter.level)
> >>> +                   tdp_iter_step_up(&iter);
> >>> +
> >>>             /* Note, a successful atomic zap also does a remote TLB flush. */
> >>> -           if (tdp_mmu_zap_spte_atomic(kvm, &iter))
> >>> -                   goto retry;
> >>> +           tdp_mmu_zap_spte_atomic(kvm, &iter);
> >>> +
> >>
> >> Can you make this a sparate function (for example
> >> tdp_mmu_zap_collapsible_spte_atomic)?  Otherwise looks great!
> >
> > There could be a tiny downside of using a helper in that it'll hide the
> > step-up of the iterator, which might not be as obvious as keeping it in the
> > loop?
>
> That's true, my reasoning is that zapping at a higher level can only be
> done by first moving the iterator up.  Maybe
> tdp_mmu_zap_at_level_atomic() is a better Though, I can very well apply
> this patch as is.

I'd be inclined to apply the patch as-is for a couple reasons:
1. As Peter said, hiding the step up could be confusing.
2. If we want to try the in-place promotion, we'll have to dismantle
that helper again anyway or else have a bunch of duplicate code.

>
> Paolo
>
David Matlack May 26, 2022, 4:40 p.m. UTC | #8
On Wed, May 25, 2022 at 11:09:04PM +0000, Ben Gardon wrote:
> When disabling dirty logging, zap non-leaf parent entries to allow
> replacement with huge pages instead of recursing and zapping all of the
> child, leaf entries. This reduces the number of TLB flushes required.
> 
> Currently disabling dirty logging with the TDP MMU is extremely slow.
> On a 96 vCPU / 96G VM backed with gigabyte pages, it takes ~200 seconds
> to disable dirty logging with the TDP MMU, as opposed to ~4 seconds with
> the shadow MMU. This patch reduces the disable dirty log time with the
> TDP MMU to ~3 seconds.

Nice!

It'd be good to also mention the new WARN. e.g.

  Opportunistically add a WARN() to catch GFNS that are mapped at a
  higher level than their max level.

> 
> Testing:
> Ran KVM selftests and kvm-unit-tests on an Intel Haswell. This
> patch introduced no new failures.
> 
> Signed-off-by: Ben Gardon <bgardon@google.com>

Reviewed-by: David Matlack <dmatlack@google.com>

> ---
>  arch/x86/kvm/mmu/tdp_iter.c |  9 +++++++++
>  arch/x86/kvm/mmu/tdp_iter.h |  1 +
>  arch/x86/kvm/mmu/tdp_mmu.c  | 38 +++++++++++++++++++++++++++++++------
>  3 files changed, 42 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
> index 6d3b3e5a5533..ee4802d7b36c 100644
> --- a/arch/x86/kvm/mmu/tdp_iter.c
> +++ b/arch/x86/kvm/mmu/tdp_iter.c
> @@ -145,6 +145,15 @@ static bool try_step_up(struct tdp_iter *iter)
>  	return true;
>  }
>  
> +/*
> + * Step the iterator back up a level in the paging structure. Should only be
> + * used when the iterator is below the root level.
> + */
> +void tdp_iter_step_up(struct tdp_iter *iter)
> +{
> +	WARN_ON(!try_step_up(iter));
> +}
> +
>  /*
>   * Step to the next SPTE in a pre-order traversal of the paging structure.
>   * To get to the next SPTE, the iterator either steps down towards the goal
> diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
> index f0af385c56e0..adfca0cf94d3 100644
> --- a/arch/x86/kvm/mmu/tdp_iter.h
> +++ b/arch/x86/kvm/mmu/tdp_iter.h
> @@ -114,5 +114,6 @@ void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
>  		    int min_level, gfn_t next_last_level_gfn);
>  void tdp_iter_next(struct tdp_iter *iter);
>  void tdp_iter_restart(struct tdp_iter *iter);
> +void tdp_iter_step_up(struct tdp_iter *iter);
>  
>  #endif /* __KVM_X86_MMU_TDP_ITER_H */
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 841feaa48be5..7b9265d67131 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -1742,12 +1742,12 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
>  	gfn_t start = slot->base_gfn;
>  	gfn_t end = start + slot->npages;
>  	struct tdp_iter iter;
> +	int max_mapping_level;
>  	kvm_pfn_t pfn;
>  
>  	rcu_read_lock();
>  
>  	tdp_root_for_each_pte(iter, root, start, end) {
> -retry:
>  		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
>  			continue;
>  
> @@ -1755,15 +1755,41 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
>  		    !is_last_spte(iter.old_spte, iter.level))
>  			continue;
>  
> +		/*
> +		 * This is a leaf SPTE. Check if the PFN it maps can
> +		 * be mapped at a higher level.
> +		 */
>  		pfn = spte_to_pfn(iter.old_spte);
> -		if (kvm_is_reserved_pfn(pfn) ||
> -		    iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
> -							    pfn, PG_LEVEL_NUM))
> +
> +		if (kvm_is_reserved_pfn(pfn))
>  			continue;
>  
> +		max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
> +				iter.gfn, pfn, PG_LEVEL_NUM);
> +
> +		WARN_ON(max_mapping_level < iter.level);
> +
> +		/*
> +		 * If this page is already mapped at the highest
> +		 * viable level, there's nothing more to do.
> +		 */
> +		if (max_mapping_level == iter.level)
> +			continue;
> +
> +		/*
> +		 * The page can be remapped at a higher level, so step
> +		 * up to zap the parent SPTE.
> +		 */
> +		while (max_mapping_level > iter.level)
> +			tdp_iter_step_up(&iter);
> +
>  		/* Note, a successful atomic zap also does a remote TLB flush. */
> -		if (tdp_mmu_zap_spte_atomic(kvm, &iter))
> -			goto retry;
> +		tdp_mmu_zap_spte_atomic(kvm, &iter);
> +
> +		/*
> +		 * If the atomic zap fails, the iter will recurse back into
> +		 * the same subtree to retry.
> +		 */
>  	}
>  
>  	rcu_read_unlock();
> -- 
> 2.36.1.124.g0e6072fb45-goog
>
Ben Gardon June 6, 2022, 11:11 p.m. UTC | #9
On Thu, May 26, 2022 at 9:00 AM Ben Gardon <bgardon@google.com> wrote:
>
> On Thu, May 26, 2022 at 8:52 AM Paolo Bonzini <pbonzini@redhat.com> wrote:
> >
> > On 5/26/22 16:30, Peter Xu wrote:
> > > On Thu, May 26, 2022 at 02:01:43PM +0200, Paolo Bonzini wrote:
> > >> On 5/26/22 01:09, Ben Gardon wrote:
> > >>> +           WARN_ON(max_mapping_level < iter.level);
> > >>> +
> > >>> +           /*
> > >>> +            * If this page is already mapped at the highest
> > >>> +            * viable level, there's nothing more to do.
> > >>> +            */
> > >>> +           if (max_mapping_level == iter.level)
> > >>> +                   continue;
> > >>> +
> > >>> +           /*
> > >>> +            * The page can be remapped at a higher level, so step
> > >>> +            * up to zap the parent SPTE.
> > >>> +            */
> > >>> +           while (max_mapping_level > iter.level)
> > >>> +                   tdp_iter_step_up(&iter);
> > >>> +
> > >>>             /* Note, a successful atomic zap also does a remote TLB flush. */
> > >>> -           if (tdp_mmu_zap_spte_atomic(kvm, &iter))
> > >>> -                   goto retry;
> > >>> +           tdp_mmu_zap_spte_atomic(kvm, &iter);
> > >>> +
> > >>
> > >> Can you make this a sparate function (for example
> > >> tdp_mmu_zap_collapsible_spte_atomic)?  Otherwise looks great!
> > >
> > > There could be a tiny downside of using a helper in that it'll hide the
> > > step-up of the iterator, which might not be as obvious as keeping it in the
> > > loop?
> >
> > That's true, my reasoning is that zapping at a higher level can only be
> > done by first moving the iterator up.  Maybe
> > tdp_mmu_zap_at_level_atomic() is a better Though, I can very well apply
> > this patch as is.
>
> I'd be inclined to apply the patch as-is for a couple reasons:
> 1. As Peter said, hiding the step up could be confusing.
> 2. If we want to try the in-place promotion, we'll have to dismantle
> that helper again anyway or else have a bunch of duplicate code.

Circling back on this, Paolo would you like me to send another version
of this patch, or do you think it's good to go?

>
> >
> > Paolo
> >
diff mbox series

Patch

diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index 6d3b3e5a5533..ee4802d7b36c 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -145,6 +145,15 @@  static bool try_step_up(struct tdp_iter *iter)
 	return true;
 }
 
+/*
+ * Step the iterator back up a level in the paging structure. Should only be
+ * used when the iterator is below the root level.
+ */
+void tdp_iter_step_up(struct tdp_iter *iter)
+{
+	WARN_ON(!try_step_up(iter));
+}
+
 /*
  * Step to the next SPTE in a pre-order traversal of the paging structure.
  * To get to the next SPTE, the iterator either steps down towards the goal
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index f0af385c56e0..adfca0cf94d3 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -114,5 +114,6 @@  void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
 		    int min_level, gfn_t next_last_level_gfn);
 void tdp_iter_next(struct tdp_iter *iter);
 void tdp_iter_restart(struct tdp_iter *iter);
+void tdp_iter_step_up(struct tdp_iter *iter);
 
 #endif /* __KVM_X86_MMU_TDP_ITER_H */
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 841feaa48be5..7b9265d67131 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1742,12 +1742,12 @@  static void zap_collapsible_spte_range(struct kvm *kvm,
 	gfn_t start = slot->base_gfn;
 	gfn_t end = start + slot->npages;
 	struct tdp_iter iter;
+	int max_mapping_level;
 	kvm_pfn_t pfn;
 
 	rcu_read_lock();
 
 	tdp_root_for_each_pte(iter, root, start, end) {
-retry:
 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
 			continue;
 
@@ -1755,15 +1755,41 @@  static void zap_collapsible_spte_range(struct kvm *kvm,
 		    !is_last_spte(iter.old_spte, iter.level))
 			continue;
 
+		/*
+		 * This is a leaf SPTE. Check if the PFN it maps can
+		 * be mapped at a higher level.
+		 */
 		pfn = spte_to_pfn(iter.old_spte);
-		if (kvm_is_reserved_pfn(pfn) ||
-		    iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
-							    pfn, PG_LEVEL_NUM))
+
+		if (kvm_is_reserved_pfn(pfn))
 			continue;
 
+		max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
+				iter.gfn, pfn, PG_LEVEL_NUM);
+
+		WARN_ON(max_mapping_level < iter.level);
+
+		/*
+		 * If this page is already mapped at the highest
+		 * viable level, there's nothing more to do.
+		 */
+		if (max_mapping_level == iter.level)
+			continue;
+
+		/*
+		 * The page can be remapped at a higher level, so step
+		 * up to zap the parent SPTE.
+		 */
+		while (max_mapping_level > iter.level)
+			tdp_iter_step_up(&iter);
+
 		/* Note, a successful atomic zap also does a remote TLB flush. */
-		if (tdp_mmu_zap_spte_atomic(kvm, &iter))
-			goto retry;
+		tdp_mmu_zap_spte_atomic(kvm, &iter);
+
+		/*
+		 * If the atomic zap fails, the iter will recurse back into
+		 * the same subtree to retry.
+		 */
 	}
 
 	rcu_read_unlock();