[v2,9/9] KVM: X86: Optimize zapping rmap

Message ID	20210625153419.43671-1-peterx@redhat.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <kvm-owner@kernel.org> From: Peter Xu <peterx@redhat.com> To: linux-kernel@vger.kernel.org, kvm@vger.kernel.org Cc: peterx@redhat.com, Maxim Levitsky <mlevitsk@redhat.com>, Vitaly Kuznetsov <vkuznets@redhat.com>, Sean Christopherson <seanjc@google.com>, Paolo Bonzini <pbonzini@redhat.com> Subject: [PATCH v2 9/9] KVM: X86: Optimize zapping rmap Date: Fri, 25 Jun 2021 11:34:19 -0400 Message-Id: <20210625153419.43671-1-peterx@redhat.com> In-Reply-To: <20210625153214.43106-1-peterx@redhat.com> References: <20210625153214.43106-1-peterx@redhat.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: bulk
Series	KVM: X86: Some light optimizations on rmap logic \| expand [v2,0/9] KVM: X86: Some light optimizations on rmap logic [v2,1/9] KVM: X86: Add per-vm stat for max rmap list size [v2,2/9] KVM: Introduce kvm_get_kvm_safe() [v2,3/9] KVM: Allow to have arch-specific per-vm debugfs files [v2,4/9] KVM: X86: Introduce pte_list_count() helper [v2,5/9] KVM: X86: Introduce kvm_mmu_slot_lpages() helpers [v2,6/9] KVM: X86: Introduce mmu_rmaps_stat per-vm debugfs file [v2,7/9] KVM: X86: MMU: Tune PTE_LIST_EXT to be bigger [v2,8/9] KVM: X86: Optimize pte_list_desc with per-array counter [v2,9/9] KVM: X86: Optimize zapping rmap

Message ID

20210625153419.43671-1-peterx@redhat.com (mailing list archive)

State

New, archived

Headers

From: Peter Xu <peterx@redhat.com>
To: linux-kernel@vger.kernel.org, kvm@vger.kernel.org
Cc: peterx@redhat.com, Maxim Levitsky <mlevitsk@redhat.com>,
        Vitaly Kuznetsov <vkuznets@redhat.com>,
        Sean Christopherson <seanjc@google.com>,
        Paolo Bonzini <pbonzini@redhat.com>
Subject: [PATCH v2 9/9] KVM: X86: Optimize zapping rmap
Date: Fri, 25 Jun 2021 11:34:19 -0400
Message-Id: <20210625153419.43671-1-peterx@redhat.com>
In-Reply-To: <20210625153214.43106-1-peterx@redhat.com>
References: <20210625153214.43106-1-peterx@redhat.com>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
Precedence: bulk

Series

KVM: X86: Some light optimizations on rmap logic | expand

Commit Message

Peter Xu June 25, 2021, 3:34 p.m. UTC

Using rmap_get_first() and rmap_remove() for zapping a huge rmap list could be
slow.  The easy way is to travers the rmap list, collecting the a/d bits and
free the slots along the way.

Provide a pte_list_destroy() and do exactly that.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c | 45 +++++++++++++++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 12 deletions(-)

Comments

Sean Christopherson July 28, 2021, 9:39 p.m. UTC | #1

On Fri, Jun 25, 2021, Peter Xu wrote:
> Using rmap_get_first() and rmap_remove() for zapping a huge rmap list could be
> slow.  The easy way is to travers the rmap list, collecting the a/d bits and
> free the slots along the way.
> 
> Provide a pte_list_destroy() and do exactly that.
> 
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  arch/x86/kvm/mmu/mmu.c | 45 +++++++++++++++++++++++++++++++-----------
>  1 file changed, 33 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index ba0258bdebc4..45aac78dcabc 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -1014,6 +1014,38 @@ unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
>  	return count;
>  }
>  
> +/* Return true if rmap existed and callback called, false otherwise */
> +static bool pte_list_destroy(struct kvm_rmap_head *rmap_head,
> +			     int (*callback)(u64 *sptep))
> +{
> +	struct pte_list_desc *desc, *next;
> +	int i;
> +
> +	if (!rmap_head->val)
> +		return false;
> +
> +	if (!(rmap_head->val & 1)) {
> +		if (callback)
> +			callback((u64 *)rmap_head->val);
> +		goto out;
> +	}
> +
> +	desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
> +
> +	while (desc) {
> +		if (callback)
> +			for (i = 0; i < desc->spte_count; i++)
> +				callback(desc->sptes[i]);
> +		next = desc->more;
> +		mmu_free_pte_list_desc(desc);
> +		desc = next;

Alternatively, 

	desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
	for ( ; desc; desc = next) {
		for (i = 0; i < desc->spte_count; i++)
			mmu_spte_clear_track_bits((u64 *)rmap_head->val);
		next = desc->more;
		mmu_free_pte_list_desc(desc);
	}

> +	}
> +out:
> +	/* rmap_head is meaningless now, remember to reset it */
> +	rmap_head->val = 0;
> +	return true;

Why implement this as a generic method with a callback?  gcc is suprisingly
astute in optimizing callback(), but I don't see the point of adding a complex
helper that has a single caller, and is extremely unlikely to gain new callers.
Or is there another "zap everything" case I'm missing?

E.g. why not this?

static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
			  const struct kvm_memory_slot *slot)
{
	struct pte_list_desc *desc, *next;
	int i;

	if (!rmap_head->val)
		return false;

	if (!(rmap_head->val & 1)) {
		mmu_spte_clear_track_bits((u64 *)rmap_head->val);
		goto out;
	}

	desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
	for ( ; desc; desc = next) {
		for (i = 0; i < desc->spte_count; i++)
			mmu_spte_clear_track_bits(desc->sptes[i]);
		next = desc->more;
		mmu_free_pte_list_desc(desc);
	}
out:
	/* rmap_head is meaningless now, remember to reset it */
	rmap_head->val = 0;
	return true;
}

> +}
> +
>  static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
>  					   struct kvm_memory_slot *slot)
>  {
> @@ -1403,18 +1435,7 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
>  static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
>  			  struct kvm_memory_slot *slot)
>  {
> -	u64 *sptep;
> -	struct rmap_iterator iter;
> -	bool flush = false;
> -
> -	while ((sptep = rmap_get_first(rmap_head, &iter))) {
> -		rmap_printk("spte %p %llx.\n", sptep, *sptep);
> -
> -		pte_list_remove(rmap_head, sptep);
> -		flush = true;
> -	}
> -
> -	return flush;
> +	return pte_list_destroy(rmap_head, mmu_spte_clear_track_bits);
>  }
>  
>  static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
> -- 
> 2.31.1
>

Peter Xu July 28, 2021, 10:01 p.m. UTC | #2

On Wed, Jul 28, 2021 at 09:39:02PM +0000, Sean Christopherson wrote:
> On Fri, Jun 25, 2021, Peter Xu wrote:
> > Using rmap_get_first() and rmap_remove() for zapping a huge rmap list could be
> > slow.  The easy way is to travers the rmap list, collecting the a/d bits and
> > free the slots along the way.
> > 
> > Provide a pte_list_destroy() and do exactly that.
> > 
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > ---
> >  arch/x86/kvm/mmu/mmu.c | 45 +++++++++++++++++++++++++++++++-----------
> >  1 file changed, 33 insertions(+), 12 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index ba0258bdebc4..45aac78dcabc 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -1014,6 +1014,38 @@ unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
> >  	return count;
> >  }
> >  
> > +/* Return true if rmap existed and callback called, false otherwise */
> > +static bool pte_list_destroy(struct kvm_rmap_head *rmap_head,
> > +			     int (*callback)(u64 *sptep))
> > +{
> > +	struct pte_list_desc *desc, *next;
> > +	int i;
> > +
> > +	if (!rmap_head->val)
> > +		return false;
> > +
> > +	if (!(rmap_head->val & 1)) {
> > +		if (callback)
> > +			callback((u64 *)rmap_head->val);
> > +		goto out;
> > +	}
> > +
> > +	desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
> > +
> > +	while (desc) {
> > +		if (callback)
> > +			for (i = 0; i < desc->spte_count; i++)
> > +				callback(desc->sptes[i]);
> > +		next = desc->more;
> > +		mmu_free_pte_list_desc(desc);
> > +		desc = next;
> 
> Alternatively, 
> 
> 	desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
> 	for ( ; desc; desc = next) {
> 		for (i = 0; i < desc->spte_count; i++)
> 			mmu_spte_clear_track_bits((u64 *)rmap_head->val);
> 		next = desc->more;
> 		mmu_free_pte_list_desc(desc);
> 	}
> 
> > +	}
> > +out:
> > +	/* rmap_head is meaningless now, remember to reset it */
> > +	rmap_head->val = 0;
> > +	return true;
> 
> Why implement this as a generic method with a callback?  gcc is suprisingly
> astute in optimizing callback(), but I don't see the point of adding a complex
> helper that has a single caller, and is extremely unlikely to gain new callers.
> Or is there another "zap everything" case I'm missing?

No other case; it's just that pte_list_*() helpers will be more self-contained.
If that'll be a performance concern, no objection to hard code it.

> 
> E.g. why not this?
> 
> static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
> 			  const struct kvm_memory_slot *slot)
> {
> 	struct pte_list_desc *desc, *next;
> 	int i;
> 
> 	if (!rmap_head->val)
> 		return false;
> 
> 	if (!(rmap_head->val & 1)) {
> 		mmu_spte_clear_track_bits((u64 *)rmap_head->val);
> 		goto out;
> 	}
> 
> 	desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
> 	for ( ; desc; desc = next) {
> 		for (i = 0; i < desc->spte_count; i++)
> 			mmu_spte_clear_track_bits(desc->sptes[i]);
> 		next = desc->more;
> 		mmu_free_pte_list_desc(desc);
> 	}
> out:
> 	/* rmap_head is meaningless now, remember to reset it */
> 	rmap_head->val = 0;
> 	return true;
> }

Looks good, but so far I've no strong opinion on this.  I'll leave it for Paolo
to decide.

Thanks!

Sean Christopherson July 28, 2021, 10:31 p.m. UTC | #3

On Wed, Jul 28, 2021, Peter Xu wrote:
> On Wed, Jul 28, 2021 at 09:39:02PM +0000, Sean Christopherson wrote:
> > On Fri, Jun 25, 2021, Peter Xu wrote:
> > Why implement this as a generic method with a callback?  gcc is suprisingly
> > astute in optimizing callback(), but I don't see the point of adding a complex
> > helper that has a single caller, and is extremely unlikely to gain new callers.
> > Or is there another "zap everything" case I'm missing?
> 
> No other case; it's just that pte_list_*() helpers will be more self-contained.

Eh, but this flow is as much about rmaps as it is about pte_list.

> If that'll be a performance concern, no objection to hard code it.

It's more about unnecessary complexity than it is about performance, e.g. gcc-10
generates identical code for both version (which did surprise the heck out of me).

If we really want to isolate pte_list_destroy(), I would vote for something like
this (squashed in).   pte_list_remove() already calls mmu_spte_clear_track_bits(),
so that particular separation of concerns has already gone out the window.

 
-/* Return true if rmap existed and callback called, false otherwise */
-static bool pte_list_destroy(struct kvm_rmap_head *rmap_head,
-                            void (*callback)(u64 *sptep))
+static bool pte_list_destroy(struct kvm_rmap_head *rmap_head)
 {
        struct pte_list_desc *desc, *next;
        int i;
@@ -1013,20 +1011,16 @@ static bool pte_list_destroy(struct kvm_rmap_head *rmap_head,
                return false;
 
        if (!(rmap_head->val & 1)) {
-               if (callback)
-                       callback((u64 *)rmap_head->val);
+               mmu_spte_clear_track_bits((u64 *)rmap_head->val);
                goto out;
        }
 
        desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
-
-       while (desc) {
-               if (callback)
-                       for (i = 0; i < desc->spte_count; i++)
-                               callback(desc->sptes[i]);
+       for ( ; desc; desc = next) {
+               for (i = 0; i < desc->spte_count; i++)
+                       mmu_spte_clear_track_bits(desc->sptes[i]);
                next = desc->more;
                mmu_free_pte_list_desc(desc);
-               desc = next;
        }
 out:
        /* rmap_head is meaningless now, remember to reset it */
@@ -1422,22 +1416,17 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
        return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K);
 }
 
-static void mmu_spte_clear_track_bits_cb(u64 *sptep)
-{
-       mmu_spte_clear_track_bits(sptep);
-}
-
 static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
                          const struct kvm_memory_slot *slot)
 {
-       return pte_list_destroy(rmap_head, mmu_spte_clear_track_bits_cb);
+       return pte_list_destroy(rmap_head);
 }
 
 static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
                            struct kvm_memory_slot *slot, gfn_t gfn, int level,
                            pte_t unused)
 {
-       return kvm_zap_rmapp(kvm, rmap_head, slot);
+       return pte_list_destroy(rmap_head);
 }
 
 static bool kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,

Paolo Bonzini July 29, 2021, 9:35 a.m. UTC | #4

On 29/07/21 00:31, Sean Christopherson wrote:
>> If that'll be a performance concern, no objection to hard code it.
> It's more about unnecessary complexity than it is about performance, e.g. gcc-10
> generates identical code for both version (which did surprise the heck out of me).

If you think of what's needed to produce decent (as fast as C) code out 
of STL code, that's not surprising. :)  Pretty cool that it lets people 
write nicer C code too, though.

> If we really want to isolate pte_list_destroy(), I would vote for something like
> this (squashed in).   pte_list_remove() already calls mmu_spte_clear_track_bits(),
> so that particular separation of concerns has already gone out the window.

Yes, that's fair enough.  Thanks for the review!

Paolo

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index ba0258bdebc4..45aac78dcabc 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1014,6 +1014,38 @@  unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
 	return count;
 }
 
+/* Return true if rmap existed and callback called, false otherwise */
+static bool pte_list_destroy(struct kvm_rmap_head *rmap_head,
+			     int (*callback)(u64 *sptep))
+{
+	struct pte_list_desc *desc, *next;
+	int i;
+
+	if (!rmap_head->val)
+		return false;
+
+	if (!(rmap_head->val & 1)) {
+		if (callback)
+			callback((u64 *)rmap_head->val);
+		goto out;
+	}
+
+	desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+
+	while (desc) {
+		if (callback)
+			for (i = 0; i < desc->spte_count; i++)
+				callback(desc->sptes[i]);
+		next = desc->more;
+		mmu_free_pte_list_desc(desc);
+		desc = next;
+	}
+out:
+	/* rmap_head is meaningless now, remember to reset it */
+	rmap_head->val = 0;
+	return true;
+}
+
 static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
 					   struct kvm_memory_slot *slot)
 {
@@ -1403,18 +1435,7 @@  static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
 static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 			  struct kvm_memory_slot *slot)
 {
-	u64 *sptep;
-	struct rmap_iterator iter;
-	bool flush = false;
-
-	while ((sptep = rmap_get_first(rmap_head, &iter))) {
-		rmap_printk("spte %p %llx.\n", sptep, *sptep);
-
-		pte_list_remove(rmap_head, sptep);
-		flush = true;
-	}
-
-	return flush;
+	return pte_list_destroy(rmap_head, mmu_spte_clear_track_bits);
 }
 
 static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,

[v2,9/9] KVM: X86: Optimize zapping rmap

Commit Message

Comments

Patch