diff mbox series

[v2,9/9] KVM: X86: Optimize zapping rmap

Message ID 20210625153419.43671-1-peterx@redhat.com (mailing list archive)
State New, archived
Headers show
Series KVM: X86: Some light optimizations on rmap logic | expand

Commit Message

Peter Xu June 25, 2021, 3:34 p.m. UTC
Using rmap_get_first() and rmap_remove() for zapping a huge rmap list could be
slow.  The easy way is to travers the rmap list, collecting the a/d bits and
free the slots along the way.

Provide a pte_list_destroy() and do exactly that.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c | 45 +++++++++++++++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 12 deletions(-)

Comments

Sean Christopherson July 28, 2021, 9:39 p.m. UTC | #1
On Fri, Jun 25, 2021, Peter Xu wrote:
> Using rmap_get_first() and rmap_remove() for zapping a huge rmap list could be
> slow.  The easy way is to travers the rmap list, collecting the a/d bits and
> free the slots along the way.
> 
> Provide a pte_list_destroy() and do exactly that.
> 
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  arch/x86/kvm/mmu/mmu.c | 45 +++++++++++++++++++++++++++++++-----------
>  1 file changed, 33 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index ba0258bdebc4..45aac78dcabc 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -1014,6 +1014,38 @@ unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
>  	return count;
>  }
>  
> +/* Return true if rmap existed and callback called, false otherwise */
> +static bool pte_list_destroy(struct kvm_rmap_head *rmap_head,
> +			     int (*callback)(u64 *sptep))
> +{
> +	struct pte_list_desc *desc, *next;
> +	int i;
> +
> +	if (!rmap_head->val)
> +		return false;
> +
> +	if (!(rmap_head->val & 1)) {
> +		if (callback)
> +			callback((u64 *)rmap_head->val);
> +		goto out;
> +	}
> +
> +	desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
> +
> +	while (desc) {
> +		if (callback)
> +			for (i = 0; i < desc->spte_count; i++)
> +				callback(desc->sptes[i]);
> +		next = desc->more;
> +		mmu_free_pte_list_desc(desc);
> +		desc = next;

Alternatively, 

	desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
	for ( ; desc; desc = next) {
		for (i = 0; i < desc->spte_count; i++)
			mmu_spte_clear_track_bits((u64 *)rmap_head->val);
		next = desc->more;
		mmu_free_pte_list_desc(desc);
	}

> +	}
> +out:
> +	/* rmap_head is meaningless now, remember to reset it */
> +	rmap_head->val = 0;
> +	return true;

Why implement this as a generic method with a callback?  gcc is suprisingly
astute in optimizing callback(), but I don't see the point of adding a complex
helper that has a single caller, and is extremely unlikely to gain new callers.
Or is there another "zap everything" case I'm missing?

E.g. why not this?

static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
			  const struct kvm_memory_slot *slot)
{
	struct pte_list_desc *desc, *next;
	int i;

	if (!rmap_head->val)
		return false;

	if (!(rmap_head->val & 1)) {
		mmu_spte_clear_track_bits((u64 *)rmap_head->val);
		goto out;
	}

	desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
	for ( ; desc; desc = next) {
		for (i = 0; i < desc->spte_count; i++)
			mmu_spte_clear_track_bits(desc->sptes[i]);
		next = desc->more;
		mmu_free_pte_list_desc(desc);
	}
out:
	/* rmap_head is meaningless now, remember to reset it */
	rmap_head->val = 0;
	return true;
}

> +}
> +
>  static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
>  					   struct kvm_memory_slot *slot)
>  {
> @@ -1403,18 +1435,7 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
>  static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
>  			  struct kvm_memory_slot *slot)
>  {
> -	u64 *sptep;
> -	struct rmap_iterator iter;
> -	bool flush = false;
> -
> -	while ((sptep = rmap_get_first(rmap_head, &iter))) {
> -		rmap_printk("spte %p %llx.\n", sptep, *sptep);
> -
> -		pte_list_remove(rmap_head, sptep);
> -		flush = true;
> -	}
> -
> -	return flush;
> +	return pte_list_destroy(rmap_head, mmu_spte_clear_track_bits);
>  }
>  
>  static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
> -- 
> 2.31.1
>
Peter Xu July 28, 2021, 10:01 p.m. UTC | #2
On Wed, Jul 28, 2021 at 09:39:02PM +0000, Sean Christopherson wrote:
> On Fri, Jun 25, 2021, Peter Xu wrote:
> > Using rmap_get_first() and rmap_remove() for zapping a huge rmap list could be
> > slow.  The easy way is to travers the rmap list, collecting the a/d bits and
> > free the slots along the way.
> > 
> > Provide a pte_list_destroy() and do exactly that.
> > 
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > ---
> >  arch/x86/kvm/mmu/mmu.c | 45 +++++++++++++++++++++++++++++++-----------
> >  1 file changed, 33 insertions(+), 12 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index ba0258bdebc4..45aac78dcabc 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -1014,6 +1014,38 @@ unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
> >  	return count;
> >  }
> >  
> > +/* Return true if rmap existed and callback called, false otherwise */
> > +static bool pte_list_destroy(struct kvm_rmap_head *rmap_head,
> > +			     int (*callback)(u64 *sptep))
> > +{
> > +	struct pte_list_desc *desc, *next;
> > +	int i;
> > +
> > +	if (!rmap_head->val)
> > +		return false;
> > +
> > +	if (!(rmap_head->val & 1)) {
> > +		if (callback)
> > +			callback((u64 *)rmap_head->val);
> > +		goto out;
> > +	}
> > +
> > +	desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
> > +
> > +	while (desc) {
> > +		if (callback)
> > +			for (i = 0; i < desc->spte_count; i++)
> > +				callback(desc->sptes[i]);
> > +		next = desc->more;
> > +		mmu_free_pte_list_desc(desc);
> > +		desc = next;
> 
> Alternatively, 
> 
> 	desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
> 	for ( ; desc; desc = next) {
> 		for (i = 0; i < desc->spte_count; i++)
> 			mmu_spte_clear_track_bits((u64 *)rmap_head->val);
> 		next = desc->more;
> 		mmu_free_pte_list_desc(desc);
> 	}
> 
> > +	}
> > +out:
> > +	/* rmap_head is meaningless now, remember to reset it */
> > +	rmap_head->val = 0;
> > +	return true;
> 
> Why implement this as a generic method with a callback?  gcc is suprisingly
> astute in optimizing callback(), but I don't see the point of adding a complex
> helper that has a single caller, and is extremely unlikely to gain new callers.
> Or is there another "zap everything" case I'm missing?

No other case; it's just that pte_list_*() helpers will be more self-contained.
If that'll be a performance concern, no objection to hard code it.

> 
> E.g. why not this?
> 
> static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
> 			  const struct kvm_memory_slot *slot)
> {
> 	struct pte_list_desc *desc, *next;
> 	int i;
> 
> 	if (!rmap_head->val)
> 		return false;
> 
> 	if (!(rmap_head->val & 1)) {
> 		mmu_spte_clear_track_bits((u64 *)rmap_head->val);
> 		goto out;
> 	}
> 
> 	desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
> 	for ( ; desc; desc = next) {
> 		for (i = 0; i < desc->spte_count; i++)
> 			mmu_spte_clear_track_bits(desc->sptes[i]);
> 		next = desc->more;
> 		mmu_free_pte_list_desc(desc);
> 	}
> out:
> 	/* rmap_head is meaningless now, remember to reset it */
> 	rmap_head->val = 0;
> 	return true;
> }

Looks good, but so far I've no strong opinion on this.  I'll leave it for Paolo
to decide.

Thanks!
Sean Christopherson July 28, 2021, 10:31 p.m. UTC | #3
On Wed, Jul 28, 2021, Peter Xu wrote:
> On Wed, Jul 28, 2021 at 09:39:02PM +0000, Sean Christopherson wrote:
> > On Fri, Jun 25, 2021, Peter Xu wrote:
> > Why implement this as a generic method with a callback?  gcc is suprisingly
> > astute in optimizing callback(), but I don't see the point of adding a complex
> > helper that has a single caller, and is extremely unlikely to gain new callers.
> > Or is there another "zap everything" case I'm missing?
> 
> No other case; it's just that pte_list_*() helpers will be more self-contained.

Eh, but this flow is as much about rmaps as it is about pte_list.

> If that'll be a performance concern, no objection to hard code it.

It's more about unnecessary complexity than it is about performance, e.g. gcc-10
generates identical code for both version (which did surprise the heck out of me).

If we really want to isolate pte_list_destroy(), I would vote for something like
this (squashed in).   pte_list_remove() already calls mmu_spte_clear_track_bits(),
so that particular separation of concerns has already gone out the window.

 
-/* Return true if rmap existed and callback called, false otherwise */
-static bool pte_list_destroy(struct kvm_rmap_head *rmap_head,
-                            void (*callback)(u64 *sptep))
+static bool pte_list_destroy(struct kvm_rmap_head *rmap_head)
 {
        struct pte_list_desc *desc, *next;
        int i;
@@ -1013,20 +1011,16 @@ static bool pte_list_destroy(struct kvm_rmap_head *rmap_head,
                return false;
 
        if (!(rmap_head->val & 1)) {
-               if (callback)
-                       callback((u64 *)rmap_head->val);
+               mmu_spte_clear_track_bits((u64 *)rmap_head->val);
                goto out;
        }
 
        desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
-
-       while (desc) {
-               if (callback)
-                       for (i = 0; i < desc->spte_count; i++)
-                               callback(desc->sptes[i]);
+       for ( ; desc; desc = next) {
+               for (i = 0; i < desc->spte_count; i++)
+                       mmu_spte_clear_track_bits(desc->sptes[i]);
                next = desc->more;
                mmu_free_pte_list_desc(desc);
-               desc = next;
        }
 out:
        /* rmap_head is meaningless now, remember to reset it */
@@ -1422,22 +1416,17 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
        return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K);
 }
 
-static void mmu_spte_clear_track_bits_cb(u64 *sptep)
-{
-       mmu_spte_clear_track_bits(sptep);
-}
-
 static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
                          const struct kvm_memory_slot *slot)
 {
-       return pte_list_destroy(rmap_head, mmu_spte_clear_track_bits_cb);
+       return pte_list_destroy(rmap_head);
 }
 
 static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
                            struct kvm_memory_slot *slot, gfn_t gfn, int level,
                            pte_t unused)
 {
-       return kvm_zap_rmapp(kvm, rmap_head, slot);
+       return pte_list_destroy(rmap_head);
 }
 
 static bool kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
Paolo Bonzini July 29, 2021, 9:35 a.m. UTC | #4
On 29/07/21 00:31, Sean Christopherson wrote:
>> If that'll be a performance concern, no objection to hard code it.
> It's more about unnecessary complexity than it is about performance, e.g. gcc-10
> generates identical code for both version (which did surprise the heck out of me).

If you think of what's needed to produce decent (as fast as C) code out 
of STL code, that's not surprising. :)  Pretty cool that it lets people 
write nicer C code too, though.

> If we really want to isolate pte_list_destroy(), I would vote for something like
> this (squashed in).   pte_list_remove() already calls mmu_spte_clear_track_bits(),
> so that particular separation of concerns has already gone out the window.

Yes, that's fair enough.  Thanks for the review!

Paolo
diff mbox series

Patch

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index ba0258bdebc4..45aac78dcabc 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1014,6 +1014,38 @@  unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
 	return count;
 }
 
+/* Return true if rmap existed and callback called, false otherwise */
+static bool pte_list_destroy(struct kvm_rmap_head *rmap_head,
+			     int (*callback)(u64 *sptep))
+{
+	struct pte_list_desc *desc, *next;
+	int i;
+
+	if (!rmap_head->val)
+		return false;
+
+	if (!(rmap_head->val & 1)) {
+		if (callback)
+			callback((u64 *)rmap_head->val);
+		goto out;
+	}
+
+	desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+
+	while (desc) {
+		if (callback)
+			for (i = 0; i < desc->spte_count; i++)
+				callback(desc->sptes[i]);
+		next = desc->more;
+		mmu_free_pte_list_desc(desc);
+		desc = next;
+	}
+out:
+	/* rmap_head is meaningless now, remember to reset it */
+	rmap_head->val = 0;
+	return true;
+}
+
 static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
 					   struct kvm_memory_slot *slot)
 {
@@ -1403,18 +1435,7 @@  static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
 static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 			  struct kvm_memory_slot *slot)
 {
-	u64 *sptep;
-	struct rmap_iterator iter;
-	bool flush = false;
-
-	while ((sptep = rmap_get_first(rmap_head, &iter))) {
-		rmap_printk("spte %p %llx.\n", sptep, *sptep);
-
-		pte_list_remove(rmap_head, sptep);
-		flush = true;
-	}
-
-	return flush;
+	return pte_list_destroy(rmap_head, mmu_spte_clear_track_bits);
 }
 
 static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,