Message ID | 20210625153419.43671-1-peterx@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | KVM: X86: Some light optimizations on rmap logic | expand |
On Fri, Jun 25, 2021, Peter Xu wrote: > Using rmap_get_first() and rmap_remove() for zapping a huge rmap list could be > slow. The easy way is to travers the rmap list, collecting the a/d bits and > free the slots along the way. > > Provide a pte_list_destroy() and do exactly that. > > Signed-off-by: Peter Xu <peterx@redhat.com> > --- > arch/x86/kvm/mmu/mmu.c | 45 +++++++++++++++++++++++++++++++----------- > 1 file changed, 33 insertions(+), 12 deletions(-) > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > index ba0258bdebc4..45aac78dcabc 100644 > --- a/arch/x86/kvm/mmu/mmu.c > +++ b/arch/x86/kvm/mmu/mmu.c > @@ -1014,6 +1014,38 @@ unsigned int pte_list_count(struct kvm_rmap_head *rmap_head) > return count; > } > > +/* Return true if rmap existed and callback called, false otherwise */ > +static bool pte_list_destroy(struct kvm_rmap_head *rmap_head, > + int (*callback)(u64 *sptep)) > +{ > + struct pte_list_desc *desc, *next; > + int i; > + > + if (!rmap_head->val) > + return false; > + > + if (!(rmap_head->val & 1)) { > + if (callback) > + callback((u64 *)rmap_head->val); > + goto out; > + } > + > + desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); > + > + while (desc) { > + if (callback) > + for (i = 0; i < desc->spte_count; i++) > + callback(desc->sptes[i]); > + next = desc->more; > + mmu_free_pte_list_desc(desc); > + desc = next; Alternatively, desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); for ( ; desc; desc = next) { for (i = 0; i < desc->spte_count; i++) mmu_spte_clear_track_bits((u64 *)rmap_head->val); next = desc->more; mmu_free_pte_list_desc(desc); } > + } > +out: > + /* rmap_head is meaningless now, remember to reset it */ > + rmap_head->val = 0; > + return true; Why implement this as a generic method with a callback? gcc is suprisingly astute in optimizing callback(), but I don't see the point of adding a complex helper that has a single caller, and is extremely unlikely to gain new callers. Or is there another "zap everything" case I'm missing? E.g. why not this? static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, const struct kvm_memory_slot *slot) { struct pte_list_desc *desc, *next; int i; if (!rmap_head->val) return false; if (!(rmap_head->val & 1)) { mmu_spte_clear_track_bits((u64 *)rmap_head->val); goto out; } desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); for ( ; desc; desc = next) { for (i = 0; i < desc->spte_count; i++) mmu_spte_clear_track_bits(desc->sptes[i]); next = desc->more; mmu_free_pte_list_desc(desc); } out: /* rmap_head is meaningless now, remember to reset it */ rmap_head->val = 0; return true; } > +} > + > static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level, > struct kvm_memory_slot *slot) > { > @@ -1403,18 +1435,7 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) > static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, > struct kvm_memory_slot *slot) > { > - u64 *sptep; > - struct rmap_iterator iter; > - bool flush = false; > - > - while ((sptep = rmap_get_first(rmap_head, &iter))) { > - rmap_printk("spte %p %llx.\n", sptep, *sptep); > - > - pte_list_remove(rmap_head, sptep); > - flush = true; > - } > - > - return flush; > + return pte_list_destroy(rmap_head, mmu_spte_clear_track_bits); > } > > static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, > -- > 2.31.1 >
On Wed, Jul 28, 2021 at 09:39:02PM +0000, Sean Christopherson wrote: > On Fri, Jun 25, 2021, Peter Xu wrote: > > Using rmap_get_first() and rmap_remove() for zapping a huge rmap list could be > > slow. The easy way is to travers the rmap list, collecting the a/d bits and > > free the slots along the way. > > > > Provide a pte_list_destroy() and do exactly that. > > > > Signed-off-by: Peter Xu <peterx@redhat.com> > > --- > > arch/x86/kvm/mmu/mmu.c | 45 +++++++++++++++++++++++++++++++----------- > > 1 file changed, 33 insertions(+), 12 deletions(-) > > > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > > index ba0258bdebc4..45aac78dcabc 100644 > > --- a/arch/x86/kvm/mmu/mmu.c > > +++ b/arch/x86/kvm/mmu/mmu.c > > @@ -1014,6 +1014,38 @@ unsigned int pte_list_count(struct kvm_rmap_head *rmap_head) > > return count; > > } > > > > +/* Return true if rmap existed and callback called, false otherwise */ > > +static bool pte_list_destroy(struct kvm_rmap_head *rmap_head, > > + int (*callback)(u64 *sptep)) > > +{ > > + struct pte_list_desc *desc, *next; > > + int i; > > + > > + if (!rmap_head->val) > > + return false; > > + > > + if (!(rmap_head->val & 1)) { > > + if (callback) > > + callback((u64 *)rmap_head->val); > > + goto out; > > + } > > + > > + desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); > > + > > + while (desc) { > > + if (callback) > > + for (i = 0; i < desc->spte_count; i++) > > + callback(desc->sptes[i]); > > + next = desc->more; > > + mmu_free_pte_list_desc(desc); > > + desc = next; > > Alternatively, > > desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); > for ( ; desc; desc = next) { > for (i = 0; i < desc->spte_count; i++) > mmu_spte_clear_track_bits((u64 *)rmap_head->val); > next = desc->more; > mmu_free_pte_list_desc(desc); > } > > > + } > > +out: > > + /* rmap_head is meaningless now, remember to reset it */ > > + rmap_head->val = 0; > > + return true; > > Why implement this as a generic method with a callback? gcc is suprisingly > astute in optimizing callback(), but I don't see the point of adding a complex > helper that has a single caller, and is extremely unlikely to gain new callers. > Or is there another "zap everything" case I'm missing? No other case; it's just that pte_list_*() helpers will be more self-contained. If that'll be a performance concern, no objection to hard code it. > > E.g. why not this? > > static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, > const struct kvm_memory_slot *slot) > { > struct pte_list_desc *desc, *next; > int i; > > if (!rmap_head->val) > return false; > > if (!(rmap_head->val & 1)) { > mmu_spte_clear_track_bits((u64 *)rmap_head->val); > goto out; > } > > desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); > for ( ; desc; desc = next) { > for (i = 0; i < desc->spte_count; i++) > mmu_spte_clear_track_bits(desc->sptes[i]); > next = desc->more; > mmu_free_pte_list_desc(desc); > } > out: > /* rmap_head is meaningless now, remember to reset it */ > rmap_head->val = 0; > return true; > } Looks good, but so far I've no strong opinion on this. I'll leave it for Paolo to decide. Thanks!
On Wed, Jul 28, 2021, Peter Xu wrote: > On Wed, Jul 28, 2021 at 09:39:02PM +0000, Sean Christopherson wrote: > > On Fri, Jun 25, 2021, Peter Xu wrote: > > Why implement this as a generic method with a callback? gcc is suprisingly > > astute in optimizing callback(), but I don't see the point of adding a complex > > helper that has a single caller, and is extremely unlikely to gain new callers. > > Or is there another "zap everything" case I'm missing? > > No other case; it's just that pte_list_*() helpers will be more self-contained. Eh, but this flow is as much about rmaps as it is about pte_list. > If that'll be a performance concern, no objection to hard code it. It's more about unnecessary complexity than it is about performance, e.g. gcc-10 generates identical code for both version (which did surprise the heck out of me). If we really want to isolate pte_list_destroy(), I would vote for something like this (squashed in). pte_list_remove() already calls mmu_spte_clear_track_bits(), so that particular separation of concerns has already gone out the window. -/* Return true if rmap existed and callback called, false otherwise */ -static bool pte_list_destroy(struct kvm_rmap_head *rmap_head, - void (*callback)(u64 *sptep)) +static bool pte_list_destroy(struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc, *next; int i; @@ -1013,20 +1011,16 @@ static bool pte_list_destroy(struct kvm_rmap_head *rmap_head, return false; if (!(rmap_head->val & 1)) { - if (callback) - callback((u64 *)rmap_head->val); + mmu_spte_clear_track_bits((u64 *)rmap_head->val); goto out; } desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); - - while (desc) { - if (callback) - for (i = 0; i < desc->spte_count; i++) - callback(desc->sptes[i]); + for ( ; desc; desc = next) { + for (i = 0; i < desc->spte_count; i++) + mmu_spte_clear_track_bits(desc->sptes[i]); next = desc->more; mmu_free_pte_list_desc(desc); - desc = next; } out: /* rmap_head is meaningless now, remember to reset it */ @@ -1422,22 +1416,17 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K); } -static void mmu_spte_clear_track_bits_cb(u64 *sptep) -{ - mmu_spte_clear_track_bits(sptep); -} - static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, const struct kvm_memory_slot *slot) { - return pte_list_destroy(rmap_head, mmu_spte_clear_track_bits_cb); + return pte_list_destroy(rmap_head); } static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level, pte_t unused) { - return kvm_zap_rmapp(kvm, rmap_head, slot); + return pte_list_destroy(rmap_head); } static bool kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
On 29/07/21 00:31, Sean Christopherson wrote: >> If that'll be a performance concern, no objection to hard code it. > It's more about unnecessary complexity than it is about performance, e.g. gcc-10 > generates identical code for both version (which did surprise the heck out of me). If you think of what's needed to produce decent (as fast as C) code out of STL code, that's not surprising. :) Pretty cool that it lets people write nicer C code too, though. > If we really want to isolate pte_list_destroy(), I would vote for something like > this (squashed in). pte_list_remove() already calls mmu_spte_clear_track_bits(), > so that particular separation of concerns has already gone out the window. Yes, that's fair enough. Thanks for the review! Paolo
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index ba0258bdebc4..45aac78dcabc 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1014,6 +1014,38 @@ unsigned int pte_list_count(struct kvm_rmap_head *rmap_head) return count; } +/* Return true if rmap existed and callback called, false otherwise */ +static bool pte_list_destroy(struct kvm_rmap_head *rmap_head, + int (*callback)(u64 *sptep)) +{ + struct pte_list_desc *desc, *next; + int i; + + if (!rmap_head->val) + return false; + + if (!(rmap_head->val & 1)) { + if (callback) + callback((u64 *)rmap_head->val); + goto out; + } + + desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + + while (desc) { + if (callback) + for (i = 0; i < desc->spte_count; i++) + callback(desc->sptes[i]); + next = desc->more; + mmu_free_pte_list_desc(desc); + desc = next; + } +out: + /* rmap_head is meaningless now, remember to reset it */ + rmap_head->val = 0; + return true; +} + static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level, struct kvm_memory_slot *slot) { @@ -1403,18 +1435,7 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot) { - u64 *sptep; - struct rmap_iterator iter; - bool flush = false; - - while ((sptep = rmap_get_first(rmap_head, &iter))) { - rmap_printk("spte %p %llx.\n", sptep, *sptep); - - pte_list_remove(rmap_head, sptep); - flush = true; - } - - return flush; + return pte_list_destroy(rmap_head, mmu_spte_clear_track_bits); } static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
Using rmap_get_first() and rmap_remove() for zapping a huge rmap list could be slow. The easy way is to travers the rmap list, collecting the a/d bits and free the slots along the way. Provide a pte_list_destroy() and do exactly that. Signed-off-by: Peter Xu <peterx@redhat.com> --- arch/x86/kvm/mmu/mmu.c | 45 +++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 12 deletions(-)