Message ID | cf1695b3e1ba495a4d23cbdc66e0fa9b7b535cc3.1621191551.git.maciej.szmigiero@oracle.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | KVM: Scalable memslots implementation | expand |
Nit: something like "KVM: Use interval tree to do fast hva lookup in memslots" would be more helpful when perusing the shortlogs. Stating that a tree is being added doesn't provide any hint as to why, or even the what is somewhat unclear. On Sun, May 16, 2021, Maciej S. Szmigiero wrote: > From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com> > > The current memslots implementation only allows quick binary search by gfn, > quick lookup by hva is not possible - the implementation has to do a linear > scan of the whole memslots array, even though the operation being performed > might apply just to a single memslot. > > This significantly hurts performance of per-hva operations with higher > memslot counts. > > Since hva ranges can overlap between memslots an interval tree is needed > for tracking them. > > Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com> > --- ... > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index d3a35646dfd8..f59847b6e9b3 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -27,6 +27,7 @@ > #include <linux/rcuwait.h> > #include <linux/refcount.h> > #include <linux/nospec.h> > +#include <linux/interval_tree.h> > #include <linux/hashtable.h> > #include <asm/signal.h> > > @@ -358,6 +359,7 @@ static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu) > > struct kvm_memory_slot { > struct hlist_node id_node; > + struct interval_tree_node hva_node; > gfn_t base_gfn; > unsigned long npages; > unsigned long *dirty_bitmap; > @@ -459,6 +461,7 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu) > */ > struct kvm_memslots { > u64 generation; > + struct rb_root_cached hva_tree; > /* The mapping table from slot id to the index in memslots[]. */ > DECLARE_HASHTABLE(id_hash, 7); > atomic_t lru_slot; > @@ -679,6 +682,11 @@ static inline struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu) > return __kvm_memslots(vcpu->kvm, as_id); > } > > +#define kvm_for_each_hva_range_memslot(node, slots, start, last) \ kvm_for_each_memslot_in_range()? Or kvm_for_each_memslot_in_hva_range()? Please add a comment about whether start is inclusive or exclusive. I'd also be in favor of hiding this in kvm_main.c, just above the MMU notifier usage. It'd be nice to discourage arch code from adding lookups that more than likely belong in generic code. > + for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \ > + node; \ > + node = interval_tree_iter_next(node, start, last)) \ > + > static inline > struct kvm_memory_slot *id_to_memslot(struct kvm_memslots *slots, int id) > { > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index 50f9bc9bb1e0..a55309432c9a 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -488,6 +488,9 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, > struct kvm_memslots *slots; > int i, idx; > > + if (range->end == range->start || WARN_ON(range->end < range->staart)) I'm pretty sure both of these are WARNable offenses, i.e. they can be combined. It'd also be a good idea to use WARN_ON_ONCE(); if a caller does manage to trigger this, odds are good it will get spammed. Also, does interval_tree_iter_first() explode if given bad inputs? If not, I'd probably say just omit this entirely. If it does explode, it might be a good idea to work the sanity check into the macro, even if the macro is hidden here. > + return 0; > + > /* A null handler is allowed if and only if on_lock() is provided. */ > if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) && > IS_KVM_NULL_FN(range->handler))) > @@ -507,15 +510,18 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, > } > > for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { > + struct interval_tree_node *node; > + > slots = __kvm_memslots(kvm, i); > - kvm_for_each_memslot(slot, slots) { > + kvm_for_each_hva_range_memslot(node, slots, > + range->start, range->end - 1) { > unsigned long hva_start, hva_end; > > + slot = container_of(node, struct kvm_memory_slot, > + hva_node); Eh, let that poke out. The 80 limit is more of a guideline. > hva_start = max(range->start, slot->userspace_addr); > hva_end = min(range->end, slot->userspace_addr + > (slot->npages << PAGE_SHIFT)); > - if (hva_start >= hva_end) > - continue; > > /* > * To optimize for the likely case where the address > @@ -787,6 +793,7 @@ static struct kvm_memslots *kvm_alloc_memslots(void) > if (!slots) > return NULL; > > + slots->hva_tree = RB_ROOT_CACHED; > hash_init(slots->id_hash); > > return slots; > @@ -1113,10 +1120,14 @@ static inline void kvm_memslot_delete(struct kvm_memslots *slots, > atomic_set(&slots->lru_slot, 0); > > for (i = dmemslot - mslots; i < slots->used_slots; i++) { > + interval_tree_remove(&mslots[i].hva_node, &slots->hva_tree); > hash_del(&mslots[i].id_node); I think it would make sense to add helpers for these? Not sure I like the names, but it would certainly dedup the code a bit. static void kvm_memslot_remove(struct kvm_memslots *slots, struct kvm_memslot *memslot) { interval_tree_remove(&memslot->hva_node, &slots->hva_tree); hash_del(&memslot->id_node); } static void kvm_memslot_insert(struct kvm_memslots *slots, struct kvm_memslot *memslot) { interval_tree_insert(&memslot->hva_node, &slots->hva_tree); hash_add(slots->id_hash, &memslot->id_node, memslot->id); } > + > mslots[i] = mslots[i + 1]; > + interval_tree_insert(&mslots[i].hva_node, &slots->hva_tree); > hash_add(slots->id_hash, &mslots[i].id_node, mslots[i].id); > } > + interval_tree_remove(&mslots[i].hva_node, &slots->hva_tree); > hash_del(&mslots[i].id_node); > mslots[i] = *memslot; > }
On 20.05.2021 01:07, Sean Christopherson wrote: > Nit: something like "KVM: Use interval tree to do fast hva lookup in memslots" > would be more helpful when perusing the shortlogs. Stating that a tree is being > added doesn't provide any hint as to why, or even the what is somewhat unclear. Will do. > On Sun, May 16, 2021, Maciej S. Szmigiero wrote: >> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com> >> >> The current memslots implementation only allows quick binary search by gfn, >> quick lookup by hva is not possible - the implementation has to do a linear >> scan of the whole memslots array, even though the operation being performed >> might apply just to a single memslot. >> >> This significantly hurts performance of per-hva operations with higher >> memslot counts. >> >> Since hva ranges can overlap between memslots an interval tree is needed >> for tracking them. >> >> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com> >> --- > > ... > >> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h >> index d3a35646dfd8..f59847b6e9b3 100644 >> --- a/include/linux/kvm_host.h >> +++ b/include/linux/kvm_host.h >> @@ -27,6 +27,7 @@ >> #include <linux/rcuwait.h> >> #include <linux/refcount.h> >> #include <linux/nospec.h> >> +#include <linux/interval_tree.h> >> #include <linux/hashtable.h> >> #include <asm/signal.h> >> >> @@ -358,6 +359,7 @@ static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu) >> >> struct kvm_memory_slot { >> struct hlist_node id_node; >> + struct interval_tree_node hva_node; >> gfn_t base_gfn; >> unsigned long npages; >> unsigned long *dirty_bitmap; >> @@ -459,6 +461,7 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu) >> */ >> struct kvm_memslots { >> u64 generation; >> + struct rb_root_cached hva_tree; >> /* The mapping table from slot id to the index in memslots[]. */ >> DECLARE_HASHTABLE(id_hash, 7); >> atomic_t lru_slot; >> @@ -679,6 +682,11 @@ static inline struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu) >> return __kvm_memslots(vcpu->kvm, as_id); >> } >> >> +#define kvm_for_each_hva_range_memslot(node, slots, start, last) \ > > kvm_for_each_memslot_in_range()? Or kvm_for_each_memslot_in_hva_range()? Will change the name to kvm_for_each_memslot_in_hva_range(), so it is obvious it's the *hva* range this iterates over. > Please add a comment about whether start is inclusive or exclusive. Will do. > I'd also be in favor of hiding this in kvm_main.c, just above the MMU notifier > usage. It'd be nice to discourage arch code from adding lookups that more than > likely belong in generic code. Will do. >> + for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \ >> + node; \ >> + node = interval_tree_iter_next(node, start, last)) \ >> + >> static inline >> struct kvm_memory_slot *id_to_memslot(struct kvm_memslots *slots, int id) >> { >> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c >> index 50f9bc9bb1e0..a55309432c9a 100644 >> --- a/virt/kvm/kvm_main.c >> +++ b/virt/kvm/kvm_main.c >> @@ -488,6 +488,9 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, >> struct kvm_memslots *slots; >> int i, idx; >> >> + if (range->end == range->start || WARN_ON(range->end < range->start)) > > I'm pretty sure both of these are WARNable offenses, i.e. they can be combined. > It'd also be a good idea to use WARN_ON_ONCE(); if a caller does manage to > trigger this, odds are good it will get spammed. Will do. > Also, does interval_tree_iter_first() explode if given bad inputs? If not, I'd > probably say just omit this entirely. Looking at the interval tree code it seems it does not account for this possibility. But even if after a deeper analysis it turns out to be safe (as of now) there is always a possibility that in the future somebody will optimize how this data structure performs its operations. After all, garbage in, garbage out. > If it does explode, it might be a good idea > to work the sanity check into the macro, even if the macro is hidden here. Can be done, although this will make the macro a bit uglier. >> + return 0; >> + >> /* A null handler is allowed if and only if on_lock() is provided. */ >> if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) && >> IS_KVM_NULL_FN(range->handler))) >> @@ -507,15 +510,18 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, >> } >> >> for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { >> + struct interval_tree_node *node; >> + >> slots = __kvm_memslots(kvm, i); >> - kvm_for_each_memslot(slot, slots) { >> + kvm_for_each_hva_range_memslot(node, slots, >> + range->start, range->end - 1) { >> unsigned long hva_start, hva_end; >> >> + slot = container_of(node, struct kvm_memory_slot, >> + hva_node); > > Eh, let that poke out. The 80 limit is more of a guideline. Okay. >> hva_start = max(range->start, slot->userspace_addr); >> hva_end = min(range->end, slot->userspace_addr + >> (slot->npages << PAGE_SHIFT)); >> - if (hva_start >= hva_end) >> - continue; >> >> /* >> * To optimize for the likely case where the address >> @@ -787,6 +793,7 @@ static struct kvm_memslots *kvm_alloc_memslots(void) >> if (!slots) >> return NULL; >> >> + slots->hva_tree = RB_ROOT_CACHED; >> hash_init(slots->id_hash); >> >> return slots; >> @@ -1113,10 +1120,14 @@ static inline void kvm_memslot_delete(struct kvm_memslots *slots, >> atomic_set(&slots->lru_slot, 0); >> >> for (i = dmemslot - mslots; i < slots->used_slots; i++) { >> + interval_tree_remove(&mslots[i].hva_node, &slots->hva_tree); >> hash_del(&mslots[i].id_node); > > I think it would make sense to add helpers for these? Not sure I like the names, > but it would certainly dedup the code a bit. > > static void kvm_memslot_remove(struct kvm_memslots *slots, > struct kvm_memslot *memslot) > { > interval_tree_remove(&memslot->hva_node, &slots->hva_tree); > hash_del(&memslot->id_node); > } > > static void kvm_memslot_insert(struct kvm_memslots *slots, > struct kvm_memslot *memslot) > { > interval_tree_insert(&memslot->hva_node, &slots->hva_tree); > hash_add(slots->id_hash, &memslot->id_node, memslot->id);> } This is possible, however patch 6 replaces the whole code anyway (and it has kvm_memslot_gfn_insert() and kvm_replace_memslot() helpers). >> + >> mslots[i] = mslots[i + 1]; >> + interval_tree_insert(&mslots[i].hva_node, &slots->hva_tree); >> hash_add(slots->id_hash, &mslots[i].id_node, mslots[i].id); >> } >> + interval_tree_remove(&mslots[i].hva_node, &slots->hva_tree); >> hash_del(&mslots[i].id_node); >> mslots[i] = *memslot; >> } Thanks, Maciej
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 3964acf5451e..f075e9939a2a 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -40,6 +40,7 @@ menuconfig KVM select HAVE_KVM_VCPU_RUN_PID_CHANGE select TASKSTATS select TASK_DELAY_ACCT + select INTERVAL_TREE help Support hosting virtualized guest machines. diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig index a77297480f56..91d197bee9c0 100644 --- a/arch/mips/kvm/Kconfig +++ b/arch/mips/kvm/Kconfig @@ -27,6 +27,7 @@ config KVM select KVM_MMIO select MMU_NOTIFIER select SRCU + select INTERVAL_TREE help Support for hosting Guest kernels. diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index e45644657d49..519d6d3642a5 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -26,6 +26,7 @@ config KVM select KVM_VFIO select IRQ_BYPASS_MANAGER select HAVE_KVM_IRQ_BYPASS + select INTERVAL_TREE config KVM_BOOK3S_HANDLER bool diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index 67a8e770e369..2e84d3922f7c 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -33,6 +33,7 @@ config KVM select HAVE_KVM_NO_POLL select SRCU select KVM_VFIO + select INTERVAL_TREE help Support hosting paravirtualized guest machines using the SIE virtualization capability on the mainframe. This should work diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index f6b93a35ce14..ee15f7e113c8 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -46,6 +46,7 @@ config KVM select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_VFIO select SRCU + select INTERVAL_TREE help Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d3a35646dfd8..f59847b6e9b3 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -27,6 +27,7 @@ #include <linux/rcuwait.h> #include <linux/refcount.h> #include <linux/nospec.h> +#include <linux/interval_tree.h> #include <linux/hashtable.h> #include <asm/signal.h> @@ -358,6 +359,7 @@ static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu) struct kvm_memory_slot { struct hlist_node id_node; + struct interval_tree_node hva_node; gfn_t base_gfn; unsigned long npages; unsigned long *dirty_bitmap; @@ -459,6 +461,7 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu) */ struct kvm_memslots { u64 generation; + struct rb_root_cached hva_tree; /* The mapping table from slot id to the index in memslots[]. */ DECLARE_HASHTABLE(id_hash, 7); atomic_t lru_slot; @@ -679,6 +682,11 @@ static inline struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu) return __kvm_memslots(vcpu->kvm, as_id); } +#define kvm_for_each_hva_range_memslot(node, slots, start, last) \ + for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \ + node; \ + node = interval_tree_iter_next(node, start, last)) \ + static inline struct kvm_memory_slot *id_to_memslot(struct kvm_memslots *slots, int id) { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 50f9bc9bb1e0..a55309432c9a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -488,6 +488,9 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, struct kvm_memslots *slots; int i, idx; + if (range->end == range->start || WARN_ON(range->end < range->start)) + return 0; + /* A null handler is allowed if and only if on_lock() is provided. */ if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) && IS_KVM_NULL_FN(range->handler))) @@ -507,15 +510,18 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, } for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + struct interval_tree_node *node; + slots = __kvm_memslots(kvm, i); - kvm_for_each_memslot(slot, slots) { + kvm_for_each_hva_range_memslot(node, slots, + range->start, range->end - 1) { unsigned long hva_start, hva_end; + slot = container_of(node, struct kvm_memory_slot, + hva_node); hva_start = max(range->start, slot->userspace_addr); hva_end = min(range->end, slot->userspace_addr + (slot->npages << PAGE_SHIFT)); - if (hva_start >= hva_end) - continue; /* * To optimize for the likely case where the address @@ -787,6 +793,7 @@ static struct kvm_memslots *kvm_alloc_memslots(void) if (!slots) return NULL; + slots->hva_tree = RB_ROOT_CACHED; hash_init(slots->id_hash); return slots; @@ -1113,10 +1120,14 @@ static inline void kvm_memslot_delete(struct kvm_memslots *slots, atomic_set(&slots->lru_slot, 0); for (i = dmemslot - mslots; i < slots->used_slots; i++) { + interval_tree_remove(&mslots[i].hva_node, &slots->hva_tree); hash_del(&mslots[i].id_node); + mslots[i] = mslots[i + 1]; + interval_tree_insert(&mslots[i].hva_node, &slots->hva_tree); hash_add(slots->id_hash, &mslots[i].id_node, mslots[i].id); } + interval_tree_remove(&mslots[i].hva_node, &slots->hva_tree); hash_del(&mslots[i].id_node); mslots[i] = *memslot; } @@ -1136,7 +1147,8 @@ static inline int kvm_memslot_insert_back(struct kvm_memslots *slots) * itself is not preserved in the array, i.e. not swapped at this time, only * its new index into the array is tracked. Returns the changed memslot's * current index into the memslots array. - * The memslot at the returned index will not be in @slots->id_hash by then. + * The memslot at the returned index will not be in @slots->hva_tree or + * @slots->id_hash by then. * @memslot is a detached struct with desired final data of the changed slot. */ static inline int kvm_memslot_move_backward(struct kvm_memslots *slots, @@ -1154,6 +1166,7 @@ static inline int kvm_memslot_move_backward(struct kvm_memslots *slots, * update_memslots() will unconditionally overwrite and re-add the * target memslot so it has to be removed here first */ + interval_tree_remove(&mmemslot->hva_node, &slots->hva_tree); hash_del(&mmemslot->id_node); /* @@ -1168,8 +1181,11 @@ static inline int kvm_memslot_move_backward(struct kvm_memslots *slots, WARN_ON_ONCE(memslot->base_gfn == mslots[i + 1].base_gfn); /* Shift the next memslot forward one and update its index. */ + interval_tree_remove(&mslots[i + 1].hva_node, &slots->hva_tree); hash_del(&mslots[i + 1].id_node); + mslots[i] = mslots[i + 1]; + interval_tree_insert(&mslots[i].hva_node, &slots->hva_tree); hash_add(slots->id_hash, &mslots[i].id_node, mslots[i].id); } return i; @@ -1181,10 +1197,12 @@ static inline int kvm_memslot_move_backward(struct kvm_memslots *slots, * is not preserved in the array, i.e. not swapped at this time, only its new * index into the array is tracked. Returns the changed memslot's final index * into the memslots array. - * The memslot at the returned index will not be in @slots->id_hash by then. + * The memslot at the returned index will not be in @slots->hva_tree or + * @slots->id_hash by then. * @memslot is a detached struct with desired final data of the new or * changed slot. - * Assumes that the memslot at @start index is not in @slots->id_hash. + * Assumes that the memslot at @start index is not in @slots->hva_tree or + * @slots->id_hash. */ static inline int kvm_memslot_move_forward(struct kvm_memslots *slots, struct kvm_memory_slot *memslot, @@ -1200,8 +1218,11 @@ static inline int kvm_memslot_move_forward(struct kvm_memslots *slots, WARN_ON_ONCE(memslot->base_gfn == mslots[i - 1].base_gfn); /* Shift the next memslot back one and update its index. */ + interval_tree_remove(&mslots[i - 1].hva_node, &slots->hva_tree); hash_del(&mslots[i - 1].id_node); + mslots[i] = mslots[i - 1]; + interval_tree_insert(&mslots[i].hva_node, &slots->hva_tree); hash_add(slots->id_hash, &mslots[i].id_node, mslots[i].id); } return i; @@ -1274,6 +1295,11 @@ static void update_memslots(struct kvm_memslots *slots, * its index accordingly. */ slots->memslots[i] = *memslot; + slots->memslots[i].hva_node.start = memslot->userspace_addr; + slots->memslots[i].hva_node.last = memslot->userspace_addr + + (memslot->npages << PAGE_SHIFT) - 1; + interval_tree_insert(&slots->memslots[i].hva_node, + &slots->hva_tree); hash_add(slots->id_hash, &slots->memslots[i].id_node, memslot->id); } @@ -1355,9 +1381,12 @@ static struct kvm_memslots *kvm_dup_memslots(struct kvm_memslots *old, memcpy(slots, old, old_size); + slots->hva_tree = RB_ROOT_CACHED; hash_init(slots->id_hash); - kvm_for_each_memslot(memslot, slots) + kvm_for_each_memslot(memslot, slots) { + interval_tree_insert(&memslot->hva_node, &slots->hva_tree); hash_add(slots->id_hash, &memslot->id_node, memslot->id); + } return slots; }