Message ID | 20220414132013.1588929-3-vkuznets@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | KVM: x86: hyper-v: Fine-grained TLB flush + L2 TLB flush feature | expand |
On Thu, 2022-04-14 at 15:19 +0200, Vitaly Kuznetsov wrote: > To allow flushing individual GVAs instead of always flushing the whole > VPID a per-vCPU structure to pass the requests is needed. Introduce a > simple ring write-locked structure to hold two types of entries: > individual GVA (GFN + up to 4095 following GFNs in the lower 12 bits) > and 'flush all'. > > The queuing rule is: if there's not enough space on the ring to put > the request and leave at least 1 entry for 'flush all' - put 'flush > all' entry. > > The size of the ring is arbitrary set to '16'. > > Note, kvm_hv_flush_tlb() only queues 'flush all' entries for now so > there's very small functional change but the infrastructure is > prepared to handle individual GVA flush requests. As I see from this patch, also the code doesn't process the requests from the ring buffer yet, but rather just ignores it completely, and resets the whole ring buffer (kvm_hv_vcpu_empty_flush_tlb) Maybe you should mention it here. > > Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com> > --- > arch/x86/include/asm/kvm_host.h | 16 +++++++ > arch/x86/kvm/hyperv.c | 83 +++++++++++++++++++++++++++++++++ > arch/x86/kvm/hyperv.h | 13 ++++++ > arch/x86/kvm/x86.c | 5 +- > arch/x86/kvm/x86.h | 1 + > 5 files changed, 116 insertions(+), 2 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index 1de3ad9308d8..b4dd2ff61658 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -578,6 +578,20 @@ struct kvm_vcpu_hv_synic { > bool dont_zero_synic_pages; > }; > > +#define KVM_HV_TLB_FLUSH_RING_SIZE (16) > + > +struct kvm_vcpu_hv_tlb_flush_entry { > + u64 addr; > + u64 flush_all:1; > + u64 pad:63; > +}; Have you considered using kfifo.h library instead? > + > +struct kvm_vcpu_hv_tlb_flush_ring { > + int read_idx, write_idx; > + spinlock_t write_lock; > + struct kvm_vcpu_hv_tlb_flush_entry entries[KVM_HV_TLB_FLUSH_RING_SIZE]; > +}; > + > /* Hyper-V per vcpu emulation context */ > struct kvm_vcpu_hv { > struct kvm_vcpu *vcpu; > @@ -597,6 +611,8 @@ struct kvm_vcpu_hv { > u32 enlightenments_ebx; /* HYPERV_CPUID_ENLIGHTMENT_INFO.EBX */ > u32 syndbg_cap_eax; /* HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX */ > } cpuid_cache; > + > + struct kvm_vcpu_hv_tlb_flush_ring tlb_flush_ring; > }; > > /* Xen HVM per vcpu emulation context */ > diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c > index b402ad059eb9..fb716cf919ed 100644 > --- a/arch/x86/kvm/hyperv.c > +++ b/arch/x86/kvm/hyperv.c > @@ -29,6 +29,7 @@ > #include <linux/kvm_host.h> > #include <linux/highmem.h> > #include <linux/sched/cputime.h> > +#include <linux/spinlock.h> > #include <linux/eventfd.h> > > #include <asm/apicdef.h> > @@ -954,6 +955,8 @@ static int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu) > > hv_vcpu->vp_index = vcpu->vcpu_idx; > > + spin_lock_init(&hv_vcpu->tlb_flush_ring.write_lock); > + > return 0; > } > > @@ -1789,6 +1792,74 @@ static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc, > var_cnt * sizeof(*sparse_banks)); > } > > +static inline int hv_tlb_flush_ring_free(struct kvm_vcpu_hv *hv_vcpu, > + int read_idx, int write_idx) > +{ > + if (write_idx >= read_idx) > + return KVM_HV_TLB_FLUSH_RING_SIZE - (write_idx - read_idx) - 1; > + > + return read_idx - write_idx - 1; > +} > + > +static void hv_tlb_flush_ring_enqueue(struct kvm_vcpu *vcpu) > +{ > + struct kvm_vcpu_hv_tlb_flush_ring *tlb_flush_ring; > + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); > + int ring_free, write_idx, read_idx; > + unsigned long flags; > + > + if (!hv_vcpu) > + return; > + > + tlb_flush_ring = &hv_vcpu->tlb_flush_ring; > + > + spin_lock_irqsave(&tlb_flush_ring->write_lock, flags); > + > + /* > + * 'read_idx' is updated by the vCPU which does the flush, this > + * happens without 'tlb_flush_ring->write_lock' being held; make > + * sure we read it once. > + */ > + read_idx = READ_ONCE(tlb_flush_ring->read_idx); > + /* > + * 'write_idx' is only updated here, under 'tlb_flush_ring->write_lock'. > + * allow the compiler to re-read it, it can't change. > + */ > + write_idx = tlb_flush_ring->write_idx; > + > + ring_free = hv_tlb_flush_ring_free(hv_vcpu, read_idx, write_idx); > + /* Full ring always contains 'flush all' entry */ > + if (!ring_free) > + goto out_unlock; > + > + tlb_flush_ring->entries[write_idx].addr = 0; > + tlb_flush_ring->entries[write_idx].flush_all = 1; > + /* > + * Advance write index only after filling in the entry to > + * synchronize with lockless reader. > + */ > + smp_wmb(); > + tlb_flush_ring->write_idx = (write_idx + 1) % KVM_HV_TLB_FLUSH_RING_SIZE; > + > +out_unlock: > + spin_unlock_irqrestore(&tlb_flush_ring->write_lock, flags); > +} > + > +void kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu) > +{ > + struct kvm_vcpu_hv_tlb_flush_ring *tlb_flush_ring; > + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); > + > + kvm_vcpu_flush_tlb_guest(vcpu); > + > + if (!hv_vcpu) > + return; > + > + tlb_flush_ring = &hv_vcpu->tlb_flush_ring; > + > + tlb_flush_ring->read_idx = tlb_flush_ring->write_idx; > +} > + > static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) > { > struct kvm *kvm = vcpu->kvm; > @@ -1797,6 +1868,8 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) > DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS); > u64 valid_bank_mask; > u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; > + struct kvm_vcpu *v; > + unsigned long i; > bool all_cpus; > > /* > @@ -1876,10 +1949,20 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) > * analyze it here, flush TLB regardless of the specified address space. > */ > if (all_cpus) { > + kvm_for_each_vcpu(i, v, kvm) > + hv_tlb_flush_ring_enqueue(v); > + > kvm_make_all_cpus_request(kvm, KVM_REQ_HV_TLB_FLUSH); > } else { > sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask); > > + for_each_set_bit(i, vcpu_mask, KVM_MAX_VCPUS) { > + v = kvm_get_vcpu(kvm, i); > + if (!v) > + continue; > + hv_tlb_flush_ring_enqueue(v); > + } > + > kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask); > } > > diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h > index da2737f2a956..6847caeaaf84 100644 > --- a/arch/x86/kvm/hyperv.h > +++ b/arch/x86/kvm/hyperv.h > @@ -147,4 +147,17 @@ int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args); > int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, > struct kvm_cpuid_entry2 __user *entries); > > + > +static inline void kvm_hv_vcpu_empty_flush_tlb(struct kvm_vcpu *vcpu) > +{ > + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); > + > + if (!hv_vcpu) > + return; > + > + hv_vcpu->tlb_flush_ring.read_idx = hv_vcpu->tlb_flush_ring.write_idx; > +} > +void kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu); > + > + > #endif > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index f633cff8cd7f..e5aec386d299 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -3324,7 +3324,7 @@ static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu) > static_call(kvm_x86_flush_tlb_all)(vcpu); > } > > -static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) > +void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) > { > ++vcpu->stat.tlb_flush; > > @@ -3362,7 +3362,8 @@ void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu) > > if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu)) { > kvm_vcpu_flush_tlb_guest(vcpu); > - kvm_clear_request(KVM_REQ_HV_TLB_FLUSH, vcpu); > + if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu)) > + kvm_hv_vcpu_empty_flush_tlb(vcpu); > } else if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu)) { > kvm_vcpu_flush_tlb_guest(vcpu); > } > diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h > index 588792f00334..2324f496c500 100644 > --- a/arch/x86/kvm/x86.h > +++ b/arch/x86/kvm/x86.h > @@ -58,6 +58,7 @@ static inline unsigned int __shrink_ple_window(unsigned int val, > > #define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL > > +void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu); > void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu); > int kvm_check_nested_events(struct kvm_vcpu *vcpu); > Overall looks good to me. I might have missed something though. Best regards, Maxim Levitsky
Maxim Levitsky <mlevitsk@redhat.com> writes: > On Thu, 2022-04-14 at 15:19 +0200, Vitaly Kuznetsov wrote: >> To allow flushing individual GVAs instead of always flushing the whole >> VPID a per-vCPU structure to pass the requests is needed. Introduce a >> simple ring write-locked structure to hold two types of entries: >> individual GVA (GFN + up to 4095 following GFNs in the lower 12 bits) >> and 'flush all'. >> >> The queuing rule is: if there's not enough space on the ring to put >> the request and leave at least 1 entry for 'flush all' - put 'flush >> all' entry. >> >> The size of the ring is arbitrary set to '16'. >> >> Note, kvm_hv_flush_tlb() only queues 'flush all' entries for now so >> there's very small functional change but the infrastructure is >> prepared to handle individual GVA flush requests. > > As I see from this patch, also the code doesn't process the requests > from the ring buffer yet, but rather just ignores it completely, > and resets the whole ring buffer (kvm_hv_vcpu_empty_flush_tlb) > Maybe you should mention it here. > > >> >> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com> >> --- >> arch/x86/include/asm/kvm_host.h | 16 +++++++ >> arch/x86/kvm/hyperv.c | 83 +++++++++++++++++++++++++++++++++ >> arch/x86/kvm/hyperv.h | 13 ++++++ >> arch/x86/kvm/x86.c | 5 +- >> arch/x86/kvm/x86.h | 1 + >> 5 files changed, 116 insertions(+), 2 deletions(-) >> >> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h >> index 1de3ad9308d8..b4dd2ff61658 100644 >> --- a/arch/x86/include/asm/kvm_host.h >> +++ b/arch/x86/include/asm/kvm_host.h >> @@ -578,6 +578,20 @@ struct kvm_vcpu_hv_synic { >> bool dont_zero_synic_pages; >> }; >> >> +#define KVM_HV_TLB_FLUSH_RING_SIZE (16) >> + >> +struct kvm_vcpu_hv_tlb_flush_entry { >> + u64 addr; >> + u64 flush_all:1; >> + u64 pad:63; >> +}; > > Have you considered using kfifo.h library instead? > As a matter of fact I have not and this is a good suggestion, actually. Let me try to use it instead of my home-brewed ring. I'll address your other comments after that. Thanks!
On Thu, Apr 14, 2022, Vitaly Kuznetsov wrote: > To allow flushing individual GVAs instead of always flushing the whole > VPID a per-vCPU structure to pass the requests is needed. Introduce a > simple ring write-locked structure to hold two types of entries: > individual GVA (GFN + up to 4095 following GFNs in the lower 12 bits) > and 'flush all'. > > The queuing rule is: if there's not enough space on the ring to put > the request and leave at least 1 entry for 'flush all' - put 'flush > all' entry. > > The size of the ring is arbitrary set to '16'. > > Note, kvm_hv_flush_tlb() only queues 'flush all' entries for now so > there's very small functional change but the infrastructure is > prepared to handle individual GVA flush requests. > > Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com> > --- > arch/x86/include/asm/kvm_host.h | 16 +++++++ > arch/x86/kvm/hyperv.c | 83 +++++++++++++++++++++++++++++++++ > arch/x86/kvm/hyperv.h | 13 ++++++ > arch/x86/kvm/x86.c | 5 +- > arch/x86/kvm/x86.h | 1 + > 5 files changed, 116 insertions(+), 2 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index 1de3ad9308d8..b4dd2ff61658 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -578,6 +578,20 @@ struct kvm_vcpu_hv_synic { > bool dont_zero_synic_pages; > }; > > +#define KVM_HV_TLB_FLUSH_RING_SIZE (16) > + > +struct kvm_vcpu_hv_tlb_flush_entry { > + u64 addr; "addr" misleading, this is overloaded to be both the virtual address and the count. I think we make it a moot point, but it led me astray in thinkin we could use the lower 12 bits for flags... until I realized those bits are already in use. > + u64 flush_all:1; > + u64 pad:63; This is rather odd, why not just use a bool? But why even have a "flush_all" field, can't we just use a magic value for write_idx to indicate "flush_all"? E.g. either an explicit #define or -1. Writers set write_idx to -1 to indicate "flush all", vCPU/reader goes straight to "flush all" if write_idx is -1/invalid. That way, future writes can simply do nothing until read_idx == write_idx, and the vCPU/reader avoids unnecessary flushes if there's a "flush all" pending and other valid entries in the ring. And it allows deferring the "flush all" until the ring is truly full (unless there's an off-by-one / wraparound edge case I'm missing, which is likely...). --- arch/x86/include/asm/kvm_host.h | 8 +----- arch/x86/kvm/hyperv.c | 47 +++++++++++++-------------------- 2 files changed, 19 insertions(+), 36 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b6b9a71a4591..bb45cc383ce4 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -605,16 +605,10 @@ enum hv_tlb_flush_rings { HV_NR_TLB_FLUSH_RINGS, }; -struct kvm_vcpu_hv_tlb_flush_entry { - u64 addr; - u64 flush_all:1; - u64 pad:63; -}; - struct kvm_vcpu_hv_tlb_flush_ring { int read_idx, write_idx; spinlock_t write_lock; - struct kvm_vcpu_hv_tlb_flush_entry entries[KVM_HV_TLB_FLUSH_RING_SIZE]; + u64 entries[KVM_HV_TLB_FLUSH_RING_SIZE]; }; /* Hyper-V per vcpu emulation context */ diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 1d6927538bc7..56f06cf85282 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1837,10 +1837,13 @@ static int kvm_hv_get_tlb_flush_entries(struct kvm *kvm, struct kvm_hv_hcall *hc static inline int hv_tlb_flush_ring_free(struct kvm_vcpu_hv *hv_vcpu, int read_idx, int write_idx) { + if (write_idx < 0) + return 0; + if (write_idx >= read_idx) - return KVM_HV_TLB_FLUSH_RING_SIZE - (write_idx - read_idx) - 1; + return KVM_HV_TLB_FLUSH_RING_SIZE - (write_idx - read_idx); - return read_idx - write_idx - 1; + return read_idx - write_idx; } static void hv_tlb_flush_ring_enqueue(struct kvm_vcpu *vcpu, @@ -1869,6 +1872,9 @@ static void hv_tlb_flush_ring_enqueue(struct kvm_vcpu *vcpu, */ write_idx = tlb_flush_ring->write_idx; + if (write_idx < 0 && read_idx == write_idx) + read_idx = write_idx = 0; + ring_free = hv_tlb_flush_ring_free(hv_vcpu, read_idx, write_idx); /* Full ring always contains 'flush all' entry */ if (!ring_free) @@ -1879,21 +1885,13 @@ static void hv_tlb_flush_ring_enqueue(struct kvm_vcpu *vcpu, * entry in case another request comes in. In case there's not enough * space, just put 'flush all' entry there. */ - if (!count || count >= ring_free - 1 || !entries) { - tlb_flush_ring->entries[write_idx].addr = 0; - tlb_flush_ring->entries[write_idx].flush_all = 1; - /* - * Advance write index only after filling in the entry to - * synchronize with lockless reader. - */ - smp_wmb(); - tlb_flush_ring->write_idx = (write_idx + 1) % KVM_HV_TLB_FLUSH_RING_SIZE; + if (!count || count > ring_free - 1 || !entries) { + tlb_flush_ring->write_idx = -1; goto out_unlock; } for (i = 0; i < count; i++) { - tlb_flush_ring->entries[write_idx].addr = entries[i]; - tlb_flush_ring->entries[write_idx].flush_all = 0; + tlb_flush_ring->entries[write_idx] = entries[i]; write_idx = (write_idx + 1) % KVM_HV_TLB_FLUSH_RING_SIZE; } /* @@ -1911,7 +1909,6 @@ void kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv_tlb_flush_ring *tlb_flush_ring; struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); - struct kvm_vcpu_hv_tlb_flush_entry *entry; int read_idx, write_idx; u64 address; u32 count; @@ -1940,26 +1937,18 @@ void kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu) /* Pairs with smp_wmb() in hv_tlb_flush_ring_enqueue() */ smp_rmb(); + if (write_idx < 0) { + kvm_vcpu_flush_tlb_guest(vcpu); + goto out_empty_ring; + } + for (i = read_idx; i != write_idx; i = (i + 1) % KVM_HV_TLB_FLUSH_RING_SIZE) { - entry = &tlb_flush_ring->entries[i]; - - if (entry->flush_all) - goto out_flush_all; - - /* - * Lower 12 bits of 'address' encode the number of additional - * pages to flush. - */ - address = entry->addr & PAGE_MASK; - count = (entry->addr & ~PAGE_MASK) + 1; + address = tlb_flush_ring->entries[i] & PAGE_MASK; + count = (tlb_flush_ring->entries[i] & ~PAGE_MASK) + 1; for (j = 0; j < count; j++) static_call(kvm_x86_flush_tlb_gva)(vcpu, address + j * PAGE_SIZE); } ++vcpu->stat.tlb_flush; - goto out_empty_ring; - -out_flush_all: - kvm_vcpu_flush_tlb_guest(vcpu); out_empty_ring: tlb_flush_ring->read_idx = write_idx; base-commit: 62592c7c742ae78eb1f1005a63965ece19e6effe --
Sean Christopherson <seanjc@google.com> writes: > On Thu, Apr 14, 2022, Vitaly Kuznetsov wrote: >> To allow flushing individual GVAs instead of always flushing the whole >> VPID a per-vCPU structure to pass the requests is needed. Introduce a >> simple ring write-locked structure to hold two types of entries: >> individual GVA (GFN + up to 4095 following GFNs in the lower 12 bits) >> and 'flush all'. >> >> The queuing rule is: if there's not enough space on the ring to put >> the request and leave at least 1 entry for 'flush all' - put 'flush >> all' entry. >> >> The size of the ring is arbitrary set to '16'. >> >> Note, kvm_hv_flush_tlb() only queues 'flush all' entries for now so >> there's very small functional change but the infrastructure is >> prepared to handle individual GVA flush requests. >> >> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com> >> --- >> arch/x86/include/asm/kvm_host.h | 16 +++++++ >> arch/x86/kvm/hyperv.c | 83 +++++++++++++++++++++++++++++++++ >> arch/x86/kvm/hyperv.h | 13 ++++++ >> arch/x86/kvm/x86.c | 5 +- >> arch/x86/kvm/x86.h | 1 + >> 5 files changed, 116 insertions(+), 2 deletions(-) >> >> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h >> index 1de3ad9308d8..b4dd2ff61658 100644 >> --- a/arch/x86/include/asm/kvm_host.h >> +++ b/arch/x86/include/asm/kvm_host.h >> @@ -578,6 +578,20 @@ struct kvm_vcpu_hv_synic { >> bool dont_zero_synic_pages; >> }; >> >> +#define KVM_HV_TLB_FLUSH_RING_SIZE (16) >> + >> +struct kvm_vcpu_hv_tlb_flush_entry { >> + u64 addr; > > "addr" misleading, this is overloaded to be both the virtual address and the count. > I think we make it a moot point, but it led me astray in thinkin we could use the > lower 12 bits for flags... until I realized those bits are already in use. > >> + u64 flush_all:1; >> + u64 pad:63; > > This is rather odd, why not just use a bool? My initial plan was to eventually put more flags here, i.e. there are two additional flags which we don't currently handle: HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES (as we don't actually look at HV_ADDRESS_SPACE_ID) HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY > But why even have a "flush_all" field, can't we just use a magic value > for write_idx to indicate "flush_all"? E.g. either an explicit #define > or -1. Sure, a magic value would do too and will allow us to make 'struct kvm_vcpu_hv_tlb_flush_entry' 8 bytes instead of 16 (for the time being as if we are to add HV_ADDRESS_SPACE_ID/additional flags the net win is going to be zero). > > Writers set write_idx to -1 to indicate "flush all", vCPU/reader goes straight > to "flush all" if write_idx is -1/invalid. That way, future writes can simply do > nothing until read_idx == write_idx, and the vCPU/reader avoids unnecessary flushes > if there's a "flush all" pending and other valid entries in the ring. > > And it allows deferring the "flush all" until the ring is truly full (unless there's > an off-by-one / wraparound edge case I'm missing, which is likely...). Thanks for the patch! I am, however, going to look at Maxim's suggestion to use 'kfifo' to avoid all these uncertainties, funky locking etc. At first glance it has everything I need here. > > --- > arch/x86/include/asm/kvm_host.h | 8 +----- > arch/x86/kvm/hyperv.c | 47 +++++++++++++-------------------- > 2 files changed, 19 insertions(+), 36 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index b6b9a71a4591..bb45cc383ce4 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -605,16 +605,10 @@ enum hv_tlb_flush_rings { > HV_NR_TLB_FLUSH_RINGS, > }; > > -struct kvm_vcpu_hv_tlb_flush_entry { > - u64 addr; > - u64 flush_all:1; > - u64 pad:63; > -}; > - > struct kvm_vcpu_hv_tlb_flush_ring { > int read_idx, write_idx; > spinlock_t write_lock; > - struct kvm_vcpu_hv_tlb_flush_entry entries[KVM_HV_TLB_FLUSH_RING_SIZE]; > + u64 entries[KVM_HV_TLB_FLUSH_RING_SIZE]; > }; > > /* Hyper-V per vcpu emulation context */ > diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c > index 1d6927538bc7..56f06cf85282 100644 > --- a/arch/x86/kvm/hyperv.c > +++ b/arch/x86/kvm/hyperv.c > @@ -1837,10 +1837,13 @@ static int kvm_hv_get_tlb_flush_entries(struct kvm *kvm, struct kvm_hv_hcall *hc > static inline int hv_tlb_flush_ring_free(struct kvm_vcpu_hv *hv_vcpu, > int read_idx, int write_idx) > { > + if (write_idx < 0) > + return 0; > + > if (write_idx >= read_idx) > - return KVM_HV_TLB_FLUSH_RING_SIZE - (write_idx - read_idx) - 1; > + return KVM_HV_TLB_FLUSH_RING_SIZE - (write_idx - read_idx); > > - return read_idx - write_idx - 1; > + return read_idx - write_idx; > } > > static void hv_tlb_flush_ring_enqueue(struct kvm_vcpu *vcpu, > @@ -1869,6 +1872,9 @@ static void hv_tlb_flush_ring_enqueue(struct kvm_vcpu *vcpu, > */ > write_idx = tlb_flush_ring->write_idx; > > + if (write_idx < 0 && read_idx == write_idx) > + read_idx = write_idx = 0; > + > ring_free = hv_tlb_flush_ring_free(hv_vcpu, read_idx, write_idx); > /* Full ring always contains 'flush all' entry */ > if (!ring_free) > @@ -1879,21 +1885,13 @@ static void hv_tlb_flush_ring_enqueue(struct kvm_vcpu *vcpu, > * entry in case another request comes in. In case there's not enough > * space, just put 'flush all' entry there. > */ > - if (!count || count >= ring_free - 1 || !entries) { > - tlb_flush_ring->entries[write_idx].addr = 0; > - tlb_flush_ring->entries[write_idx].flush_all = 1; > - /* > - * Advance write index only after filling in the entry to > - * synchronize with lockless reader. > - */ > - smp_wmb(); > - tlb_flush_ring->write_idx = (write_idx + 1) % KVM_HV_TLB_FLUSH_RING_SIZE; > + if (!count || count > ring_free - 1 || !entries) { > + tlb_flush_ring->write_idx = -1; > goto out_unlock; > } > > for (i = 0; i < count; i++) { > - tlb_flush_ring->entries[write_idx].addr = entries[i]; > - tlb_flush_ring->entries[write_idx].flush_all = 0; > + tlb_flush_ring->entries[write_idx] = entries[i]; > write_idx = (write_idx + 1) % KVM_HV_TLB_FLUSH_RING_SIZE; > } > /* > @@ -1911,7 +1909,6 @@ void kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu) > { > struct kvm_vcpu_hv_tlb_flush_ring *tlb_flush_ring; > struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); > - struct kvm_vcpu_hv_tlb_flush_entry *entry; > int read_idx, write_idx; > u64 address; > u32 count; > @@ -1940,26 +1937,18 @@ void kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu) > /* Pairs with smp_wmb() in hv_tlb_flush_ring_enqueue() */ > smp_rmb(); > > + if (write_idx < 0) { > + kvm_vcpu_flush_tlb_guest(vcpu); > + goto out_empty_ring; > + } > + > for (i = read_idx; i != write_idx; i = (i + 1) % KVM_HV_TLB_FLUSH_RING_SIZE) { > - entry = &tlb_flush_ring->entries[i]; > - > - if (entry->flush_all) > - goto out_flush_all; > - > - /* > - * Lower 12 bits of 'address' encode the number of additional > - * pages to flush. > - */ > - address = entry->addr & PAGE_MASK; > - count = (entry->addr & ~PAGE_MASK) + 1; > + address = tlb_flush_ring->entries[i] & PAGE_MASK; > + count = (tlb_flush_ring->entries[i] & ~PAGE_MASK) + 1; > for (j = 0; j < count; j++) > static_call(kvm_x86_flush_tlb_gva)(vcpu, address + j * PAGE_SIZE); > } > ++vcpu->stat.tlb_flush; > - goto out_empty_ring; > - > -out_flush_all: > - kvm_vcpu_flush_tlb_guest(vcpu); > > out_empty_ring: > tlb_flush_ring->read_idx = write_idx; > > base-commit: 62592c7c742ae78eb1f1005a63965ece19e6effe > -- >
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1de3ad9308d8..b4dd2ff61658 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -578,6 +578,20 @@ struct kvm_vcpu_hv_synic { bool dont_zero_synic_pages; }; +#define KVM_HV_TLB_FLUSH_RING_SIZE (16) + +struct kvm_vcpu_hv_tlb_flush_entry { + u64 addr; + u64 flush_all:1; + u64 pad:63; +}; + +struct kvm_vcpu_hv_tlb_flush_ring { + int read_idx, write_idx; + spinlock_t write_lock; + struct kvm_vcpu_hv_tlb_flush_entry entries[KVM_HV_TLB_FLUSH_RING_SIZE]; +}; + /* Hyper-V per vcpu emulation context */ struct kvm_vcpu_hv { struct kvm_vcpu *vcpu; @@ -597,6 +611,8 @@ struct kvm_vcpu_hv { u32 enlightenments_ebx; /* HYPERV_CPUID_ENLIGHTMENT_INFO.EBX */ u32 syndbg_cap_eax; /* HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX */ } cpuid_cache; + + struct kvm_vcpu_hv_tlb_flush_ring tlb_flush_ring; }; /* Xen HVM per vcpu emulation context */ diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index b402ad059eb9..fb716cf919ed 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -29,6 +29,7 @@ #include <linux/kvm_host.h> #include <linux/highmem.h> #include <linux/sched/cputime.h> +#include <linux/spinlock.h> #include <linux/eventfd.h> #include <asm/apicdef.h> @@ -954,6 +955,8 @@ static int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu) hv_vcpu->vp_index = vcpu->vcpu_idx; + spin_lock_init(&hv_vcpu->tlb_flush_ring.write_lock); + return 0; } @@ -1789,6 +1792,74 @@ static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc, var_cnt * sizeof(*sparse_banks)); } +static inline int hv_tlb_flush_ring_free(struct kvm_vcpu_hv *hv_vcpu, + int read_idx, int write_idx) +{ + if (write_idx >= read_idx) + return KVM_HV_TLB_FLUSH_RING_SIZE - (write_idx - read_idx) - 1; + + return read_idx - write_idx - 1; +} + +static void hv_tlb_flush_ring_enqueue(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv_tlb_flush_ring *tlb_flush_ring; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + int ring_free, write_idx, read_idx; + unsigned long flags; + + if (!hv_vcpu) + return; + + tlb_flush_ring = &hv_vcpu->tlb_flush_ring; + + spin_lock_irqsave(&tlb_flush_ring->write_lock, flags); + + /* + * 'read_idx' is updated by the vCPU which does the flush, this + * happens without 'tlb_flush_ring->write_lock' being held; make + * sure we read it once. + */ + read_idx = READ_ONCE(tlb_flush_ring->read_idx); + /* + * 'write_idx' is only updated here, under 'tlb_flush_ring->write_lock'. + * allow the compiler to re-read it, it can't change. + */ + write_idx = tlb_flush_ring->write_idx; + + ring_free = hv_tlb_flush_ring_free(hv_vcpu, read_idx, write_idx); + /* Full ring always contains 'flush all' entry */ + if (!ring_free) + goto out_unlock; + + tlb_flush_ring->entries[write_idx].addr = 0; + tlb_flush_ring->entries[write_idx].flush_all = 1; + /* + * Advance write index only after filling in the entry to + * synchronize with lockless reader. + */ + smp_wmb(); + tlb_flush_ring->write_idx = (write_idx + 1) % KVM_HV_TLB_FLUSH_RING_SIZE; + +out_unlock: + spin_unlock_irqrestore(&tlb_flush_ring->write_lock, flags); +} + +void kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv_tlb_flush_ring *tlb_flush_ring; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + + kvm_vcpu_flush_tlb_guest(vcpu); + + if (!hv_vcpu) + return; + + tlb_flush_ring = &hv_vcpu->tlb_flush_ring; + + tlb_flush_ring->read_idx = tlb_flush_ring->write_idx; +} + static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) { struct kvm *kvm = vcpu->kvm; @@ -1797,6 +1868,8 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS); u64 valid_bank_mask; u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; + struct kvm_vcpu *v; + unsigned long i; bool all_cpus; /* @@ -1876,10 +1949,20 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) * analyze it here, flush TLB regardless of the specified address space. */ if (all_cpus) { + kvm_for_each_vcpu(i, v, kvm) + hv_tlb_flush_ring_enqueue(v); + kvm_make_all_cpus_request(kvm, KVM_REQ_HV_TLB_FLUSH); } else { sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask); + for_each_set_bit(i, vcpu_mask, KVM_MAX_VCPUS) { + v = kvm_get_vcpu(kvm, i); + if (!v) + continue; + hv_tlb_flush_ring_enqueue(v); + } + kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask); } diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index da2737f2a956..6847caeaaf84 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -147,4 +147,17 @@ int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args); int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries); + +static inline void kvm_hv_vcpu_empty_flush_tlb(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + + if (!hv_vcpu) + return; + + hv_vcpu->tlb_flush_ring.read_idx = hv_vcpu->tlb_flush_ring.write_idx; +} +void kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu); + + #endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f633cff8cd7f..e5aec386d299 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3324,7 +3324,7 @@ static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu) static_call(kvm_x86_flush_tlb_all)(vcpu); } -static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) +void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; @@ -3362,7 +3362,8 @@ void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu)) { kvm_vcpu_flush_tlb_guest(vcpu); - kvm_clear_request(KVM_REQ_HV_TLB_FLUSH, vcpu); + if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu)) + kvm_hv_vcpu_empty_flush_tlb(vcpu); } else if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu)) { kvm_vcpu_flush_tlb_guest(vcpu); } diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 588792f00334..2324f496c500 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -58,6 +58,7 @@ static inline unsigned int __shrink_ple_window(unsigned int val, #define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL +void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu); void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu); int kvm_check_nested_events(struct kvm_vcpu *vcpu);
To allow flushing individual GVAs instead of always flushing the whole VPID a per-vCPU structure to pass the requests is needed. Introduce a simple ring write-locked structure to hold two types of entries: individual GVA (GFN + up to 4095 following GFNs in the lower 12 bits) and 'flush all'. The queuing rule is: if there's not enough space on the ring to put the request and leave at least 1 entry for 'flush all' - put 'flush all' entry. The size of the ring is arbitrary set to '16'. Note, kvm_hv_flush_tlb() only queues 'flush all' entries for now so there's very small functional change but the infrastructure is prepared to handle individual GVA flush requests. Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com> --- arch/x86/include/asm/kvm_host.h | 16 +++++++ arch/x86/kvm/hyperv.c | 83 +++++++++++++++++++++++++++++++++ arch/x86/kvm/hyperv.h | 13 ++++++ arch/x86/kvm/x86.c | 5 +- arch/x86/kvm/x86.h | 1 + 5 files changed, 116 insertions(+), 2 deletions(-)