diff mbox series

[v3,02/34] KVM: x86: hyper-v: Introduce TLB flush ring

Message ID 20220414132013.1588929-3-vkuznets@redhat.com (mailing list archive)
State New, archived
Headers show
Series KVM: x86: hyper-v: Fine-grained TLB flush + L2 TLB flush feature | expand

Commit Message

Vitaly Kuznetsov April 14, 2022, 1:19 p.m. UTC
To allow flushing individual GVAs instead of always flushing the whole
VPID a per-vCPU structure to pass the requests is needed. Introduce a
simple ring write-locked structure to hold two types of entries:
individual GVA (GFN + up to 4095 following GFNs in the lower 12 bits)
and 'flush all'.

The queuing rule is: if there's not enough space on the ring to put
the request and leave at least 1 entry for 'flush all' - put 'flush
all' entry.

The size of the ring is arbitrary set to '16'.

Note, kvm_hv_flush_tlb() only queues 'flush all' entries for now so
there's very small functional change but the infrastructure is
prepared to handle individual GVA flush requests.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 16 +++++++
 arch/x86/kvm/hyperv.c           | 83 +++++++++++++++++++++++++++++++++
 arch/x86/kvm/hyperv.h           | 13 ++++++
 arch/x86/kvm/x86.c              |  5 +-
 arch/x86/kvm/x86.h              |  1 +
 5 files changed, 116 insertions(+), 2 deletions(-)

Comments

Maxim Levitsky May 11, 2022, 11:19 a.m. UTC | #1
On Thu, 2022-04-14 at 15:19 +0200, Vitaly Kuznetsov wrote:
> To allow flushing individual GVAs instead of always flushing the whole
> VPID a per-vCPU structure to pass the requests is needed. Introduce a
> simple ring write-locked structure to hold two types of entries:
> individual GVA (GFN + up to 4095 following GFNs in the lower 12 bits)
> and 'flush all'.
> 
> The queuing rule is: if there's not enough space on the ring to put
> the request and leave at least 1 entry for 'flush all' - put 'flush
> all' entry.
> 
> The size of the ring is arbitrary set to '16'.
> 
> Note, kvm_hv_flush_tlb() only queues 'flush all' entries for now so
> there's very small functional change but the infrastructure is
> prepared to handle individual GVA flush requests.

As I see from this patch, also the code doesn't process the requests
from the ring buffer yet, but rather just ignores it completely,
and resets the whole ring buffer (kvm_hv_vcpu_empty_flush_tlb)
Maybe you should mention it here.


> 
> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
> ---
>  arch/x86/include/asm/kvm_host.h | 16 +++++++
>  arch/x86/kvm/hyperv.c           | 83 +++++++++++++++++++++++++++++++++
>  arch/x86/kvm/hyperv.h           | 13 ++++++
>  arch/x86/kvm/x86.c              |  5 +-
>  arch/x86/kvm/x86.h              |  1 +
>  5 files changed, 116 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 1de3ad9308d8..b4dd2ff61658 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -578,6 +578,20 @@ struct kvm_vcpu_hv_synic {
>  	bool dont_zero_synic_pages;
>  };
>  
> +#define KVM_HV_TLB_FLUSH_RING_SIZE (16)
> +
> +struct kvm_vcpu_hv_tlb_flush_entry {
> +	u64 addr;
> +	u64 flush_all:1;
> +	u64 pad:63;
> +};

Have you considered using kfifo.h library instead?

> +
> +struct kvm_vcpu_hv_tlb_flush_ring {
> +	int read_idx, write_idx;
> +	spinlock_t write_lock;
> +	struct kvm_vcpu_hv_tlb_flush_entry entries[KVM_HV_TLB_FLUSH_RING_SIZE];
> +};
> +
>  /* Hyper-V per vcpu emulation context */
>  struct kvm_vcpu_hv {
>  	struct kvm_vcpu *vcpu;
> @@ -597,6 +611,8 @@ struct kvm_vcpu_hv {
>  		u32 enlightenments_ebx; /* HYPERV_CPUID_ENLIGHTMENT_INFO.EBX */
>  		u32 syndbg_cap_eax; /* HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX */
>  	} cpuid_cache;
> +
> +	struct kvm_vcpu_hv_tlb_flush_ring tlb_flush_ring;
>  };
>  
>  /* Xen HVM per vcpu emulation context */
> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> index b402ad059eb9..fb716cf919ed 100644
> --- a/arch/x86/kvm/hyperv.c
> +++ b/arch/x86/kvm/hyperv.c
> @@ -29,6 +29,7 @@
>  #include <linux/kvm_host.h>
>  #include <linux/highmem.h>
>  #include <linux/sched/cputime.h>
> +#include <linux/spinlock.h>
>  #include <linux/eventfd.h>
>  
>  #include <asm/apicdef.h>
> @@ -954,6 +955,8 @@ static int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu)
>  
>  	hv_vcpu->vp_index = vcpu->vcpu_idx;
>  
> +	spin_lock_init(&hv_vcpu->tlb_flush_ring.write_lock);
> +
>  	return 0;
>  }
>  
> @@ -1789,6 +1792,74 @@ static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc,
>  			      var_cnt * sizeof(*sparse_banks));
>  }
>  
> +static inline int hv_tlb_flush_ring_free(struct kvm_vcpu_hv *hv_vcpu,
> +					 int read_idx, int write_idx)
> +{
> +	if (write_idx >= read_idx)
> +		return KVM_HV_TLB_FLUSH_RING_SIZE - (write_idx - read_idx) - 1;
> +
> +	return read_idx - write_idx - 1;
> +}
> +
> +static void hv_tlb_flush_ring_enqueue(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_vcpu_hv_tlb_flush_ring *tlb_flush_ring;
> +	struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
> +	int ring_free, write_idx, read_idx;
> +	unsigned long flags;
> +
> +	if (!hv_vcpu)
> +		return;
> +
> +	tlb_flush_ring = &hv_vcpu->tlb_flush_ring;
> +
> +	spin_lock_irqsave(&tlb_flush_ring->write_lock, flags);
> +
> +	/*
> +	 * 'read_idx' is updated by the vCPU which does the flush, this
> +	 * happens without 'tlb_flush_ring->write_lock' being held; make
> +	 * sure we read it once.
> +	 */
> +	read_idx = READ_ONCE(tlb_flush_ring->read_idx);
> +	/*
> +	 * 'write_idx' is only updated here, under 'tlb_flush_ring->write_lock'.
> +	 * allow the compiler to re-read it, it can't change.
> +	 */
> +	write_idx = tlb_flush_ring->write_idx;
> +
> +	ring_free = hv_tlb_flush_ring_free(hv_vcpu, read_idx, write_idx);
> +	/* Full ring always contains 'flush all' entry */
> +	if (!ring_free)
> +		goto out_unlock;
> +
> +	tlb_flush_ring->entries[write_idx].addr = 0;
> +	tlb_flush_ring->entries[write_idx].flush_all = 1;
> +	/*
> +	 * Advance write index only after filling in the entry to
> +	 * synchronize with lockless reader.
> +	 */
> +	smp_wmb();
> +	tlb_flush_ring->write_idx = (write_idx + 1) % KVM_HV_TLB_FLUSH_RING_SIZE;
> +
> +out_unlock:
> +	spin_unlock_irqrestore(&tlb_flush_ring->write_lock, flags);
> +}
> +
> +void kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_vcpu_hv_tlb_flush_ring *tlb_flush_ring;
> +	struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
> +
> +	kvm_vcpu_flush_tlb_guest(vcpu);
> +
> +	if (!hv_vcpu)
> +		return;
> +
> +	tlb_flush_ring = &hv_vcpu->tlb_flush_ring;
> +
> +	tlb_flush_ring->read_idx = tlb_flush_ring->write_idx;
> +}
> +
>  static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
>  {
>  	struct kvm *kvm = vcpu->kvm;
> @@ -1797,6 +1868,8 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
>  	DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS);
>  	u64 valid_bank_mask;
>  	u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
> +	struct kvm_vcpu *v;
> +	unsigned long i;
>  	bool all_cpus;
>  
>  	/*
> @@ -1876,10 +1949,20 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
>  	 * analyze it here, flush TLB regardless of the specified address space.
>  	 */
>  	if (all_cpus) {
> +		kvm_for_each_vcpu(i, v, kvm)
> +			hv_tlb_flush_ring_enqueue(v);
> +
>  		kvm_make_all_cpus_request(kvm, KVM_REQ_HV_TLB_FLUSH);
>  	} else {
>  		sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask);
>  
> +		for_each_set_bit(i, vcpu_mask, KVM_MAX_VCPUS) {
> +			v = kvm_get_vcpu(kvm, i);
> +			if (!v)
> +				continue;
> +			hv_tlb_flush_ring_enqueue(v);
> +		}
> +
>  		kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask);
>  	}
>  
> diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
> index da2737f2a956..6847caeaaf84 100644
> --- a/arch/x86/kvm/hyperv.h
> +++ b/arch/x86/kvm/hyperv.h
> @@ -147,4 +147,17 @@ int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args);
>  int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
>  		     struct kvm_cpuid_entry2 __user *entries);
>  
> +
> +static inline void kvm_hv_vcpu_empty_flush_tlb(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
> +
> +	if (!hv_vcpu)
> +		return;
> +
> +	hv_vcpu->tlb_flush_ring.read_idx = hv_vcpu->tlb_flush_ring.write_idx;
> +}
> +void kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu);
> +
> +
>  #endif
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index f633cff8cd7f..e5aec386d299 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3324,7 +3324,7 @@ static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
>  	static_call(kvm_x86_flush_tlb_all)(vcpu);
>  }
>  
> -static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
> +void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
>  {
>  	++vcpu->stat.tlb_flush;
>  
> @@ -3362,7 +3362,8 @@ void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu)
>  
>  	if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu)) {
>  		kvm_vcpu_flush_tlb_guest(vcpu);
> -		kvm_clear_request(KVM_REQ_HV_TLB_FLUSH, vcpu);
> +		if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu))
> +			kvm_hv_vcpu_empty_flush_tlb(vcpu);
>  	} else if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu)) {
>  		kvm_vcpu_flush_tlb_guest(vcpu);
>  	}
> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
> index 588792f00334..2324f496c500 100644
> --- a/arch/x86/kvm/x86.h
> +++ b/arch/x86/kvm/x86.h
> @@ -58,6 +58,7 @@ static inline unsigned int __shrink_ple_window(unsigned int val,
>  
>  #define MSR_IA32_CR_PAT_DEFAULT  0x0007040600070406ULL
>  
> +void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu);
>  void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu);
>  int kvm_check_nested_events(struct kvm_vcpu *vcpu);
>  


Overall looks good to me. I might have missed something though.

Best regards,
	Maxim Levitsky
Vitaly Kuznetsov May 16, 2022, 2:29 p.m. UTC | #2
Maxim Levitsky <mlevitsk@redhat.com> writes:

> On Thu, 2022-04-14 at 15:19 +0200, Vitaly Kuznetsov wrote:
>> To allow flushing individual GVAs instead of always flushing the whole
>> VPID a per-vCPU structure to pass the requests is needed. Introduce a
>> simple ring write-locked structure to hold two types of entries:
>> individual GVA (GFN + up to 4095 following GFNs in the lower 12 bits)
>> and 'flush all'.
>> 
>> The queuing rule is: if there's not enough space on the ring to put
>> the request and leave at least 1 entry for 'flush all' - put 'flush
>> all' entry.
>> 
>> The size of the ring is arbitrary set to '16'.
>> 
>> Note, kvm_hv_flush_tlb() only queues 'flush all' entries for now so
>> there's very small functional change but the infrastructure is
>> prepared to handle individual GVA flush requests.
>
> As I see from this patch, also the code doesn't process the requests
> from the ring buffer yet, but rather just ignores it completely,
> and resets the whole ring buffer (kvm_hv_vcpu_empty_flush_tlb)
> Maybe you should mention it here.
>
>
>> 
>> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
>> ---
>>  arch/x86/include/asm/kvm_host.h | 16 +++++++
>>  arch/x86/kvm/hyperv.c           | 83 +++++++++++++++++++++++++++++++++
>>  arch/x86/kvm/hyperv.h           | 13 ++++++
>>  arch/x86/kvm/x86.c              |  5 +-
>>  arch/x86/kvm/x86.h              |  1 +
>>  5 files changed, 116 insertions(+), 2 deletions(-)
>> 
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index 1de3ad9308d8..b4dd2ff61658 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -578,6 +578,20 @@ struct kvm_vcpu_hv_synic {
>>  	bool dont_zero_synic_pages;
>>  };
>>  
>> +#define KVM_HV_TLB_FLUSH_RING_SIZE (16)
>> +
>> +struct kvm_vcpu_hv_tlb_flush_entry {
>> +	u64 addr;
>> +	u64 flush_all:1;
>> +	u64 pad:63;
>> +};
>
> Have you considered using kfifo.h library instead?
>

As a matter of fact I have not and this is a good suggestion,
actually. Let me try to use it instead of my home-brewed ring. I'll
address your other comments after that. Thanks!
Sean Christopherson May 16, 2022, 7:34 p.m. UTC | #3
On Thu, Apr 14, 2022, Vitaly Kuznetsov wrote:
> To allow flushing individual GVAs instead of always flushing the whole
> VPID a per-vCPU structure to pass the requests is needed. Introduce a
> simple ring write-locked structure to hold two types of entries:
> individual GVA (GFN + up to 4095 following GFNs in the lower 12 bits)
> and 'flush all'.
> 
> The queuing rule is: if there's not enough space on the ring to put
> the request and leave at least 1 entry for 'flush all' - put 'flush
> all' entry.
> 
> The size of the ring is arbitrary set to '16'.
> 
> Note, kvm_hv_flush_tlb() only queues 'flush all' entries for now so
> there's very small functional change but the infrastructure is
> prepared to handle individual GVA flush requests.
> 
> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
> ---
>  arch/x86/include/asm/kvm_host.h | 16 +++++++
>  arch/x86/kvm/hyperv.c           | 83 +++++++++++++++++++++++++++++++++
>  arch/x86/kvm/hyperv.h           | 13 ++++++
>  arch/x86/kvm/x86.c              |  5 +-
>  arch/x86/kvm/x86.h              |  1 +
>  5 files changed, 116 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 1de3ad9308d8..b4dd2ff61658 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -578,6 +578,20 @@ struct kvm_vcpu_hv_synic {
>  	bool dont_zero_synic_pages;
>  };
>  
> +#define KVM_HV_TLB_FLUSH_RING_SIZE (16)
> +
> +struct kvm_vcpu_hv_tlb_flush_entry {
> +	u64 addr;

"addr" misleading, this is overloaded to be both the virtual address and the count.
I think we make it a moot point, but it led me astray in thinkin we could use the
lower 12 bits for flags... until I realized those bits are already in use.

> +	u64 flush_all:1;
> +	u64 pad:63;

This is rather odd, why not just use a bool?  But why even have a "flush_all"
field, can't we just use a magic value for write_idx to indicate "flush_all"?
E.g. either an explicit #define or -1.

Writers set write_idx to -1 to indicate "flush all", vCPU/reader goes straight
to "flush all" if write_idx is -1/invalid.  That way, future writes can simply do
nothing until read_idx == write_idx, and the vCPU/reader avoids unnecessary flushes
if there's a "flush all" pending and other valid entries in the ring.

And it allows deferring the "flush all" until the ring is truly full (unless there's
an off-by-one / wraparound edge case I'm missing, which is likely...).

---
 arch/x86/include/asm/kvm_host.h |  8 +-----
 arch/x86/kvm/hyperv.c           | 47 +++++++++++++--------------------
 2 files changed, 19 insertions(+), 36 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b6b9a71a4591..bb45cc383ce4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -605,16 +605,10 @@ enum hv_tlb_flush_rings {
 	HV_NR_TLB_FLUSH_RINGS,
 };

-struct kvm_vcpu_hv_tlb_flush_entry {
-	u64 addr;
-	u64 flush_all:1;
-	u64 pad:63;
-};
-
 struct kvm_vcpu_hv_tlb_flush_ring {
 	int read_idx, write_idx;
 	spinlock_t write_lock;
-	struct kvm_vcpu_hv_tlb_flush_entry entries[KVM_HV_TLB_FLUSH_RING_SIZE];
+	u64 entries[KVM_HV_TLB_FLUSH_RING_SIZE];
 };

 /* Hyper-V per vcpu emulation context */
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 1d6927538bc7..56f06cf85282 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1837,10 +1837,13 @@ static int kvm_hv_get_tlb_flush_entries(struct kvm *kvm, struct kvm_hv_hcall *hc
 static inline int hv_tlb_flush_ring_free(struct kvm_vcpu_hv *hv_vcpu,
 					 int read_idx, int write_idx)
 {
+	if (write_idx < 0)
+		return 0;
+
 	if (write_idx >= read_idx)
-		return KVM_HV_TLB_FLUSH_RING_SIZE - (write_idx - read_idx) - 1;
+		return KVM_HV_TLB_FLUSH_RING_SIZE - (write_idx - read_idx);

-	return read_idx - write_idx - 1;
+	return read_idx - write_idx;
 }

 static void hv_tlb_flush_ring_enqueue(struct kvm_vcpu *vcpu,
@@ -1869,6 +1872,9 @@ static void hv_tlb_flush_ring_enqueue(struct kvm_vcpu *vcpu,
 	 */
 	write_idx = tlb_flush_ring->write_idx;

+	if (write_idx < 0 && read_idx == write_idx)
+		read_idx = write_idx = 0;
+
 	ring_free = hv_tlb_flush_ring_free(hv_vcpu, read_idx, write_idx);
 	/* Full ring always contains 'flush all' entry */
 	if (!ring_free)
@@ -1879,21 +1885,13 @@ static void hv_tlb_flush_ring_enqueue(struct kvm_vcpu *vcpu,
 	 * entry in case another request comes in. In case there's not enough
 	 * space, just put 'flush all' entry there.
 	 */
-	if (!count || count >= ring_free - 1 || !entries) {
-		tlb_flush_ring->entries[write_idx].addr = 0;
-		tlb_flush_ring->entries[write_idx].flush_all = 1;
-		/*
-		 * Advance write index only after filling in the entry to
-		 * synchronize with lockless reader.
-		 */
-		smp_wmb();
-		tlb_flush_ring->write_idx = (write_idx + 1) % KVM_HV_TLB_FLUSH_RING_SIZE;
+	if (!count || count > ring_free - 1 || !entries) {
+		tlb_flush_ring->write_idx = -1;
 		goto out_unlock;
 	}

 	for (i = 0; i < count; i++) {
-		tlb_flush_ring->entries[write_idx].addr = entries[i];
-		tlb_flush_ring->entries[write_idx].flush_all = 0;
+		tlb_flush_ring->entries[write_idx] = entries[i];
 		write_idx = (write_idx + 1) % KVM_HV_TLB_FLUSH_RING_SIZE;
 	}
 	/*
@@ -1911,7 +1909,6 @@ void kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
 {
 	struct kvm_vcpu_hv_tlb_flush_ring *tlb_flush_ring;
 	struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
-	struct kvm_vcpu_hv_tlb_flush_entry *entry;
 	int read_idx, write_idx;
 	u64 address;
 	u32 count;
@@ -1940,26 +1937,18 @@ void kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
 	/* Pairs with smp_wmb() in hv_tlb_flush_ring_enqueue() */
 	smp_rmb();

+	if (write_idx < 0) {
+		kvm_vcpu_flush_tlb_guest(vcpu);
+		goto out_empty_ring;
+	}
+
 	for (i = read_idx; i != write_idx; i = (i + 1) % KVM_HV_TLB_FLUSH_RING_SIZE) {
-		entry = &tlb_flush_ring->entries[i];
-
-		if (entry->flush_all)
-			goto out_flush_all;
-
-		/*
-		 * Lower 12 bits of 'address' encode the number of additional
-		 * pages to flush.
-		 */
-		address = entry->addr & PAGE_MASK;
-		count = (entry->addr & ~PAGE_MASK) + 1;
+		address = tlb_flush_ring->entries[i] & PAGE_MASK;
+		count = (tlb_flush_ring->entries[i] & ~PAGE_MASK) + 1;
 		for (j = 0; j < count; j++)
 			static_call(kvm_x86_flush_tlb_gva)(vcpu, address + j * PAGE_SIZE);
 	}
 	++vcpu->stat.tlb_flush;
-	goto out_empty_ring;
-
-out_flush_all:
-	kvm_vcpu_flush_tlb_guest(vcpu);

 out_empty_ring:
 	tlb_flush_ring->read_idx = write_idx;

base-commit: 62592c7c742ae78eb1f1005a63965ece19e6effe
--
Vitaly Kuznetsov May 17, 2022, 1:31 p.m. UTC | #4
Sean Christopherson <seanjc@google.com> writes:

> On Thu, Apr 14, 2022, Vitaly Kuznetsov wrote:
>> To allow flushing individual GVAs instead of always flushing the whole
>> VPID a per-vCPU structure to pass the requests is needed. Introduce a
>> simple ring write-locked structure to hold two types of entries:
>> individual GVA (GFN + up to 4095 following GFNs in the lower 12 bits)
>> and 'flush all'.
>> 
>> The queuing rule is: if there's not enough space on the ring to put
>> the request and leave at least 1 entry for 'flush all' - put 'flush
>> all' entry.
>> 
>> The size of the ring is arbitrary set to '16'.
>> 
>> Note, kvm_hv_flush_tlb() only queues 'flush all' entries for now so
>> there's very small functional change but the infrastructure is
>> prepared to handle individual GVA flush requests.
>> 
>> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
>> ---
>>  arch/x86/include/asm/kvm_host.h | 16 +++++++
>>  arch/x86/kvm/hyperv.c           | 83 +++++++++++++++++++++++++++++++++
>>  arch/x86/kvm/hyperv.h           | 13 ++++++
>>  arch/x86/kvm/x86.c              |  5 +-
>>  arch/x86/kvm/x86.h              |  1 +
>>  5 files changed, 116 insertions(+), 2 deletions(-)
>> 
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index 1de3ad9308d8..b4dd2ff61658 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -578,6 +578,20 @@ struct kvm_vcpu_hv_synic {
>>  	bool dont_zero_synic_pages;
>>  };
>>  
>> +#define KVM_HV_TLB_FLUSH_RING_SIZE (16)
>> +
>> +struct kvm_vcpu_hv_tlb_flush_entry {
>> +	u64 addr;
>
> "addr" misleading, this is overloaded to be both the virtual address and the count.
> I think we make it a moot point, but it led me astray in thinkin we could use the
> lower 12 bits for flags... until I realized those bits are already in use.
>
>> +	u64 flush_all:1;
>> +	u64 pad:63;
>
> This is rather odd, why not just use a bool?  

My initial plan was to eventually put more flags here, i.e. there are
two additional flags which we don't currently handle:

HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES (as we don't actually look at
 HV_ADDRESS_SPACE_ID)
HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY

> But why even have a "flush_all" field, can't we just use a magic value
> for write_idx to indicate "flush_all"? E.g. either an explicit #define
> or -1.

Sure, a magic value would do too and will allow us to make 'struct
kvm_vcpu_hv_tlb_flush_entry' 8 bytes instead of 16 (for the time being
as if we are to add HV_ADDRESS_SPACE_ID/additional flags the net win is
going to be zero).

>
> Writers set write_idx to -1 to indicate "flush all", vCPU/reader goes straight
> to "flush all" if write_idx is -1/invalid.  That way, future writes can simply do
> nothing until read_idx == write_idx, and the vCPU/reader avoids unnecessary flushes
> if there's a "flush all" pending and other valid entries in the ring.
>
> And it allows deferring the "flush all" until the ring is truly full (unless there's
> an off-by-one / wraparound edge case I'm missing, which is likely...).

Thanks for the patch! I am, however, going to look at Maxim's suggestion
to use 'kfifo' to avoid all these uncertainties, funky locking etc. At
first glance it has everything I need here.

>
> ---
>  arch/x86/include/asm/kvm_host.h |  8 +-----
>  arch/x86/kvm/hyperv.c           | 47 +++++++++++++--------------------
>  2 files changed, 19 insertions(+), 36 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index b6b9a71a4591..bb45cc383ce4 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -605,16 +605,10 @@ enum hv_tlb_flush_rings {
>  	HV_NR_TLB_FLUSH_RINGS,
>  };
>
> -struct kvm_vcpu_hv_tlb_flush_entry {
> -	u64 addr;
> -	u64 flush_all:1;
> -	u64 pad:63;
> -};
> -
>  struct kvm_vcpu_hv_tlb_flush_ring {
>  	int read_idx, write_idx;
>  	spinlock_t write_lock;
> -	struct kvm_vcpu_hv_tlb_flush_entry entries[KVM_HV_TLB_FLUSH_RING_SIZE];
> +	u64 entries[KVM_HV_TLB_FLUSH_RING_SIZE];
>  };
>
>  /* Hyper-V per vcpu emulation context */
> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> index 1d6927538bc7..56f06cf85282 100644
> --- a/arch/x86/kvm/hyperv.c
> +++ b/arch/x86/kvm/hyperv.c
> @@ -1837,10 +1837,13 @@ static int kvm_hv_get_tlb_flush_entries(struct kvm *kvm, struct kvm_hv_hcall *hc
>  static inline int hv_tlb_flush_ring_free(struct kvm_vcpu_hv *hv_vcpu,
>  					 int read_idx, int write_idx)
>  {
> +	if (write_idx < 0)
> +		return 0;
> +
>  	if (write_idx >= read_idx)
> -		return KVM_HV_TLB_FLUSH_RING_SIZE - (write_idx - read_idx) - 1;
> +		return KVM_HV_TLB_FLUSH_RING_SIZE - (write_idx - read_idx);
>
> -	return read_idx - write_idx - 1;
> +	return read_idx - write_idx;
>  }
>
>  static void hv_tlb_flush_ring_enqueue(struct kvm_vcpu *vcpu,
> @@ -1869,6 +1872,9 @@ static void hv_tlb_flush_ring_enqueue(struct kvm_vcpu *vcpu,
>  	 */
>  	write_idx = tlb_flush_ring->write_idx;
>
> +	if (write_idx < 0 && read_idx == write_idx)
> +		read_idx = write_idx = 0;
> +
>  	ring_free = hv_tlb_flush_ring_free(hv_vcpu, read_idx, write_idx);
>  	/* Full ring always contains 'flush all' entry */
>  	if (!ring_free)
> @@ -1879,21 +1885,13 @@ static void hv_tlb_flush_ring_enqueue(struct kvm_vcpu *vcpu,
>  	 * entry in case another request comes in. In case there's not enough
>  	 * space, just put 'flush all' entry there.
>  	 */
> -	if (!count || count >= ring_free - 1 || !entries) {
> -		tlb_flush_ring->entries[write_idx].addr = 0;
> -		tlb_flush_ring->entries[write_idx].flush_all = 1;
> -		/*
> -		 * Advance write index only after filling in the entry to
> -		 * synchronize with lockless reader.
> -		 */
> -		smp_wmb();
> -		tlb_flush_ring->write_idx = (write_idx + 1) % KVM_HV_TLB_FLUSH_RING_SIZE;
> +	if (!count || count > ring_free - 1 || !entries) {
> +		tlb_flush_ring->write_idx = -1;
>  		goto out_unlock;
>  	}
>
>  	for (i = 0; i < count; i++) {
> -		tlb_flush_ring->entries[write_idx].addr = entries[i];
> -		tlb_flush_ring->entries[write_idx].flush_all = 0;
> +		tlb_flush_ring->entries[write_idx] = entries[i];
>  		write_idx = (write_idx + 1) % KVM_HV_TLB_FLUSH_RING_SIZE;
>  	}
>  	/*
> @@ -1911,7 +1909,6 @@ void kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
>  {
>  	struct kvm_vcpu_hv_tlb_flush_ring *tlb_flush_ring;
>  	struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
> -	struct kvm_vcpu_hv_tlb_flush_entry *entry;
>  	int read_idx, write_idx;
>  	u64 address;
>  	u32 count;
> @@ -1940,26 +1937,18 @@ void kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
>  	/* Pairs with smp_wmb() in hv_tlb_flush_ring_enqueue() */
>  	smp_rmb();
>
> +	if (write_idx < 0) {
> +		kvm_vcpu_flush_tlb_guest(vcpu);
> +		goto out_empty_ring;
> +	}
> +
>  	for (i = read_idx; i != write_idx; i = (i + 1) % KVM_HV_TLB_FLUSH_RING_SIZE) {
> -		entry = &tlb_flush_ring->entries[i];
> -
> -		if (entry->flush_all)
> -			goto out_flush_all;
> -
> -		/*
> -		 * Lower 12 bits of 'address' encode the number of additional
> -		 * pages to flush.
> -		 */
> -		address = entry->addr & PAGE_MASK;
> -		count = (entry->addr & ~PAGE_MASK) + 1;
> +		address = tlb_flush_ring->entries[i] & PAGE_MASK;
> +		count = (tlb_flush_ring->entries[i] & ~PAGE_MASK) + 1;
>  		for (j = 0; j < count; j++)
>  			static_call(kvm_x86_flush_tlb_gva)(vcpu, address + j * PAGE_SIZE);
>  	}
>  	++vcpu->stat.tlb_flush;
> -	goto out_empty_ring;
> -
> -out_flush_all:
> -	kvm_vcpu_flush_tlb_guest(vcpu);
>
>  out_empty_ring:
>  	tlb_flush_ring->read_idx = write_idx;
>
> base-commit: 62592c7c742ae78eb1f1005a63965ece19e6effe
> --
>
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1de3ad9308d8..b4dd2ff61658 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -578,6 +578,20 @@  struct kvm_vcpu_hv_synic {
 	bool dont_zero_synic_pages;
 };
 
+#define KVM_HV_TLB_FLUSH_RING_SIZE (16)
+
+struct kvm_vcpu_hv_tlb_flush_entry {
+	u64 addr;
+	u64 flush_all:1;
+	u64 pad:63;
+};
+
+struct kvm_vcpu_hv_tlb_flush_ring {
+	int read_idx, write_idx;
+	spinlock_t write_lock;
+	struct kvm_vcpu_hv_tlb_flush_entry entries[KVM_HV_TLB_FLUSH_RING_SIZE];
+};
+
 /* Hyper-V per vcpu emulation context */
 struct kvm_vcpu_hv {
 	struct kvm_vcpu *vcpu;
@@ -597,6 +611,8 @@  struct kvm_vcpu_hv {
 		u32 enlightenments_ebx; /* HYPERV_CPUID_ENLIGHTMENT_INFO.EBX */
 		u32 syndbg_cap_eax; /* HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX */
 	} cpuid_cache;
+
+	struct kvm_vcpu_hv_tlb_flush_ring tlb_flush_ring;
 };
 
 /* Xen HVM per vcpu emulation context */
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index b402ad059eb9..fb716cf919ed 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -29,6 +29,7 @@ 
 #include <linux/kvm_host.h>
 #include <linux/highmem.h>
 #include <linux/sched/cputime.h>
+#include <linux/spinlock.h>
 #include <linux/eventfd.h>
 
 #include <asm/apicdef.h>
@@ -954,6 +955,8 @@  static int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu)
 
 	hv_vcpu->vp_index = vcpu->vcpu_idx;
 
+	spin_lock_init(&hv_vcpu->tlb_flush_ring.write_lock);
+
 	return 0;
 }
 
@@ -1789,6 +1792,74 @@  static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc,
 			      var_cnt * sizeof(*sparse_banks));
 }
 
+static inline int hv_tlb_flush_ring_free(struct kvm_vcpu_hv *hv_vcpu,
+					 int read_idx, int write_idx)
+{
+	if (write_idx >= read_idx)
+		return KVM_HV_TLB_FLUSH_RING_SIZE - (write_idx - read_idx) - 1;
+
+	return read_idx - write_idx - 1;
+}
+
+static void hv_tlb_flush_ring_enqueue(struct kvm_vcpu *vcpu)
+{
+	struct kvm_vcpu_hv_tlb_flush_ring *tlb_flush_ring;
+	struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+	int ring_free, write_idx, read_idx;
+	unsigned long flags;
+
+	if (!hv_vcpu)
+		return;
+
+	tlb_flush_ring = &hv_vcpu->tlb_flush_ring;
+
+	spin_lock_irqsave(&tlb_flush_ring->write_lock, flags);
+
+	/*
+	 * 'read_idx' is updated by the vCPU which does the flush, this
+	 * happens without 'tlb_flush_ring->write_lock' being held; make
+	 * sure we read it once.
+	 */
+	read_idx = READ_ONCE(tlb_flush_ring->read_idx);
+	/*
+	 * 'write_idx' is only updated here, under 'tlb_flush_ring->write_lock'.
+	 * allow the compiler to re-read it, it can't change.
+	 */
+	write_idx = tlb_flush_ring->write_idx;
+
+	ring_free = hv_tlb_flush_ring_free(hv_vcpu, read_idx, write_idx);
+	/* Full ring always contains 'flush all' entry */
+	if (!ring_free)
+		goto out_unlock;
+
+	tlb_flush_ring->entries[write_idx].addr = 0;
+	tlb_flush_ring->entries[write_idx].flush_all = 1;
+	/*
+	 * Advance write index only after filling in the entry to
+	 * synchronize with lockless reader.
+	 */
+	smp_wmb();
+	tlb_flush_ring->write_idx = (write_idx + 1) % KVM_HV_TLB_FLUSH_RING_SIZE;
+
+out_unlock:
+	spin_unlock_irqrestore(&tlb_flush_ring->write_lock, flags);
+}
+
+void kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
+{
+	struct kvm_vcpu_hv_tlb_flush_ring *tlb_flush_ring;
+	struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+	kvm_vcpu_flush_tlb_guest(vcpu);
+
+	if (!hv_vcpu)
+		return;
+
+	tlb_flush_ring = &hv_vcpu->tlb_flush_ring;
+
+	tlb_flush_ring->read_idx = tlb_flush_ring->write_idx;
+}
+
 static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
 {
 	struct kvm *kvm = vcpu->kvm;
@@ -1797,6 +1868,8 @@  static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
 	DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS);
 	u64 valid_bank_mask;
 	u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
+	struct kvm_vcpu *v;
+	unsigned long i;
 	bool all_cpus;
 
 	/*
@@ -1876,10 +1949,20 @@  static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
 	 * analyze it here, flush TLB regardless of the specified address space.
 	 */
 	if (all_cpus) {
+		kvm_for_each_vcpu(i, v, kvm)
+			hv_tlb_flush_ring_enqueue(v);
+
 		kvm_make_all_cpus_request(kvm, KVM_REQ_HV_TLB_FLUSH);
 	} else {
 		sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask);
 
+		for_each_set_bit(i, vcpu_mask, KVM_MAX_VCPUS) {
+			v = kvm_get_vcpu(kvm, i);
+			if (!v)
+				continue;
+			hv_tlb_flush_ring_enqueue(v);
+		}
+
 		kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask);
 	}
 
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index da2737f2a956..6847caeaaf84 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -147,4 +147,17 @@  int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args);
 int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
 		     struct kvm_cpuid_entry2 __user *entries);
 
+
+static inline void kvm_hv_vcpu_empty_flush_tlb(struct kvm_vcpu *vcpu)
+{
+	struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+	if (!hv_vcpu)
+		return;
+
+	hv_vcpu->tlb_flush_ring.read_idx = hv_vcpu->tlb_flush_ring.write_idx;
+}
+void kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu);
+
+
 #endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f633cff8cd7f..e5aec386d299 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3324,7 +3324,7 @@  static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
 	static_call(kvm_x86_flush_tlb_all)(vcpu);
 }
 
-static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
+void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
 {
 	++vcpu->stat.tlb_flush;
 
@@ -3362,7 +3362,8 @@  void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu)
 
 	if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu)) {
 		kvm_vcpu_flush_tlb_guest(vcpu);
-		kvm_clear_request(KVM_REQ_HV_TLB_FLUSH, vcpu);
+		if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu))
+			kvm_hv_vcpu_empty_flush_tlb(vcpu);
 	} else if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu)) {
 		kvm_vcpu_flush_tlb_guest(vcpu);
 	}
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 588792f00334..2324f496c500 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -58,6 +58,7 @@  static inline unsigned int __shrink_ple_window(unsigned int val,
 
 #define MSR_IA32_CR_PAT_DEFAULT  0x0007040600070406ULL
 
+void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu);
 void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu);
 int kvm_check_nested_events(struct kvm_vcpu *vcpu);