diff mbox series

[v2,03/31] KVM: x86: hyper-v: Handle HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST{,EX} calls gently

Message ID 20220407155645.940890-4-vkuznets@redhat.com (mailing list archive)
State New, archived
Headers show
Series KVM: x86: hyper-v: Fine-grained TLB flush + Direct TLB flush feature | expand

Commit Message

Vitaly Kuznetsov April 7, 2022, 3:56 p.m. UTC
Currently, HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST{,EX} calls are handled
the exact same way as HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE{,EX}: by
flushing the whole VPID and this is sub-optimal. Switch to handling
these requests with 'flush_tlb_gva()' hooks instead. Use the newly
introduced TLB flush ring to queue the requests.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
---
 arch/x86/kvm/hyperv.c | 141 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 121 insertions(+), 20 deletions(-)

Comments

Sean Christopherson April 7, 2022, 5:33 p.m. UTC | #1
On Thu, Apr 07, 2022, Vitaly Kuznetsov wrote:
> Currently, HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST{,EX} calls are handled
> the exact same way as HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE{,EX}: by
> flushing the whole VPID and this is sub-optimal. Switch to handling
> these requests with 'flush_tlb_gva()' hooks instead. Use the newly
> introduced TLB flush ring to queue the requests.
> 
> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
> ---
>  arch/x86/kvm/hyperv.c | 141 ++++++++++++++++++++++++++++++++++++------
>  1 file changed, 121 insertions(+), 20 deletions(-)
> 
> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> index 81c44e0eadf9..a54d41656f30 100644
> --- a/arch/x86/kvm/hyperv.c
> +++ b/arch/x86/kvm/hyperv.c
> @@ -1792,6 +1792,35 @@ static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc,
>  			      var_cnt * sizeof(*sparse_banks));
>  }
>  
> +static int kvm_hv_get_tlbflush_entries(struct kvm *kvm, struct kvm_hv_hcall *hc, u64 entries[],
> +				       u32 data_offset, int consumed_xmm_halves)

data_offset should be gpa_t, and the order of params should be consistent between
this and kvm_get_sparse_vp_set().

> +{
> +	int i;
> +
> +	if (hc->fast) {
> +		/*
> +		 * Each XMM holds two entries, but do not count halves that
> +		 * have already been consumed.
> +		 */
> +		if (hc->rep_cnt > (2 * HV_HYPERCALL_MAX_XMM_REGISTERS - consumed_xmm_halves))
> +			return -EINVAL;
> +
> +		for (i = 0; i < hc->rep_cnt; i++) {
> +			int j = i + consumed_xmm_halves;
> +
> +			if (j % 2)
> +				entries[i] = sse128_hi(hc->xmm[j / 2]);
> +			else
> +				entries[i] = sse128_lo(hc->xmm[j / 2]);
> +		}
> +
> +		return 0;
> +	}
> +
> +	return kvm_read_guest(kvm, hc->ingpa + data_offset,
> +			      entries, hc->rep_cnt * sizeof(entries[0]));

This is almost verbatim copy+pasted from kvm_get_sparse_vp_set().  If you slot in
the attached patched before this, then this function becomes:

static int kvm_hv_get_tlbflush_entries(struct kvm *kvm, struct kvm_hv_hcall *hc, u64 entries[],
				       int consumed_xmm_halves, gpa_t offset)
{
	return kvm_hv_get_hc_data(kvm, hc, hc->rep_cnt, hc->rep_cnt,
				  entries, consumed_xmm_halves, offset);
}


> +}

...

> @@ -1840,15 +1891,47 @@ void kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
>  {
>  	struct kvm_vcpu_hv_tlbflush_ring *tlb_flush_ring;
>  	struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
> -
> -	kvm_vcpu_flush_tlb_guest(vcpu);
> -
> -	if (!hv_vcpu)
> +	struct kvm_vcpu_hv_tlbflush_entry *entry;
> +	int read_idx, write_idx;
> +	u64 address;
> +	u32 count;
> +	int i, j;
> +
> +	if (!tdp_enabled || !hv_vcpu) {
> +		kvm_vcpu_flush_tlb_guest(vcpu);
>  		return;
> +	}
>  
>  	tlb_flush_ring = &hv_vcpu->tlb_flush_ring;
> +	read_idx = READ_ONCE(tlb_flush_ring->read_idx);
> +	write_idx = READ_ONCE(tlb_flush_ring->write_idx);
> +
> +	/* Pairs with smp_wmb() in hv_tlb_flush_ring_enqueue() */
> +	smp_rmb();
>  
> -	tlb_flush_ring->read_idx = tlb_flush_ring->write_idx;
> +	for (i = read_idx; i != write_idx; i = (i + 1) % KVM_HV_TLB_FLUSH_RING_SIZE) {
> +		entry = &tlb_flush_ring->entries[i];
> +
> +		if (entry->flush_all)
> +			goto out_flush_all;
> +
> +		/*
> +		 * Lower 12 bits of 'address' encode the number of additional
> +		 * pages to flush.
> +		 */
> +		address = entry->addr & PAGE_MASK;
> +		count = (entry->addr & ~PAGE_MASK) + 1;
> +		for (j = 0; j < count; j++)
> +			static_call(kvm_x86_flush_tlb_gva)(vcpu, address + j * PAGE_SIZE);
> +	}
> +	++vcpu->stat.tlb_flush;
> +	goto out_empty_ring;
> +
> +out_flush_all:
> +	kvm_vcpu_flush_tlb_guest(vcpu);
> +
> +out_empty_ring:
> +	tlb_flush_ring->read_idx = write_idx;
>  }
>  
>  static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
> @@ -1857,12 +1940,13 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
>  	struct hv_tlb_flush_ex flush_ex;
>  	struct hv_tlb_flush flush;
>  	DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS);
> +	u64 entries[KVM_HV_TLB_FLUSH_RING_SIZE - 2];

What's up with the -2?  And given the multitude of things going on in this code,
I'd strongly prefer this be tlbflush_entries.

Actually, if you do:

	u64 __tlbflush_entries[KVM_HV_TLB_FLUSH_RING_SIZE - 2];
	u64 *tlbflush_entries;

and drop all_addr, the code to get entries can be

	if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE ||
	    hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX ||
	    hc->rep_cnt > ARRAY_SIZE(tlbflush_entries)) {
		tlbfluish_entries = NULL;
	} else {
		if (kvm_hv_get_tlbflush_entries(kvm, hc, __tlbflush_entries,
						consumed_xmm_halves, data_offset))
			return HV_STATUS_INVALID_HYPERCALL_INPUT;
		tlbfluish_entries = __tlbflush_entries;
	}

and the calls to queue flushes becomes

			hv_tlb_flush_ring_enqueue(v, tlbflush_entries, hc->rep_cnt);

That way a bug will "just" be a NULL pointer dereference and not consumption of
uninitialized data (though such a bug might be caught be caught by the compiler).

>  	u64 valid_bank_mask;
>  	u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
>  	struct kvm_vcpu *v;
>  	unsigned long i;
> -	bool all_cpus;
> -
> +	bool all_cpus, all_addr;
> +	int data_offset = 0, consumed_xmm_halves = 0;

data_offset should be a gpa_t.

>  	/*
>  	 * The Hyper-V TLFS doesn't allow more than 64 sparse banks, e.g. the
>  	 * valid mask is a u64.  Fail the build if KVM's max allowed number of

...

> +read_flush_entries:
> +	if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE ||
> +	    hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX ||
> +	    hc->rep_cnt > (KVM_HV_TLB_FLUSH_RING_SIZE - 2)) {

Rather than duplicate the -2 magic, it's far better to do:


> +		all_addr = true;
> +	} else {
> +		if (kvm_hv_get_tlbflush_entries(kvm, hc, entries,
> +						data_offset, consumed_xmm_halves))

As mentioned, the order for this call should match kvm_get_sparse_vp_set().

>  			return HV_STATUS_INVALID_HYPERCALL_INPUT;
> +		all_addr = false;
>  	}
>  
> -do_flush:
> +
>  	/*
>  	 * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't
>  	 * analyze it here, flush TLB regardless of the specified address space.
>  	 */
>  	if (all_cpus) {
>  		kvm_for_each_vcpu(i, v, kvm)
> -			hv_tlb_flush_ring_enqueue(v);
> +			hv_tlb_flush_ring_enqueue(v, all_addr, entries, hc->rep_cnt);
>  
>  		kvm_make_all_cpus_request(kvm, KVM_REQ_HV_TLB_FLUSH);
>  	} else {
> @@ -1951,7 +2052,7 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
>  			v = kvm_get_vcpu(kvm, i);
>  			if (!v)
>  				continue;
> -			hv_tlb_flush_ring_enqueue(v);
> +			hv_tlb_flush_ring_enqueue(v, all_addr, entries, hc->rep_cnt);
>  		}
>  
>  		kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask);
> -- 
> 2.35.1
>
Sean Christopherson April 7, 2022, 5:44 p.m. UTC | #2
On Thu, Apr 07, 2022, Vitaly Kuznetsov wrote:
> @@ -1840,15 +1891,47 @@ void kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
>  {
>  	struct kvm_vcpu_hv_tlbflush_ring *tlb_flush_ring;
>  	struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
> -
> -	kvm_vcpu_flush_tlb_guest(vcpu);
> -
> -	if (!hv_vcpu)
> +	struct kvm_vcpu_hv_tlbflush_entry *entry;
> +	int read_idx, write_idx;
> +	u64 address;
> +	u32 count;
> +	int i, j;
> +
> +	if (!tdp_enabled || !hv_vcpu) {
> +		kvm_vcpu_flush_tlb_guest(vcpu);
>  		return;
> +	}
>  
>  	tlb_flush_ring = &hv_vcpu->tlb_flush_ring;
> +	read_idx = READ_ONCE(tlb_flush_ring->read_idx);
> +	write_idx = READ_ONCE(tlb_flush_ring->write_idx);
> +
> +	/* Pairs with smp_wmb() in hv_tlb_flush_ring_enqueue() */
> +	smp_rmb();
>  
> -	tlb_flush_ring->read_idx = tlb_flush_ring->write_idx;
> +	for (i = read_idx; i != write_idx; i = (i + 1) % KVM_HV_TLB_FLUSH_RING_SIZE) {
> +		entry = &tlb_flush_ring->entries[i];
> +
> +		if (entry->flush_all)
> +			goto out_flush_all;
> +
> +		/*
> +		 * Lower 12 bits of 'address' encode the number of additional
> +		 * pages to flush.
> +		 */
> +		address = entry->addr & PAGE_MASK;
> +		count = (entry->addr & ~PAGE_MASK) + 1;
> +		for (j = 0; j < count; j++)
> +			static_call(kvm_x86_flush_tlb_gva)(vcpu, address + j * PAGE_SIZE);
> +	}
> +	++vcpu->stat.tlb_flush;
> +	goto out_empty_ring;
> +
> +out_flush_all:
> +	kvm_vcpu_flush_tlb_guest(vcpu);
> +
> +out_empty_ring:
> +	tlb_flush_ring->read_idx = write_idx;

Does this need WRITE_ONCE?  My usual "I suck at memory ordering" disclaimer applies.
Sean Christopherson April 7, 2022, 5:47 p.m. UTC | #3
On Thu, Apr 07, 2022, Sean Christopherson wrote:
> > @@ -1857,12 +1940,13 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
> >  	struct hv_tlb_flush_ex flush_ex;
> >  	struct hv_tlb_flush flush;
> >  	DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS);
> > +	u64 entries[KVM_HV_TLB_FLUSH_RING_SIZE - 2];
> 
> What's up with the -2?  And given the multitude of things going on in this code,
> I'd strongly prefer this be tlbflush_entries.
> 
> Actually, if you do:
> 
> 	u64 __tlbflush_entries[KVM_HV_TLB_FLUSH_RING_SIZE - 2];
> 	u64 *tlbflush_entries;

Looking at future patches, tlb_flush_entries is better for consistency (apply everywhere).

> and drop all_addr, the code to get entries can be
> 
> 	if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE ||
> 	    hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX ||
> 	    hc->rep_cnt > ARRAY_SIZE(tlbflush_entries)) {
> 		tlbfluish_entries = NULL;
> 	} else {
> 		if (kvm_hv_get_tlbflush_entries(kvm, hc, __tlbflush_entries,
> 						consumed_xmm_halves, data_offset))
> 			return HV_STATUS_INVALID_HYPERCALL_INPUT;
> 		tlbfluish_entries = __tlbflush_entries;

Heh, fluish, because TLB entries are somewhat fluid?

> 	}
> 
> and the calls to queue flushes becomes
> 
> 			hv_tlb_flush_ring_enqueue(v, tlbflush_entries, hc->rep_cnt);
> 
> That way a bug will "just" be a NULL pointer dereference and not consumption of
> uninitialized data (though such a bug might be caught be caught by the compiler).
Vitaly Kuznetsov April 11, 2022, 11:15 a.m. UTC | #4
Sean Christopherson <seanjc@google.com> writes:

> On Thu, Apr 07, 2022, Vitaly Kuznetsov wrote:

...

Thanks a lot for the review! I'll incorporate your feedback into v3.

>>  
>>  static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
>> @@ -1857,12 +1940,13 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
>>  	struct hv_tlb_flush_ex flush_ex;
>>  	struct hv_tlb_flush flush;
>>  	DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS);
>> +	u64 entries[KVM_HV_TLB_FLUSH_RING_SIZE - 2];
>
> What's up with the -2?

(This should probably be a define or at least a comment somewhere)

Normally, we can only put 'KVM_HV_TLB_FLUSH_RING_SIZE - 1' entries on
the ring as when read_idx == write_idx we percieve this as 'ring is
empty' and not as 'ring is full'. For the TLB flush ring we must always
leave one free entry to put "flush all" request when we run out of
free space to avoid blocking the writer. I.e. when a request flies in,
we check if we have enough space on the ring to put all the entries and
if not, we just put 'flush all' there. In case 'flush all' is already on
the ring, ignoring the request is safe.

So, long story short, there's no point in fetching more than
'KVM_HV_TLB_FLUSH_RING_SIZE - 2' entries from the guest as we can't
possibly put them all on the ring.

[snip]
Vitaly Kuznetsov April 11, 2022, 11:31 a.m. UTC | #5
Sean Christopherson <seanjc@google.com> writes:

> On Thu, Apr 07, 2022, Vitaly Kuznetsov wrote:
>> @@ -1840,15 +1891,47 @@ void kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
>>  {
>>  	struct kvm_vcpu_hv_tlbflush_ring *tlb_flush_ring;
>>  	struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
>> -
>> -	kvm_vcpu_flush_tlb_guest(vcpu);
>> -
>> -	if (!hv_vcpu)
>> +	struct kvm_vcpu_hv_tlbflush_entry *entry;
>> +	int read_idx, write_idx;
>> +	u64 address;
>> +	u32 count;
>> +	int i, j;
>> +
>> +	if (!tdp_enabled || !hv_vcpu) {
>> +		kvm_vcpu_flush_tlb_guest(vcpu);
>>  		return;
>> +	}
>>  
>>  	tlb_flush_ring = &hv_vcpu->tlb_flush_ring;
>> +	read_idx = READ_ONCE(tlb_flush_ring->read_idx);
>> +	write_idx = READ_ONCE(tlb_flush_ring->write_idx);
>> +
>> +	/* Pairs with smp_wmb() in hv_tlb_flush_ring_enqueue() */
>> +	smp_rmb();
>>  
>> -	tlb_flush_ring->read_idx = tlb_flush_ring->write_idx;
>> +	for (i = read_idx; i != write_idx; i = (i + 1) % KVM_HV_TLB_FLUSH_RING_SIZE) {
>> +		entry = &tlb_flush_ring->entries[i];
>> +
>> +		if (entry->flush_all)
>> +			goto out_flush_all;
>> +
>> +		/*
>> +		 * Lower 12 bits of 'address' encode the number of additional
>> +		 * pages to flush.
>> +		 */
>> +		address = entry->addr & PAGE_MASK;
>> +		count = (entry->addr & ~PAGE_MASK) + 1;
>> +		for (j = 0; j < count; j++)
>> +			static_call(kvm_x86_flush_tlb_gva)(vcpu, address + j * PAGE_SIZE);
>> +	}
>> +	++vcpu->stat.tlb_flush;
>> +	goto out_empty_ring;
>> +
>> +out_flush_all:
>> +	kvm_vcpu_flush_tlb_guest(vcpu);
>> +
>> +out_empty_ring:
>> +	tlb_flush_ring->read_idx = write_idx;
>
> Does this need WRITE_ONCE?  My usual "I suck at memory ordering" disclaimer applies.
>

Same here) I *think* we're fine for 'read_idx' as it shouldn't matter at
which point in this function 'tlb_flush_ring->read_idx' gets modified
(relative to other things, e.g. actual TLB flushes) and there's no
concurency as we only have one reader (the vCPU which needs its TLB
flushed). On the other hand, I'm not against adding WRITE_ONCE() here
even if just to aid an unprepared reader (thinking myself couple years
in the future).
Sean Christopherson April 11, 2022, 8:37 p.m. UTC | #6
On Mon, Apr 11, 2022, Vitaly Kuznetsov wrote:
> Sean Christopherson <seanjc@google.com> writes:
> 
> > On Thu, Apr 07, 2022, Vitaly Kuznetsov wrote:
> >> @@ -1840,15 +1891,47 @@ void kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
> >>  {
> >>  	struct kvm_vcpu_hv_tlbflush_ring *tlb_flush_ring;
> >>  	struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
> >> -
> >> -	kvm_vcpu_flush_tlb_guest(vcpu);
> >> -
> >> -	if (!hv_vcpu)
> >> +	struct kvm_vcpu_hv_tlbflush_entry *entry;
> >> +	int read_idx, write_idx;
> >> +	u64 address;
> >> +	u32 count;
> >> +	int i, j;
> >> +
> >> +	if (!tdp_enabled || !hv_vcpu) {
> >> +		kvm_vcpu_flush_tlb_guest(vcpu);
> >>  		return;
> >> +	}
> >>  
> >>  	tlb_flush_ring = &hv_vcpu->tlb_flush_ring;
> >> +	read_idx = READ_ONCE(tlb_flush_ring->read_idx);
> >> +	write_idx = READ_ONCE(tlb_flush_ring->write_idx);
> >> +
> >> +	/* Pairs with smp_wmb() in hv_tlb_flush_ring_enqueue() */
> >> +	smp_rmb();
> >>  
> >> -	tlb_flush_ring->read_idx = tlb_flush_ring->write_idx;
> >> +	for (i = read_idx; i != write_idx; i = (i + 1) % KVM_HV_TLB_FLUSH_RING_SIZE) {
> >> +		entry = &tlb_flush_ring->entries[i];
> >> +
> >> +		if (entry->flush_all)
> >> +			goto out_flush_all;
> >> +
> >> +		/*
> >> +		 * Lower 12 bits of 'address' encode the number of additional
> >> +		 * pages to flush.
> >> +		 */
> >> +		address = entry->addr & PAGE_MASK;
> >> +		count = (entry->addr & ~PAGE_MASK) + 1;
> >> +		for (j = 0; j < count; j++)
> >> +			static_call(kvm_x86_flush_tlb_gva)(vcpu, address + j * PAGE_SIZE);
> >> +	}
> >> +	++vcpu->stat.tlb_flush;
> >> +	goto out_empty_ring;
> >> +
> >> +out_flush_all:
> >> +	kvm_vcpu_flush_tlb_guest(vcpu);
> >> +
> >> +out_empty_ring:
> >> +	tlb_flush_ring->read_idx = write_idx;
> >
> > Does this need WRITE_ONCE?  My usual "I suck at memory ordering" disclaimer applies.
> >
> 
> Same here) I *think* we're fine for 'read_idx' as it shouldn't matter at
> which point in this function 'tlb_flush_ring->read_idx' gets modified
> (relative to other things, e.g. actual TLB flushes) and there's no
> concurency as we only have one reader (the vCPU which needs its TLB
> flushed). On the other hand, I'm not against adding WRITE_ONCE() here
> even if just to aid an unprepared reader (thinking myself couple years
> in the future).

Ah, read_idx == tail and write_idx == head.  I didn't look at the structure very
closely, or maybe not at all :-)  And IIUC, only the vCPU itself ever writes to
tail?  In that case, I would omit the READ_ONCE() from both the write to tail here
and the read above, and probably add a brief comment stating that the flush must
be performed on the target vCPU, i.e. must hold vcpu->mutex, and so it's safe for
the compiler to re-read tlb_flush_ring->read_idx in the loop because it cannot
change.
diff mbox series

Patch

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 81c44e0eadf9..a54d41656f30 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1792,6 +1792,35 @@  static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc,
 			      var_cnt * sizeof(*sparse_banks));
 }
 
+static int kvm_hv_get_tlbflush_entries(struct kvm *kvm, struct kvm_hv_hcall *hc, u64 entries[],
+				       u32 data_offset, int consumed_xmm_halves)
+{
+	int i;
+
+	if (hc->fast) {
+		/*
+		 * Each XMM holds two entries, but do not count halves that
+		 * have already been consumed.
+		 */
+		if (hc->rep_cnt > (2 * HV_HYPERCALL_MAX_XMM_REGISTERS - consumed_xmm_halves))
+			return -EINVAL;
+
+		for (i = 0; i < hc->rep_cnt; i++) {
+			int j = i + consumed_xmm_halves;
+
+			if (j % 2)
+				entries[i] = sse128_hi(hc->xmm[j / 2]);
+			else
+				entries[i] = sse128_lo(hc->xmm[j / 2]);
+		}
+
+		return 0;
+	}
+
+	return kvm_read_guest(kvm, hc->ingpa + data_offset,
+			      entries, hc->rep_cnt * sizeof(entries[0]));
+}
+
 static inline int hv_tlb_flush_ring_free(struct kvm_vcpu_hv *hv_vcpu,
 					 int read_idx, int write_idx)
 {
@@ -1801,12 +1830,14 @@  static inline int hv_tlb_flush_ring_free(struct kvm_vcpu_hv *hv_vcpu,
 	return read_idx - write_idx - 1;
 }
 
-static void hv_tlb_flush_ring_enqueue(struct kvm_vcpu *vcpu)
+static void hv_tlb_flush_ring_enqueue(struct kvm_vcpu *vcpu, bool flush_all,
+				      u64 *entries, int count)
 {
 	struct kvm_vcpu_hv_tlbflush_ring *tlb_flush_ring;
 	struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
 	int ring_free, write_idx, read_idx;
 	unsigned long flags;
+	int i;
 
 	if (!hv_vcpu)
 		return;
@@ -1823,14 +1854,34 @@  static void hv_tlb_flush_ring_enqueue(struct kvm_vcpu *vcpu)
 	if (!ring_free)
 		goto out_unlock;
 
-	tlb_flush_ring->entries[write_idx].addr = 0;
-	tlb_flush_ring->entries[write_idx].flush_all = 1;
 	/*
-	 * Advance write index only after filling in the entry to
-	 * synchronize with lockless reader.
+	 * All entries should fit on the ring leaving one free for 'flush all'
+	 * entry in case another request comes in. In case there's not enough
+	 * space, just put 'flush all' entry there.
+	 */
+	if (!count || count >= ring_free - 1 || flush_all) {
+		tlb_flush_ring->entries[write_idx].addr = 0;
+		tlb_flush_ring->entries[write_idx].flush_all = 1;
+		/*
+		 * Advance write index only after filling in the entry to
+		 * synchronize with lockless reader.
+		 */
+		smp_wmb();
+		tlb_flush_ring->write_idx = (write_idx + 1) % KVM_HV_TLB_FLUSH_RING_SIZE;
+		goto out_unlock;
+	}
+
+	for (i = 0; i < count; i++) {
+		tlb_flush_ring->entries[write_idx].addr = entries[i];
+		tlb_flush_ring->entries[write_idx].flush_all = 0;
+		write_idx = (write_idx + 1) % KVM_HV_TLB_FLUSH_RING_SIZE;
+	}
+	/*
+	 * Advance write index only after filling in the entry to synchronize
+	 * with lockless reader.
 	 */
 	smp_wmb();
-	tlb_flush_ring->write_idx = (write_idx + 1) % KVM_HV_TLB_FLUSH_RING_SIZE;
+	tlb_flush_ring->write_idx = write_idx;
 
 out_unlock:
 	spin_unlock_irqrestore(&tlb_flush_ring->write_lock, flags);
@@ -1840,15 +1891,47 @@  void kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
 {
 	struct kvm_vcpu_hv_tlbflush_ring *tlb_flush_ring;
 	struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
-
-	kvm_vcpu_flush_tlb_guest(vcpu);
-
-	if (!hv_vcpu)
+	struct kvm_vcpu_hv_tlbflush_entry *entry;
+	int read_idx, write_idx;
+	u64 address;
+	u32 count;
+	int i, j;
+
+	if (!tdp_enabled || !hv_vcpu) {
+		kvm_vcpu_flush_tlb_guest(vcpu);
 		return;
+	}
 
 	tlb_flush_ring = &hv_vcpu->tlb_flush_ring;
+	read_idx = READ_ONCE(tlb_flush_ring->read_idx);
+	write_idx = READ_ONCE(tlb_flush_ring->write_idx);
+
+	/* Pairs with smp_wmb() in hv_tlb_flush_ring_enqueue() */
+	smp_rmb();
 
-	tlb_flush_ring->read_idx = tlb_flush_ring->write_idx;
+	for (i = read_idx; i != write_idx; i = (i + 1) % KVM_HV_TLB_FLUSH_RING_SIZE) {
+		entry = &tlb_flush_ring->entries[i];
+
+		if (entry->flush_all)
+			goto out_flush_all;
+
+		/*
+		 * Lower 12 bits of 'address' encode the number of additional
+		 * pages to flush.
+		 */
+		address = entry->addr & PAGE_MASK;
+		count = (entry->addr & ~PAGE_MASK) + 1;
+		for (j = 0; j < count; j++)
+			static_call(kvm_x86_flush_tlb_gva)(vcpu, address + j * PAGE_SIZE);
+	}
+	++vcpu->stat.tlb_flush;
+	goto out_empty_ring;
+
+out_flush_all:
+	kvm_vcpu_flush_tlb_guest(vcpu);
+
+out_empty_ring:
+	tlb_flush_ring->read_idx = write_idx;
 }
 
 static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
@@ -1857,12 +1940,13 @@  static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
 	struct hv_tlb_flush_ex flush_ex;
 	struct hv_tlb_flush flush;
 	DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS);
+	u64 entries[KVM_HV_TLB_FLUSH_RING_SIZE - 2];
 	u64 valid_bank_mask;
 	u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
 	struct kvm_vcpu *v;
 	unsigned long i;
-	bool all_cpus;
-
+	bool all_cpus, all_addr;
+	int data_offset = 0, consumed_xmm_halves = 0;
 	/*
 	 * The Hyper-V TLFS doesn't allow more than 64 sparse banks, e.g. the
 	 * valid mask is a u64.  Fail the build if KVM's max allowed number of
@@ -1877,10 +1961,12 @@  static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
 			flush.address_space = hc->ingpa;
 			flush.flags = hc->outgpa;
 			flush.processor_mask = sse128_lo(hc->xmm[0]);
+			consumed_xmm_halves = 1;
 		} else {
 			if (unlikely(kvm_read_guest(kvm, hc->ingpa,
 						    &flush, sizeof(flush))))
 				return HV_STATUS_INVALID_HYPERCALL_INPUT;
+			data_offset = sizeof(flush);
 		}
 
 		trace_kvm_hv_flush_tlb(flush.processor_mask,
@@ -1904,10 +1990,12 @@  static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
 			flush_ex.flags = hc->outgpa;
 			memcpy(&flush_ex.hv_vp_set,
 			       &hc->xmm[0], sizeof(hc->xmm[0]));
+			consumed_xmm_halves = 2;
 		} else {
 			if (unlikely(kvm_read_guest(kvm, hc->ingpa, &flush_ex,
 						    sizeof(flush_ex))))
 				return HV_STATUS_INVALID_HYPERCALL_INPUT;
+			data_offset = sizeof(flush_ex);
 		}
 
 		trace_kvm_hv_flush_tlb_ex(flush_ex.hv_vp_set.valid_bank_mask,
@@ -1923,25 +2011,38 @@  static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
 			return HV_STATUS_INVALID_HYPERCALL_INPUT;
 
 		if (all_cpus)
-			goto do_flush;
+			goto read_flush_entries;
 
 		if (!hc->var_cnt)
 			goto ret_success;
 
-		if (kvm_get_sparse_vp_set(kvm, hc, 2, sparse_banks,
-					  offsetof(struct hv_tlb_flush_ex,
-						   hv_vp_set.bank_contents)))
+		if (kvm_get_sparse_vp_set(kvm, hc, consumed_xmm_halves,
+					  sparse_banks, data_offset))
+			return HV_STATUS_INVALID_HYPERCALL_INPUT;
+		data_offset += hc->var_cnt * sizeof(sparse_banks[0]);
+		consumed_xmm_halves += hc->var_cnt;
+	}
+
+read_flush_entries:
+	if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE ||
+	    hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX ||
+	    hc->rep_cnt > (KVM_HV_TLB_FLUSH_RING_SIZE - 2)) {
+		all_addr = true;
+	} else {
+		if (kvm_hv_get_tlbflush_entries(kvm, hc, entries,
+						data_offset, consumed_xmm_halves))
 			return HV_STATUS_INVALID_HYPERCALL_INPUT;
+		all_addr = false;
 	}
 
-do_flush:
+
 	/*
 	 * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't
 	 * analyze it here, flush TLB regardless of the specified address space.
 	 */
 	if (all_cpus) {
 		kvm_for_each_vcpu(i, v, kvm)
-			hv_tlb_flush_ring_enqueue(v);
+			hv_tlb_flush_ring_enqueue(v, all_addr, entries, hc->rep_cnt);
 
 		kvm_make_all_cpus_request(kvm, KVM_REQ_HV_TLB_FLUSH);
 	} else {
@@ -1951,7 +2052,7 @@  static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
 			v = kvm_get_vcpu(kvm, i);
 			if (!v)
 				continue;
-			hv_tlb_flush_ring_enqueue(v);
+			hv_tlb_flush_ring_enqueue(v, all_addr, entries, hc->rep_cnt);
 		}
 
 		kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask);