diff mbox series

[RFC] Para-virtualized TLB flush for PV-waiting vCPUs

Message ID 20250106155652.484001-1-kentaishiguro@sslab.ics.keio.ac.jp (mailing list archive)
State New
Headers show
Series [RFC] Para-virtualized TLB flush for PV-waiting vCPUs | expand

Commit Message

Kenta Ishiguro Jan. 6, 2025, 3:56 p.m. UTC
In oversubscribed environments, the latency of flushing the remote TLB can
become significant when the destination virtual CPU (vCPU) is the waiter
of a para-virtualized queued spinlock that halts with interrupts disabled.
This occurs because the waiter does not respond to remote function call
requests until it releases the spinlock. As a result, the source vCPU
wastes CPU time performing busy-waiting for a response from the
destination vCPU.

To mitigate this issue, this patch extends the target of the PV TLB flush
to include vCPUs that are halting to wait on the PV qspinlock. Since the
PV qspinlock waiters voluntarily yield before being preempted by KVM,
their state does not get preempted, and the current PV TLB flush overlooks
them. This change allows vCPUs to bypass waiting for PV qspinlock waiters
during TLB shootdowns.

This enhancement improves the throughput of the ebizzy workload, which
intensively causes spinlock contention and TLB shootdowns, in
oversubscribed environments. The following experimental setup was used to
evaluate the performance impact:

Host Machine: Dell R760, Intel(R) Xeon(R) Platinum 8468 (48C/96T), 256GB
memory
VM0: ebizzy -M, 96 vCPUs, 32GB memory
VM1: busy-loop, 96 vCPUs, 32GB memory
Experiments Conducted: 10 iterations

Results:
- Without Patch: 7702.4 records/second (standard deviation: 295.5)
- With Patch: 9110.9 records/second (standard deviation: 528.6)

Signed-off-by: Kenta Ishiguro <kentaishiguro@sslab.ics.keio.ac.jp>
---
 arch/x86/include/uapi/asm/kvm_para.h |  1 +
 arch/x86/kernel/kvm.c                | 13 +++++++++++--
 2 files changed, 12 insertions(+), 2 deletions(-)
diff mbox series

Patch

diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index a1efa7907a0b..db26e167a707 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -70,6 +70,7 @@  struct kvm_steal_time {
 
 #define KVM_VCPU_PREEMPTED          (1 << 0)
 #define KVM_VCPU_FLUSH_TLB          (1 << 1)
+#define KVM_VCPU_IN_PVWAIT          (1 << 2)
 
 #define KVM_CLOCK_PAIRING_WALLCLOCK 0
 struct kvm_clock_pairing {
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 21e9e4845354..f17057b7d263 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -668,7 +668,8 @@  static void kvm_flush_tlb_multi(const struct cpumask *cpumask,
 		 */
 		src = &per_cpu(steal_time, cpu);
 		state = READ_ONCE(src->preempted);
-		if ((state & KVM_VCPU_PREEMPTED)) {
+		if ((state & KVM_VCPU_PREEMPTED) ||
+		    (state & KVM_VCPU_IN_PVWAIT)) {
 			if (try_cmpxchg(&src->preempted, &state,
 					state | KVM_VCPU_FLUSH_TLB))
 				__cpumask_clear_cpu(cpu, flushmask);
@@ -1045,6 +1046,9 @@  static void kvm_kick_cpu(int cpu)
 
 static void kvm_wait(u8 *ptr, u8 val)
 {
+	u8 state;
+	struct kvm_steal_time *src;
+
 	if (in_nmi())
 		return;
 
@@ -1054,8 +1058,13 @@  static void kvm_wait(u8 *ptr, u8 val)
 	 * in irq spinlock slowpath and no spurious interrupt occur to save us.
 	 */
 	if (irqs_disabled()) {
-		if (READ_ONCE(*ptr) == val)
+		if (READ_ONCE(*ptr) == val) {
+			src = this_cpu_ptr(&steal_time);
+			state = READ_ONCE(src->preempted);
+			try_cmpxchg(&src->preempted, &state,
+				    state | KVM_VCPU_IN_PVWAIT);
 			halt();
+		}
 	} else {
 		local_irq_disable();