@@ -70,6 +70,7 @@ struct kvm_steal_time {
#define KVM_VCPU_PREEMPTED (1 << 0)
#define KVM_VCPU_FLUSH_TLB (1 << 1)
+#define KVM_VCPU_IN_PVWAIT (1 << 2)
#define KVM_CLOCK_PAIRING_WALLCLOCK 0
struct kvm_clock_pairing {
@@ -668,7 +668,8 @@ static void kvm_flush_tlb_multi(const struct cpumask *cpumask,
*/
src = &per_cpu(steal_time, cpu);
state = READ_ONCE(src->preempted);
- if ((state & KVM_VCPU_PREEMPTED)) {
+ if ((state & KVM_VCPU_PREEMPTED) ||
+ (state & KVM_VCPU_IN_PVWAIT)) {
if (try_cmpxchg(&src->preempted, &state,
state | KVM_VCPU_FLUSH_TLB))
__cpumask_clear_cpu(cpu, flushmask);
@@ -1045,6 +1046,9 @@ static void kvm_kick_cpu(int cpu)
static void kvm_wait(u8 *ptr, u8 val)
{
+ u8 state;
+ struct kvm_steal_time *src;
+
if (in_nmi())
return;
@@ -1054,8 +1058,13 @@ static void kvm_wait(u8 *ptr, u8 val)
* in irq spinlock slowpath and no spurious interrupt occur to save us.
*/
if (irqs_disabled()) {
- if (READ_ONCE(*ptr) == val)
+ if (READ_ONCE(*ptr) == val) {
+ src = this_cpu_ptr(&steal_time);
+ state = READ_ONCE(src->preempted);
+ try_cmpxchg(&src->preempted, &state,
+ state | KVM_VCPU_IN_PVWAIT);
halt();
+ }
} else {
local_irq_disable();
In oversubscribed environments, the latency of flushing the remote TLB can become significant when the destination virtual CPU (vCPU) is the waiter of a para-virtualized queued spinlock that halts with interrupts disabled. This occurs because the waiter does not respond to remote function call requests until it releases the spinlock. As a result, the source vCPU wastes CPU time performing busy-waiting for a response from the destination vCPU. To mitigate this issue, this patch extends the target of the PV TLB flush to include vCPUs that are halting to wait on the PV qspinlock. Since the PV qspinlock waiters voluntarily yield before being preempted by KVM, their state does not get preempted, and the current PV TLB flush overlooks them. This change allows vCPUs to bypass waiting for PV qspinlock waiters during TLB shootdowns. This enhancement improves the throughput of the ebizzy workload, which intensively causes spinlock contention and TLB shootdowns, in oversubscribed environments. The following experimental setup was used to evaluate the performance impact: Host Machine: Dell R760, Intel(R) Xeon(R) Platinum 8468 (48C/96T), 256GB memory VM0: ebizzy -M, 96 vCPUs, 32GB memory VM1: busy-loop, 96 vCPUs, 32GB memory Experiments Conducted: 10 iterations Results: - Without Patch: 7702.4 records/second (standard deviation: 295.5) - With Patch: 9110.9 records/second (standard deviation: 528.6) Signed-off-by: Kenta Ishiguro <kentaishiguro@sslab.ics.keio.ac.jp> --- arch/x86/include/uapi/asm/kvm_para.h | 1 + arch/x86/kernel/kvm.c | 13 +++++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-)