===================================================================
@@ -204,6 +204,32 @@ void pi_post_block(struct kvm_vcpu *vcpu
}
/*
+ * Bail out of the block loop if the VM has an assigned
+ * device, but the blocking vCPU didn't reconfigure the
+ * PI.NV to the wakeup vector, i.e. the assigned device
+ * came along after the initial check in vcpu_block().
+ */
+
+int vmx_vcpu_check_block(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ if (!irq_remapping_cap(IRQ_POSTING_CAP))
+ return 0;
+
+ if (!kvm_vcpu_apicv_active(vcpu))
+ return 0;
+
+ if (!kvm_arch_has_assigned_device(vcpu->kvm))
+ return 0;
+
+ if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR)
+ return 0;
+
+ return 1;
+}
+
+/*
* Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
*/
void pi_wakeup_handler(void)
@@ -236,6 +262,25 @@ bool pi_has_pending_interrupt(struct kvm
(pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc));
}
+void vmx_pi_start_assignment(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ if (!irq_remapping_cap(IRQ_POSTING_CAP))
+ return;
+
+ /*
+ * Wakeup will cause the vCPU to bail out of kvm_vcpu_block() and
+ * go back through vcpu_block().
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_vcpu_apicv_active(vcpu))
+ continue;
+
+ kvm_vcpu_wake_up(vcpu);
+ }
+}
/*
* pi_update_irte - set IRTE for Posted-Interrupts
===================================================================
@@ -95,5 +95,7 @@ void __init pi_init_cpu(int cpu);
bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq,
bool set);
+void vmx_pi_start_assignment(struct kvm *kvm);
+int vmx_vcpu_check_block(struct kvm_vcpu *vcpu);
#endif /* __KVM_X86_VMX_POSTED_INTR_H */
===================================================================
@@ -7727,11 +7727,13 @@ static struct kvm_x86_ops vmx_x86_ops __
.pre_block = vmx_pre_block,
.post_block = vmx_post_block,
+ .vcpu_check_block = vmx_vcpu_check_block,
.pmu_ops = &intel_pmu_ops,
.nested_ops = &vmx_nested_ops,
.update_pi_irte = pi_update_irte,
+ .start_assignment = vmx_pi_start_assignment,
#ifdef CONFIG_X86_64
.set_hv_timer = vmx_set_hv_timer,
For VMX, when a vcpu enters HLT emulation, pi_post_block will: 1) Add vcpu to per-cpu list of blocked vcpus. 2) Program the posted-interrupt descriptor "notification vector" to POSTED_INTR_WAKEUP_VECTOR With interrupt remapping, an interrupt will set the PIR bit for the vector programmed for the device on the CPU, test-and-set the ON bit on the posted interrupt descriptor, and if the ON bit is clear generate an interrupt for the notification vector. This way, the target CPU wakes upon a device interrupt and wakes up the target vcpu. Problem is that pi_post_block only programs the notification vector if kvm_arch_has_assigned_device() is true. Its possible for the following to happen: 1) vcpu V HLTs on pcpu P, kvm_arch_has_assigned_device is false, notification vector is not programmed 2) device is assigned to VM 3) device interrupts vcpu V, sets ON bit (notification vector not programmed, so pcpu P remains in idle) 4) vcpu 0 IPIs vcpu V (in guest), but since pi descriptor ON bit is set, kvm_vcpu_kick is skipped 5) vcpu 0 busy spins on vcpu V's response for several seconds, until RCU watchdog NMIs all vCPUs. To fix this, use the start_assignment kvm_x86_ops callback to kick vcpus out of the halt loop, so the notification vector is properly reprogrammed to the wakeup vector. Reported-by: Pei Zhang <pezhang@redhat.com> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>