diff mbox

[v2] KVM: x86: Rework INIT and SIPI handling

Message ID 5140662A.7010209@siemens.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jan Kiszka March 13, 2013, 11:42 a.m. UTC
A VCPU sending INIT or SIPI to some other VCPU races for setting the
remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED
was overwritten by kvm_emulate_halt and, thus, got lost.

This introduces APIC events for those two signals, keeping them in
kvm_apic until kvm_apic_accept_events is run over the target vcpu
context. kvm_apic_has_events reports to kvm_arch_vcpu_runnable if there
are pending events, thus if vcpu blocking should end.

The patch comes with the side effect of effectively obsoleting
KVM_MP_STATE_SIPI_RECEIVED. We still accept it from user space, but
immediately translate it to KVM_MP_STATE_INIT_RECEIVED + KVM_APIC_SIPI.
The vcpu itself will no longer enter the KVM_MP_STATE_SIPI_RECEIVED
state. That also means we no longer exit to user space after receiving a
SIPI event.

Furthermore, we already reset the VCPU on INIT, only fixing up the code
segment later on when SIPI arrives. Moreover, we fix INIT handling for
the BSP: it never enter wait-for-SIPI but directly starts over on INIT.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
---

Changes in v2:
 - drop cmpxchg from INIT signaling
 - rmb on SIPI processing
 - reset sets all CPUs to 0xf000:0xfff0, RIP is fixed up on SIPI reception

 arch/x86/include/asm/kvm_host.h |    3 +-
 arch/x86/kvm/lapic.c            |   48 +++++++++++++++++++++++++++-----
 arch/x86/kvm/lapic.h            |   11 +++++++
 arch/x86/kvm/svm.c              |    6 ----
 arch/x86/kvm/vmx.c              |   12 +-------
 arch/x86/kvm/x86.c              |   58 +++++++++++++++++++++++++-------------
 6 files changed, 93 insertions(+), 45 deletions(-)

Comments

Gleb Natapov March 13, 2013, 2:08 p.m. UTC | #1
On Wed, Mar 13, 2013 at 12:42:34PM +0100, Jan Kiszka wrote:
> A VCPU sending INIT or SIPI to some other VCPU races for setting the
> remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED
> was overwritten by kvm_emulate_halt and, thus, got lost.
> 
> This introduces APIC events for those two signals, keeping them in
> kvm_apic until kvm_apic_accept_events is run over the target vcpu
> context. kvm_apic_has_events reports to kvm_arch_vcpu_runnable if there
> are pending events, thus if vcpu blocking should end.
> 
> The patch comes with the side effect of effectively obsoleting
> KVM_MP_STATE_SIPI_RECEIVED. We still accept it from user space, but
> immediately translate it to KVM_MP_STATE_INIT_RECEIVED + KVM_APIC_SIPI.
> The vcpu itself will no longer enter the KVM_MP_STATE_SIPI_RECEIVED
> state. That also means we no longer exit to user space after receiving a
> SIPI event.
> 
> Furthermore, we already reset the VCPU on INIT, only fixing up the code
> segment later on when SIPI arrives. Moreover, we fix INIT handling for
> the BSP: it never enter wait-for-SIPI but directly starts over on INIT.
> 
> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Applied, thanks.

> ---
> 
> Changes in v2:
>  - drop cmpxchg from INIT signaling
>  - rmb on SIPI processing
>  - reset sets all CPUs to 0xf000:0xfff0, RIP is fixed up on SIPI reception
> 
>  arch/x86/include/asm/kvm_host.h |    3 +-
>  arch/x86/kvm/lapic.c            |   48 +++++++++++++++++++++++++++-----
>  arch/x86/kvm/lapic.h            |   11 +++++++
>  arch/x86/kvm/svm.c              |    6 ----
>  arch/x86/kvm/vmx.c              |   12 +-------
>  arch/x86/kvm/x86.c              |   58 +++++++++++++++++++++++++-------------
>  6 files changed, 93 insertions(+), 45 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 348d859..ef7f4a5 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -345,7 +345,6 @@ struct kvm_vcpu_arch {
>  	unsigned long apic_attention;
>  	int32_t apic_arb_prio;
>  	int mp_state;
> -	int sipi_vector;
>  	u64 ia32_misc_enable_msr;
>  	bool tpr_access_reporting;
>  
> @@ -819,6 +818,7 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
>  
>  void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
>  int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
> +void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector);
>  
>  int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
>  		    int reason, bool has_error_code, u32 error_code);
> @@ -1002,6 +1002,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
>  int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
>  int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
>  int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
> +void kvm_vcpu_reset(struct kvm_vcpu *vcpu);
>  
>  void kvm_define_shared_msr(unsigned index, u32 msr);
>  void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 02b51dd..a8e9369 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -731,7 +731,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
>  	case APIC_DM_INIT:
>  		if (!trig_mode || level) {
>  			result = 1;
> -			vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
> +			/* assumes that there are only KVM_APIC_INIT/SIPI */
> +			apic->pending_events = (1UL << KVM_APIC_INIT);
> +			/* make sure pending_events is visible before sending
> +			 * the request */
> +			smp_wmb();
>  			kvm_make_request(KVM_REQ_EVENT, vcpu);
>  			kvm_vcpu_kick(vcpu);
>  		} else {
> @@ -743,13 +747,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
>  	case APIC_DM_STARTUP:
>  		apic_debug("SIPI to vcpu %d vector 0x%02x\n",
>  			   vcpu->vcpu_id, vector);
> -		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
> -			result = 1;
> -			vcpu->arch.sipi_vector = vector;
> -			vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
> -			kvm_make_request(KVM_REQ_EVENT, vcpu);
> -			kvm_vcpu_kick(vcpu);
> -		}
> +		result = 1;
> +		apic->sipi_vector = vector;
> +		/* make sure sipi_vector is visible for the receiver */
> +		smp_wmb();
> +		set_bit(KVM_APIC_SIPI, &apic->pending_events);
> +		kvm_make_request(KVM_REQ_EVENT, vcpu);
> +		kvm_vcpu_kick(vcpu);
>  		break;
>  
>  	case APIC_DM_EXTINT:
> @@ -1860,6 +1864,34 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
>  					 addr);
>  }
>  
> +void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_lapic *apic = vcpu->arch.apic;
> +	unsigned int sipi_vector;
> +
> +	if (!kvm_vcpu_has_lapic(vcpu))
> +		return;
> +
> +	if (test_and_clear_bit(KVM_APIC_INIT, &apic->pending_events)) {
> +		kvm_lapic_reset(vcpu);
> +		kvm_vcpu_reset(vcpu);
> +		if (kvm_vcpu_is_bsp(apic->vcpu))
> +			vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
> +		else
> +			vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
> +	}
> +	if (test_and_clear_bit(KVM_APIC_SIPI, &apic->pending_events) &&
> +	    vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
> +		/* evaluate pending_events before reading the vector */
> +		smp_rmb();
> +		sipi_vector = apic->sipi_vector;
> +		pr_debug("vcpu %d received sipi with vector # %x\n",
> +			 vcpu->vcpu_id, sipi_vector);
> +		kvm_vcpu_deliver_sipi_vector(vcpu, sipi_vector);
> +		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
> +	}
> +}
> +
>  void kvm_lapic_init(void)
>  {
>  	/* do not patch jump label more than once per second */
> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> index 1676d34..2c721b9 100644
> --- a/arch/x86/kvm/lapic.h
> +++ b/arch/x86/kvm/lapic.h
> @@ -5,6 +5,9 @@
>  
>  #include <linux/kvm_host.h>
>  
> +#define KVM_APIC_INIT		0
> +#define KVM_APIC_SIPI		1
> +
>  struct kvm_timer {
>  	struct hrtimer timer;
>  	s64 period; 				/* unit: ns */
> @@ -32,6 +35,8 @@ struct kvm_lapic {
>  	void *regs;
>  	gpa_t vapic_addr;
>  	struct page *vapic_page;
> +	unsigned long pending_events;
> +	unsigned int sipi_vector;
>  };
>  int kvm_create_lapic(struct kvm_vcpu *vcpu);
>  void kvm_free_lapic(struct kvm_vcpu *vcpu);
> @@ -39,6 +44,7 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu);
>  int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
>  int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
>  int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
> +void kvm_apic_accept_events(struct kvm_vcpu *vcpu);
>  void kvm_lapic_reset(struct kvm_vcpu *vcpu);
>  u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
>  void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
> @@ -158,4 +164,9 @@ void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
>  				struct kvm_lapic_irq *irq,
>  				u64 *eoi_bitmap);
>  
> +static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.apic->pending_events;
> +}
> +
>  #endif
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index 907e428..7219a40 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -1199,12 +1199,6 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu)
>  
>  	init_vmcb(svm);
>  
> -	if (!kvm_vcpu_is_bsp(vcpu)) {
> -		kvm_rip_write(vcpu, 0);
> -		svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
> -		svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
> -	}
> -
>  	kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
>  	kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
>  }
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index f17cd2a..b73989d 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -4119,12 +4119,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
>  	vmx_segment_cache_clear(vmx);
>  
>  	seg_setup(VCPU_SREG_CS);
> -	if (kvm_vcpu_is_bsp(&vmx->vcpu))
> -		vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
> -	else {
> -		vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
> -		vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
> -	}
> +	vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
>  
>  	seg_setup(VCPU_SREG_DS);
>  	seg_setup(VCPU_SREG_ES);
> @@ -4147,10 +4142,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
>  	vmcs_writel(GUEST_SYSENTER_EIP, 0);
>  
>  	vmcs_writel(GUEST_RFLAGS, 0x02);
> -	if (kvm_vcpu_is_bsp(&vmx->vcpu))
> -		kvm_rip_write(vcpu, 0xfff0);
> -	else
> -		kvm_rip_write(vcpu, 0);
> +	kvm_rip_write(vcpu, 0xfff0);
>  
>  	vmcs_writel(GUEST_GDTR_BASE, 0);
>  	vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index b891ac3..a7e51ae 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -162,8 +162,6 @@ u64 __read_mostly host_xcr0;
>  
>  static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
>  
> -static void kvm_vcpu_reset(struct kvm_vcpu *vcpu);
> -
>  static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
>  {
>  	int i;
> @@ -2823,10 +2821,9 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
>  	events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
>  	events->nmi.pad = 0;
>  
> -	events->sipi_vector = vcpu->arch.sipi_vector;
> +	events->sipi_vector = 0; /* never valid when reporting to user space */
>  
>  	events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
> -			 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
>  			 | KVM_VCPUEVENT_VALID_SHADOW);
>  	memset(&events->reserved, 0, sizeof(events->reserved));
>  }
> @@ -2857,8 +2854,9 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
>  		vcpu->arch.nmi_pending = events->nmi.pending;
>  	kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
>  
> -	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
> -		vcpu->arch.sipi_vector = events->sipi_vector;
> +	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
> +	    kvm_vcpu_has_lapic(vcpu))
> +		vcpu->arch.apic->sipi_vector = events->sipi_vector;
>  
>  	kvm_make_request(KVM_REQ_EVENT, vcpu);
>  
> @@ -5713,6 +5711,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  	}
>  
>  	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
> +		kvm_apic_accept_events(vcpu);
> +		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
> +			r = 1;
> +			goto out;
> +		}
> +
>  		inject_pending_event(vcpu);
>  
>  		/* enable NMI/IRQ window open exits if needed */
> @@ -5847,14 +5851,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
>  	int r;
>  	struct kvm *kvm = vcpu->kvm;
>  
> -	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
> -		pr_debug("vcpu %d received sipi with vector # %x\n",
> -			 vcpu->vcpu_id, vcpu->arch.sipi_vector);
> -		kvm_lapic_reset(vcpu);
> -		kvm_vcpu_reset(vcpu);
> -		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
> -	}
> -
>  	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
>  	r = vapic_enter(vcpu);
>  	if (r) {
> @@ -5871,8 +5867,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
>  			srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
>  			kvm_vcpu_block(vcpu);
>  			vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
> -			if (kvm_check_request(KVM_REQ_UNHALT, vcpu))
> -			{
> +			if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) {
> +				kvm_apic_accept_events(vcpu);
>  				switch(vcpu->arch.mp_state) {
>  				case KVM_MP_STATE_HALTED:
>  					vcpu->arch.mp_state =
> @@ -5880,7 +5876,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
>  				case KVM_MP_STATE_RUNNABLE:
>  					vcpu->arch.apf.halted = false;
>  					break;
> -				case KVM_MP_STATE_SIPI_RECEIVED:
> +				case KVM_MP_STATE_INIT_RECEIVED:
> +					break;
>  				default:
>  					r = -EINTR;
>  					break;
> @@ -6015,6 +6012,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
>  
>  	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
>  		kvm_vcpu_block(vcpu);
> +		kvm_apic_accept_events(vcpu);
>  		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
>  		r = -EAGAIN;
>  		goto out;
> @@ -6171,6 +6169,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
>  int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
>  				    struct kvm_mp_state *mp_state)
>  {
> +	kvm_apic_accept_events(vcpu);
>  	mp_state->mp_state = vcpu->arch.mp_state;
>  	return 0;
>  }
> @@ -6178,7 +6177,15 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
>  int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
>  				    struct kvm_mp_state *mp_state)
>  {
> -	vcpu->arch.mp_state = mp_state->mp_state;
> +	if (!kvm_vcpu_has_lapic(vcpu) &&
> +	    mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
> +		return -EINVAL;
> +
> +	if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
> +		vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
> +		set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
> +	} else
> +		vcpu->arch.mp_state = mp_state->mp_state;
>  	kvm_make_request(KVM_REQ_EVENT, vcpu);
>  	return 0;
>  }
> @@ -6515,7 +6522,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
>  	kvm_x86_ops->vcpu_free(vcpu);
>  }
>  
> -static void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
> +void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
>  {
>  	atomic_set(&vcpu->arch.nmi_queued, 0);
>  	vcpu->arch.nmi_pending = 0;
> @@ -6545,6 +6552,17 @@ static void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
>  	kvm_x86_ops->vcpu_reset(vcpu);
>  }
>  
> +void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector)
> +{
> +	struct kvm_segment cs;
> +
> +	kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
> +	cs.selector = vector << 8;
> +	cs.base = vector << 12;
> +	kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
> +	kvm_rip_write(vcpu, 0);
> +}
> +
>  int kvm_arch_hardware_enable(void *garbage)
>  {
>  	struct kvm *kvm;
> @@ -6988,7 +7006,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
>  	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
>  		!vcpu->arch.apf.halted)
>  		|| !list_empty_careful(&vcpu->async_pf.done)
> -		|| vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
> +		|| kvm_apic_has_events(vcpu)
>  		|| atomic_read(&vcpu->arch.nmi_queued) ||
>  		(kvm_arch_interrupt_allowed(vcpu) &&
>  		 kvm_cpu_has_interrupt(vcpu));
> -- 
> 1.7.3.4

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti March 14, 2013, 12:08 a.m. UTC | #2
On Wed, Mar 13, 2013 at 12:42:34PM +0100, Jan Kiszka wrote:
> A VCPU sending INIT or SIPI to some other VCPU races for setting the
> remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED
> was overwritten by kvm_emulate_halt and, thus, got lost.
> 
> This introduces APIC events for those two signals, keeping them in
> kvm_apic until kvm_apic_accept_events is run over the target vcpu
> context. kvm_apic_has_events reports to kvm_arch_vcpu_runnable if there
> are pending events, thus if vcpu blocking should end.
> 
> The patch comes with the side effect of effectively obsoleting
> KVM_MP_STATE_SIPI_RECEIVED. We still accept it from user space, but
> immediately translate it to KVM_MP_STATE_INIT_RECEIVED + KVM_APIC_SIPI.
> The vcpu itself will no longer enter the KVM_MP_STATE_SIPI_RECEIVED
> state. That also means we no longer exit to user space after receiving a
> SIPI event.
> 
> Furthermore, we already reset the VCPU on INIT, only fixing up the code
> segment later on when SIPI arrives. Moreover, we fix INIT handling for
> the BSP: it never enter wait-for-SIPI but directly starts over on INIT.
> 
> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> ---
> 
> Changes in v2:
>  - drop cmpxchg from INIT signaling
>  - rmb on SIPI processing
>  - reset sets all CPUs to 0xf000:0xfff0, RIP is fixed up on SIPI reception
> 
>  arch/x86/include/asm/kvm_host.h |    3 +-
>  arch/x86/kvm/lapic.c            |   48 +++++++++++++++++++++++++++-----
>  arch/x86/kvm/lapic.h            |   11 +++++++
>  arch/x86/kvm/svm.c              |    6 ----
>  arch/x86/kvm/vmx.c              |   12 +-------
>  arch/x86/kvm/x86.c              |   58 +++++++++++++++++++++++++-------------
>  6 files changed, 93 insertions(+), 45 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 348d859..ef7f4a5 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -345,7 +345,6 @@ struct kvm_vcpu_arch {
>  	unsigned long apic_attention;
>  	int32_t apic_arb_prio;
>  	int mp_state;
> -	int sipi_vector;
>  	u64 ia32_misc_enable_msr;
>  	bool tpr_access_reporting;
>  
> @@ -819,6 +818,7 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
>  
>  void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
>  int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
> +void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector);
>  
>  int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
>  		    int reason, bool has_error_code, u32 error_code);
> @@ -1002,6 +1002,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
>  int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
>  int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
>  int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
> +void kvm_vcpu_reset(struct kvm_vcpu *vcpu);
>  
>  void kvm_define_shared_msr(unsigned index, u32 msr);
>  void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 02b51dd..a8e9369 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -731,7 +731,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
>  	case APIC_DM_INIT:
>  		if (!trig_mode || level) {
>  			result = 1;
> -			vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
> +			/* assumes that there are only KVM_APIC_INIT/SIPI */
> +			apic->pending_events = (1UL << KVM_APIC_INIT);
> +			/* make sure pending_events is visible before sending
> +			 * the request */
> +			smp_wmb();
>  			kvm_make_request(KVM_REQ_EVENT, vcpu);
>  			kvm_vcpu_kick(vcpu);
>  		} else {
> @@ -743,13 +747,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
>  	case APIC_DM_STARTUP:
>  		apic_debug("SIPI to vcpu %d vector 0x%02x\n",
>  			   vcpu->vcpu_id, vector);
> -		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
> -			result = 1;
> -			vcpu->arch.sipi_vector = vector;
> -			vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
> -			kvm_make_request(KVM_REQ_EVENT, vcpu);
> -			kvm_vcpu_kick(vcpu);
> -		}
> +		result = 1;
> +		apic->sipi_vector = vector;
> +		/* make sure sipi_vector is visible for the receiver */
> +		smp_wmb();
> +		set_bit(KVM_APIC_SIPI, &apic->pending_events);
> +		kvm_make_request(KVM_REQ_EVENT, vcpu);
> +		kvm_vcpu_kick(vcpu);

Why are APIC_DM_STARTUP / APIC_DM_INIT setting KVM_REQ_EVENT again?
Can't see any direct connection. See below.

> +static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.apic->pending_events;
> +}

vcpu->arch.apic = NULL?

>  	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
> +		kvm_apic_accept_events(vcpu);
> +		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
> +			r = 1;
> +			goto out;
> +		}
> +

A separate request bit makes sense, because nothing in this
(KVM_REQ_EVENT conditional) code sequence handles the work from
APIC_DM_STARTUP / APIC_DM_INIT sites (well, before your patch). See
below.

>  
> -	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
> -		pr_debug("vcpu %d received sipi with vector # %x\n",
> -			 vcpu->vcpu_id, vcpu->arch.sipi_vector);
> -		kvm_lapic_reset(vcpu);
> -		kvm_vcpu_reset(vcpu);
> -		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
> -	}
> -
>  	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
>  	r = vapic_enter(vcpu);

vmx_vcpu_reset overwrites vcpu->srcu_idx if ->vcpu_reset is called from
within srcu section.

Also, this is not part of the hotpath and therefore it could be farther
from vcpu_enter_guest. What about processing a new request bit here?
(KVM_REQ_EVENT <-> apic->pending_events relationship is cloudy).

Also the fact kvm_apic_accept_events() is called from sites is annoying,
why is that necessary again?
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti March 14, 2013, 2:10 a.m. UTC | #3
> vmx_vcpu_reset overwrites vcpu->srcu_idx if ->vcpu_reset is called from
> within srcu section.
> 
> Also, this is not part of the hotpath and therefore it could be farther
> from vcpu_enter_guest. What about processing a new request bit here?
> (KVM_REQ_EVENT <-> apic->pending_events relationship is cloudy).
> 
> Also the fact kvm_apic_accept_events() is called from  sites is annoying,
> why is that necessary again?

multiple sites. would be nicer if from one call site.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka March 14, 2013, 10:15 a.m. UTC | #4
On 2013-03-14 01:08, Marcelo Tosatti wrote:
> On Wed, Mar 13, 2013 at 12:42:34PM +0100, Jan Kiszka wrote:
>> A VCPU sending INIT or SIPI to some other VCPU races for setting the
>> remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED
>> was overwritten by kvm_emulate_halt and, thus, got lost.
>>
>> This introduces APIC events for those two signals, keeping them in
>> kvm_apic until kvm_apic_accept_events is run over the target vcpu
>> context. kvm_apic_has_events reports to kvm_arch_vcpu_runnable if there
>> are pending events, thus if vcpu blocking should end.
>>
>> The patch comes with the side effect of effectively obsoleting
>> KVM_MP_STATE_SIPI_RECEIVED. We still accept it from user space, but
>> immediately translate it to KVM_MP_STATE_INIT_RECEIVED + KVM_APIC_SIPI.
>> The vcpu itself will no longer enter the KVM_MP_STATE_SIPI_RECEIVED
>> state. That also means we no longer exit to user space after receiving a
>> SIPI event.
>>
>> Furthermore, we already reset the VCPU on INIT, only fixing up the code
>> segment later on when SIPI arrives. Moreover, we fix INIT handling for
>> the BSP: it never enter wait-for-SIPI but directly starts over on INIT.
>>
>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>> ---
>>
>> Changes in v2:
>>  - drop cmpxchg from INIT signaling
>>  - rmb on SIPI processing
>>  - reset sets all CPUs to 0xf000:0xfff0, RIP is fixed up on SIPI reception
>>
>>  arch/x86/include/asm/kvm_host.h |    3 +-
>>  arch/x86/kvm/lapic.c            |   48 +++++++++++++++++++++++++++-----
>>  arch/x86/kvm/lapic.h            |   11 +++++++
>>  arch/x86/kvm/svm.c              |    6 ----
>>  arch/x86/kvm/vmx.c              |   12 +-------
>>  arch/x86/kvm/x86.c              |   58 +++++++++++++++++++++++++-------------
>>  6 files changed, 93 insertions(+), 45 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index 348d859..ef7f4a5 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -345,7 +345,6 @@ struct kvm_vcpu_arch {
>>  	unsigned long apic_attention;
>>  	int32_t apic_arb_prio;
>>  	int mp_state;
>> -	int sipi_vector;
>>  	u64 ia32_misc_enable_msr;
>>  	bool tpr_access_reporting;
>>  
>> @@ -819,6 +818,7 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
>>  
>>  void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
>>  int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
>> +void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector);
>>  
>>  int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
>>  		    int reason, bool has_error_code, u32 error_code);
>> @@ -1002,6 +1002,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
>>  int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
>>  int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
>>  int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
>> +void kvm_vcpu_reset(struct kvm_vcpu *vcpu);
>>  
>>  void kvm_define_shared_msr(unsigned index, u32 msr);
>>  void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
>> index 02b51dd..a8e9369 100644
>> --- a/arch/x86/kvm/lapic.c
>> +++ b/arch/x86/kvm/lapic.c
>> @@ -731,7 +731,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
>>  	case APIC_DM_INIT:
>>  		if (!trig_mode || level) {
>>  			result = 1;
>> -			vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
>> +			/* assumes that there are only KVM_APIC_INIT/SIPI */
>> +			apic->pending_events = (1UL << KVM_APIC_INIT);
>> +			/* make sure pending_events is visible before sending
>> +			 * the request */
>> +			smp_wmb();
>>  			kvm_make_request(KVM_REQ_EVENT, vcpu);
>>  			kvm_vcpu_kick(vcpu);
>>  		} else {
>> @@ -743,13 +747,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
>>  	case APIC_DM_STARTUP:
>>  		apic_debug("SIPI to vcpu %d vector 0x%02x\n",
>>  			   vcpu->vcpu_id, vector);
>> -		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
>> -			result = 1;
>> -			vcpu->arch.sipi_vector = vector;
>> -			vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
>> -			kvm_make_request(KVM_REQ_EVENT, vcpu);
>> -			kvm_vcpu_kick(vcpu);
>> -		}
>> +		result = 1;
>> +		apic->sipi_vector = vector;
>> +		/* make sure sipi_vector is visible for the receiver */
>> +		smp_wmb();
>> +		set_bit(KVM_APIC_SIPI, &apic->pending_events);
>> +		kvm_make_request(KVM_REQ_EVENT, vcpu);
>> +		kvm_vcpu_kick(vcpu);
> 
> Why are APIC_DM_STARTUP / APIC_DM_INIT setting KVM_REQ_EVENT again?
> Can't see any direct connection. See below.

Changing the mp_state may unblock pending events (IRQs, NMIs), I think
that is the original reason.

> 
>> +static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
>> +{
>> +	return vcpu->arch.apic->pending_events;
>> +}
> 
> vcpu->arch.apic = NULL?

Not possible for the caller of this function.

> 
>>  	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
>> +		kvm_apic_accept_events(vcpu);
>> +		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
>> +			r = 1;
>> +			goto out;
>> +		}
>> +
> 
> A separate request bit makes sense, because nothing in this
> (KVM_REQ_EVENT conditional) code sequence handles the work from
> APIC_DM_STARTUP / APIC_DM_INIT sites (well, before your patch). See
> below.

OK. Was told to avoid new request bits.

> 
>>  
>> -	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
>> -		pr_debug("vcpu %d received sipi with vector # %x\n",
>> -			 vcpu->vcpu_id, vcpu->arch.sipi_vector);
>> -		kvm_lapic_reset(vcpu);
>> -		kvm_vcpu_reset(vcpu);
>> -		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
>> -	}
>> -
>>  	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
>>  	r = vapic_enter(vcpu);
> 
> vmx_vcpu_reset overwrites vcpu->srcu_idx if ->vcpu_reset is called from
> within srcu section.

Indeed.

Do you know what the look over vmx_set_cr0 actually protects?

> 
> Also, this is not part of the hotpath and therefore it could be farther
> from vcpu_enter_guest. What about processing a new request bit here?
> (KVM_REQ_EVENT <-> apic->pending_events relationship is cloudy).

I can pull the check from vcpu_enter_guest out at this level again, but
then we will generate the user space exits again.

> 
> Also the fact kvm_apic_accept_events() is called from sites is annoying,
> why is that necessary again?

- to avoid having pending events in the APIC when delivering mp_state
  to user space
- to keep mp_state correct after waking up from kvm_vcpu_block (we
  could otherwise walk through code paths with the wrong state)
- to process pending events when the VCPU was running

Jan
Gleb Natapov March 14, 2013, 11:24 a.m. UTC | #5
On Thu, Mar 14, 2013 at 11:15:02AM +0100, Jan Kiszka wrote:
> On 2013-03-14 01:08, Marcelo Tosatti wrote:
> > On Wed, Mar 13, 2013 at 12:42:34PM +0100, Jan Kiszka wrote:
> >> A VCPU sending INIT or SIPI to some other VCPU races for setting the
> >> remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED
> >> was overwritten by kvm_emulate_halt and, thus, got lost.
> >>
> >> This introduces APIC events for those two signals, keeping them in
> >> kvm_apic until kvm_apic_accept_events is run over the target vcpu
> >> context. kvm_apic_has_events reports to kvm_arch_vcpu_runnable if there
> >> are pending events, thus if vcpu blocking should end.
> >>
> >> The patch comes with the side effect of effectively obsoleting
> >> KVM_MP_STATE_SIPI_RECEIVED. We still accept it from user space, but
> >> immediately translate it to KVM_MP_STATE_INIT_RECEIVED + KVM_APIC_SIPI.
> >> The vcpu itself will no longer enter the KVM_MP_STATE_SIPI_RECEIVED
> >> state. That also means we no longer exit to user space after receiving a
> >> SIPI event.
> >>
> >> Furthermore, we already reset the VCPU on INIT, only fixing up the code
> >> segment later on when SIPI arrives. Moreover, we fix INIT handling for
> >> the BSP: it never enter wait-for-SIPI but directly starts over on INIT.
> >>
> >> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> >> ---
> >>
> >> Changes in v2:
> >>  - drop cmpxchg from INIT signaling
> >>  - rmb on SIPI processing
> >>  - reset sets all CPUs to 0xf000:0xfff0, RIP is fixed up on SIPI reception
> >>
> >>  arch/x86/include/asm/kvm_host.h |    3 +-
> >>  arch/x86/kvm/lapic.c            |   48 +++++++++++++++++++++++++++-----
> >>  arch/x86/kvm/lapic.h            |   11 +++++++
> >>  arch/x86/kvm/svm.c              |    6 ----
> >>  arch/x86/kvm/vmx.c              |   12 +-------
> >>  arch/x86/kvm/x86.c              |   58 +++++++++++++++++++++++++-------------
> >>  6 files changed, 93 insertions(+), 45 deletions(-)
> >>
> >> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> >> index 348d859..ef7f4a5 100644
> >> --- a/arch/x86/include/asm/kvm_host.h
> >> +++ b/arch/x86/include/asm/kvm_host.h
> >> @@ -345,7 +345,6 @@ struct kvm_vcpu_arch {
> >>  	unsigned long apic_attention;
> >>  	int32_t apic_arb_prio;
> >>  	int mp_state;
> >> -	int sipi_vector;
> >>  	u64 ia32_misc_enable_msr;
> >>  	bool tpr_access_reporting;
> >>  
> >> @@ -819,6 +818,7 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
> >>  
> >>  void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
> >>  int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
> >> +void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector);
> >>  
> >>  int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
> >>  		    int reason, bool has_error_code, u32 error_code);
> >> @@ -1002,6 +1002,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
> >>  int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
> >>  int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
> >>  int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
> >> +void kvm_vcpu_reset(struct kvm_vcpu *vcpu);
> >>  
> >>  void kvm_define_shared_msr(unsigned index, u32 msr);
> >>  void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
> >> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> >> index 02b51dd..a8e9369 100644
> >> --- a/arch/x86/kvm/lapic.c
> >> +++ b/arch/x86/kvm/lapic.c
> >> @@ -731,7 +731,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
> >>  	case APIC_DM_INIT:
> >>  		if (!trig_mode || level) {
> >>  			result = 1;
> >> -			vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
> >> +			/* assumes that there are only KVM_APIC_INIT/SIPI */
> >> +			apic->pending_events = (1UL << KVM_APIC_INIT);
> >> +			/* make sure pending_events is visible before sending
> >> +			 * the request */
> >> +			smp_wmb();
> >>  			kvm_make_request(KVM_REQ_EVENT, vcpu);
> >>  			kvm_vcpu_kick(vcpu);
> >>  		} else {
> >> @@ -743,13 +747,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
> >>  	case APIC_DM_STARTUP:
> >>  		apic_debug("SIPI to vcpu %d vector 0x%02x\n",
> >>  			   vcpu->vcpu_id, vector);
> >> -		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
> >> -			result = 1;
> >> -			vcpu->arch.sipi_vector = vector;
> >> -			vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
> >> -			kvm_make_request(KVM_REQ_EVENT, vcpu);
> >> -			kvm_vcpu_kick(vcpu);
> >> -		}
> >> +		result = 1;
> >> +		apic->sipi_vector = vector;
> >> +		/* make sure sipi_vector is visible for the receiver */
> >> +		smp_wmb();
> >> +		set_bit(KVM_APIC_SIPI, &apic->pending_events);
> >> +		kvm_make_request(KVM_REQ_EVENT, vcpu);
> >> +		kvm_vcpu_kick(vcpu);
> > 
> > Why are APIC_DM_STARTUP / APIC_DM_INIT setting KVM_REQ_EVENT again?
> > Can't see any direct connection. See below.
> 
> Changing the mp_state may unblock pending events (IRQs, NMIs), I think
> that is the original reason.
> 
> > 
> >> +static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
> >> +{
> >> +	return vcpu->arch.apic->pending_events;
> >> +}
> > 
> > vcpu->arch.apic = NULL?
> 
> Not possible for the caller of this function.
> 
> > 
> >>  	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
> >> +		kvm_apic_accept_events(vcpu);
> >> +		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
> >> +			r = 1;
> >> +			goto out;
> >> +		}
> >> +
> > 
> > A separate request bit makes sense, because nothing in this
> > (KVM_REQ_EVENT conditional) code sequence handles the work from
> > APIC_DM_STARTUP / APIC_DM_INIT sites (well, before your patch). See
> > below.
> 
> OK. Was told to avoid new request bits.
> 
KVM_REQ_EVENT guards processing of any external evens. INIT/SIPI are
those kind of events. By using separate request bit we will save one
if() and one function call during each event processing. Not sure if it
worth it, but I do not mind.

> > 
> >>  
> >> -	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
> >> -		pr_debug("vcpu %d received sipi with vector # %x\n",
> >> -			 vcpu->vcpu_id, vcpu->arch.sipi_vector);
> >> -		kvm_lapic_reset(vcpu);
> >> -		kvm_vcpu_reset(vcpu);
> >> -		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
> >> -	}
> >> -
> >>  	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
> >>  	r = vapic_enter(vcpu);
> > 
> > vmx_vcpu_reset overwrites vcpu->srcu_idx if ->vcpu_reset is called from
> > within srcu section.
> 
> Indeed.
> 
> Do you know what the look over vmx_set_cr0 actually protects?
Setting up the real-mode TSS

> 
> > 
> > Also, this is not part of the hotpath and therefore it could be farther
> > from vcpu_enter_guest. What about processing a new request bit here?
> > (KVM_REQ_EVENT <-> apic->pending_events relationship is cloudy).
> 
> I can pull the check from vcpu_enter_guest out at this level again, but
> then we will generate the user space exits again.
> 
> > 
> > Also the fact kvm_apic_accept_events() is called from sites is annoying,
> > why is that necessary again?
> 
> - to avoid having pending events in the APIC when delivering mp_state
>   to user space
> - to keep mp_state correct after waking up from kvm_vcpu_block (we
>   could otherwise walk through code paths with the wrong state)
> - to process pending events when the VCPU was running
> 
> Jan
> 
> -- 
> Siemens AG, Corporate Technology, CT RTC ITP SDP-DE
> Corporate Competence Center Embedded Linux

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka March 14, 2013, 11:29 a.m. UTC | #6
On 2013-03-14 11:15, Jan Kiszka wrote:
>>>  
>>> -	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
>>> -		pr_debug("vcpu %d received sipi with vector # %x\n",
>>> -			 vcpu->vcpu_id, vcpu->arch.sipi_vector);
>>> -		kvm_lapic_reset(vcpu);
>>> -		kvm_vcpu_reset(vcpu);
>>> -		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
>>> -	}
>>> -
>>>  	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
>>>  	r = vapic_enter(vcpu);
>>
>> vmx_vcpu_reset overwrites vcpu->srcu_idx if ->vcpu_reset is called from
>> within srcu section.
> 
> Indeed.
> 
> Do you know what the look over vmx_set_cr0 actually protects?

Found it: It's not actually protecting anything. enter_rmode is called,
and that assumes that lock to be held. If enter_rmode faces an
uninitialized tss, it drops the lock before calling vmx_set_tss_addr.

Well, I wonder if that is a good place to fix the TSS issue. Why not
make that special case (lacking KVM_SET_TSS_ADDR before first KVM_RUN) a
static jump key and check for it on KVM_RUN?

Jan
Gleb Natapov March 14, 2013, 12:12 p.m. UTC | #7
On Thu, Mar 14, 2013 at 12:29:48PM +0100, Jan Kiszka wrote:
> On 2013-03-14 11:15, Jan Kiszka wrote:
> >>>  
> >>> -	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
> >>> -		pr_debug("vcpu %d received sipi with vector # %x\n",
> >>> -			 vcpu->vcpu_id, vcpu->arch.sipi_vector);
> >>> -		kvm_lapic_reset(vcpu);
> >>> -		kvm_vcpu_reset(vcpu);
> >>> -		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
> >>> -	}
> >>> -
> >>>  	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
> >>>  	r = vapic_enter(vcpu);
> >>
> >> vmx_vcpu_reset overwrites vcpu->srcu_idx if ->vcpu_reset is called from
> >> within srcu section.
> > 
> > Indeed.
> > 
> > Do you know what the look over vmx_set_cr0 actually protects?
> 
> Found it: It's not actually protecting anything. enter_rmode is called,
> and that assumes that lock to be held. If enter_rmode faces an
> uninitialized tss, it drops the lock before calling vmx_set_tss_addr.
> 
> Well, I wonder if that is a good place to fix the TSS issue. Why not
> make that special case (lacking KVM_SET_TSS_ADDR before first KVM_RUN) a
> static jump key and check for it on KVM_RUN?
> 
Or finally break userspace that does not set it before calling kvm_run.
I haven't seen people complain about "kvm: KVM_SET_TSS_ADDR need to be
called before entering vcpu" warning in dmesg. Or create TSS mem slot at
0xfeffd000 during VM creation and destroy it if userspace overwrites it.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka March 14, 2013, 12:16 p.m. UTC | #8
On 2013-03-14 13:12, Gleb Natapov wrote:
> On Thu, Mar 14, 2013 at 12:29:48PM +0100, Jan Kiszka wrote:
>> On 2013-03-14 11:15, Jan Kiszka wrote:
>>>>>  
>>>>> -	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
>>>>> -		pr_debug("vcpu %d received sipi with vector # %x\n",
>>>>> -			 vcpu->vcpu_id, vcpu->arch.sipi_vector);
>>>>> -		kvm_lapic_reset(vcpu);
>>>>> -		kvm_vcpu_reset(vcpu);
>>>>> -		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
>>>>> -	}
>>>>> -
>>>>>  	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
>>>>>  	r = vapic_enter(vcpu);
>>>>
>>>> vmx_vcpu_reset overwrites vcpu->srcu_idx if ->vcpu_reset is called from
>>>> within srcu section.
>>>
>>> Indeed.
>>>
>>> Do you know what the look over vmx_set_cr0 actually protects?
>>
>> Found it: It's not actually protecting anything. enter_rmode is called,
>> and that assumes that lock to be held. If enter_rmode faces an
>> uninitialized tss, it drops the lock before calling vmx_set_tss_addr.
>>
>> Well, I wonder if that is a good place to fix the TSS issue. Why not
>> make that special case (lacking KVM_SET_TSS_ADDR before first KVM_RUN) a
>> static jump key and check for it on KVM_RUN?
>>
> Or finally break userspace that does not set it before calling kvm_run.
> I haven't seen people complain about "kvm: KVM_SET_TSS_ADDR need to be
> called before entering vcpu" warning in dmesg. Or create TSS mem slot at
> 0xfeffd000 during VM creation and destroy it if userspace overwrites it.

Whatever is preferred, I'm not able to decide (about ABI "breakage"
specifically). I just think any of them would be better than pulling
vcpu_reset out of the inner loop again just to fulfill the locking
requirements.

Jan
Gleb Natapov March 14, 2013, 12:18 p.m. UTC | #9
On Thu, Mar 14, 2013 at 01:16:41PM +0100, Jan Kiszka wrote:
> On 2013-03-14 13:12, Gleb Natapov wrote:
> > On Thu, Mar 14, 2013 at 12:29:48PM +0100, Jan Kiszka wrote:
> >> On 2013-03-14 11:15, Jan Kiszka wrote:
> >>>>>  
> >>>>> -	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
> >>>>> -		pr_debug("vcpu %d received sipi with vector # %x\n",
> >>>>> -			 vcpu->vcpu_id, vcpu->arch.sipi_vector);
> >>>>> -		kvm_lapic_reset(vcpu);
> >>>>> -		kvm_vcpu_reset(vcpu);
> >>>>> -		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
> >>>>> -	}
> >>>>> -
> >>>>>  	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
> >>>>>  	r = vapic_enter(vcpu);
> >>>>
> >>>> vmx_vcpu_reset overwrites vcpu->srcu_idx if ->vcpu_reset is called from
> >>>> within srcu section.
> >>>
> >>> Indeed.
> >>>
> >>> Do you know what the look over vmx_set_cr0 actually protects?
> >>
> >> Found it: It's not actually protecting anything. enter_rmode is called,
> >> and that assumes that lock to be held. If enter_rmode faces an
> >> uninitialized tss, it drops the lock before calling vmx_set_tss_addr.
> >>
> >> Well, I wonder if that is a good place to fix the TSS issue. Why not
> >> make that special case (lacking KVM_SET_TSS_ADDR before first KVM_RUN) a
> >> static jump key and check for it on KVM_RUN?
> >>
> > Or finally break userspace that does not set it before calling kvm_run.
> > I haven't seen people complain about "kvm: KVM_SET_TSS_ADDR need to be
> > called before entering vcpu" warning in dmesg. Or create TSS mem slot at
> > 0xfeffd000 during VM creation and destroy it if userspace overwrites it.
> 
> Whatever is preferred, I'm not able to decide (about ABI "breakage"
> specifically). I just think any of them would be better than pulling
> vcpu_reset out of the inner loop again just to fulfill the locking
> requirements.
> 
I agree. Lets try second approach. Can you write a patch?

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka March 14, 2013, 12:25 p.m. UTC | #10
On 2013-03-14 13:18, Gleb Natapov wrote:
> On Thu, Mar 14, 2013 at 01:16:41PM +0100, Jan Kiszka wrote:
>> On 2013-03-14 13:12, Gleb Natapov wrote:
>>> On Thu, Mar 14, 2013 at 12:29:48PM +0100, Jan Kiszka wrote:
>>>> On 2013-03-14 11:15, Jan Kiszka wrote:
>>>>>>>  
>>>>>>> -	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
>>>>>>> -		pr_debug("vcpu %d received sipi with vector # %x\n",
>>>>>>> -			 vcpu->vcpu_id, vcpu->arch.sipi_vector);
>>>>>>> -		kvm_lapic_reset(vcpu);
>>>>>>> -		kvm_vcpu_reset(vcpu);
>>>>>>> -		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
>>>>>>> -	}
>>>>>>> -
>>>>>>>  	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
>>>>>>>  	r = vapic_enter(vcpu);
>>>>>>
>>>>>> vmx_vcpu_reset overwrites vcpu->srcu_idx if ->vcpu_reset is called from
>>>>>> within srcu section.
>>>>>
>>>>> Indeed.
>>>>>
>>>>> Do you know what the look over vmx_set_cr0 actually protects?
>>>>
>>>> Found it: It's not actually protecting anything. enter_rmode is called,
>>>> and that assumes that lock to be held. If enter_rmode faces an
>>>> uninitialized tss, it drops the lock before calling vmx_set_tss_addr.
>>>>
>>>> Well, I wonder if that is a good place to fix the TSS issue. Why not
>>>> make that special case (lacking KVM_SET_TSS_ADDR before first KVM_RUN) a
>>>> static jump key and check for it on KVM_RUN?
>>>>
>>> Or finally break userspace that does not set it before calling kvm_run.
>>> I haven't seen people complain about "kvm: KVM_SET_TSS_ADDR need to be
>>> called before entering vcpu" warning in dmesg. Or create TSS mem slot at
>>> 0xfeffd000 during VM creation and destroy it if userspace overwrites it.
>>
>> Whatever is preferred, I'm not able to decide (about ABI "breakage"
>> specifically). I just think any of them would be better than pulling
>> vcpu_reset out of the inner loop again just to fulfill the locking
>> requirements.
>>
> I agree. Lets try second approach. Can you write a patch?

Task queued for later today or tomorrow. I suppose you hold back this
patch here for now anyway.

Jan
Gleb Natapov March 14, 2013, 12:28 p.m. UTC | #11
On Thu, Mar 14, 2013 at 01:25:04PM +0100, Jan Kiszka wrote:
> On 2013-03-14 13:18, Gleb Natapov wrote:
> > On Thu, Mar 14, 2013 at 01:16:41PM +0100, Jan Kiszka wrote:
> >> On 2013-03-14 13:12, Gleb Natapov wrote:
> >>> On Thu, Mar 14, 2013 at 12:29:48PM +0100, Jan Kiszka wrote:
> >>>> On 2013-03-14 11:15, Jan Kiszka wrote:
> >>>>>>>  
> >>>>>>> -	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
> >>>>>>> -		pr_debug("vcpu %d received sipi with vector # %x\n",
> >>>>>>> -			 vcpu->vcpu_id, vcpu->arch.sipi_vector);
> >>>>>>> -		kvm_lapic_reset(vcpu);
> >>>>>>> -		kvm_vcpu_reset(vcpu);
> >>>>>>> -		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
> >>>>>>> -	}
> >>>>>>> -
> >>>>>>>  	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
> >>>>>>>  	r = vapic_enter(vcpu);
> >>>>>>
> >>>>>> vmx_vcpu_reset overwrites vcpu->srcu_idx if ->vcpu_reset is called from
> >>>>>> within srcu section.
> >>>>>
> >>>>> Indeed.
> >>>>>
> >>>>> Do you know what the look over vmx_set_cr0 actually protects?
> >>>>
> >>>> Found it: It's not actually protecting anything. enter_rmode is called,
> >>>> and that assumes that lock to be held. If enter_rmode faces an
> >>>> uninitialized tss, it drops the lock before calling vmx_set_tss_addr.
> >>>>
> >>>> Well, I wonder if that is a good place to fix the TSS issue. Why not
> >>>> make that special case (lacking KVM_SET_TSS_ADDR before first KVM_RUN) a
> >>>> static jump key and check for it on KVM_RUN?
> >>>>
> >>> Or finally break userspace that does not set it before calling kvm_run.
> >>> I haven't seen people complain about "kvm: KVM_SET_TSS_ADDR need to be
> >>> called before entering vcpu" warning in dmesg. Or create TSS mem slot at
> >>> 0xfeffd000 during VM creation and destroy it if userspace overwrites it.
> >>
> >> Whatever is preferred, I'm not able to decide (about ABI "breakage"
> >> specifically). I just think any of them would be better than pulling
> >> vcpu_reset out of the inner loop again just to fulfill the locking
> >> requirements.
> >>
> > I agree. Lets try second approach. Can you write a patch?
> 
> Task queued for later today or tomorrow. I suppose you hold back this
> patch here for now anyway.
> 
The patch is pushed to queue already, I prefer to apply fix on top but
if it will take time I can rebase queue for now.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti March 14, 2013, 2:32 p.m. UTC | #12
On Thu, Mar 14, 2013 at 11:15:02AM +0100, Jan Kiszka wrote:
> On 2013-03-14 01:08, Marcelo Tosatti wrote:
> > On Wed, Mar 13, 2013 at 12:42:34PM +0100, Jan Kiszka wrote:
> >> A VCPU sending INIT or SIPI to some other VCPU races for setting the
> >> remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED
> >> was overwritten by kvm_emulate_halt and, thus, got lost.
> >>
> >> This introduces APIC events for those two signals, keeping them in
> >> kvm_apic until kvm_apic_accept_events is run over the target vcpu
> >> context. kvm_apic_has_events reports to kvm_arch_vcpu_runnable if there
> >> are pending events, thus if vcpu blocking should end.
> >>
> >> The patch comes with the side effect of effectively obsoleting
> >> KVM_MP_STATE_SIPI_RECEIVED. We still accept it from user space, but
> >> immediately translate it to KVM_MP_STATE_INIT_RECEIVED + KVM_APIC_SIPI.
> >> The vcpu itself will no longer enter the KVM_MP_STATE_SIPI_RECEIVED
> >> state. That also means we no longer exit to user space after receiving a
> >> SIPI event.
> >>
> >> Furthermore, we already reset the VCPU on INIT, only fixing up the code
> >> segment later on when SIPI arrives. Moreover, we fix INIT handling for
> >> the BSP: it never enter wait-for-SIPI but directly starts over on INIT.
> >>
> >> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> >> ---
> >>
> >> Changes in v2:
> >>  - drop cmpxchg from INIT signaling
> >>  - rmb on SIPI processing
> >>  - reset sets all CPUs to 0xf000:0xfff0, RIP is fixed up on SIPI reception
> >>
> >>  arch/x86/include/asm/kvm_host.h |    3 +-
> >>  arch/x86/kvm/lapic.c            |   48 +++++++++++++++++++++++++++-----
> >>  arch/x86/kvm/lapic.h            |   11 +++++++
> >>  arch/x86/kvm/svm.c              |    6 ----
> >>  arch/x86/kvm/vmx.c              |   12 +-------
> >>  arch/x86/kvm/x86.c              |   58 +++++++++++++++++++++++++-------------
> >>  6 files changed, 93 insertions(+), 45 deletions(-)
> >>
> >> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> >> index 348d859..ef7f4a5 100644
> >> --- a/arch/x86/include/asm/kvm_host.h
> >> +++ b/arch/x86/include/asm/kvm_host.h
> >> @@ -345,7 +345,6 @@ struct kvm_vcpu_arch {
> >>  	unsigned long apic_attention;
> >>  	int32_t apic_arb_prio;
> >>  	int mp_state;
> >> -	int sipi_mector;
> >>  	u64 ia32_misc_enable_msr;
> >>  	bool tpr_access_reporting;
> >>  
> >> @@ -819,6 +818,7 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
> >>  
> >>  void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
> >>  int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
> >> +void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector);
> >>  
> >>  int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
> >>  		    int reason, bool has_error_code, u32 error_code);
> >> @@ -1002,6 +1002,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
> >>  int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
> >>  int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
> >>  int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
> >> +void kvm_vcpu_reset(struct kvm_vcpu *vcpu);
> >>  
> >>  void kvm_define_shared_msr(unsigned index, u32 msr);
> >>  void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
> >> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> >> index 02b51dd..a8e9369 100644
> >> --- a/arch/x86/kvm/lapic.c
> >> +++ b/arch/x86/kvm/lapic.c
> >> @@ -731,7 +731,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
> >>  	case APIC_DM_INIT:
> >>  		if (!trig_mode || level) {
> >>  			result = 1;
> >> -			vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
> >> +			/* assumes that there are only KVM_APIC_INIT/SIPI */
> >> +			apic->pending_events = (1UL << KVM_APIC_INIT);
> >> +			/* make sure pending_events is visible before sending
> >> +			 * the request */
> >> +			smp_wmb();
> >>  			kvm_make_request(KVM_REQ_EVENT, vcpu);
> >>  			kvm_vcpu_kick(vcpu);
> >>  		} else {
> >> @@ -743,13 +747,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
> >>  	case APIC_DM_STARTUP:
> >>  		apic_debug("SIPI to vcpu %d vector 0x%02x\n",
> >>  			   vcpu->vcpu_id, vector);
> >> -		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
> >> -			result = 1;
> >> -			vcpu->arch.sipi_vector = vector;
> >> -			vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
> >> -			kvm_make_request(KVM_REQ_EVENT, vcpu);
> >> -			kvm_vcpu_kick(vcpu);
> >> -		}
> >> +		result = 1;
> >> +		apic->sipi_vector = vector;
> >> +		/* make sure sipi_vector is visible for the receiver */
> >> +		smp_wmb();
> >> +		set_bit(KVM_APIC_SIPI, &apic->pending_events);
> >> +		kvm_make_request(KVM_REQ_EVENT, vcpu);
> >> +		kvm_vcpu_kick(vcpu);
> > 
> > Why are APIC_DM_STARTUP / APIC_DM_INIT setting KVM_REQ_EVENT again?
> > Can't see any direct connection. See below.
> 
> Changing the mp_state may unblock pending events (IRQs, NMIs), I think
> that is the original reason.
>
> >> +static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
> >> +{
> >> +	return vcpu->arch.apic->pending_events;
> >> +}
> > 
> > vcpu->arch.apic = NULL?
> 
> Not possible for the caller of this function.
> 
> > 
> >>  	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
> >> +		kvm_apic_accept_events(vcpu);
> >> +		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
> >> +			r = 1;
> >> +			goto out;
> >> +		}
> >> +
> > 
> > A separate request bit makes sense, because nothing in this
> > (KVM_REQ_EVENT conditional) code sequence handles the work from
> > APIC_DM_STARTUP / APIC_DM_INIT sites (well, before your patch). See
> > below.
> 
> OK. Was told to avoid new request bits.
> 
> > 
> >>  
> >> -	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
> >> -		pr_debug("vcpu %d received sipi with vector # %x\n",
> >> -			 vcpu->vcpu_id, vcpu->arch.sipi_vector);
> >> -		kvm_lapic_reset(vcpu);
> >> -		kvm_vcpu_reset(vcpu);
> >> -		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
> >> -	}
> >> -
> >>  	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
> >>  	r = vapic_enter(vcpu);
> > 
> > vmx_vcpu_reset overwrites vcpu->srcu_idx if ->vcpu_reset is called from
> > within srcu section.
> 
> Indeed.
> 
> Do you know what the look over vmx_set_cr0 actually protects?

Memslot usage (enter_rmode), rmode_tss_base for example.

> > Also, this is not part of the hotpath and therefore it could be farther
> > from vcpu_enter_guest. What about processing a new request bit here?
> > (KVM_REQ_EVENT <-> apic->pending_events relationship is cloudy).
> 
> I can pull the check from vcpu_enter_guest out at this level again, but
> then we will generate the user space exits again.

Its a very slow path, right? 

> > Also the fact kvm_apic_accept_events() is called from sites is annoying,
> > why is that necessary again?
> 
> - to avoid having pending events in the APIC when delivering mp_state
>   to user space
> - to keep mp_state correct after waking up from kvm_vcpu_block (we
>   could otherwise walk through code paths with the wrong state)
> - to process pending events when the VCPU was running
> 
> Jan

Ok, its alright (multiple callsites). Perhaps second and third points to
be collapsed into a single one if moved outside vcpu_enter_guest.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov March 14, 2013, 2:53 p.m. UTC | #13
On Thu, Mar 14, 2013 at 11:32:59AM -0300, Marcelo Tosatti wrote:
> > > Also the fact kvm_apic_accept_events() is called from sites is annoying,
> > > why is that necessary again?
> > 
> > - to avoid having pending events in the APIC when delivering mp_state
> >   to user space
> > - to keep mp_state correct after waking up from kvm_vcpu_block (we
> >   could otherwise walk through code paths with the wrong state)
> > - to process pending events when the VCPU was running
> > 
> > Jan
> 
> Ok, its alright (multiple callsites). Perhaps second and third points to
> be collapsed into a single one if moved outside vcpu_enter_guest.

Lets not forget that the point of all this changes was to properly
handle INIT wrt nested guests. I wonder how implementing it will affect
all the call sites. I doubt we want to handle nested vmexit at each one one
them.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka March 14, 2013, 3:32 p.m. UTC | #14
On 2013-03-14 13:28, Gleb Natapov wrote:
> On Thu, Mar 14, 2013 at 01:25:04PM +0100, Jan Kiszka wrote:
>> On 2013-03-14 13:18, Gleb Natapov wrote:
>>> On Thu, Mar 14, 2013 at 01:16:41PM +0100, Jan Kiszka wrote:
>>>> On 2013-03-14 13:12, Gleb Natapov wrote:
>>>>> On Thu, Mar 14, 2013 at 12:29:48PM +0100, Jan Kiszka wrote:
>>>>>> On 2013-03-14 11:15, Jan Kiszka wrote:
>>>>>>>>>  
>>>>>>>>> -	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
>>>>>>>>> -		pr_debug("vcpu %d received sipi with vector # %x\n",
>>>>>>>>> -			 vcpu->vcpu_id, vcpu->arch.sipi_vector);
>>>>>>>>> -		kvm_lapic_reset(vcpu);
>>>>>>>>> -		kvm_vcpu_reset(vcpu);
>>>>>>>>> -		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
>>>>>>>>> -	}
>>>>>>>>> -
>>>>>>>>>  	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
>>>>>>>>>  	r = vapic_enter(vcpu);
>>>>>>>>
>>>>>>>> vmx_vcpu_reset overwrites vcpu->srcu_idx if ->vcpu_reset is called from
>>>>>>>> within srcu section.
>>>>>>>
>>>>>>> Indeed.
>>>>>>>
>>>>>>> Do you know what the look over vmx_set_cr0 actually protects?
>>>>>>
>>>>>> Found it: It's not actually protecting anything. enter_rmode is called,
>>>>>> and that assumes that lock to be held. If enter_rmode faces an
>>>>>> uninitialized tss, it drops the lock before calling vmx_set_tss_addr.
>>>>>>
>>>>>> Well, I wonder if that is a good place to fix the TSS issue. Why not
>>>>>> make that special case (lacking KVM_SET_TSS_ADDR before first KVM_RUN) a
>>>>>> static jump key and check for it on KVM_RUN?
>>>>>>
>>>>> Or finally break userspace that does not set it before calling kvm_run.
>>>>> I haven't seen people complain about "kvm: KVM_SET_TSS_ADDR need to be
>>>>> called before entering vcpu" warning in dmesg. Or create TSS mem slot at
>>>>> 0xfeffd000 during VM creation and destroy it if userspace overwrites it.
>>>>
>>>> Whatever is preferred, I'm not able to decide (about ABI "breakage"
>>>> specifically). I just think any of them would be better than pulling
>>>> vcpu_reset out of the inner loop again just to fulfill the locking
>>>> requirements.
>>>>
>>> I agree. Lets try second approach. Can you write a patch?
>>
>> Task queued for later today or tomorrow. I suppose you hold back this
>> patch here for now anyway.
>>
> The patch is pushed to queue already, I prefer to apply fix on top but
> if it will take time I can rebase queue for now.

OK, tried this approach, but I don't think it easily works: If we
unconditionally set the TSS, we can create conflicts with userland's
mapping if that is different, no?

Leaves us with dropping the fixup, just leaving the warning and let the
guest crash. Or my static key idea.

Jan
Gleb Natapov March 14, 2013, 3:46 p.m. UTC | #15
On Thu, Mar 14, 2013 at 04:32:16PM +0100, Jan Kiszka wrote:
> On 2013-03-14 13:28, Gleb Natapov wrote:
> > On Thu, Mar 14, 2013 at 01:25:04PM +0100, Jan Kiszka wrote:
> >> On 2013-03-14 13:18, Gleb Natapov wrote:
> >>> On Thu, Mar 14, 2013 at 01:16:41PM +0100, Jan Kiszka wrote:
> >>>> On 2013-03-14 13:12, Gleb Natapov wrote:
> >>>>> On Thu, Mar 14, 2013 at 12:29:48PM +0100, Jan Kiszka wrote:
> >>>>>> On 2013-03-14 11:15, Jan Kiszka wrote:
> >>>>>>>>>  
> >>>>>>>>> -	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
> >>>>>>>>> -		pr_debug("vcpu %d received sipi with vector # %x\n",
> >>>>>>>>> -			 vcpu->vcpu_id, vcpu->arch.sipi_vector);
> >>>>>>>>> -		kvm_lapic_reset(vcpu);
> >>>>>>>>> -		kvm_vcpu_reset(vcpu);
> >>>>>>>>> -		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
> >>>>>>>>> -	}
> >>>>>>>>> -
> >>>>>>>>>  	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
> >>>>>>>>>  	r = vapic_enter(vcpu);
> >>>>>>>>
> >>>>>>>> vmx_vcpu_reset overwrites vcpu->srcu_idx if ->vcpu_reset is called from
> >>>>>>>> within srcu section.
> >>>>>>>
> >>>>>>> Indeed.
> >>>>>>>
> >>>>>>> Do you know what the look over vmx_set_cr0 actually protects?
> >>>>>>
> >>>>>> Found it: It's not actually protecting anything. enter_rmode is called,
> >>>>>> and that assumes that lock to be held. If enter_rmode faces an
> >>>>>> uninitialized tss, it drops the lock before calling vmx_set_tss_addr.
> >>>>>>
> >>>>>> Well, I wonder if that is a good place to fix the TSS issue. Why not
> >>>>>> make that special case (lacking KVM_SET_TSS_ADDR before first KVM_RUN) a
> >>>>>> static jump key and check for it on KVM_RUN?
> >>>>>>
> >>>>> Or finally break userspace that does not set it before calling kvm_run.
> >>>>> I haven't seen people complain about "kvm: KVM_SET_TSS_ADDR need to be
> >>>>> called before entering vcpu" warning in dmesg. Or create TSS mem slot at
> >>>>> 0xfeffd000 during VM creation and destroy it if userspace overwrites it.
> >>>>
> >>>> Whatever is preferred, I'm not able to decide (about ABI "breakage"
> >>>> specifically). I just think any of them would be better than pulling
> >>>> vcpu_reset out of the inner loop again just to fulfill the locking
> >>>> requirements.
> >>>>
> >>> I agree. Lets try second approach. Can you write a patch?
> >>
> >> Task queued for later today or tomorrow. I suppose you hold back this
> >> patch here for now anyway.
> >>
> > The patch is pushed to queue already, I prefer to apply fix on top but
> > if it will take time I can rebase queue for now.
> 
> OK, tried this approach, but I don't think it easily works: If we
> unconditionally set the TSS, we can create conflicts with userland's
> mapping if that is different, no?
> 
Yes, you are right. It is possible to add hack that removes the TSS slot
if userspace add overlapping slot, but this will push the hack to far
into the common code.

> Leaves us with dropping the fixup, just leaving the warning and let the
> guest crash. Or my static key idea.
> 
The warning was there for a long time. I say lets drop the fixup leaving
the warning. Static key will be a back up plan. Marcelo, Paolo what do you think?

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 348d859..ef7f4a5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -345,7 +345,6 @@  struct kvm_vcpu_arch {
 	unsigned long apic_attention;
 	int32_t apic_arb_prio;
 	int mp_state;
-	int sipi_vector;
 	u64 ia32_misc_enable_msr;
 	bool tpr_access_reporting;
 
@@ -819,6 +818,7 @@  int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
 
 void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
+void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector);
 
 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
 		    int reason, bool has_error_code, u32 error_code);
@@ -1002,6 +1002,7 @@  int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
 int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
+void kvm_vcpu_reset(struct kvm_vcpu *vcpu);
 
 void kvm_define_shared_msr(unsigned index, u32 msr);
 void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 02b51dd..a8e9369 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -731,7 +731,11 @@  static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 	case APIC_DM_INIT:
 		if (!trig_mode || level) {
 			result = 1;
-			vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
+			/* assumes that there are only KVM_APIC_INIT/SIPI */
+			apic->pending_events = (1UL << KVM_APIC_INIT);
+			/* make sure pending_events is visible before sending
+			 * the request */
+			smp_wmb();
 			kvm_make_request(KVM_REQ_EVENT, vcpu);
 			kvm_vcpu_kick(vcpu);
 		} else {
@@ -743,13 +747,13 @@  static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 	case APIC_DM_STARTUP:
 		apic_debug("SIPI to vcpu %d vector 0x%02x\n",
 			   vcpu->vcpu_id, vector);
-		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
-			result = 1;
-			vcpu->arch.sipi_vector = vector;
-			vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
-			kvm_make_request(KVM_REQ_EVENT, vcpu);
-			kvm_vcpu_kick(vcpu);
-		}
+		result = 1;
+		apic->sipi_vector = vector;
+		/* make sure sipi_vector is visible for the receiver */
+		smp_wmb();
+		set_bit(KVM_APIC_SIPI, &apic->pending_events);
+		kvm_make_request(KVM_REQ_EVENT, vcpu);
+		kvm_vcpu_kick(vcpu);
 		break;
 
 	case APIC_DM_EXTINT:
@@ -1860,6 +1864,34 @@  int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
 					 addr);
 }
 
+void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+	unsigned int sipi_vector;
+
+	if (!kvm_vcpu_has_lapic(vcpu))
+		return;
+
+	if (test_and_clear_bit(KVM_APIC_INIT, &apic->pending_events)) {
+		kvm_lapic_reset(vcpu);
+		kvm_vcpu_reset(vcpu);
+		if (kvm_vcpu_is_bsp(apic->vcpu))
+			vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+		else
+			vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
+	}
+	if (test_and_clear_bit(KVM_APIC_SIPI, &apic->pending_events) &&
+	    vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
+		/* evaluate pending_events before reading the vector */
+		smp_rmb();
+		sipi_vector = apic->sipi_vector;
+		pr_debug("vcpu %d received sipi with vector # %x\n",
+			 vcpu->vcpu_id, sipi_vector);
+		kvm_vcpu_deliver_sipi_vector(vcpu, sipi_vector);
+		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+	}
+}
+
 void kvm_lapic_init(void)
 {
 	/* do not patch jump label more than once per second */
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 1676d34..2c721b9 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -5,6 +5,9 @@ 
 
 #include <linux/kvm_host.h>
 
+#define KVM_APIC_INIT		0
+#define KVM_APIC_SIPI		1
+
 struct kvm_timer {
 	struct hrtimer timer;
 	s64 period; 				/* unit: ns */
@@ -32,6 +35,8 @@  struct kvm_lapic {
 	void *regs;
 	gpa_t vapic_addr;
 	struct page *vapic_page;
+	unsigned long pending_events;
+	unsigned int sipi_vector;
 };
 int kvm_create_lapic(struct kvm_vcpu *vcpu);
 void kvm_free_lapic(struct kvm_vcpu *vcpu);
@@ -39,6 +44,7 @@  void kvm_free_lapic(struct kvm_vcpu *vcpu);
 int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
 int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
 int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
+void kvm_apic_accept_events(struct kvm_vcpu *vcpu);
 void kvm_lapic_reset(struct kvm_vcpu *vcpu);
 u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
 void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
@@ -158,4 +164,9 @@  void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
 				struct kvm_lapic_irq *irq,
 				u64 *eoi_bitmap);
 
+static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.apic->pending_events;
+}
+
 #endif
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 907e428..7219a40 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1199,12 +1199,6 @@  static void svm_vcpu_reset(struct kvm_vcpu *vcpu)
 
 	init_vmcb(svm);
 
-	if (!kvm_vcpu_is_bsp(vcpu)) {
-		kvm_rip_write(vcpu, 0);
-		svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
-		svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
-	}
-
 	kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
 	kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
 }
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f17cd2a..b73989d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4119,12 +4119,7 @@  static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	vmx_segment_cache_clear(vmx);
 
 	seg_setup(VCPU_SREG_CS);
-	if (kvm_vcpu_is_bsp(&vmx->vcpu))
-		vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
-	else {
-		vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
-		vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
-	}
+	vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
 
 	seg_setup(VCPU_SREG_DS);
 	seg_setup(VCPU_SREG_ES);
@@ -4147,10 +4142,7 @@  static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	vmcs_writel(GUEST_SYSENTER_EIP, 0);
 
 	vmcs_writel(GUEST_RFLAGS, 0x02);
-	if (kvm_vcpu_is_bsp(&vmx->vcpu))
-		kvm_rip_write(vcpu, 0xfff0);
-	else
-		kvm_rip_write(vcpu, 0);
+	kvm_rip_write(vcpu, 0xfff0);
 
 	vmcs_writel(GUEST_GDTR_BASE, 0);
 	vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b891ac3..a7e51ae 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -162,8 +162,6 @@  u64 __read_mostly host_xcr0;
 
 static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
 
-static void kvm_vcpu_reset(struct kvm_vcpu *vcpu);
-
 static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
 {
 	int i;
@@ -2823,10 +2821,9 @@  static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 	events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
 	events->nmi.pad = 0;
 
-	events->sipi_vector = vcpu->arch.sipi_vector;
+	events->sipi_vector = 0; /* never valid when reporting to user space */
 
 	events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
-			 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
 			 | KVM_VCPUEVENT_VALID_SHADOW);
 	memset(&events->reserved, 0, sizeof(events->reserved));
 }
@@ -2857,8 +2854,9 @@  static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 		vcpu->arch.nmi_pending = events->nmi.pending;
 	kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
 
-	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
-		vcpu->arch.sipi_vector = events->sipi_vector;
+	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
+	    kvm_vcpu_has_lapic(vcpu))
+		vcpu->arch.apic->sipi_vector = events->sipi_vector;
 
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 
@@ -5713,6 +5711,12 @@  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	}
 
 	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
+		kvm_apic_accept_events(vcpu);
+		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
+			r = 1;
+			goto out;
+		}
+
 		inject_pending_event(vcpu);
 
 		/* enable NMI/IRQ window open exits if needed */
@@ -5847,14 +5851,6 @@  static int __vcpu_run(struct kvm_vcpu *vcpu)
 	int r;
 	struct kvm *kvm = vcpu->kvm;
 
-	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
-		pr_debug("vcpu %d received sipi with vector # %x\n",
-			 vcpu->vcpu_id, vcpu->arch.sipi_vector);
-		kvm_lapic_reset(vcpu);
-		kvm_vcpu_reset(vcpu);
-		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
-	}
-
 	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
 	r = vapic_enter(vcpu);
 	if (r) {
@@ -5871,8 +5867,8 @@  static int __vcpu_run(struct kvm_vcpu *vcpu)
 			srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
 			kvm_vcpu_block(vcpu);
 			vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
-			if (kvm_check_request(KVM_REQ_UNHALT, vcpu))
-			{
+			if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) {
+				kvm_apic_accept_events(vcpu);
 				switch(vcpu->arch.mp_state) {
 				case KVM_MP_STATE_HALTED:
 					vcpu->arch.mp_state =
@@ -5880,7 +5876,8 @@  static int __vcpu_run(struct kvm_vcpu *vcpu)
 				case KVM_MP_STATE_RUNNABLE:
 					vcpu->arch.apf.halted = false;
 					break;
-				case KVM_MP_STATE_SIPI_RECEIVED:
+				case KVM_MP_STATE_INIT_RECEIVED:
+					break;
 				default:
 					r = -EINTR;
 					break;
@@ -6015,6 +6012,7 @@  int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
 		kvm_vcpu_block(vcpu);
+		kvm_apic_accept_events(vcpu);
 		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
 		r = -EAGAIN;
 		goto out;
@@ -6171,6 +6169,7 @@  int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 {
+	kvm_apic_accept_events(vcpu);
 	mp_state->mp_state = vcpu->arch.mp_state;
 	return 0;
 }
@@ -6178,7 +6177,15 @@  int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 {
-	vcpu->arch.mp_state = mp_state->mp_state;
+	if (!kvm_vcpu_has_lapic(vcpu) &&
+	    mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
+		return -EINVAL;
+
+	if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
+		vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
+		set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
+	} else
+		vcpu->arch.mp_state = mp_state->mp_state;
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	return 0;
 }
@@ -6515,7 +6522,7 @@  void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 	kvm_x86_ops->vcpu_free(vcpu);
 }
 
-static void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
+void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
 {
 	atomic_set(&vcpu->arch.nmi_queued, 0);
 	vcpu->arch.nmi_pending = 0;
@@ -6545,6 +6552,17 @@  static void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
 	kvm_x86_ops->vcpu_reset(vcpu);
 }
 
+void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector)
+{
+	struct kvm_segment cs;
+
+	kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
+	cs.selector = vector << 8;
+	cs.base = vector << 12;
+	kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
+	kvm_rip_write(vcpu, 0);
+}
+
 int kvm_arch_hardware_enable(void *garbage)
 {
 	struct kvm *kvm;
@@ -6988,7 +7006,7 @@  int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
 		!vcpu->arch.apf.halted)
 		|| !list_empty_careful(&vcpu->async_pf.done)
-		|| vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
+		|| kvm_apic_has_events(vcpu)
 		|| atomic_read(&vcpu->arch.nmi_queued) ||
 		(kvm_arch_interrupt_allowed(vcpu) &&
 		 kvm_cpu_has_interrupt(vcpu));