diff mbox

[v2] KVM: x86: Convert INIT and SIPI signals into synchronously handled requests

Message ID 51351517.3090600@web.de (mailing list archive)
State New, archived
Headers show

Commit Message

Jan Kiszka March 4, 2013, 9:41 p.m. UTC
From: Jan Kiszka <jan.kiszka@siemens.com>

A VCPU sending INIT or SIPI to some other VCPU races for setting the
remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED
was overwritten by kvm_emulate_halt and, thus, got lost.

Fix this by raising requests on the sender side that will then be
handled synchronously over the target VCPU context.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
---

Changes in v2:
 - check transition to INIT_RECEIVED in vcpu_enter_guest
 - removed return value of kvm_check_init_and_sipi - caller has to
   check for relevant transition afterward
 - add write barrier after setting sipi_vector

 arch/x86/kvm/lapic.c     |   11 ++++++-----
 arch/x86/kvm/x86.c       |   15 +++++++++++++++
 include/linux/kvm_host.h |    2 ++
 3 files changed, 23 insertions(+), 5 deletions(-)

Comments

Jan Kiszka March 4, 2013, 11 p.m. UTC | #1
On 2013-03-04 22:41, Jan Kiszka wrote:
> From: Jan Kiszka <jan.kiszka@siemens.com>
> 
> A VCPU sending INIT or SIPI to some other VCPU races for setting the
> remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED
> was overwritten by kvm_emulate_halt and, thus, got lost.
> 
> Fix this by raising requests on the sender side that will then be
> handled synchronously over the target VCPU context.
> 
> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> ---
> 
> Changes in v2:
>  - check transition to INIT_RECEIVED in vcpu_enter_guest
>  - removed return value of kvm_check_init_and_sipi - caller has to
>    check for relevant transition afterward
>  - add write barrier after setting sipi_vector
> 
>  arch/x86/kvm/lapic.c     |   11 ++++++-----
>  arch/x86/kvm/x86.c       |   15 +++++++++++++++
>  include/linux/kvm_host.h |    2 ++
>  3 files changed, 23 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 02b51dd..7986c9f 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -731,8 +731,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
>  	case APIC_DM_INIT:
>  		if (!trig_mode || level) {
>  			result = 1;
> -			vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
> -			kvm_make_request(KVM_REQ_EVENT, vcpu);
> +			kvm_make_request(KVM_REQ_INIT, vcpu);
>  			kvm_vcpu_kick(vcpu);
>  		} else {
>  			apic_debug("Ignoring de-assert INIT to vcpu %d\n",
> @@ -743,11 +742,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
>  	case APIC_DM_STARTUP:
>  		apic_debug("SIPI to vcpu %d vector 0x%02x\n",
>  			   vcpu->vcpu_id, vector);
> -		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
> +		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED ||
> +		    test_bit(KVM_REQ_INIT, &vcpu->requests)) {
>  			result = 1;
>  			vcpu->arch.sipi_vector = vector;
> -			vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
> -			kvm_make_request(KVM_REQ_EVENT, vcpu);
> +			/* make sure sipi_vector is visible for the receiver */
> +			smp_wmb();
> +			kvm_make_request(KVM_REQ_SIPI, vcpu);
>  			kvm_vcpu_kick(vcpu);
>  		}
>  		break;
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index d0cf737..0be04b9 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -5641,6 +5641,15 @@ static void update_eoi_exitmap(struct kvm_vcpu *vcpu)
>  	kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
>  }
>  
> +static void kvm_check_init_and_sipi(struct kvm_vcpu *vcpu)
> +{
> +	if (kvm_check_request(KVM_REQ_INIT, vcpu))
> +		vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;

And here is a small race between clearing REQ_INIT and setting
INIT_RECEIVED. It can make the LAPIC drop the SIPI incorrectly. Need to
break up test and clear, doing the clear after mp_state update. Yeah...

Jan

> +	if (kvm_check_request(KVM_REQ_SIPI, vcpu) &&
> +	    vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
> +		vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
> +}
> +
>  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  {
>  	int r;
> @@ -5649,6 +5658,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  	bool req_immediate_exit = 0;
>  
>  	if (vcpu->requests) {
> +		kvm_check_init_and_sipi(vcpu);
> +		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
> +			r = 1;
> +			goto out;
> +		}
>  		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
>  			kvm_mmu_unload(vcpu);
>  		if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
> @@ -6977,6 +6991,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
>  
>  int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
>  {
> +	kvm_check_init_and_sipi(vcpu);
>  	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
>  		!vcpu->arch.apf.halted)
>  		|| !list_empty_careful(&vcpu->async_pf.done)
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 722cae7..1a191c9 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -124,6 +124,8 @@ static inline bool is_error_page(struct page *page)
>  #define KVM_REQ_MCLOCK_INPROGRESS 20
>  #define KVM_REQ_EPR_EXIT          21
>  #define KVM_REQ_EOIBITMAP         22
> +#define KVM_REQ_INIT              23
> +#define KVM_REQ_SIPI              24
>  
>  #define KVM_USERSPACE_IRQ_SOURCE_ID		0
>  #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID	1
>
Gleb Natapov March 5, 2013, 7:57 a.m. UTC | #2
On Tue, Mar 05, 2013 at 12:00:10AM +0100, Jan Kiszka wrote:
> On 2013-03-04 22:41, Jan Kiszka wrote:
> > From: Jan Kiszka <jan.kiszka@siemens.com>
> > 
> > A VCPU sending INIT or SIPI to some other VCPU races for setting the
> > remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED
> > was overwritten by kvm_emulate_halt and, thus, got lost.
> > 
> > Fix this by raising requests on the sender side that will then be
> > handled synchronously over the target VCPU context.
> > 
> > Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> > ---
> > 
> > Changes in v2:
> >  - check transition to INIT_RECEIVED in vcpu_enter_guest
> >  - removed return value of kvm_check_init_and_sipi - caller has to
> >    check for relevant transition afterward
> >  - add write barrier after setting sipi_vector
> > 
> >  arch/x86/kvm/lapic.c     |   11 ++++++-----
> >  arch/x86/kvm/x86.c       |   15 +++++++++++++++
> >  include/linux/kvm_host.h |    2 ++
> >  3 files changed, 23 insertions(+), 5 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > index 02b51dd..7986c9f 100644
> > --- a/arch/x86/kvm/lapic.c
> > +++ b/arch/x86/kvm/lapic.c
> > @@ -731,8 +731,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
> >  	case APIC_DM_INIT:
> >  		if (!trig_mode || level) {
> >  			result = 1;
> > -			vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
> > -			kvm_make_request(KVM_REQ_EVENT, vcpu);
> > +			kvm_make_request(KVM_REQ_INIT, vcpu);
> >  			kvm_vcpu_kick(vcpu);
> >  		} else {
> >  			apic_debug("Ignoring de-assert INIT to vcpu %d\n",
> > @@ -743,11 +742,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
> >  	case APIC_DM_STARTUP:
> >  		apic_debug("SIPI to vcpu %d vector 0x%02x\n",
> >  			   vcpu->vcpu_id, vector);
> > -		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
> > +		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED ||
> > +		    test_bit(KVM_REQ_INIT, &vcpu->requests)) {
> >  			result = 1;
> >  			vcpu->arch.sipi_vector = vector;
> > -			vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
> > -			kvm_make_request(KVM_REQ_EVENT, vcpu);
> > +			/* make sure sipi_vector is visible for the receiver */
> > +			smp_wmb();
> > +			kvm_make_request(KVM_REQ_SIPI, vcpu);
> >  			kvm_vcpu_kick(vcpu);
> >  		}
> >  		break;
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index d0cf737..0be04b9 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -5641,6 +5641,15 @@ static void update_eoi_exitmap(struct kvm_vcpu *vcpu)
> >  	kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
> >  }
> >  
> > +static void kvm_check_init_and_sipi(struct kvm_vcpu *vcpu)
> > +{
> > +	if (kvm_check_request(KVM_REQ_INIT, vcpu))
> > +		vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
> 
> And here is a small race between clearing REQ_INIT and setting
> INIT_RECEIVED. It can make the LAPIC drop the SIPI incorrectly. Need to
> break up test and clear, doing the clear after mp_state update. Yeah...
> 
You also need to call kvm_check_init_and_sipi() in
kvm_arch_vcpu_ioctl_get_mpstate(), which means you now have three places
where you transfer INIT/SIPI state from requests to mp_state. All the
problems arise from the fact that now you have two places where you
are storing current state. To overcome this we can either deprecated
KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED values for mp_state
(use it only for migration purposes) and use separate state in APIC
to hold those event, like with nmi, or why not go with Paolo's simple
cmpxchg one?

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka March 5, 2013, 8:24 a.m. UTC | #3
On 2013-03-05 08:57, Gleb Natapov wrote:
> On Tue, Mar 05, 2013 at 12:00:10AM +0100, Jan Kiszka wrote:
>> On 2013-03-04 22:41, Jan Kiszka wrote:
>>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>>
>>> A VCPU sending INIT or SIPI to some other VCPU races for setting the
>>> remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED
>>> was overwritten by kvm_emulate_halt and, thus, got lost.
>>>
>>> Fix this by raising requests on the sender side that will then be
>>> handled synchronously over the target VCPU context.
>>>
>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>>> ---
>>>
>>> Changes in v2:
>>>  - check transition to INIT_RECEIVED in vcpu_enter_guest
>>>  - removed return value of kvm_check_init_and_sipi - caller has to
>>>    check for relevant transition afterward
>>>  - add write barrier after setting sipi_vector
>>>
>>>  arch/x86/kvm/lapic.c     |   11 ++++++-----
>>>  arch/x86/kvm/x86.c       |   15 +++++++++++++++
>>>  include/linux/kvm_host.h |    2 ++
>>>  3 files changed, 23 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
>>> index 02b51dd..7986c9f 100644
>>> --- a/arch/x86/kvm/lapic.c
>>> +++ b/arch/x86/kvm/lapic.c
>>> @@ -731,8 +731,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
>>>  	case APIC_DM_INIT:
>>>  		if (!trig_mode || level) {
>>>  			result = 1;
>>> -			vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
>>> -			kvm_make_request(KVM_REQ_EVENT, vcpu);
>>> +			kvm_make_request(KVM_REQ_INIT, vcpu);
>>>  			kvm_vcpu_kick(vcpu);
>>>  		} else {
>>>  			apic_debug("Ignoring de-assert INIT to vcpu %d\n",
>>> @@ -743,11 +742,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
>>>  	case APIC_DM_STARTUP:
>>>  		apic_debug("SIPI to vcpu %d vector 0x%02x\n",
>>>  			   vcpu->vcpu_id, vector);
>>> -		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
>>> +		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED ||
>>> +		    test_bit(KVM_REQ_INIT, &vcpu->requests)) {
>>>  			result = 1;
>>>  			vcpu->arch.sipi_vector = vector;
>>> -			vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
>>> -			kvm_make_request(KVM_REQ_EVENT, vcpu);
>>> +			/* make sure sipi_vector is visible for the receiver */
>>> +			smp_wmb();
>>> +			kvm_make_request(KVM_REQ_SIPI, vcpu);
>>>  			kvm_vcpu_kick(vcpu);
>>>  		}
>>>  		break;
>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>>> index d0cf737..0be04b9 100644
>>> --- a/arch/x86/kvm/x86.c
>>> +++ b/arch/x86/kvm/x86.c
>>> @@ -5641,6 +5641,15 @@ static void update_eoi_exitmap(struct kvm_vcpu *vcpu)
>>>  	kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
>>>  }
>>>  
>>> +static void kvm_check_init_and_sipi(struct kvm_vcpu *vcpu)
>>> +{
>>> +	if (kvm_check_request(KVM_REQ_INIT, vcpu))
>>> +		vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
>>
>> And here is a small race between clearing REQ_INIT and setting
>> INIT_RECEIVED. It can make the LAPIC drop the SIPI incorrectly. Need to
>> break up test and clear, doing the clear after mp_state update. Yeah...
>>
> You also need to call kvm_check_init_and_sipi() in
> kvm_arch_vcpu_ioctl_get_mpstate(), 

Indeed.

> which means you now have three places
> where you transfer INIT/SIPI state from requests to mp_state. All the
> problems arise from the fact that now you have two places where you
> are storing current state.

Not at all. I'm keeping the state in a single place, mp_state. I just
have to make sure that I do not loose asynchronous events - what INIT
and SIPI are.

> To overcome this we can either deprecated
> KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED values for mp_state
> (use it only for migration purposes) and use separate state in APIC
> to hold those event, like with nmi, or why not go with Paolo's simple
> cmpxchg one?

We need to replace most, if not all, manipulations of mp_state with
cmpxchg, verifying the state transitions there. And the request-based
approach still looks cleaner to me when it comes to implementing INIT
handling for nested modes. That will just trivially hook into
kvm_check_init_and_sipi.

Jan
Gleb Natapov March 5, 2013, 8:46 a.m. UTC | #4
On Tue, Mar 05, 2013 at 09:24:40AM +0100, Jan Kiszka wrote:
> On 2013-03-05 08:57, Gleb Natapov wrote:
> > On Tue, Mar 05, 2013 at 12:00:10AM +0100, Jan Kiszka wrote:
> >> On 2013-03-04 22:41, Jan Kiszka wrote:
> >>> From: Jan Kiszka <jan.kiszka@siemens.com>
> >>>
> >>> A VCPU sending INIT or SIPI to some other VCPU races for setting the
> >>> remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED
> >>> was overwritten by kvm_emulate_halt and, thus, got lost.
> >>>
> >>> Fix this by raising requests on the sender side that will then be
> >>> handled synchronously over the target VCPU context.
> >>>
> >>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> >>> ---
> >>>
> >>> Changes in v2:
> >>>  - check transition to INIT_RECEIVED in vcpu_enter_guest
> >>>  - removed return value of kvm_check_init_and_sipi - caller has to
> >>>    check for relevant transition afterward
> >>>  - add write barrier after setting sipi_vector
> >>>
> >>>  arch/x86/kvm/lapic.c     |   11 ++++++-----
> >>>  arch/x86/kvm/x86.c       |   15 +++++++++++++++
> >>>  include/linux/kvm_host.h |    2 ++
> >>>  3 files changed, 23 insertions(+), 5 deletions(-)
> >>>
> >>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> >>> index 02b51dd..7986c9f 100644
> >>> --- a/arch/x86/kvm/lapic.c
> >>> +++ b/arch/x86/kvm/lapic.c
> >>> @@ -731,8 +731,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
> >>>  	case APIC_DM_INIT:
> >>>  		if (!trig_mode || level) {
> >>>  			result = 1;
> >>> -			vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
> >>> -			kvm_make_request(KVM_REQ_EVENT, vcpu);
> >>> +			kvm_make_request(KVM_REQ_INIT, vcpu);
> >>>  			kvm_vcpu_kick(vcpu);
> >>>  		} else {
> >>>  			apic_debug("Ignoring de-assert INIT to vcpu %d\n",
> >>> @@ -743,11 +742,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
> >>>  	case APIC_DM_STARTUP:
> >>>  		apic_debug("SIPI to vcpu %d vector 0x%02x\n",
> >>>  			   vcpu->vcpu_id, vector);
> >>> -		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
> >>> +		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED ||
> >>> +		    test_bit(KVM_REQ_INIT, &vcpu->requests)) {
> >>>  			result = 1;
> >>>  			vcpu->arch.sipi_vector = vector;
> >>> -			vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
> >>> -			kvm_make_request(KVM_REQ_EVENT, vcpu);
> >>> +			/* make sure sipi_vector is visible for the receiver */
> >>> +			smp_wmb();
> >>> +			kvm_make_request(KVM_REQ_SIPI, vcpu);
> >>>  			kvm_vcpu_kick(vcpu);
> >>>  		}
> >>>  		break;
> >>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> >>> index d0cf737..0be04b9 100644
> >>> --- a/arch/x86/kvm/x86.c
> >>> +++ b/arch/x86/kvm/x86.c
> >>> @@ -5641,6 +5641,15 @@ static void update_eoi_exitmap(struct kvm_vcpu *vcpu)
> >>>  	kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
> >>>  }
> >>>  
> >>> +static void kvm_check_init_and_sipi(struct kvm_vcpu *vcpu)
> >>> +{
> >>> +	if (kvm_check_request(KVM_REQ_INIT, vcpu))
> >>> +		vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
> >>
> >> And here is a small race between clearing REQ_INIT and setting
> >> INIT_RECEIVED. It can make the LAPIC drop the SIPI incorrectly. Need to
> >> break up test and clear, doing the clear after mp_state update. Yeah...
> >>
> > You also need to call kvm_check_init_and_sipi() in
> > kvm_arch_vcpu_ioctl_get_mpstate(), 
> 
> Indeed.
> 
> > which means you now have three places
> > where you transfer INIT/SIPI state from requests to mp_state. All the
> > problems arise from the fact that now you have two places where you
> > are storing current state.
> 
> Not at all. I'm keeping the state in a single place, mp_state. I just
> have to make sure that I do not loose asynchronous events - what INIT
> and SIPI are.
> 
As evident from this code:
 +           if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED ||
 +               test_bit(KVM_REQ_INIT, &vcpu->requests)) {
the state is in two places.

> > To overcome this we can either deprecated
> > KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED values for mp_state
> > (use it only for migration purposes) and use separate state in APIC
> > to hold those event, like with nmi, or why not go with Paolo's simple
> > cmpxchg one?
> 
> We need to replace most, if not all, manipulations of mp_state with
> cmpxchg, verifying the state transitions there. And the request-based
> approach still looks cleaner to me when it comes to implementing INIT
> handling for nested modes. That will just trivially hook into
> kvm_check_init_and_sipi.
> 
The mp_state changes are rare, do not see the problem replacing all
state changes with cmpxchg. I do not like request-based approach as
implemented since we keep state in two places and constantly sync it
back.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka March 5, 2013, 9:12 a.m. UTC | #5
On 2013-03-05 09:46, Gleb Natapov wrote:
> On Tue, Mar 05, 2013 at 09:24:40AM +0100, Jan Kiszka wrote:
>> On 2013-03-05 08:57, Gleb Natapov wrote:
>>> On Tue, Mar 05, 2013 at 12:00:10AM +0100, Jan Kiszka wrote:
>>>> On 2013-03-04 22:41, Jan Kiszka wrote:
>>>>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>>>>
>>>>> A VCPU sending INIT or SIPI to some other VCPU races for setting the
>>>>> remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED
>>>>> was overwritten by kvm_emulate_halt and, thus, got lost.
>>>>>
>>>>> Fix this by raising requests on the sender side that will then be
>>>>> handled synchronously over the target VCPU context.
>>>>>
>>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>>>>> ---
>>>>>
>>>>> Changes in v2:
>>>>>  - check transition to INIT_RECEIVED in vcpu_enter_guest
>>>>>  - removed return value of kvm_check_init_and_sipi - caller has to
>>>>>    check for relevant transition afterward
>>>>>  - add write barrier after setting sipi_vector
>>>>>
>>>>>  arch/x86/kvm/lapic.c     |   11 ++++++-----
>>>>>  arch/x86/kvm/x86.c       |   15 +++++++++++++++
>>>>>  include/linux/kvm_host.h |    2 ++
>>>>>  3 files changed, 23 insertions(+), 5 deletions(-)
>>>>>
>>>>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
>>>>> index 02b51dd..7986c9f 100644
>>>>> --- a/arch/x86/kvm/lapic.c
>>>>> +++ b/arch/x86/kvm/lapic.c
>>>>> @@ -731,8 +731,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
>>>>>  	case APIC_DM_INIT:
>>>>>  		if (!trig_mode || level) {
>>>>>  			result = 1;
>>>>> -			vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
>>>>> -			kvm_make_request(KVM_REQ_EVENT, vcpu);
>>>>> +			kvm_make_request(KVM_REQ_INIT, vcpu);
>>>>>  			kvm_vcpu_kick(vcpu);
>>>>>  		} else {
>>>>>  			apic_debug("Ignoring de-assert INIT to vcpu %d\n",
>>>>> @@ -743,11 +742,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
>>>>>  	case APIC_DM_STARTUP:
>>>>>  		apic_debug("SIPI to vcpu %d vector 0x%02x\n",
>>>>>  			   vcpu->vcpu_id, vector);
>>>>> -		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
>>>>> +		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED ||
>>>>> +		    test_bit(KVM_REQ_INIT, &vcpu->requests)) {
>>>>>  			result = 1;
>>>>>  			vcpu->arch.sipi_vector = vector;
>>>>> -			vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
>>>>> -			kvm_make_request(KVM_REQ_EVENT, vcpu);
>>>>> +			/* make sure sipi_vector is visible for the receiver */
>>>>> +			smp_wmb();
>>>>> +			kvm_make_request(KVM_REQ_SIPI, vcpu);
>>>>>  			kvm_vcpu_kick(vcpu);
>>>>>  		}
>>>>>  		break;
>>>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>>>>> index d0cf737..0be04b9 100644
>>>>> --- a/arch/x86/kvm/x86.c
>>>>> +++ b/arch/x86/kvm/x86.c
>>>>> @@ -5641,6 +5641,15 @@ static void update_eoi_exitmap(struct kvm_vcpu *vcpu)
>>>>>  	kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
>>>>>  }
>>>>>  
>>>>> +static void kvm_check_init_and_sipi(struct kvm_vcpu *vcpu)
>>>>> +{
>>>>> +	if (kvm_check_request(KVM_REQ_INIT, vcpu))
>>>>> +		vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
>>>>
>>>> And here is a small race between clearing REQ_INIT and setting
>>>> INIT_RECEIVED. It can make the LAPIC drop the SIPI incorrectly. Need to
>>>> break up test and clear, doing the clear after mp_state update. Yeah...
>>>>
>>> You also need to call kvm_check_init_and_sipi() in
>>> kvm_arch_vcpu_ioctl_get_mpstate(), 
>>
>> Indeed.
>>
>>> which means you now have three places
>>> where you transfer INIT/SIPI state from requests to mp_state. All the
>>> problems arise from the fact that now you have two places where you
>>> are storing current state.
>>
>> Not at all. I'm keeping the state in a single place, mp_state. I just
>> have to make sure that I do not loose asynchronous events - what INIT
>> and SIPI are.
>>
> As evident from this code:
>  +           if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED ||
>  +               test_bit(KVM_REQ_INIT, &vcpu->requests)) {
> the state is in two places.

That's just to protect the content of sipi_vector during delivery. We
could drop the complete if clause if we protected that variable differently.

> 
>>> To overcome this we can either deprecated
>>> KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED values for mp_state
>>> (use it only for migration purposes) and use separate state in APIC
>>> to hold those event, like with nmi, or why not go with Paolo's simple
>>> cmpxchg one?
>>
>> We need to replace most, if not all, manipulations of mp_state with
>> cmpxchg, verifying the state transitions there. And the request-based
>> approach still looks cleaner to me when it comes to implementing INIT
>> handling for nested modes. That will just trivially hook into
>> kvm_check_init_and_sipi.
>>
> The mp_state changes are rare, do not see the problem replacing all
> state changes with cmpxchg. I do not like request-based approach as
> implemented since we keep state in two places and constantly sync it
> back.

And I like it more as it avoids spurious state changes toward INIT. That
will happen if we misuse mp_state for event signaling, like we do so
far, having to fix it up later again because the INIT event turned out
to become an INIT VM-exit.

Jan
Gleb Natapov March 5, 2013, 9:37 a.m. UTC | #6
On Tue, Mar 05, 2013 at 10:12:05AM +0100, Jan Kiszka wrote:
> On 2013-03-05 09:46, Gleb Natapov wrote:
> > On Tue, Mar 05, 2013 at 09:24:40AM +0100, Jan Kiszka wrote:
> >> On 2013-03-05 08:57, Gleb Natapov wrote:
> >>> On Tue, Mar 05, 2013 at 12:00:10AM +0100, Jan Kiszka wrote:
> >>>> On 2013-03-04 22:41, Jan Kiszka wrote:
> >>>>> From: Jan Kiszka <jan.kiszka@siemens.com>
> >>>>>
> >>>>> A VCPU sending INIT or SIPI to some other VCPU races for setting the
> >>>>> remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED
> >>>>> was overwritten by kvm_emulate_halt and, thus, got lost.
> >>>>>
> >>>>> Fix this by raising requests on the sender side that will then be
> >>>>> handled synchronously over the target VCPU context.
> >>>>>
> >>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> >>>>> ---
> >>>>>
> >>>>> Changes in v2:
> >>>>>  - check transition to INIT_RECEIVED in vcpu_enter_guest
> >>>>>  - removed return value of kvm_check_init_and_sipi - caller has to
> >>>>>    check for relevant transition afterward
> >>>>>  - add write barrier after setting sipi_vector
> >>>>>
> >>>>>  arch/x86/kvm/lapic.c     |   11 ++++++-----
> >>>>>  arch/x86/kvm/x86.c       |   15 +++++++++++++++
> >>>>>  include/linux/kvm_host.h |    2 ++
> >>>>>  3 files changed, 23 insertions(+), 5 deletions(-)
> >>>>>
> >>>>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> >>>>> index 02b51dd..7986c9f 100644
> >>>>> --- a/arch/x86/kvm/lapic.c
> >>>>> +++ b/arch/x86/kvm/lapic.c
> >>>>> @@ -731,8 +731,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
> >>>>>  	case APIC_DM_INIT:
> >>>>>  		if (!trig_mode || level) {
> >>>>>  			result = 1;
> >>>>> -			vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
> >>>>> -			kvm_make_request(KVM_REQ_EVENT, vcpu);
> >>>>> +			kvm_make_request(KVM_REQ_INIT, vcpu);
> >>>>>  			kvm_vcpu_kick(vcpu);
> >>>>>  		} else {
> >>>>>  			apic_debug("Ignoring de-assert INIT to vcpu %d\n",
> >>>>> @@ -743,11 +742,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
> >>>>>  	case APIC_DM_STARTUP:
> >>>>>  		apic_debug("SIPI to vcpu %d vector 0x%02x\n",
> >>>>>  			   vcpu->vcpu_id, vector);
> >>>>> -		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
> >>>>> +		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED ||
> >>>>> +		    test_bit(KVM_REQ_INIT, &vcpu->requests)) {
> >>>>>  			result = 1;
> >>>>>  			vcpu->arch.sipi_vector = vector;
> >>>>> -			vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
> >>>>> -			kvm_make_request(KVM_REQ_EVENT, vcpu);
> >>>>> +			/* make sure sipi_vector is visible for the receiver */
> >>>>> +			smp_wmb();
> >>>>> +			kvm_make_request(KVM_REQ_SIPI, vcpu);
> >>>>>  			kvm_vcpu_kick(vcpu);
> >>>>>  		}
> >>>>>  		break;
> >>>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> >>>>> index d0cf737..0be04b9 100644
> >>>>> --- a/arch/x86/kvm/x86.c
> >>>>> +++ b/arch/x86/kvm/x86.c
> >>>>> @@ -5641,6 +5641,15 @@ static void update_eoi_exitmap(struct kvm_vcpu *vcpu)
> >>>>>  	kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
> >>>>>  }
> >>>>>  
> >>>>> +static void kvm_check_init_and_sipi(struct kvm_vcpu *vcpu)
> >>>>> +{
> >>>>> +	if (kvm_check_request(KVM_REQ_INIT, vcpu))
> >>>>> +		vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
> >>>>
> >>>> And here is a small race between clearing REQ_INIT and setting
> >>>> INIT_RECEIVED. It can make the LAPIC drop the SIPI incorrectly. Need to
> >>>> break up test and clear, doing the clear after mp_state update. Yeah...
> >>>>
> >>> You also need to call kvm_check_init_and_sipi() in
> >>> kvm_arch_vcpu_ioctl_get_mpstate(), 
> >>
> >> Indeed.
> >>
> >>> which means you now have three places
> >>> where you transfer INIT/SIPI state from requests to mp_state. All the
> >>> problems arise from the fact that now you have two places where you
> >>> are storing current state.
> >>
> >> Not at all. I'm keeping the state in a single place, mp_state. I just
> >> have to make sure that I do not loose asynchronous events - what INIT
> >> and SIPI are.
> >>
> > As evident from this code:
> >  +           if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED ||
> >  +               test_bit(KVM_REQ_INIT, &vcpu->requests)) {
> > the state is in two places.
> 
> That's just to protect the content of sipi_vector during delivery. We
> could drop the complete if clause if we protected that variable differently.
> 
I understand why the code is here. I am saying that this is the evidence
that the state is in two places.

> > 
> >>> To overcome this we can either deprecated
> >>> KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED values for mp_state
> >>> (use it only for migration purposes) and use separate state in APIC
> >>> to hold those event, like with nmi, or why not go with Paolo's simple
> >>> cmpxchg one?
> >>
> >> We need to replace most, if not all, manipulations of mp_state with
> >> cmpxchg, verifying the state transitions there. And the request-based
> >> approach still looks cleaner to me when it comes to implementing INIT
> >> handling for nested modes. That will just trivially hook into
> >> kvm_check_init_and_sipi.
> >>
> > The mp_state changes are rare, do not see the problem replacing all
> > state changes with cmpxchg. I do not like request-based approach as
> > implemented since we keep state in two places and constantly sync it
> > back.
> 
> And I like it more as it avoids spurious state changes toward INIT. That
> will happen if we misuse mp_state for event signaling, like we do so
> far, having to fix it up later again because the INIT event turned out
> to become an INIT VM-exit.
> 
Yes, I agree we abuse mp_state for signaling. I do not want to abuse
->request for that too. So what about other idea about treating init/sipi
just like any other APIC event (that's what they are) and add state to
lapic to track init/sipi signaling?

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini March 5, 2013, 10:50 a.m. UTC | #7
Il 05/03/2013 10:37, Gleb Natapov ha scritto:
> > > > Not at all. I'm keeping the state in a single place, mp_state. I just
> > > > have to make sure that I do not loose asynchronous events - what INIT
> > > > and SIPI are.
> > >
> > > As evident from this code:
> > >  +           if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED ||
> > >  +               test_bit(KVM_REQ_INIT, &vcpu->requests)) {
> > > the state is in two places.
> > That's just to protect the content of sipi_vector during delivery.

Is that even needed?  Can we just do an unconditional:

	vcpu->arch.sipi_vector = vector;
	/* make sure sipi_vector is visible for the receiver */
	smp_wmb();
	kvm_make_request(KVM_REQ_SIPI, vcpu);
	kvm_vcpu_kick(vcpu);

and check in the request handler that we did get an INIT?

> > We could drop the complete if clause if we protected that variable differently.
> 
> I understand why the code is here. I am saying that this is the evidence
> that the state is in two places.

I agree with Gleb here.  The state is in two places.  I'm not saying that
using requests is wrong, it sounds nice especially for nested INIT.  But
there is something nasty in the patches so far.

BTW checking the requests in kvm_apic_has_interrupt seems nicer than
synchronizing in kvm_arch_cpu_runnable; it is a bit ugly to have side
effects in kvm_arch_cpu_runnable.  Then you can actually _process_
the requests only in vcpu_enter_guest and kvm_arch_vcpu_ioctl_get_mpstate().
In kvm_arch_vcpu_ioctl_put_mpstate(), KVM_MP_STATE_SIPI_RECEIVED has to
become KVM_MP_STATE_INIT_RECEIVED with KVM_REQ_SIPI set.

>>> > > 
>>>>> > >>> To overcome this we can either deprecated
>>>>> > >>> KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED values for mp_state
>>>>> > >>> (use it only for migration purposes) and use separate state in APIC
>>>>> > >>> to hold those event, like with nmi, or why not go with Paolo's simple
>>>>> > >>> cmpxchg one?
>>>> > >>
>>>> > >> We need to replace most, if not all, manipulations of mp_state with
>>>> > >> cmpxchg, verifying the state transitions there. 

Let's take again the matrix I sent yesterday, fixed to make UNINIT
unreachable (that transition can only be done by userspace):

from \ to    RUNNABLE     UNINIT      INIT     HALTED       SIPI
RUNNABLE       n/a          NO        yes       yes         NO
UNINIT         NO           n/a       yes       NO          NO
INIT           NO           NO        yes       NO          yes
HALTED         yes          NO        yes       n/a         NO
SIPI           yes          NO        yes       NO          n/a

We have:

- anything -> INIT: this is asynchronous, and INIT always wins.  No need
for a cmpxchg.

- RUNNABLE -> HALTED: cmpxchg needed

- INIT -> SIPI: we can have a race between the last two interrupts in an
INIT-INIT-SIPI (or INIT-SIPI-INIT) sequence, with these two interrupts sent
by two different processors.  The same race should also be present in real
hardware, and should never happen (the BSP should send SIPIs to all other
processors).  No need for a cmpxchg.

- HALTED -> RUNNABLE: racy vs. INIT too, cmpxchg needed

- SIPI -> RUNNABLE: there is the same race with going back to INIT; no need
for a cmpxchg.


But it probably will not scale well with nested virt.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka March 5, 2013, 1:25 p.m. UTC | #8
On 2013-03-05 11:50, Paolo Bonzini wrote:
> Il 05/03/2013 10:37, Gleb Natapov ha scritto:
>>>>> Not at all. I'm keeping the state in a single place, mp_state. I just
>>>>> have to make sure that I do not loose asynchronous events - what INIT
>>>>> and SIPI are.
>>>>
>>>> As evident from this code:
>>>>  +           if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED ||
>>>>  +               test_bit(KVM_REQ_INIT, &vcpu->requests)) {
>>>> the state is in two places.
>>> That's just to protect the content of sipi_vector during delivery.
> 
> Is that even needed?  Can we just do an unconditional:
> 
> 	vcpu->arch.sipi_vector = vector;
> 	/* make sure sipi_vector is visible for the receiver */
> 	smp_wmb();
> 	kvm_make_request(KVM_REQ_SIPI, vcpu);
> 	kvm_vcpu_kick(vcpu);
> 
> and check in the request handler that we did get an INIT?
> 
>>> We could drop the complete if clause if we protected that variable differently.
>>
>> I understand why the code is here. I am saying that this is the evidence
>> that the state is in two places.
> 
> I agree with Gleb here.  The state is in two places.  I'm not saying that
> using requests is wrong, it sounds nice especially for nested INIT.  But
> there is something nasty in the patches so far.
> 
> BTW checking the requests in kvm_apic_has_interrupt seems nicer than
> synchronizing in kvm_arch_cpu_runnable; it is a bit ugly to have side
> effects in kvm_arch_cpu_runnable.  Then you can actually _process_
> the requests only in vcpu_enter_guest and kvm_arch_vcpu_ioctl_get_mpstate().
> In kvm_arch_vcpu_ioctl_put_mpstate(), KVM_MP_STATE_SIPI_RECEIVED has to
> become KVM_MP_STATE_INIT_RECEIVED with KVM_REQ_SIPI set.

An INIT or SIPI signal is not really an interrupt in the sense that
kvm_cpu_has_interrupt or kvm_get_apic_interrupt expect (the current
users of kvm_apic_has_interrupt). I think we would move some ugliness
around this way, not yet resolve it.

But it probably makes sense to refactor the kvm_check_init_and_sipi
service to something that the APIC provide (is that what you are
thinking of, Gleb?). That will not reduce the number of places where we
have to check, but it should encapsulate things a bit cleaner.

Jan
Gleb Natapov March 5, 2013, 1:28 p.m. UTC | #9
On Tue, Mar 05, 2013 at 11:50:58AM +0100, Paolo Bonzini wrote:
> Il 05/03/2013 10:37, Gleb Natapov ha scritto:
> > > > > Not at all. I'm keeping the state in a single place, mp_state. I just
> > > > > have to make sure that I do not loose asynchronous events - what INIT
> > > > > and SIPI are.
> > > >
> > > > As evident from this code:
> > > >  +           if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED ||
> > > >  +               test_bit(KVM_REQ_INIT, &vcpu->requests)) {
> > > > the state is in two places.
> > > That's just to protect the content of sipi_vector during delivery.
> 
> Is that even needed?  Can we just do an unconditional:
> 
> 	vcpu->arch.sipi_vector = vector;
> 	/* make sure sipi_vector is visible for the receiver */
> 	smp_wmb();
> 	kvm_make_request(KVM_REQ_SIPI, vcpu);
> 	kvm_vcpu_kick(vcpu);
> 
> and check in the request handler that we did get an INIT?
> 
INIT may come after SIPI and vcpu will start anyway.

> > > We could drop the complete if clause if we protected that variable differently.
> > 
> > I understand why the code is here. I am saying that this is the evidence
> > that the state is in two places.
> 
> I agree with Gleb here.  The state is in two places.  I'm not saying that
> using requests is wrong, it sounds nice especially for nested INIT.  But
> there is something nasty in the patches so far.
> 
> BTW checking the requests in kvm_apic_has_interrupt seems nicer than
> synchronizing in kvm_arch_cpu_runnable; it is a bit ugly to have side
> effects in kvm_arch_cpu_runnable.  Then you can actually _process_
I subscribe to the idea of kvm_arch_cpu_runnable() that does not change
state that it queries. Otherwise we do quantum programming: state
changes by examination. But kvm_apic_has_interrupt() is called by
kvm_arch_vcpu_runnable() only if interrupts are enabled.

> the requests only in vcpu_enter_guest and kvm_arch_vcpu_ioctl_get_mpstate().
> In kvm_arch_vcpu_ioctl_put_mpstate(), KVM_MP_STATE_SIPI_RECEIVED has to
> become KVM_MP_STATE_INIT_RECEIVED with KVM_REQ_SIPI set.
> 
> >>> > > 
> >>>>> > >>> To overcome this we can either deprecated
> >>>>> > >>> KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED values for mp_state
> >>>>> > >>> (use it only for migration purposes) and use separate state in APIC
> >>>>> > >>> to hold those event, like with nmi, or why not go with Paolo's simple
> >>>>> > >>> cmpxchg one?
> >>>> > >>
> >>>> > >> We need to replace most, if not all, manipulations of mp_state with
> >>>> > >> cmpxchg, verifying the state transitions there. 
> 
> Let's take again the matrix I sent yesterday, fixed to make UNINIT
> unreachable (that transition can only be done by userspace):
> 
> from \ to    RUNNABLE     UNINIT      INIT     HALTED       SIPI
> RUNNABLE       n/a          NO        yes       yes         NO
> UNINIT         NO           n/a       yes       NO          NO
> INIT           NO           NO        yes       NO          yes
> HALTED         yes          NO        yes       n/a         NO
> SIPI           yes          NO        yes       NO          n/a
> 
> We have:
> 
> - anything -> INIT: this is asynchronous, and INIT always wins.  No need
> for a cmpxchg.
> 
> - RUNNABLE -> HALTED: cmpxchg needed
> 
> - INIT -> SIPI: we can have a race between the last two interrupts in an
> INIT-INIT-SIPI (or INIT-SIPI-INIT) sequence, with these two interrupts sent
> by two different processors.  The same race should also be present in real
> hardware, and should never happen (the BSP should send SIPIs to all other
> processors).  No need for a cmpxchg.
> 
> - HALTED -> RUNNABLE: racy vs. INIT too, cmpxchg needed
> 
> - SIPI -> RUNNABLE: there is the same race with going back to INIT; no need
> for a cmpxchg.
> 
> 
> But it probably will not scale well with nested virt.
> 
I do not think the problem for nested virt with cmpxchg() approach is
scalability. The problem is that with nested virt INIT behaves like
regular even that may cause vmexit or be delayed. Changing vcpu state
during event generation does not work if vcpu it in a guest mode.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov March 5, 2013, 1:33 p.m. UTC | #10
On Tue, Mar 05, 2013 at 02:25:51PM +0100, Jan Kiszka wrote:
> On 2013-03-05 11:50, Paolo Bonzini wrote:
> > Il 05/03/2013 10:37, Gleb Natapov ha scritto:
> >>>>> Not at all. I'm keeping the state in a single place, mp_state. I just
> >>>>> have to make sure that I do not loose asynchronous events - what INIT
> >>>>> and SIPI are.
> >>>>
> >>>> As evident from this code:
> >>>>  +           if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED ||
> >>>>  +               test_bit(KVM_REQ_INIT, &vcpu->requests)) {
> >>>> the state is in two places.
> >>> That's just to protect the content of sipi_vector during delivery.
> > 
> > Is that even needed?  Can we just do an unconditional:
> > 
> > 	vcpu->arch.sipi_vector = vector;
> > 	/* make sure sipi_vector is visible for the receiver */
> > 	smp_wmb();
> > 	kvm_make_request(KVM_REQ_SIPI, vcpu);
> > 	kvm_vcpu_kick(vcpu);
> > 
> > and check in the request handler that we did get an INIT?
> > 
> >>> We could drop the complete if clause if we protected that variable differently.
> >>
> >> I understand why the code is here. I am saying that this is the evidence
> >> that the state is in two places.
> > 
> > I agree with Gleb here.  The state is in two places.  I'm not saying that
> > using requests is wrong, it sounds nice especially for nested INIT.  But
> > there is something nasty in the patches so far.
> > 
> > BTW checking the requests in kvm_apic_has_interrupt seems nicer than
> > synchronizing in kvm_arch_cpu_runnable; it is a bit ugly to have side
> > effects in kvm_arch_cpu_runnable.  Then you can actually _process_
> > the requests only in vcpu_enter_guest and kvm_arch_vcpu_ioctl_get_mpstate().
> > In kvm_arch_vcpu_ioctl_put_mpstate(), KVM_MP_STATE_SIPI_RECEIVED has to
> > become KVM_MP_STATE_INIT_RECEIVED with KVM_REQ_SIPI set.
> 
> An INIT or SIPI signal is not really an interrupt in the sense that
> kvm_cpu_has_interrupt or kvm_get_apic_interrupt expect (the current
> users of kvm_apic_has_interrupt). I think we would move some ugliness
> around this way, not yet resolve it.
> 
> But it probably makes sense to refactor the kvm_check_init_and_sipi
> service to something that the APIC provide (is that what you are
> thinking of, Gleb?). That will not reduce the number of places where we
> have to check, but it should encapsulate things a bit cleaner.
> 
Yes, that what I am thinking. And yes I do not think it will reduce
number of places we need to check it much (or at all) unfortunately.
But if we want to hold INIT signal pending (until vmxoff for instance)
we cannot do that in ->requests anyway.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti March 5, 2013, 11:16 p.m. UTC | #11
On Mon, Mar 04, 2013 at 10:41:43PM +0100, Jan Kiszka wrote:
> From: Jan Kiszka <jan.kiszka@siemens.com>
> 
> A VCPU sending INIT or SIPI to some other VCPU races for setting the
> remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED
> was overwritten by kvm_emulate_halt and, thus, got lost.
> 
> Fix this by raising requests on the sender side that will then be
> handled synchronously over the target VCPU context.
> 
> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>

Why is kvm_emulate_halt being executed from
KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED again?

Why is it not true that the only valid transition from
KVM_MP_STATE_HALTED is from KVM_MP_STATE_RUNNABLE?

It would be good for KVM_MP_STATE_HALTED to indicate 
"guest executed HLT instruction" (which is impossible without INIT/SIPI
being received).

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti March 6, 2013, 12:01 a.m. UTC | #12
On Tue, Mar 05, 2013 at 08:16:41PM -0300, Marcelo Tosatti wrote:
> On Mon, Mar 04, 2013 at 10:41:43PM +0100, Jan Kiszka wrote:
> > From: Jan Kiszka <jan.kiszka@siemens.com>
> > 
> > A VCPU sending INIT or SIPI to some other VCPU races for setting the
> > remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED
> > was overwritten by kvm_emulate_halt and, thus, got lost.
> > 
> > Fix this by raising requests on the sender side that will then be
> > handled synchronously over the target VCPU context.
> > 
> > Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> 
> Why is kvm_emulate_halt being executed from
> KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED again?
> 
> Why is it not true that the only valid transition from
s/from/to/
> KVM_MP_STATE_HALTED is from KVM_MP_STATE_RUNNABLE?
> 
> It would be good for KVM_MP_STATE_HALTED to indicate 
> "guest executed HLT instruction" (which is impossible without INIT/SIPI
> being received).

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti March 6, 2013, 12:06 a.m. UTC | #13
On Tue, Mar 05, 2013 at 08:16:41PM -0300, Marcelo Tosatti wrote:
> On Mon, Mar 04, 2013 at 10:41:43PM +0100, Jan Kiszka wrote:
> > From: Jan Kiszka <jan.kiszka@siemens.com>
> > 
> > A VCPU sending INIT or SIPI to some other VCPU races for setting the
> > remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED
> > was overwritten by kvm_emulate_halt and, thus, got lost.
> > 
> > Fix this by raising requests on the sender side that will then be
> > handled synchronously over the target VCPU context.
> > 
> > Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> 
> Why is kvm_emulate_halt being executed from
> KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED again?
> 
> Why is it not true that the only valid transition from
> KVM_MP_STATE_HALTED is from KVM_MP_STATE_RUNNABLE?

See Paolo's table, it is. So why fix a race which should not be
happening in the first place.

> It would be good for KVM_MP_STATE_HALTED to indicate 
> "guest executed HLT instruction" (which is impossible without INIT/SIPI
> being received).

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini March 6, 2013, 6:12 a.m. UTC | #14
> On Tue, Mar 05, 2013 at 08:16:41PM -0300, Marcelo Tosatti wrote:
> > On Mon, Mar 04, 2013 at 10:41:43PM +0100, Jan Kiszka wrote:
> > > From: Jan Kiszka <jan.kiszka@siemens.com>
> > > 
> > > A VCPU sending INIT or SIPI to some other VCPU races for setting
> > > the
> > > remote VCPU's mp_state. When we were unlucky,
> > > KVM_MP_STATE_INIT_RECEIVED
> > > was overwritten by kvm_emulate_halt and, thus, got lost.
> > > 
> > > Fix this by raising requests on the sender side that will then be
> > > handled synchronously over the target VCPU context.
> > > 
> > > Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> > 
> > Why is kvm_emulate_halt being executed from
> > KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED again?
> > 
> > Why is it not true that the only valid transition from
> > KVM_MP_STATE_HALTED is from KVM_MP_STATE_RUNNABLE?
> 
> See Paolo's table, it is. So why fix a race which should not be
> happening in the first place.

The bad transition happens exactly because of the race.
Are you saying you prefer the solution with cmpxchg?

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka March 6, 2013, 7:57 a.m. UTC | #15
On 2013-03-06 07:12, Paolo Bonzini wrote:
> 
>> On Tue, Mar 05, 2013 at 08:16:41PM -0300, Marcelo Tosatti wrote:
>>> On Mon, Mar 04, 2013 at 10:41:43PM +0100, Jan Kiszka wrote:
>>>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>>>
>>>> A VCPU sending INIT or SIPI to some other VCPU races for setting
>>>> the
>>>> remote VCPU's mp_state. When we were unlucky,
>>>> KVM_MP_STATE_INIT_RECEIVED
>>>> was overwritten by kvm_emulate_halt and, thus, got lost.
>>>>
>>>> Fix this by raising requests on the sender side that will then be
>>>> handled synchronously over the target VCPU context.
>>>>
>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>>>
>>> Why is kvm_emulate_halt being executed from
>>> KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED again?
>>>
>>> Why is it not true that the only valid transition from
>>> KVM_MP_STATE_HALTED is from KVM_MP_STATE_RUNNABLE?
>>
>> See Paolo's table, it is. So why fix a race which should not be
>> happening in the first place.
> 
> The bad transition happens exactly because of the race.
> Are you saying you prefer the solution with cmpxchg?

I think we are past that point in our discussion and should really
separate signal (INIT/SIPI) from state (INIT/SIPI_RECEIVED etc.).

Jan
Marcelo Tosatti March 6, 2013, 9:19 p.m. UTC | #16
On Wed, Mar 06, 2013 at 01:12:52AM -0500, Paolo Bonzini wrote:
> 
> > On Tue, Mar 05, 2013 at 08:16:41PM -0300, Marcelo Tosatti wrote:
> > > On Mon, Mar 04, 2013 at 10:41:43PM +0100, Jan Kiszka wrote:
> > > > From: Jan Kiszka <jan.kiszka@siemens.com>
> > > > 
> > > > A VCPU sending INIT or SIPI to some other VCPU races for setting
> > > > the
> > > > remote VCPU's mp_state. When we were unlucky,
> > > > KVM_MP_STATE_INIT_RECEIVED
> > > > was overwritten by kvm_emulate_halt and, thus, got lost.
> > > > 
> > > > Fix this by raising requests on the sender side that will then be
> > > > handled synchronously over the target VCPU context.
> > > > 
> > > > Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> > > 
> > > Why is kvm_emulate_halt being executed from
> > > KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED again?
> > > 
> > > Why is it not true that the only valid transition from
> > > KVM_MP_STATE_HALTED is from KVM_MP_STATE_RUNNABLE?
> > 
> > See Paolo's table, it is. So why fix a race which should not be
> > happening in the first place.
> 
> The bad transition happens exactly because of the race.
> Are you saying you prefer the solution with cmpxchg?
> 
> Paolo

Vcpu should only invoke kvm_emulate_halt if it has been through a
KVM_MP_STATE_UNINITIALIZED ->  KVM_MP_STATE_INIT_RECEIVED ->
KVM_MP_STATE_SIPI_RECEIVED -> KVM_MP_STATE_RUNNABLE transition.

If it has been through that, how can a KVM_MP_STATE_INIT_RECEIVED be
overwritten? 

That is, KVM_MP_STATE_RUNNABLE (precondition for kvm_emulate_halt) is
only reachable if KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED
have not been overwritten. 

The point i'm trying to make is it appears the symptom is being fixed
with the proposed patch, not the root cause (which, if the reasoning
above is correct, is somewhere in nVMX code).


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti March 6, 2013, 9:30 p.m. UTC | #17
On Wed, Mar 06, 2013 at 08:57:54AM +0100, Jan Kiszka wrote:
> On 2013-03-06 07:12, Paolo Bonzini wrote:
> > 
> >> On Tue, Mar 05, 2013 at 08:16:41PM -0300, Marcelo Tosatti wrote:
> >>> On Mon, Mar 04, 2013 at 10:41:43PM +0100, Jan Kiszka wrote:
> >>>> From: Jan Kiszka <jan.kiszka@siemens.com>
> >>>>
> >>>> A VCPU sending INIT or SIPI to some other VCPU races for setting
> >>>> the
> >>>> remote VCPU's mp_state. When we were unlucky,
> >>>> KVM_MP_STATE_INIT_RECEIVED
> >>>> was overwritten by kvm_emulate_halt and, thus, got lost.
> >>>>
> >>>> Fix this by raising requests on the sender side that will then be
> >>>> handled synchronously over the target VCPU context.
> >>>>
> >>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> >>>
> >>> Why is kvm_emulate_halt being executed from
> >>> KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED again?
> >>>
> >>> Why is it not true that the only valid transition from
> >>> KVM_MP_STATE_HALTED is from KVM_MP_STATE_RUNNABLE?
> >>
> >> See Paolo's table, it is. So why fix a race which should not be
> >> happening in the first place.
> > 
> > The bad transition happens exactly because of the race.
> > Are you saying you prefer the solution with cmpxchg?
> 
> I think we are past that point in our discussion and should really
> separate signal (INIT/SIPI) from state (INIT/SIPI_RECEIVED etc.).
> 
> Jan

The sentence "KVM_MP_STATE_INIT_RECEIVED overwritten by
kvm_emulate_halt" is contradictory, unless i miss something.

The KVM_REQ_ solution is messy, should avoid introducing new request
bits if possible.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka March 6, 2013, 9:39 p.m. UTC | #18
On 2013-03-06 22:30, Marcelo Tosatti wrote:
> On Wed, Mar 06, 2013 at 08:57:54AM +0100, Jan Kiszka wrote:
>> On 2013-03-06 07:12, Paolo Bonzini wrote:
>>>
>>>> On Tue, Mar 05, 2013 at 08:16:41PM -0300, Marcelo Tosatti wrote:
>>>>> On Mon, Mar 04, 2013 at 10:41:43PM +0100, Jan Kiszka wrote:
>>>>>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>>>>>
>>>>>> A VCPU sending INIT or SIPI to some other VCPU races for setting
>>>>>> the
>>>>>> remote VCPU's mp_state. When we were unlucky,
>>>>>> KVM_MP_STATE_INIT_RECEIVED
>>>>>> was overwritten by kvm_emulate_halt and, thus, got lost.
>>>>>>
>>>>>> Fix this by raising requests on the sender side that will then be
>>>>>> handled synchronously over the target VCPU context.
>>>>>>
>>>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>>>>>
>>>>> Why is kvm_emulate_halt being executed from
>>>>> KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED again?
>>>>>
>>>>> Why is it not true that the only valid transition from
>>>>> KVM_MP_STATE_HALTED is from KVM_MP_STATE_RUNNABLE?
>>>>
>>>> See Paolo's table, it is. So why fix a race which should not be
>>>> happening in the first place.
>>>
>>> The bad transition happens exactly because of the race.
>>> Are you saying you prefer the solution with cmpxchg?
>>
>> I think we are past that point in our discussion and should really
>> separate signal (INIT/SIPI) from state (INIT/SIPI_RECEIVED etc.).
>>
>> Jan
> 
> The sentence "KVM_MP_STATE_INIT_RECEIVED overwritten by
> kvm_emulate_halt" is contradictory, unless i miss something.

http://thread.gmane.org/gmane.comp.emulators.kvm.devel/105638

Jan
Marcelo Tosatti March 6, 2013, 9:50 p.m. UTC | #19
On Wed, Mar 06, 2013 at 10:39:27PM +0100, Jan Kiszka wrote:
> On 2013-03-06 22:30, Marcelo Tosatti wrote:
> > On Wed, Mar 06, 2013 at 08:57:54AM +0100, Jan Kiszka wrote:
> >> On 2013-03-06 07:12, Paolo Bonzini wrote:
> >>>
> >>>> On Tue, Mar 05, 2013 at 08:16:41PM -0300, Marcelo Tosatti wrote:
> >>>>> On Mon, Mar 04, 2013 at 10:41:43PM +0100, Jan Kiszka wrote:
> >>>>>> From: Jan Kiszka <jan.kiszka@siemens.com>
> >>>>>>
> >>>>>> A VCPU sending INIT or SIPI to some other VCPU races for setting
> >>>>>> the
> >>>>>> remote VCPU's mp_state. When we were unlucky,
> >>>>>> KVM_MP_STATE_INIT_RECEIVED
> >>>>>> was overwritten by kvm_emulate_halt and, thus, got lost.
> >>>>>>
> >>>>>> Fix this by raising requests on the sender side that will then be
> >>>>>> handled synchronously over the target VCPU context.
> >>>>>>
> >>>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> >>>>>
> >>>>> Why is kvm_emulate_halt being executed from
> >>>>> KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED again?
> >>>>>
> >>>>> Why is it not true that the only valid transition from
> >>>>> KVM_MP_STATE_HALTED is from KVM_MP_STATE_RUNNABLE?
> >>>>
> >>>> See Paolo's table, it is. So why fix a race which should not be
> >>>> happening in the first place.
> >>>
> >>> The bad transition happens exactly because of the race.
> >>> Are you saying you prefer the solution with cmpxchg?
> >>
> >> I think we are past that point in our discussion and should really
> >> separate signal (INIT/SIPI) from state (INIT/SIPI_RECEIVED etc.).
> >>
> >> Jan
> > 
> > The sentence "KVM_MP_STATE_INIT_RECEIVED overwritten by
> > kvm_emulate_halt" is contradictory, unless i miss something.
> 
> http://thread.gmane.org/gmane.comp.emulators.kvm.devel/105638
> 
> Jan

"A VCPU sending INIT or SIPI to some other VCPU races for setting the
remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED
was overwritten by kvm_emulate_halt and, thus, got lost.


Fix this by raising requests on the sender side that will then be
handled synchronously over the target VCPU context."


The scenario you describe is:

vcpu0,bsp						vcpu1

vcpu0->mp_state=KVM_MP_STATE_RUNNABLE
							vcpu1->mp_state=KVM_MP_STATE_UNINIT

at __accept_apic_irq()
vcpu1->mp_state=KVM_MP_STATE_INIT_RECEIVED
							kvm_emulate_halt
							vcpu1->mp_state=
							KVM_MP_STATE_HALTED


This is what the first sentence from the patch refers to, correct?


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka March 6, 2013, 9:58 p.m. UTC | #20
On 2013-03-06 22:50, Marcelo Tosatti wrote:
> On Wed, Mar 06, 2013 at 10:39:27PM +0100, Jan Kiszka wrote:
>> On 2013-03-06 22:30, Marcelo Tosatti wrote:
>>> On Wed, Mar 06, 2013 at 08:57:54AM +0100, Jan Kiszka wrote:
>>>> On 2013-03-06 07:12, Paolo Bonzini wrote:
>>>>>
>>>>>> On Tue, Mar 05, 2013 at 08:16:41PM -0300, Marcelo Tosatti wrote:
>>>>>>> On Mon, Mar 04, 2013 at 10:41:43PM +0100, Jan Kiszka wrote:
>>>>>>>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>>>>>>>
>>>>>>>> A VCPU sending INIT or SIPI to some other VCPU races for setting
>>>>>>>> the
>>>>>>>> remote VCPU's mp_state. When we were unlucky,
>>>>>>>> KVM_MP_STATE_INIT_RECEIVED
>>>>>>>> was overwritten by kvm_emulate_halt and, thus, got lost.
>>>>>>>>
>>>>>>>> Fix this by raising requests on the sender side that will then be
>>>>>>>> handled synchronously over the target VCPU context.
>>>>>>>>
>>>>>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>>>>>>>
>>>>>>> Why is kvm_emulate_halt being executed from
>>>>>>> KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED again?
>>>>>>>
>>>>>>> Why is it not true that the only valid transition from
>>>>>>> KVM_MP_STATE_HALTED is from KVM_MP_STATE_RUNNABLE?
>>>>>>
>>>>>> See Paolo's table, it is. So why fix a race which should not be
>>>>>> happening in the first place.
>>>>>
>>>>> The bad transition happens exactly because of the race.
>>>>> Are you saying you prefer the solution with cmpxchg?
>>>>
>>>> I think we are past that point in our discussion and should really
>>>> separate signal (INIT/SIPI) from state (INIT/SIPI_RECEIVED etc.).
>>>>
>>>> Jan
>>>
>>> The sentence "KVM_MP_STATE_INIT_RECEIVED overwritten by
>>> kvm_emulate_halt" is contradictory, unless i miss something.
>>
>> http://thread.gmane.org/gmane.comp.emulators.kvm.devel/105638
>>
>> Jan
> 
> "A VCPU sending INIT or SIPI to some other VCPU races for setting the
> remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED
> was overwritten by kvm_emulate_halt and, thus, got lost.
> 
> 
> Fix this by raising requests on the sender side that will then be
> handled synchronously over the target VCPU context."
> 
> 
> The scenario you describe is:
> 
> vcpu0,bsp						vcpu1
> 
> vcpu0->mp_state=KVM_MP_STATE_RUNNABLE

This is not related to the race.

> 							vcpu1->mp_state=KVM_MP_STATE_UNINIT

Nor this.

> 
> at __accept_apic_irq()
> vcpu1->mp_state=KVM_MP_STATE_INIT_RECEIVED
> 							kvm_emulate_halt
> 							vcpu1->mp_state=
> 							KVM_MP_STATE_HALTED

Just these two sequences. vcpu0 need not be the BSP, any VCPU can send
INIT at any time.

> 
> 
> This is what the first sentence from the patch refers to, correct?

The last part, yes. But there are more races due to the unsynchronized
access to mp_state in lapic.c.

I'm going to refactor my patch according to Gleb's and Paolos's
suggestions, trying to keep the signals lapic-local, adding services to
check for them and translate them into mp_state transitions over the
target vcpu context.

Jan
Paolo Bonzini March 6, 2013, 10:43 p.m. UTC | #21
Il 06/03/2013 22:19, Marcelo Tosatti ha scritto:
> Vcpu should only invoke kvm_emulate_halt if it has been through a
> KVM_MP_STATE_UNINITIALIZED ->  KVM_MP_STATE_INIT_RECEIVED ->
> KVM_MP_STATE_SIPI_RECEIVED -> KVM_MP_STATE_RUNNABLE transition.
> 
> If it has been through that, how can a KVM_MP_STATE_INIT_RECEIVED be
> overwritten? 
> 
> That is, KVM_MP_STATE_RUNNABLE (precondition for kvm_emulate_halt) is
> only reachable if KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED
> have not been overwritten. 

You can always go back to the KVM_MP_STATE_INIT_RECEIVED state; either
by an APIC write or by various soft resets (port 92h, keyboard
controller, port cf9h) that aren't emulated correctly right now.

Paolo

> The point i'm trying to make is it appears the symptom is being fixed
> with the proposed patch, not the root cause (which, if the reasoning
> above is correct, is somewhere in nVMX code).

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti March 6, 2013, 11:04 p.m. UTC | #22
On Wed, Mar 06, 2013 at 11:43:30PM +0100, Paolo Bonzini wrote:
> Il 06/03/2013 22:19, Marcelo Tosatti ha scritto:
> > Vcpu should only invoke kvm_emulate_halt if it has been through a
> > KVM_MP_STATE_UNINITIALIZED ->  KVM_MP_STATE_INIT_RECEIVED ->
> > KVM_MP_STATE_SIPI_RECEIVED -> KVM_MP_STATE_RUNNABLE transition.
> > 
> > If it has been through that, how can a KVM_MP_STATE_INIT_RECEIVED be
> > overwritten? 
> > 
> > That is, KVM_MP_STATE_RUNNABLE (precondition for kvm_emulate_halt) is
> > only reachable if KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED
> > have not been overwritten. 
> 
> You can always go back to the KVM_MP_STATE_INIT_RECEIVED state; either
> by an APIC write or by various soft resets (port 92h, keyboard
> controller, port cf9h) that aren't emulated correctly right now.

Indeed (and BSP not ignoring INIT is also broken in KVM, as you pointed
our earlier).

So the stress test in case is guest using APIC INIT after initial MP
initialization protocol (therefore reproducible without nVMX).

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 02b51dd..7986c9f 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -731,8 +731,7 @@  static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 	case APIC_DM_INIT:
 		if (!trig_mode || level) {
 			result = 1;
-			vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
-			kvm_make_request(KVM_REQ_EVENT, vcpu);
+			kvm_make_request(KVM_REQ_INIT, vcpu);
 			kvm_vcpu_kick(vcpu);
 		} else {
 			apic_debug("Ignoring de-assert INIT to vcpu %d\n",
@@ -743,11 +742,13 @@  static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 	case APIC_DM_STARTUP:
 		apic_debug("SIPI to vcpu %d vector 0x%02x\n",
 			   vcpu->vcpu_id, vector);
-		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
+		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED ||
+		    test_bit(KVM_REQ_INIT, &vcpu->requests)) {
 			result = 1;
 			vcpu->arch.sipi_vector = vector;
-			vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
-			kvm_make_request(KVM_REQ_EVENT, vcpu);
+			/* make sure sipi_vector is visible for the receiver */
+			smp_wmb();
+			kvm_make_request(KVM_REQ_SIPI, vcpu);
 			kvm_vcpu_kick(vcpu);
 		}
 		break;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d0cf737..0be04b9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5641,6 +5641,15 @@  static void update_eoi_exitmap(struct kvm_vcpu *vcpu)
 	kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
 }
 
+static void kvm_check_init_and_sipi(struct kvm_vcpu *vcpu)
+{
+	if (kvm_check_request(KVM_REQ_INIT, vcpu))
+		vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
+	if (kvm_check_request(KVM_REQ_SIPI, vcpu) &&
+	    vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
+		vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
+}
+
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 {
 	int r;
@@ -5649,6 +5658,11 @@  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	bool req_immediate_exit = 0;
 
 	if (vcpu->requests) {
+		kvm_check_init_and_sipi(vcpu);
+		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
+			r = 1;
+			goto out;
+		}
 		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
 			kvm_mmu_unload(vcpu);
 		if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
@@ -6977,6 +6991,7 @@  void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
+	kvm_check_init_and_sipi(vcpu);
 	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
 		!vcpu->arch.apf.halted)
 		|| !list_empty_careful(&vcpu->async_pf.done)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 722cae7..1a191c9 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -124,6 +124,8 @@  static inline bool is_error_page(struct page *page)
 #define KVM_REQ_MCLOCK_INPROGRESS 20
 #define KVM_REQ_EPR_EXIT          21
 #define KVM_REQ_EOIBITMAP         22
+#define KVM_REQ_INIT              23
+#define KVM_REQ_SIPI              24
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID		0
 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID	1