Message ID | 51351517.3090600@web.de (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 2013-03-04 22:41, Jan Kiszka wrote: > From: Jan Kiszka <jan.kiszka@siemens.com> > > A VCPU sending INIT or SIPI to some other VCPU races for setting the > remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED > was overwritten by kvm_emulate_halt and, thus, got lost. > > Fix this by raising requests on the sender side that will then be > handled synchronously over the target VCPU context. > > Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> > --- > > Changes in v2: > - check transition to INIT_RECEIVED in vcpu_enter_guest > - removed return value of kvm_check_init_and_sipi - caller has to > check for relevant transition afterward > - add write barrier after setting sipi_vector > > arch/x86/kvm/lapic.c | 11 ++++++----- > arch/x86/kvm/x86.c | 15 +++++++++++++++ > include/linux/kvm_host.h | 2 ++ > 3 files changed, 23 insertions(+), 5 deletions(-) > > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c > index 02b51dd..7986c9f 100644 > --- a/arch/x86/kvm/lapic.c > +++ b/arch/x86/kvm/lapic.c > @@ -731,8 +731,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, > case APIC_DM_INIT: > if (!trig_mode || level) { > result = 1; > - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; > - kvm_make_request(KVM_REQ_EVENT, vcpu); > + kvm_make_request(KVM_REQ_INIT, vcpu); > kvm_vcpu_kick(vcpu); > } else { > apic_debug("Ignoring de-assert INIT to vcpu %d\n", > @@ -743,11 +742,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, > case APIC_DM_STARTUP: > apic_debug("SIPI to vcpu %d vector 0x%02x\n", > vcpu->vcpu_id, vector); > - if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { > + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED || > + test_bit(KVM_REQ_INIT, &vcpu->requests)) { > result = 1; > vcpu->arch.sipi_vector = vector; > - vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; > - kvm_make_request(KVM_REQ_EVENT, vcpu); > + /* make sure sipi_vector is visible for the receiver */ > + smp_wmb(); > + kvm_make_request(KVM_REQ_SIPI, vcpu); > kvm_vcpu_kick(vcpu); > } > break; > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index d0cf737..0be04b9 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -5641,6 +5641,15 @@ static void update_eoi_exitmap(struct kvm_vcpu *vcpu) > kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); > } > > +static void kvm_check_init_and_sipi(struct kvm_vcpu *vcpu) > +{ > + if (kvm_check_request(KVM_REQ_INIT, vcpu)) > + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; And here is a small race between clearing REQ_INIT and setting INIT_RECEIVED. It can make the LAPIC drop the SIPI incorrectly. Need to break up test and clear, doing the clear after mp_state update. Yeah... Jan > + if (kvm_check_request(KVM_REQ_SIPI, vcpu) && > + vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) > + vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; > +} > + > static int vcpu_enter_guest(struct kvm_vcpu *vcpu) > { > int r; > @@ -5649,6 +5658,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) > bool req_immediate_exit = 0; > > if (vcpu->requests) { > + kvm_check_init_and_sipi(vcpu); > + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { > + r = 1; > + goto out; > + } > if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) > kvm_mmu_unload(vcpu); > if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) > @@ -6977,6 +6991,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, > > int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) > { > + kvm_check_init_and_sipi(vcpu); > return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && > !vcpu->arch.apf.halted) > || !list_empty_careful(&vcpu->async_pf.done) > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 722cae7..1a191c9 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -124,6 +124,8 @@ static inline bool is_error_page(struct page *page) > #define KVM_REQ_MCLOCK_INPROGRESS 20 > #define KVM_REQ_EPR_EXIT 21 > #define KVM_REQ_EOIBITMAP 22 > +#define KVM_REQ_INIT 23 > +#define KVM_REQ_SIPI 24 > > #define KVM_USERSPACE_IRQ_SOURCE_ID 0 > #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 >
On Tue, Mar 05, 2013 at 12:00:10AM +0100, Jan Kiszka wrote: > On 2013-03-04 22:41, Jan Kiszka wrote: > > From: Jan Kiszka <jan.kiszka@siemens.com> > > > > A VCPU sending INIT or SIPI to some other VCPU races for setting the > > remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED > > was overwritten by kvm_emulate_halt and, thus, got lost. > > > > Fix this by raising requests on the sender side that will then be > > handled synchronously over the target VCPU context. > > > > Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> > > --- > > > > Changes in v2: > > - check transition to INIT_RECEIVED in vcpu_enter_guest > > - removed return value of kvm_check_init_and_sipi - caller has to > > check for relevant transition afterward > > - add write barrier after setting sipi_vector > > > > arch/x86/kvm/lapic.c | 11 ++++++----- > > arch/x86/kvm/x86.c | 15 +++++++++++++++ > > include/linux/kvm_host.h | 2 ++ > > 3 files changed, 23 insertions(+), 5 deletions(-) > > > > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c > > index 02b51dd..7986c9f 100644 > > --- a/arch/x86/kvm/lapic.c > > +++ b/arch/x86/kvm/lapic.c > > @@ -731,8 +731,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, > > case APIC_DM_INIT: > > if (!trig_mode || level) { > > result = 1; > > - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; > > - kvm_make_request(KVM_REQ_EVENT, vcpu); > > + kvm_make_request(KVM_REQ_INIT, vcpu); > > kvm_vcpu_kick(vcpu); > > } else { > > apic_debug("Ignoring de-assert INIT to vcpu %d\n", > > @@ -743,11 +742,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, > > case APIC_DM_STARTUP: > > apic_debug("SIPI to vcpu %d vector 0x%02x\n", > > vcpu->vcpu_id, vector); > > - if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { > > + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED || > > + test_bit(KVM_REQ_INIT, &vcpu->requests)) { > > result = 1; > > vcpu->arch.sipi_vector = vector; > > - vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; > > - kvm_make_request(KVM_REQ_EVENT, vcpu); > > + /* make sure sipi_vector is visible for the receiver */ > > + smp_wmb(); > > + kvm_make_request(KVM_REQ_SIPI, vcpu); > > kvm_vcpu_kick(vcpu); > > } > > break; > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > index d0cf737..0be04b9 100644 > > --- a/arch/x86/kvm/x86.c > > +++ b/arch/x86/kvm/x86.c > > @@ -5641,6 +5641,15 @@ static void update_eoi_exitmap(struct kvm_vcpu *vcpu) > > kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); > > } > > > > +static void kvm_check_init_and_sipi(struct kvm_vcpu *vcpu) > > +{ > > + if (kvm_check_request(KVM_REQ_INIT, vcpu)) > > + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; > > And here is a small race between clearing REQ_INIT and setting > INIT_RECEIVED. It can make the LAPIC drop the SIPI incorrectly. Need to > break up test and clear, doing the clear after mp_state update. Yeah... > You also need to call kvm_check_init_and_sipi() in kvm_arch_vcpu_ioctl_get_mpstate(), which means you now have three places where you transfer INIT/SIPI state from requests to mp_state. All the problems arise from the fact that now you have two places where you are storing current state. To overcome this we can either deprecated KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED values for mp_state (use it only for migration purposes) and use separate state in APIC to hold those event, like with nmi, or why not go with Paolo's simple cmpxchg one? -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2013-03-05 08:57, Gleb Natapov wrote: > On Tue, Mar 05, 2013 at 12:00:10AM +0100, Jan Kiszka wrote: >> On 2013-03-04 22:41, Jan Kiszka wrote: >>> From: Jan Kiszka <jan.kiszka@siemens.com> >>> >>> A VCPU sending INIT or SIPI to some other VCPU races for setting the >>> remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED >>> was overwritten by kvm_emulate_halt and, thus, got lost. >>> >>> Fix this by raising requests on the sender side that will then be >>> handled synchronously over the target VCPU context. >>> >>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> >>> --- >>> >>> Changes in v2: >>> - check transition to INIT_RECEIVED in vcpu_enter_guest >>> - removed return value of kvm_check_init_and_sipi - caller has to >>> check for relevant transition afterward >>> - add write barrier after setting sipi_vector >>> >>> arch/x86/kvm/lapic.c | 11 ++++++----- >>> arch/x86/kvm/x86.c | 15 +++++++++++++++ >>> include/linux/kvm_host.h | 2 ++ >>> 3 files changed, 23 insertions(+), 5 deletions(-) >>> >>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c >>> index 02b51dd..7986c9f 100644 >>> --- a/arch/x86/kvm/lapic.c >>> +++ b/arch/x86/kvm/lapic.c >>> @@ -731,8 +731,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, >>> case APIC_DM_INIT: >>> if (!trig_mode || level) { >>> result = 1; >>> - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; >>> - kvm_make_request(KVM_REQ_EVENT, vcpu); >>> + kvm_make_request(KVM_REQ_INIT, vcpu); >>> kvm_vcpu_kick(vcpu); >>> } else { >>> apic_debug("Ignoring de-assert INIT to vcpu %d\n", >>> @@ -743,11 +742,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, >>> case APIC_DM_STARTUP: >>> apic_debug("SIPI to vcpu %d vector 0x%02x\n", >>> vcpu->vcpu_id, vector); >>> - if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { >>> + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED || >>> + test_bit(KVM_REQ_INIT, &vcpu->requests)) { >>> result = 1; >>> vcpu->arch.sipi_vector = vector; >>> - vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; >>> - kvm_make_request(KVM_REQ_EVENT, vcpu); >>> + /* make sure sipi_vector is visible for the receiver */ >>> + smp_wmb(); >>> + kvm_make_request(KVM_REQ_SIPI, vcpu); >>> kvm_vcpu_kick(vcpu); >>> } >>> break; >>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c >>> index d0cf737..0be04b9 100644 >>> --- a/arch/x86/kvm/x86.c >>> +++ b/arch/x86/kvm/x86.c >>> @@ -5641,6 +5641,15 @@ static void update_eoi_exitmap(struct kvm_vcpu *vcpu) >>> kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); >>> } >>> >>> +static void kvm_check_init_and_sipi(struct kvm_vcpu *vcpu) >>> +{ >>> + if (kvm_check_request(KVM_REQ_INIT, vcpu)) >>> + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; >> >> And here is a small race between clearing REQ_INIT and setting >> INIT_RECEIVED. It can make the LAPIC drop the SIPI incorrectly. Need to >> break up test and clear, doing the clear after mp_state update. Yeah... >> > You also need to call kvm_check_init_and_sipi() in > kvm_arch_vcpu_ioctl_get_mpstate(), Indeed. > which means you now have three places > where you transfer INIT/SIPI state from requests to mp_state. All the > problems arise from the fact that now you have two places where you > are storing current state. Not at all. I'm keeping the state in a single place, mp_state. I just have to make sure that I do not loose asynchronous events - what INIT and SIPI are. > To overcome this we can either deprecated > KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED values for mp_state > (use it only for migration purposes) and use separate state in APIC > to hold those event, like with nmi, or why not go with Paolo's simple > cmpxchg one? We need to replace most, if not all, manipulations of mp_state with cmpxchg, verifying the state transitions there. And the request-based approach still looks cleaner to me when it comes to implementing INIT handling for nested modes. That will just trivially hook into kvm_check_init_and_sipi. Jan
On Tue, Mar 05, 2013 at 09:24:40AM +0100, Jan Kiszka wrote: > On 2013-03-05 08:57, Gleb Natapov wrote: > > On Tue, Mar 05, 2013 at 12:00:10AM +0100, Jan Kiszka wrote: > >> On 2013-03-04 22:41, Jan Kiszka wrote: > >>> From: Jan Kiszka <jan.kiszka@siemens.com> > >>> > >>> A VCPU sending INIT or SIPI to some other VCPU races for setting the > >>> remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED > >>> was overwritten by kvm_emulate_halt and, thus, got lost. > >>> > >>> Fix this by raising requests on the sender side that will then be > >>> handled synchronously over the target VCPU context. > >>> > >>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> > >>> --- > >>> > >>> Changes in v2: > >>> - check transition to INIT_RECEIVED in vcpu_enter_guest > >>> - removed return value of kvm_check_init_and_sipi - caller has to > >>> check for relevant transition afterward > >>> - add write barrier after setting sipi_vector > >>> > >>> arch/x86/kvm/lapic.c | 11 ++++++----- > >>> arch/x86/kvm/x86.c | 15 +++++++++++++++ > >>> include/linux/kvm_host.h | 2 ++ > >>> 3 files changed, 23 insertions(+), 5 deletions(-) > >>> > >>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c > >>> index 02b51dd..7986c9f 100644 > >>> --- a/arch/x86/kvm/lapic.c > >>> +++ b/arch/x86/kvm/lapic.c > >>> @@ -731,8 +731,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, > >>> case APIC_DM_INIT: > >>> if (!trig_mode || level) { > >>> result = 1; > >>> - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; > >>> - kvm_make_request(KVM_REQ_EVENT, vcpu); > >>> + kvm_make_request(KVM_REQ_INIT, vcpu); > >>> kvm_vcpu_kick(vcpu); > >>> } else { > >>> apic_debug("Ignoring de-assert INIT to vcpu %d\n", > >>> @@ -743,11 +742,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, > >>> case APIC_DM_STARTUP: > >>> apic_debug("SIPI to vcpu %d vector 0x%02x\n", > >>> vcpu->vcpu_id, vector); > >>> - if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { > >>> + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED || > >>> + test_bit(KVM_REQ_INIT, &vcpu->requests)) { > >>> result = 1; > >>> vcpu->arch.sipi_vector = vector; > >>> - vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; > >>> - kvm_make_request(KVM_REQ_EVENT, vcpu); > >>> + /* make sure sipi_vector is visible for the receiver */ > >>> + smp_wmb(); > >>> + kvm_make_request(KVM_REQ_SIPI, vcpu); > >>> kvm_vcpu_kick(vcpu); > >>> } > >>> break; > >>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > >>> index d0cf737..0be04b9 100644 > >>> --- a/arch/x86/kvm/x86.c > >>> +++ b/arch/x86/kvm/x86.c > >>> @@ -5641,6 +5641,15 @@ static void update_eoi_exitmap(struct kvm_vcpu *vcpu) > >>> kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); > >>> } > >>> > >>> +static void kvm_check_init_and_sipi(struct kvm_vcpu *vcpu) > >>> +{ > >>> + if (kvm_check_request(KVM_REQ_INIT, vcpu)) > >>> + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; > >> > >> And here is a small race between clearing REQ_INIT and setting > >> INIT_RECEIVED. It can make the LAPIC drop the SIPI incorrectly. Need to > >> break up test and clear, doing the clear after mp_state update. Yeah... > >> > > You also need to call kvm_check_init_and_sipi() in > > kvm_arch_vcpu_ioctl_get_mpstate(), > > Indeed. > > > which means you now have three places > > where you transfer INIT/SIPI state from requests to mp_state. All the > > problems arise from the fact that now you have two places where you > > are storing current state. > > Not at all. I'm keeping the state in a single place, mp_state. I just > have to make sure that I do not loose asynchronous events - what INIT > and SIPI are. > As evident from this code: + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED || + test_bit(KVM_REQ_INIT, &vcpu->requests)) { the state is in two places. > > To overcome this we can either deprecated > > KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED values for mp_state > > (use it only for migration purposes) and use separate state in APIC > > to hold those event, like with nmi, or why not go with Paolo's simple > > cmpxchg one? > > We need to replace most, if not all, manipulations of mp_state with > cmpxchg, verifying the state transitions there. And the request-based > approach still looks cleaner to me when it comes to implementing INIT > handling for nested modes. That will just trivially hook into > kvm_check_init_and_sipi. > The mp_state changes are rare, do not see the problem replacing all state changes with cmpxchg. I do not like request-based approach as implemented since we keep state in two places and constantly sync it back. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2013-03-05 09:46, Gleb Natapov wrote: > On Tue, Mar 05, 2013 at 09:24:40AM +0100, Jan Kiszka wrote: >> On 2013-03-05 08:57, Gleb Natapov wrote: >>> On Tue, Mar 05, 2013 at 12:00:10AM +0100, Jan Kiszka wrote: >>>> On 2013-03-04 22:41, Jan Kiszka wrote: >>>>> From: Jan Kiszka <jan.kiszka@siemens.com> >>>>> >>>>> A VCPU sending INIT or SIPI to some other VCPU races for setting the >>>>> remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED >>>>> was overwritten by kvm_emulate_halt and, thus, got lost. >>>>> >>>>> Fix this by raising requests on the sender side that will then be >>>>> handled synchronously over the target VCPU context. >>>>> >>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> >>>>> --- >>>>> >>>>> Changes in v2: >>>>> - check transition to INIT_RECEIVED in vcpu_enter_guest >>>>> - removed return value of kvm_check_init_and_sipi - caller has to >>>>> check for relevant transition afterward >>>>> - add write barrier after setting sipi_vector >>>>> >>>>> arch/x86/kvm/lapic.c | 11 ++++++----- >>>>> arch/x86/kvm/x86.c | 15 +++++++++++++++ >>>>> include/linux/kvm_host.h | 2 ++ >>>>> 3 files changed, 23 insertions(+), 5 deletions(-) >>>>> >>>>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c >>>>> index 02b51dd..7986c9f 100644 >>>>> --- a/arch/x86/kvm/lapic.c >>>>> +++ b/arch/x86/kvm/lapic.c >>>>> @@ -731,8 +731,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, >>>>> case APIC_DM_INIT: >>>>> if (!trig_mode || level) { >>>>> result = 1; >>>>> - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; >>>>> - kvm_make_request(KVM_REQ_EVENT, vcpu); >>>>> + kvm_make_request(KVM_REQ_INIT, vcpu); >>>>> kvm_vcpu_kick(vcpu); >>>>> } else { >>>>> apic_debug("Ignoring de-assert INIT to vcpu %d\n", >>>>> @@ -743,11 +742,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, >>>>> case APIC_DM_STARTUP: >>>>> apic_debug("SIPI to vcpu %d vector 0x%02x\n", >>>>> vcpu->vcpu_id, vector); >>>>> - if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { >>>>> + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED || >>>>> + test_bit(KVM_REQ_INIT, &vcpu->requests)) { >>>>> result = 1; >>>>> vcpu->arch.sipi_vector = vector; >>>>> - vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; >>>>> - kvm_make_request(KVM_REQ_EVENT, vcpu); >>>>> + /* make sure sipi_vector is visible for the receiver */ >>>>> + smp_wmb(); >>>>> + kvm_make_request(KVM_REQ_SIPI, vcpu); >>>>> kvm_vcpu_kick(vcpu); >>>>> } >>>>> break; >>>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c >>>>> index d0cf737..0be04b9 100644 >>>>> --- a/arch/x86/kvm/x86.c >>>>> +++ b/arch/x86/kvm/x86.c >>>>> @@ -5641,6 +5641,15 @@ static void update_eoi_exitmap(struct kvm_vcpu *vcpu) >>>>> kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); >>>>> } >>>>> >>>>> +static void kvm_check_init_and_sipi(struct kvm_vcpu *vcpu) >>>>> +{ >>>>> + if (kvm_check_request(KVM_REQ_INIT, vcpu)) >>>>> + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; >>>> >>>> And here is a small race between clearing REQ_INIT and setting >>>> INIT_RECEIVED. It can make the LAPIC drop the SIPI incorrectly. Need to >>>> break up test and clear, doing the clear after mp_state update. Yeah... >>>> >>> You also need to call kvm_check_init_and_sipi() in >>> kvm_arch_vcpu_ioctl_get_mpstate(), >> >> Indeed. >> >>> which means you now have three places >>> where you transfer INIT/SIPI state from requests to mp_state. All the >>> problems arise from the fact that now you have two places where you >>> are storing current state. >> >> Not at all. I'm keeping the state in a single place, mp_state. I just >> have to make sure that I do not loose asynchronous events - what INIT >> and SIPI are. >> > As evident from this code: > + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED || > + test_bit(KVM_REQ_INIT, &vcpu->requests)) { > the state is in two places. That's just to protect the content of sipi_vector during delivery. We could drop the complete if clause if we protected that variable differently. > >>> To overcome this we can either deprecated >>> KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED values for mp_state >>> (use it only for migration purposes) and use separate state in APIC >>> to hold those event, like with nmi, or why not go with Paolo's simple >>> cmpxchg one? >> >> We need to replace most, if not all, manipulations of mp_state with >> cmpxchg, verifying the state transitions there. And the request-based >> approach still looks cleaner to me when it comes to implementing INIT >> handling for nested modes. That will just trivially hook into >> kvm_check_init_and_sipi. >> > The mp_state changes are rare, do not see the problem replacing all > state changes with cmpxchg. I do not like request-based approach as > implemented since we keep state in two places and constantly sync it > back. And I like it more as it avoids spurious state changes toward INIT. That will happen if we misuse mp_state for event signaling, like we do so far, having to fix it up later again because the INIT event turned out to become an INIT VM-exit. Jan
On Tue, Mar 05, 2013 at 10:12:05AM +0100, Jan Kiszka wrote: > On 2013-03-05 09:46, Gleb Natapov wrote: > > On Tue, Mar 05, 2013 at 09:24:40AM +0100, Jan Kiszka wrote: > >> On 2013-03-05 08:57, Gleb Natapov wrote: > >>> On Tue, Mar 05, 2013 at 12:00:10AM +0100, Jan Kiszka wrote: > >>>> On 2013-03-04 22:41, Jan Kiszka wrote: > >>>>> From: Jan Kiszka <jan.kiszka@siemens.com> > >>>>> > >>>>> A VCPU sending INIT or SIPI to some other VCPU races for setting the > >>>>> remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED > >>>>> was overwritten by kvm_emulate_halt and, thus, got lost. > >>>>> > >>>>> Fix this by raising requests on the sender side that will then be > >>>>> handled synchronously over the target VCPU context. > >>>>> > >>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> > >>>>> --- > >>>>> > >>>>> Changes in v2: > >>>>> - check transition to INIT_RECEIVED in vcpu_enter_guest > >>>>> - removed return value of kvm_check_init_and_sipi - caller has to > >>>>> check for relevant transition afterward > >>>>> - add write barrier after setting sipi_vector > >>>>> > >>>>> arch/x86/kvm/lapic.c | 11 ++++++----- > >>>>> arch/x86/kvm/x86.c | 15 +++++++++++++++ > >>>>> include/linux/kvm_host.h | 2 ++ > >>>>> 3 files changed, 23 insertions(+), 5 deletions(-) > >>>>> > >>>>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c > >>>>> index 02b51dd..7986c9f 100644 > >>>>> --- a/arch/x86/kvm/lapic.c > >>>>> +++ b/arch/x86/kvm/lapic.c > >>>>> @@ -731,8 +731,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, > >>>>> case APIC_DM_INIT: > >>>>> if (!trig_mode || level) { > >>>>> result = 1; > >>>>> - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; > >>>>> - kvm_make_request(KVM_REQ_EVENT, vcpu); > >>>>> + kvm_make_request(KVM_REQ_INIT, vcpu); > >>>>> kvm_vcpu_kick(vcpu); > >>>>> } else { > >>>>> apic_debug("Ignoring de-assert INIT to vcpu %d\n", > >>>>> @@ -743,11 +742,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, > >>>>> case APIC_DM_STARTUP: > >>>>> apic_debug("SIPI to vcpu %d vector 0x%02x\n", > >>>>> vcpu->vcpu_id, vector); > >>>>> - if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { > >>>>> + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED || > >>>>> + test_bit(KVM_REQ_INIT, &vcpu->requests)) { > >>>>> result = 1; > >>>>> vcpu->arch.sipi_vector = vector; > >>>>> - vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; > >>>>> - kvm_make_request(KVM_REQ_EVENT, vcpu); > >>>>> + /* make sure sipi_vector is visible for the receiver */ > >>>>> + smp_wmb(); > >>>>> + kvm_make_request(KVM_REQ_SIPI, vcpu); > >>>>> kvm_vcpu_kick(vcpu); > >>>>> } > >>>>> break; > >>>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > >>>>> index d0cf737..0be04b9 100644 > >>>>> --- a/arch/x86/kvm/x86.c > >>>>> +++ b/arch/x86/kvm/x86.c > >>>>> @@ -5641,6 +5641,15 @@ static void update_eoi_exitmap(struct kvm_vcpu *vcpu) > >>>>> kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); > >>>>> } > >>>>> > >>>>> +static void kvm_check_init_and_sipi(struct kvm_vcpu *vcpu) > >>>>> +{ > >>>>> + if (kvm_check_request(KVM_REQ_INIT, vcpu)) > >>>>> + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; > >>>> > >>>> And here is a small race between clearing REQ_INIT and setting > >>>> INIT_RECEIVED. It can make the LAPIC drop the SIPI incorrectly. Need to > >>>> break up test and clear, doing the clear after mp_state update. Yeah... > >>>> > >>> You also need to call kvm_check_init_and_sipi() in > >>> kvm_arch_vcpu_ioctl_get_mpstate(), > >> > >> Indeed. > >> > >>> which means you now have three places > >>> where you transfer INIT/SIPI state from requests to mp_state. All the > >>> problems arise from the fact that now you have two places where you > >>> are storing current state. > >> > >> Not at all. I'm keeping the state in a single place, mp_state. I just > >> have to make sure that I do not loose asynchronous events - what INIT > >> and SIPI are. > >> > > As evident from this code: > > + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED || > > + test_bit(KVM_REQ_INIT, &vcpu->requests)) { > > the state is in two places. > > That's just to protect the content of sipi_vector during delivery. We > could drop the complete if clause if we protected that variable differently. > I understand why the code is here. I am saying that this is the evidence that the state is in two places. > > > >>> To overcome this we can either deprecated > >>> KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED values for mp_state > >>> (use it only for migration purposes) and use separate state in APIC > >>> to hold those event, like with nmi, or why not go with Paolo's simple > >>> cmpxchg one? > >> > >> We need to replace most, if not all, manipulations of mp_state with > >> cmpxchg, verifying the state transitions there. And the request-based > >> approach still looks cleaner to me when it comes to implementing INIT > >> handling for nested modes. That will just trivially hook into > >> kvm_check_init_and_sipi. > >> > > The mp_state changes are rare, do not see the problem replacing all > > state changes with cmpxchg. I do not like request-based approach as > > implemented since we keep state in two places and constantly sync it > > back. > > And I like it more as it avoids spurious state changes toward INIT. That > will happen if we misuse mp_state for event signaling, like we do so > far, having to fix it up later again because the INIT event turned out > to become an INIT VM-exit. > Yes, I agree we abuse mp_state for signaling. I do not want to abuse ->request for that too. So what about other idea about treating init/sipi just like any other APIC event (that's what they are) and add state to lapic to track init/sipi signaling? -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Il 05/03/2013 10:37, Gleb Natapov ha scritto: > > > > Not at all. I'm keeping the state in a single place, mp_state. I just > > > > have to make sure that I do not loose asynchronous events - what INIT > > > > and SIPI are. > > > > > > As evident from this code: > > > + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED || > > > + test_bit(KVM_REQ_INIT, &vcpu->requests)) { > > > the state is in two places. > > That's just to protect the content of sipi_vector during delivery. Is that even needed? Can we just do an unconditional: vcpu->arch.sipi_vector = vector; /* make sure sipi_vector is visible for the receiver */ smp_wmb(); kvm_make_request(KVM_REQ_SIPI, vcpu); kvm_vcpu_kick(vcpu); and check in the request handler that we did get an INIT? > > We could drop the complete if clause if we protected that variable differently. > > I understand why the code is here. I am saying that this is the evidence > that the state is in two places. I agree with Gleb here. The state is in two places. I'm not saying that using requests is wrong, it sounds nice especially for nested INIT. But there is something nasty in the patches so far. BTW checking the requests in kvm_apic_has_interrupt seems nicer than synchronizing in kvm_arch_cpu_runnable; it is a bit ugly to have side effects in kvm_arch_cpu_runnable. Then you can actually _process_ the requests only in vcpu_enter_guest and kvm_arch_vcpu_ioctl_get_mpstate(). In kvm_arch_vcpu_ioctl_put_mpstate(), KVM_MP_STATE_SIPI_RECEIVED has to become KVM_MP_STATE_INIT_RECEIVED with KVM_REQ_SIPI set. >>> > > >>>>> > >>> To overcome this we can either deprecated >>>>> > >>> KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED values for mp_state >>>>> > >>> (use it only for migration purposes) and use separate state in APIC >>>>> > >>> to hold those event, like with nmi, or why not go with Paolo's simple >>>>> > >>> cmpxchg one? >>>> > >> >>>> > >> We need to replace most, if not all, manipulations of mp_state with >>>> > >> cmpxchg, verifying the state transitions there. Let's take again the matrix I sent yesterday, fixed to make UNINIT unreachable (that transition can only be done by userspace): from \ to RUNNABLE UNINIT INIT HALTED SIPI RUNNABLE n/a NO yes yes NO UNINIT NO n/a yes NO NO INIT NO NO yes NO yes HALTED yes NO yes n/a NO SIPI yes NO yes NO n/a We have: - anything -> INIT: this is asynchronous, and INIT always wins. No need for a cmpxchg. - RUNNABLE -> HALTED: cmpxchg needed - INIT -> SIPI: we can have a race between the last two interrupts in an INIT-INIT-SIPI (or INIT-SIPI-INIT) sequence, with these two interrupts sent by two different processors. The same race should also be present in real hardware, and should never happen (the BSP should send SIPIs to all other processors). No need for a cmpxchg. - HALTED -> RUNNABLE: racy vs. INIT too, cmpxchg needed - SIPI -> RUNNABLE: there is the same race with going back to INIT; no need for a cmpxchg. But it probably will not scale well with nested virt. Paolo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2013-03-05 11:50, Paolo Bonzini wrote: > Il 05/03/2013 10:37, Gleb Natapov ha scritto: >>>>> Not at all. I'm keeping the state in a single place, mp_state. I just >>>>> have to make sure that I do not loose asynchronous events - what INIT >>>>> and SIPI are. >>>> >>>> As evident from this code: >>>> + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED || >>>> + test_bit(KVM_REQ_INIT, &vcpu->requests)) { >>>> the state is in two places. >>> That's just to protect the content of sipi_vector during delivery. > > Is that even needed? Can we just do an unconditional: > > vcpu->arch.sipi_vector = vector; > /* make sure sipi_vector is visible for the receiver */ > smp_wmb(); > kvm_make_request(KVM_REQ_SIPI, vcpu); > kvm_vcpu_kick(vcpu); > > and check in the request handler that we did get an INIT? > >>> We could drop the complete if clause if we protected that variable differently. >> >> I understand why the code is here. I am saying that this is the evidence >> that the state is in two places. > > I agree with Gleb here. The state is in two places. I'm not saying that > using requests is wrong, it sounds nice especially for nested INIT. But > there is something nasty in the patches so far. > > BTW checking the requests in kvm_apic_has_interrupt seems nicer than > synchronizing in kvm_arch_cpu_runnable; it is a bit ugly to have side > effects in kvm_arch_cpu_runnable. Then you can actually _process_ > the requests only in vcpu_enter_guest and kvm_arch_vcpu_ioctl_get_mpstate(). > In kvm_arch_vcpu_ioctl_put_mpstate(), KVM_MP_STATE_SIPI_RECEIVED has to > become KVM_MP_STATE_INIT_RECEIVED with KVM_REQ_SIPI set. An INIT or SIPI signal is not really an interrupt in the sense that kvm_cpu_has_interrupt or kvm_get_apic_interrupt expect (the current users of kvm_apic_has_interrupt). I think we would move some ugliness around this way, not yet resolve it. But it probably makes sense to refactor the kvm_check_init_and_sipi service to something that the APIC provide (is that what you are thinking of, Gleb?). That will not reduce the number of places where we have to check, but it should encapsulate things a bit cleaner. Jan
On Tue, Mar 05, 2013 at 11:50:58AM +0100, Paolo Bonzini wrote: > Il 05/03/2013 10:37, Gleb Natapov ha scritto: > > > > > Not at all. I'm keeping the state in a single place, mp_state. I just > > > > > have to make sure that I do not loose asynchronous events - what INIT > > > > > and SIPI are. > > > > > > > > As evident from this code: > > > > + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED || > > > > + test_bit(KVM_REQ_INIT, &vcpu->requests)) { > > > > the state is in two places. > > > That's just to protect the content of sipi_vector during delivery. > > Is that even needed? Can we just do an unconditional: > > vcpu->arch.sipi_vector = vector; > /* make sure sipi_vector is visible for the receiver */ > smp_wmb(); > kvm_make_request(KVM_REQ_SIPI, vcpu); > kvm_vcpu_kick(vcpu); > > and check in the request handler that we did get an INIT? > INIT may come after SIPI and vcpu will start anyway. > > > We could drop the complete if clause if we protected that variable differently. > > > > I understand why the code is here. I am saying that this is the evidence > > that the state is in two places. > > I agree with Gleb here. The state is in two places. I'm not saying that > using requests is wrong, it sounds nice especially for nested INIT. But > there is something nasty in the patches so far. > > BTW checking the requests in kvm_apic_has_interrupt seems nicer than > synchronizing in kvm_arch_cpu_runnable; it is a bit ugly to have side > effects in kvm_arch_cpu_runnable. Then you can actually _process_ I subscribe to the idea of kvm_arch_cpu_runnable() that does not change state that it queries. Otherwise we do quantum programming: state changes by examination. But kvm_apic_has_interrupt() is called by kvm_arch_vcpu_runnable() only if interrupts are enabled. > the requests only in vcpu_enter_guest and kvm_arch_vcpu_ioctl_get_mpstate(). > In kvm_arch_vcpu_ioctl_put_mpstate(), KVM_MP_STATE_SIPI_RECEIVED has to > become KVM_MP_STATE_INIT_RECEIVED with KVM_REQ_SIPI set. > > >>> > > > >>>>> > >>> To overcome this we can either deprecated > >>>>> > >>> KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED values for mp_state > >>>>> > >>> (use it only for migration purposes) and use separate state in APIC > >>>>> > >>> to hold those event, like with nmi, or why not go with Paolo's simple > >>>>> > >>> cmpxchg one? > >>>> > >> > >>>> > >> We need to replace most, if not all, manipulations of mp_state with > >>>> > >> cmpxchg, verifying the state transitions there. > > Let's take again the matrix I sent yesterday, fixed to make UNINIT > unreachable (that transition can only be done by userspace): > > from \ to RUNNABLE UNINIT INIT HALTED SIPI > RUNNABLE n/a NO yes yes NO > UNINIT NO n/a yes NO NO > INIT NO NO yes NO yes > HALTED yes NO yes n/a NO > SIPI yes NO yes NO n/a > > We have: > > - anything -> INIT: this is asynchronous, and INIT always wins. No need > for a cmpxchg. > > - RUNNABLE -> HALTED: cmpxchg needed > > - INIT -> SIPI: we can have a race between the last two interrupts in an > INIT-INIT-SIPI (or INIT-SIPI-INIT) sequence, with these two interrupts sent > by two different processors. The same race should also be present in real > hardware, and should never happen (the BSP should send SIPIs to all other > processors). No need for a cmpxchg. > > - HALTED -> RUNNABLE: racy vs. INIT too, cmpxchg needed > > - SIPI -> RUNNABLE: there is the same race with going back to INIT; no need > for a cmpxchg. > > > But it probably will not scale well with nested virt. > I do not think the problem for nested virt with cmpxchg() approach is scalability. The problem is that with nested virt INIT behaves like regular even that may cause vmexit or be delayed. Changing vcpu state during event generation does not work if vcpu it in a guest mode. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Mar 05, 2013 at 02:25:51PM +0100, Jan Kiszka wrote: > On 2013-03-05 11:50, Paolo Bonzini wrote: > > Il 05/03/2013 10:37, Gleb Natapov ha scritto: > >>>>> Not at all. I'm keeping the state in a single place, mp_state. I just > >>>>> have to make sure that I do not loose asynchronous events - what INIT > >>>>> and SIPI are. > >>>> > >>>> As evident from this code: > >>>> + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED || > >>>> + test_bit(KVM_REQ_INIT, &vcpu->requests)) { > >>>> the state is in two places. > >>> That's just to protect the content of sipi_vector during delivery. > > > > Is that even needed? Can we just do an unconditional: > > > > vcpu->arch.sipi_vector = vector; > > /* make sure sipi_vector is visible for the receiver */ > > smp_wmb(); > > kvm_make_request(KVM_REQ_SIPI, vcpu); > > kvm_vcpu_kick(vcpu); > > > > and check in the request handler that we did get an INIT? > > > >>> We could drop the complete if clause if we protected that variable differently. > >> > >> I understand why the code is here. I am saying that this is the evidence > >> that the state is in two places. > > > > I agree with Gleb here. The state is in two places. I'm not saying that > > using requests is wrong, it sounds nice especially for nested INIT. But > > there is something nasty in the patches so far. > > > > BTW checking the requests in kvm_apic_has_interrupt seems nicer than > > synchronizing in kvm_arch_cpu_runnable; it is a bit ugly to have side > > effects in kvm_arch_cpu_runnable. Then you can actually _process_ > > the requests only in vcpu_enter_guest and kvm_arch_vcpu_ioctl_get_mpstate(). > > In kvm_arch_vcpu_ioctl_put_mpstate(), KVM_MP_STATE_SIPI_RECEIVED has to > > become KVM_MP_STATE_INIT_RECEIVED with KVM_REQ_SIPI set. > > An INIT or SIPI signal is not really an interrupt in the sense that > kvm_cpu_has_interrupt or kvm_get_apic_interrupt expect (the current > users of kvm_apic_has_interrupt). I think we would move some ugliness > around this way, not yet resolve it. > > But it probably makes sense to refactor the kvm_check_init_and_sipi > service to something that the APIC provide (is that what you are > thinking of, Gleb?). That will not reduce the number of places where we > have to check, but it should encapsulate things a bit cleaner. > Yes, that what I am thinking. And yes I do not think it will reduce number of places we need to check it much (or at all) unfortunately. But if we want to hold INIT signal pending (until vmxoff for instance) we cannot do that in ->requests anyway. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Mar 04, 2013 at 10:41:43PM +0100, Jan Kiszka wrote: > From: Jan Kiszka <jan.kiszka@siemens.com> > > A VCPU sending INIT or SIPI to some other VCPU races for setting the > remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED > was overwritten by kvm_emulate_halt and, thus, got lost. > > Fix this by raising requests on the sender side that will then be > handled synchronously over the target VCPU context. > > Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> Why is kvm_emulate_halt being executed from KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED again? Why is it not true that the only valid transition from KVM_MP_STATE_HALTED is from KVM_MP_STATE_RUNNABLE? It would be good for KVM_MP_STATE_HALTED to indicate "guest executed HLT instruction" (which is impossible without INIT/SIPI being received). -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Mar 05, 2013 at 08:16:41PM -0300, Marcelo Tosatti wrote: > On Mon, Mar 04, 2013 at 10:41:43PM +0100, Jan Kiszka wrote: > > From: Jan Kiszka <jan.kiszka@siemens.com> > > > > A VCPU sending INIT or SIPI to some other VCPU races for setting the > > remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED > > was overwritten by kvm_emulate_halt and, thus, got lost. > > > > Fix this by raising requests on the sender side that will then be > > handled synchronously over the target VCPU context. > > > > Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> > > Why is kvm_emulate_halt being executed from > KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED again? > > Why is it not true that the only valid transition from s/from/to/ > KVM_MP_STATE_HALTED is from KVM_MP_STATE_RUNNABLE? > > It would be good for KVM_MP_STATE_HALTED to indicate > "guest executed HLT instruction" (which is impossible without INIT/SIPI > being received). -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Mar 05, 2013 at 08:16:41PM -0300, Marcelo Tosatti wrote: > On Mon, Mar 04, 2013 at 10:41:43PM +0100, Jan Kiszka wrote: > > From: Jan Kiszka <jan.kiszka@siemens.com> > > > > A VCPU sending INIT or SIPI to some other VCPU races for setting the > > remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED > > was overwritten by kvm_emulate_halt and, thus, got lost. > > > > Fix this by raising requests on the sender side that will then be > > handled synchronously over the target VCPU context. > > > > Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> > > Why is kvm_emulate_halt being executed from > KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED again? > > Why is it not true that the only valid transition from > KVM_MP_STATE_HALTED is from KVM_MP_STATE_RUNNABLE? See Paolo's table, it is. So why fix a race which should not be happening in the first place. > It would be good for KVM_MP_STATE_HALTED to indicate > "guest executed HLT instruction" (which is impossible without INIT/SIPI > being received). -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> On Tue, Mar 05, 2013 at 08:16:41PM -0300, Marcelo Tosatti wrote: > > On Mon, Mar 04, 2013 at 10:41:43PM +0100, Jan Kiszka wrote: > > > From: Jan Kiszka <jan.kiszka@siemens.com> > > > > > > A VCPU sending INIT or SIPI to some other VCPU races for setting > > > the > > > remote VCPU's mp_state. When we were unlucky, > > > KVM_MP_STATE_INIT_RECEIVED > > > was overwritten by kvm_emulate_halt and, thus, got lost. > > > > > > Fix this by raising requests on the sender side that will then be > > > handled synchronously over the target VCPU context. > > > > > > Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> > > > > Why is kvm_emulate_halt being executed from > > KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED again? > > > > Why is it not true that the only valid transition from > > KVM_MP_STATE_HALTED is from KVM_MP_STATE_RUNNABLE? > > See Paolo's table, it is. So why fix a race which should not be > happening in the first place. The bad transition happens exactly because of the race. Are you saying you prefer the solution with cmpxchg? Paolo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2013-03-06 07:12, Paolo Bonzini wrote: > >> On Tue, Mar 05, 2013 at 08:16:41PM -0300, Marcelo Tosatti wrote: >>> On Mon, Mar 04, 2013 at 10:41:43PM +0100, Jan Kiszka wrote: >>>> From: Jan Kiszka <jan.kiszka@siemens.com> >>>> >>>> A VCPU sending INIT or SIPI to some other VCPU races for setting >>>> the >>>> remote VCPU's mp_state. When we were unlucky, >>>> KVM_MP_STATE_INIT_RECEIVED >>>> was overwritten by kvm_emulate_halt and, thus, got lost. >>>> >>>> Fix this by raising requests on the sender side that will then be >>>> handled synchronously over the target VCPU context. >>>> >>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> >>> >>> Why is kvm_emulate_halt being executed from >>> KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED again? >>> >>> Why is it not true that the only valid transition from >>> KVM_MP_STATE_HALTED is from KVM_MP_STATE_RUNNABLE? >> >> See Paolo's table, it is. So why fix a race which should not be >> happening in the first place. > > The bad transition happens exactly because of the race. > Are you saying you prefer the solution with cmpxchg? I think we are past that point in our discussion and should really separate signal (INIT/SIPI) from state (INIT/SIPI_RECEIVED etc.). Jan
On Wed, Mar 06, 2013 at 01:12:52AM -0500, Paolo Bonzini wrote: > > > On Tue, Mar 05, 2013 at 08:16:41PM -0300, Marcelo Tosatti wrote: > > > On Mon, Mar 04, 2013 at 10:41:43PM +0100, Jan Kiszka wrote: > > > > From: Jan Kiszka <jan.kiszka@siemens.com> > > > > > > > > A VCPU sending INIT or SIPI to some other VCPU races for setting > > > > the > > > > remote VCPU's mp_state. When we were unlucky, > > > > KVM_MP_STATE_INIT_RECEIVED > > > > was overwritten by kvm_emulate_halt and, thus, got lost. > > > > > > > > Fix this by raising requests on the sender side that will then be > > > > handled synchronously over the target VCPU context. > > > > > > > > Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> > > > > > > Why is kvm_emulate_halt being executed from > > > KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED again? > > > > > > Why is it not true that the only valid transition from > > > KVM_MP_STATE_HALTED is from KVM_MP_STATE_RUNNABLE? > > > > See Paolo's table, it is. So why fix a race which should not be > > happening in the first place. > > The bad transition happens exactly because of the race. > Are you saying you prefer the solution with cmpxchg? > > Paolo Vcpu should only invoke kvm_emulate_halt if it has been through a KVM_MP_STATE_UNINITIALIZED -> KVM_MP_STATE_INIT_RECEIVED -> KVM_MP_STATE_SIPI_RECEIVED -> KVM_MP_STATE_RUNNABLE transition. If it has been through that, how can a KVM_MP_STATE_INIT_RECEIVED be overwritten? That is, KVM_MP_STATE_RUNNABLE (precondition for kvm_emulate_halt) is only reachable if KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED have not been overwritten. The point i'm trying to make is it appears the symptom is being fixed with the proposed patch, not the root cause (which, if the reasoning above is correct, is somewhere in nVMX code). -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Mar 06, 2013 at 08:57:54AM +0100, Jan Kiszka wrote: > On 2013-03-06 07:12, Paolo Bonzini wrote: > > > >> On Tue, Mar 05, 2013 at 08:16:41PM -0300, Marcelo Tosatti wrote: > >>> On Mon, Mar 04, 2013 at 10:41:43PM +0100, Jan Kiszka wrote: > >>>> From: Jan Kiszka <jan.kiszka@siemens.com> > >>>> > >>>> A VCPU sending INIT or SIPI to some other VCPU races for setting > >>>> the > >>>> remote VCPU's mp_state. When we were unlucky, > >>>> KVM_MP_STATE_INIT_RECEIVED > >>>> was overwritten by kvm_emulate_halt and, thus, got lost. > >>>> > >>>> Fix this by raising requests on the sender side that will then be > >>>> handled synchronously over the target VCPU context. > >>>> > >>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> > >>> > >>> Why is kvm_emulate_halt being executed from > >>> KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED again? > >>> > >>> Why is it not true that the only valid transition from > >>> KVM_MP_STATE_HALTED is from KVM_MP_STATE_RUNNABLE? > >> > >> See Paolo's table, it is. So why fix a race which should not be > >> happening in the first place. > > > > The bad transition happens exactly because of the race. > > Are you saying you prefer the solution with cmpxchg? > > I think we are past that point in our discussion and should really > separate signal (INIT/SIPI) from state (INIT/SIPI_RECEIVED etc.). > > Jan The sentence "KVM_MP_STATE_INIT_RECEIVED overwritten by kvm_emulate_halt" is contradictory, unless i miss something. The KVM_REQ_ solution is messy, should avoid introducing new request bits if possible. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2013-03-06 22:30, Marcelo Tosatti wrote: > On Wed, Mar 06, 2013 at 08:57:54AM +0100, Jan Kiszka wrote: >> On 2013-03-06 07:12, Paolo Bonzini wrote: >>> >>>> On Tue, Mar 05, 2013 at 08:16:41PM -0300, Marcelo Tosatti wrote: >>>>> On Mon, Mar 04, 2013 at 10:41:43PM +0100, Jan Kiszka wrote: >>>>>> From: Jan Kiszka <jan.kiszka@siemens.com> >>>>>> >>>>>> A VCPU sending INIT or SIPI to some other VCPU races for setting >>>>>> the >>>>>> remote VCPU's mp_state. When we were unlucky, >>>>>> KVM_MP_STATE_INIT_RECEIVED >>>>>> was overwritten by kvm_emulate_halt and, thus, got lost. >>>>>> >>>>>> Fix this by raising requests on the sender side that will then be >>>>>> handled synchronously over the target VCPU context. >>>>>> >>>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> >>>>> >>>>> Why is kvm_emulate_halt being executed from >>>>> KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED again? >>>>> >>>>> Why is it not true that the only valid transition from >>>>> KVM_MP_STATE_HALTED is from KVM_MP_STATE_RUNNABLE? >>>> >>>> See Paolo's table, it is. So why fix a race which should not be >>>> happening in the first place. >>> >>> The bad transition happens exactly because of the race. >>> Are you saying you prefer the solution with cmpxchg? >> >> I think we are past that point in our discussion and should really >> separate signal (INIT/SIPI) from state (INIT/SIPI_RECEIVED etc.). >> >> Jan > > The sentence "KVM_MP_STATE_INIT_RECEIVED overwritten by > kvm_emulate_halt" is contradictory, unless i miss something. http://thread.gmane.org/gmane.comp.emulators.kvm.devel/105638 Jan
On Wed, Mar 06, 2013 at 10:39:27PM +0100, Jan Kiszka wrote: > On 2013-03-06 22:30, Marcelo Tosatti wrote: > > On Wed, Mar 06, 2013 at 08:57:54AM +0100, Jan Kiszka wrote: > >> On 2013-03-06 07:12, Paolo Bonzini wrote: > >>> > >>>> On Tue, Mar 05, 2013 at 08:16:41PM -0300, Marcelo Tosatti wrote: > >>>>> On Mon, Mar 04, 2013 at 10:41:43PM +0100, Jan Kiszka wrote: > >>>>>> From: Jan Kiszka <jan.kiszka@siemens.com> > >>>>>> > >>>>>> A VCPU sending INIT or SIPI to some other VCPU races for setting > >>>>>> the > >>>>>> remote VCPU's mp_state. When we were unlucky, > >>>>>> KVM_MP_STATE_INIT_RECEIVED > >>>>>> was overwritten by kvm_emulate_halt and, thus, got lost. > >>>>>> > >>>>>> Fix this by raising requests on the sender side that will then be > >>>>>> handled synchronously over the target VCPU context. > >>>>>> > >>>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> > >>>>> > >>>>> Why is kvm_emulate_halt being executed from > >>>>> KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED again? > >>>>> > >>>>> Why is it not true that the only valid transition from > >>>>> KVM_MP_STATE_HALTED is from KVM_MP_STATE_RUNNABLE? > >>>> > >>>> See Paolo's table, it is. So why fix a race which should not be > >>>> happening in the first place. > >>> > >>> The bad transition happens exactly because of the race. > >>> Are you saying you prefer the solution with cmpxchg? > >> > >> I think we are past that point in our discussion and should really > >> separate signal (INIT/SIPI) from state (INIT/SIPI_RECEIVED etc.). > >> > >> Jan > > > > The sentence "KVM_MP_STATE_INIT_RECEIVED overwritten by > > kvm_emulate_halt" is contradictory, unless i miss something. > > http://thread.gmane.org/gmane.comp.emulators.kvm.devel/105638 > > Jan "A VCPU sending INIT or SIPI to some other VCPU races for setting the remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED was overwritten by kvm_emulate_halt and, thus, got lost. Fix this by raising requests on the sender side that will then be handled synchronously over the target VCPU context." The scenario you describe is: vcpu0,bsp vcpu1 vcpu0->mp_state=KVM_MP_STATE_RUNNABLE vcpu1->mp_state=KVM_MP_STATE_UNINIT at __accept_apic_irq() vcpu1->mp_state=KVM_MP_STATE_INIT_RECEIVED kvm_emulate_halt vcpu1->mp_state= KVM_MP_STATE_HALTED This is what the first sentence from the patch refers to, correct? -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2013-03-06 22:50, Marcelo Tosatti wrote: > On Wed, Mar 06, 2013 at 10:39:27PM +0100, Jan Kiszka wrote: >> On 2013-03-06 22:30, Marcelo Tosatti wrote: >>> On Wed, Mar 06, 2013 at 08:57:54AM +0100, Jan Kiszka wrote: >>>> On 2013-03-06 07:12, Paolo Bonzini wrote: >>>>> >>>>>> On Tue, Mar 05, 2013 at 08:16:41PM -0300, Marcelo Tosatti wrote: >>>>>>> On Mon, Mar 04, 2013 at 10:41:43PM +0100, Jan Kiszka wrote: >>>>>>>> From: Jan Kiszka <jan.kiszka@siemens.com> >>>>>>>> >>>>>>>> A VCPU sending INIT or SIPI to some other VCPU races for setting >>>>>>>> the >>>>>>>> remote VCPU's mp_state. When we were unlucky, >>>>>>>> KVM_MP_STATE_INIT_RECEIVED >>>>>>>> was overwritten by kvm_emulate_halt and, thus, got lost. >>>>>>>> >>>>>>>> Fix this by raising requests on the sender side that will then be >>>>>>>> handled synchronously over the target VCPU context. >>>>>>>> >>>>>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> >>>>>>> >>>>>>> Why is kvm_emulate_halt being executed from >>>>>>> KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED again? >>>>>>> >>>>>>> Why is it not true that the only valid transition from >>>>>>> KVM_MP_STATE_HALTED is from KVM_MP_STATE_RUNNABLE? >>>>>> >>>>>> See Paolo's table, it is. So why fix a race which should not be >>>>>> happening in the first place. >>>>> >>>>> The bad transition happens exactly because of the race. >>>>> Are you saying you prefer the solution with cmpxchg? >>>> >>>> I think we are past that point in our discussion and should really >>>> separate signal (INIT/SIPI) from state (INIT/SIPI_RECEIVED etc.). >>>> >>>> Jan >>> >>> The sentence "KVM_MP_STATE_INIT_RECEIVED overwritten by >>> kvm_emulate_halt" is contradictory, unless i miss something. >> >> http://thread.gmane.org/gmane.comp.emulators.kvm.devel/105638 >> >> Jan > > "A VCPU sending INIT or SIPI to some other VCPU races for setting the > remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED > was overwritten by kvm_emulate_halt and, thus, got lost. > > > Fix this by raising requests on the sender side that will then be > handled synchronously over the target VCPU context." > > > The scenario you describe is: > > vcpu0,bsp vcpu1 > > vcpu0->mp_state=KVM_MP_STATE_RUNNABLE This is not related to the race. > vcpu1->mp_state=KVM_MP_STATE_UNINIT Nor this. > > at __accept_apic_irq() > vcpu1->mp_state=KVM_MP_STATE_INIT_RECEIVED > kvm_emulate_halt > vcpu1->mp_state= > KVM_MP_STATE_HALTED Just these two sequences. vcpu0 need not be the BSP, any VCPU can send INIT at any time. > > > This is what the first sentence from the patch refers to, correct? The last part, yes. But there are more races due to the unsynchronized access to mp_state in lapic.c. I'm going to refactor my patch according to Gleb's and Paolos's suggestions, trying to keep the signals lapic-local, adding services to check for them and translate them into mp_state transitions over the target vcpu context. Jan
Il 06/03/2013 22:19, Marcelo Tosatti ha scritto: > Vcpu should only invoke kvm_emulate_halt if it has been through a > KVM_MP_STATE_UNINITIALIZED -> KVM_MP_STATE_INIT_RECEIVED -> > KVM_MP_STATE_SIPI_RECEIVED -> KVM_MP_STATE_RUNNABLE transition. > > If it has been through that, how can a KVM_MP_STATE_INIT_RECEIVED be > overwritten? > > That is, KVM_MP_STATE_RUNNABLE (precondition for kvm_emulate_halt) is > only reachable if KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED > have not been overwritten. You can always go back to the KVM_MP_STATE_INIT_RECEIVED state; either by an APIC write or by various soft resets (port 92h, keyboard controller, port cf9h) that aren't emulated correctly right now. Paolo > The point i'm trying to make is it appears the symptom is being fixed > with the proposed patch, not the root cause (which, if the reasoning > above is correct, is somewhere in nVMX code). -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Mar 06, 2013 at 11:43:30PM +0100, Paolo Bonzini wrote: > Il 06/03/2013 22:19, Marcelo Tosatti ha scritto: > > Vcpu should only invoke kvm_emulate_halt if it has been through a > > KVM_MP_STATE_UNINITIALIZED -> KVM_MP_STATE_INIT_RECEIVED -> > > KVM_MP_STATE_SIPI_RECEIVED -> KVM_MP_STATE_RUNNABLE transition. > > > > If it has been through that, how can a KVM_MP_STATE_INIT_RECEIVED be > > overwritten? > > > > That is, KVM_MP_STATE_RUNNABLE (precondition for kvm_emulate_halt) is > > only reachable if KVM_MP_STATE_INIT_RECEIVED/KVM_MP_STATE_SIPI_RECEIVED > > have not been overwritten. > > You can always go back to the KVM_MP_STATE_INIT_RECEIVED state; either > by an APIC write or by various soft resets (port 92h, keyboard > controller, port cf9h) that aren't emulated correctly right now. Indeed (and BSP not ignoring INIT is also broken in KVM, as you pointed our earlier). So the stress test in case is guest using APIC INIT after initial MP initialization protocol (therefore reproducible without nVMX). -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 02b51dd..7986c9f 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -731,8 +731,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, case APIC_DM_INIT: if (!trig_mode || level) { result = 1; - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(KVM_REQ_INIT, vcpu); kvm_vcpu_kick(vcpu); } else { apic_debug("Ignoring de-assert INIT to vcpu %d\n", @@ -743,11 +742,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, case APIC_DM_STARTUP: apic_debug("SIPI to vcpu %d vector 0x%02x\n", vcpu->vcpu_id, vector); - if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED || + test_bit(KVM_REQ_INIT, &vcpu->requests)) { result = 1; vcpu->arch.sipi_vector = vector; - vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; - kvm_make_request(KVM_REQ_EVENT, vcpu); + /* make sure sipi_vector is visible for the receiver */ + smp_wmb(); + kvm_make_request(KVM_REQ_SIPI, vcpu); kvm_vcpu_kick(vcpu); } break; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d0cf737..0be04b9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5641,6 +5641,15 @@ static void update_eoi_exitmap(struct kvm_vcpu *vcpu) kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); } +static void kvm_check_init_and_sipi(struct kvm_vcpu *vcpu) +{ + if (kvm_check_request(KVM_REQ_INIT, vcpu)) + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + if (kvm_check_request(KVM_REQ_SIPI, vcpu) && + vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) + vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; +} + static int vcpu_enter_guest(struct kvm_vcpu *vcpu) { int r; @@ -5649,6 +5658,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) bool req_immediate_exit = 0; if (vcpu->requests) { + kvm_check_init_and_sipi(vcpu); + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { + r = 1; + goto out; + } if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) kvm_mmu_unload(vcpu); if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) @@ -6977,6 +6991,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { + kvm_check_init_and_sipi(vcpu); return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && !vcpu->arch.apf.halted) || !list_empty_careful(&vcpu->async_pf.done) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 722cae7..1a191c9 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -124,6 +124,8 @@ static inline bool is_error_page(struct page *page) #define KVM_REQ_MCLOCK_INPROGRESS 20 #define KVM_REQ_EPR_EXIT 21 #define KVM_REQ_EOIBITMAP 22 +#define KVM_REQ_INIT 23 +#define KVM_REQ_SIPI 24 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1