diff mbox

[v5,3/4] kvm: Create kvm_clear_irq()

Message ID 20120718062742.GH6479@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Gleb Natapov July 18, 2012, 6:27 a.m. UTC
On Tue, Jul 17, 2012 at 07:14:52PM +0300, Michael S. Tsirkin wrote:
> > _Seems_ racy, or _is_ racy?  Please identify the race.
> 
> Look at this:
> 
> static inline int kvm_irq_line_state(unsigned long *irq_state,
>                                      int irq_source_id, int level)
> {
>         /* Logical OR for level trig interrupt */
>         if (level)
>                 set_bit(irq_source_id, irq_state);
>         else
>                 clear_bit(irq_source_id, irq_state);
> 
>         return !!(*irq_state);
> }
> 
> 
> Now:
> If other CPU changes some other bit after the atomic change,
> it looks like !!(*irq_state) might return a stale value.
> 
> CPU 0 clears bit 0. CPU 1 sets bit 1. CPU 1 sets level to 1.
> If CPU 0 sees a stale value now it will return 0 here
> and interrupt will get cleared.
> 
This will hardly happen on x86 especially since bit is set with
serialized instruction. But there is actually a race here.
CPU 0 clears bit 0. CPU 0 read irq_state as 0. CPU 1 sets level to 1.
CPU 1 calls kvm_ioapic_set_irq(1). CPU 0 calls kvm_ioapic_set_irq(0).
No ioapic thinks the level is 0 but irq_state is not 0.

This untested and un-compiled patch should fix it.


--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Michael S. Tsirkin July 18, 2012, 10:20 a.m. UTC | #1
On Wed, Jul 18, 2012 at 09:27:42AM +0300, Gleb Natapov wrote:
> On Tue, Jul 17, 2012 at 07:14:52PM +0300, Michael S. Tsirkin wrote:
> > > _Seems_ racy, or _is_ racy?  Please identify the race.
> > 
> > Look at this:
> > 
> > static inline int kvm_irq_line_state(unsigned long *irq_state,
> >                                      int irq_source_id, int level)
> > {
> >         /* Logical OR for level trig interrupt */
> >         if (level)
> >                 set_bit(irq_source_id, irq_state);
> >         else
> >                 clear_bit(irq_source_id, irq_state);
> > 
> >         return !!(*irq_state);
> > }
> > 
> > 
> > Now:
> > If other CPU changes some other bit after the atomic change,
> > it looks like !!(*irq_state) might return a stale value.
> > 
> > CPU 0 clears bit 0. CPU 1 sets bit 1. CPU 1 sets level to 1.
> > If CPU 0 sees a stale value now it will return 0 here
> > and interrupt will get cleared.
> > 
> This will hardly happen on x86 especially since bit is set with
> serialized instruction.

Probably. But it does make me a bit uneasy.  Why don't we pass
irq_source_id to kvm_pic_set_irq/kvm_ioapic_set_irq, and move
kvm_irq_line_state to under pic_lock/ioapic_lock?  We can then use
__set_bit/__clear_bit in kvm_irq_line_state, making the ordering simpler
and saving an atomic op in the process.

> But there is actually a race here.
> CPU 0 clears bit 0. CPU 0 read irq_state as 0. CPU 1 sets level to 1.
> CPU 1 calls kvm_ioapic_set_irq(1). CPU 0 calls kvm_ioapic_set_irq(0).
> No ioapic thinks the level is 0 but irq_state is not 0.
> 
> This untested and un-compiled patch should fix it.
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index ef91d79..e22c78b 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -825,7 +825,7 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
>  void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
>  bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
>  
> -int kvm_pic_set_irq(void *opaque, int irq, int level);
> +int kvm_pic_set_irq(void *opaque, int irq);
>  
>  void kvm_inject_nmi(struct kvm_vcpu *vcpu);
>  
> diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
> index 81cf4fa..0d6988f 100644
> --- a/arch/x86/kvm/i8259.c
> +++ b/arch/x86/kvm/i8259.c
> @@ -188,12 +188,13 @@ void kvm_pic_update_irq(struct kvm_pic *s)
>  	pic_unlock(s);
>  }
>  
> -int kvm_pic_set_irq(void *opaque, int irq, int level)
> +int kvm_pic_set_irq(void *opaque, int irq)
>  {
>  	struct kvm_pic *s = opaque;
> -	int ret = -1;
> +	int ret = -1, level;
>  
>  	pic_lock(s);
> +	level = !!s->irq_states[irq];
>  	if (irq >= 0 && irq < PIC_NUM_PINS) {
>  		ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
>  		pic_update_irq(s);
> diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
> index 26fd54d..6ad6a6b 100644
> --- a/virt/kvm/ioapic.c
> +++ b/virt/kvm/ioapic.c
> @@ -191,14 +191,15 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
>  	return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe);
>  }
>  
> -int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
> +int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq)
>  {
>  	u32 old_irr;
>  	u32 mask = 1 << irq;
>  	union kvm_ioapic_redirect_entry entry;
> -	int ret = 1;
> +	int ret = 1, level;
>  
>  	spin_lock(&ioapic->lock);
> +	level = !!ioapic->irq_states[irq];
>  	old_irr = ioapic->irr;
>  	if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
>  		entry = ioapic->redirtbl[irq];
> diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
> index 32872a0..65894dd 100644
> --- a/virt/kvm/ioapic.h
> +++ b/virt/kvm/ioapic.h
> @@ -74,7 +74,7 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
>  bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector);
>  int kvm_ioapic_init(struct kvm *kvm);
>  void kvm_ioapic_destroy(struct kvm *kvm);
> -int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
> +int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq);
>  void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
>  int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
>  		struct kvm_lapic_irq *irq);
> diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
> index a6a0365..db0ccef 100644
> --- a/virt/kvm/irq_comm.c
> +++ b/virt/kvm/irq_comm.c
> @@ -33,7 +33,7 @@
>  
>  #include "ioapic.h"
>  
> -static inline int kvm_irq_line_state(unsigned long *irq_state,
> +static inline void kvm_irq_line_state(unsigned long *irq_state,
>  				     int irq_source_id, int level)
>  {
>  	/* Logical OR for level trig interrupt */
> @@ -41,8 +41,6 @@ static inline int kvm_irq_line_state(unsigned long *irq_state,
>  		set_bit(irq_source_id, irq_state);
>  	else
>  		clear_bit(irq_source_id, irq_state);
> -
> -	return !!(*irq_state);
>  }
>  
>  static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
> @@ -50,9 +48,9 @@ static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
>  {
>  #ifdef CONFIG_X86
>  	struct kvm_pic *pic = pic_irqchip(kvm);
> -	level = kvm_irq_line_state(&pic->irq_states[e->irqchip.pin],
> +	kvm_irq_line_state(&pic->irq_states[e->irqchip.pin],
>  				   irq_source_id, level);
> -	return kvm_pic_set_irq(pic, e->irqchip.pin, level);
> +	return kvm_pic_set_irq(pic, e->irqchip.pin);
>  #else
>  	return -1;
>  #endif
> @@ -62,10 +60,10 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
>  			      struct kvm *kvm, int irq_source_id, int level)
>  {
>  	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
> -	level = kvm_irq_line_state(&ioapic->irq_states[e->irqchip.pin],
> +	kvm_irq_line_state(&ioapic->irq_states[e->irqchip.pin],
>  				   irq_source_id, level);
>  
> -	return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level);
> +	return kvm_ioapic_set_irq(ioapic, e->irqchip.pin);
>  }
>  
>  inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
> --
> 			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov July 18, 2012, 10:27 a.m. UTC | #2
On Wed, Jul 18, 2012 at 01:20:29PM +0300, Michael S. Tsirkin wrote:
> On Wed, Jul 18, 2012 at 09:27:42AM +0300, Gleb Natapov wrote:
> > On Tue, Jul 17, 2012 at 07:14:52PM +0300, Michael S. Tsirkin wrote:
> > > > _Seems_ racy, or _is_ racy?  Please identify the race.
> > > 
> > > Look at this:
> > > 
> > > static inline int kvm_irq_line_state(unsigned long *irq_state,
> > >                                      int irq_source_id, int level)
> > > {
> > >         /* Logical OR for level trig interrupt */
> > >         if (level)
> > >                 set_bit(irq_source_id, irq_state);
> > >         else
> > >                 clear_bit(irq_source_id, irq_state);
> > > 
> > >         return !!(*irq_state);
> > > }
> > > 
> > > 
> > > Now:
> > > If other CPU changes some other bit after the atomic change,
> > > it looks like !!(*irq_state) might return a stale value.
> > > 
> > > CPU 0 clears bit 0. CPU 1 sets bit 1. CPU 1 sets level to 1.
> > > If CPU 0 sees a stale value now it will return 0 here
> > > and interrupt will get cleared.
> > > 
> > This will hardly happen on x86 especially since bit is set with
> > serialized instruction.
> 
> Probably. But it does make me a bit uneasy.  Why don't we pass
> irq_source_id to kvm_pic_set_irq/kvm_ioapic_set_irq, and move
> kvm_irq_line_state to under pic_lock/ioapic_lock?  We can then use
> __set_bit/__clear_bit in kvm_irq_line_state, making the ordering simpler
> and saving an atomic op in the process.
> 
With my patch I do not see why we can't change them to unlocked variant
without moving them anywhere. The only requirement is to not use RMW
sequence to set/clear bits. The ordering of setting does not matter. The
ordering of reading is.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin July 18, 2012, 10:33 a.m. UTC | #3
On Wed, Jul 18, 2012 at 01:27:39PM +0300, Gleb Natapov wrote:
> On Wed, Jul 18, 2012 at 01:20:29PM +0300, Michael S. Tsirkin wrote:
> > On Wed, Jul 18, 2012 at 09:27:42AM +0300, Gleb Natapov wrote:
> > > On Tue, Jul 17, 2012 at 07:14:52PM +0300, Michael S. Tsirkin wrote:
> > > > > _Seems_ racy, or _is_ racy?  Please identify the race.
> > > > 
> > > > Look at this:
> > > > 
> > > > static inline int kvm_irq_line_state(unsigned long *irq_state,
> > > >                                      int irq_source_id, int level)
> > > > {
> > > >         /* Logical OR for level trig interrupt */
> > > >         if (level)
> > > >                 set_bit(irq_source_id, irq_state);
> > > >         else
> > > >                 clear_bit(irq_source_id, irq_state);
> > > > 
> > > >         return !!(*irq_state);
> > > > }
> > > > 
> > > > 
> > > > Now:
> > > > If other CPU changes some other bit after the atomic change,
> > > > it looks like !!(*irq_state) might return a stale value.
> > > > 
> > > > CPU 0 clears bit 0. CPU 1 sets bit 1. CPU 1 sets level to 1.
> > > > If CPU 0 sees a stale value now it will return 0 here
> > > > and interrupt will get cleared.
> > > > 
> > > This will hardly happen on x86 especially since bit is set with
> > > serialized instruction.
> > 
> > Probably. But it does make me a bit uneasy.  Why don't we pass
> > irq_source_id to kvm_pic_set_irq/kvm_ioapic_set_irq, and move
> > kvm_irq_line_state to under pic_lock/ioapic_lock?  We can then use
> > __set_bit/__clear_bit in kvm_irq_line_state, making the ordering simpler
> > and saving an atomic op in the process.
> > 
> With my patch I do not see why we can't change them to unlocked variant
> without moving them anywhere. The only requirement is to not use RMW
> sequence to set/clear bits. The ordering of setting does not matter. The
> ordering of reading is.

You want to use __set_bit/__clear_bit on the same word
from multiple CPUs, without locking?
Why won't this lose information?

In any case, it seems simpler and safer to do accesses under lock
than rely on specific use.

> --
> 			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov July 18, 2012, 10:36 a.m. UTC | #4
On Wed, Jul 18, 2012 at 01:33:35PM +0300, Michael S. Tsirkin wrote:
> On Wed, Jul 18, 2012 at 01:27:39PM +0300, Gleb Natapov wrote:
> > On Wed, Jul 18, 2012 at 01:20:29PM +0300, Michael S. Tsirkin wrote:
> > > On Wed, Jul 18, 2012 at 09:27:42AM +0300, Gleb Natapov wrote:
> > > > On Tue, Jul 17, 2012 at 07:14:52PM +0300, Michael S. Tsirkin wrote:
> > > > > > _Seems_ racy, or _is_ racy?  Please identify the race.
> > > > > 
> > > > > Look at this:
> > > > > 
> > > > > static inline int kvm_irq_line_state(unsigned long *irq_state,
> > > > >                                      int irq_source_id, int level)
> > > > > {
> > > > >         /* Logical OR for level trig interrupt */
> > > > >         if (level)
> > > > >                 set_bit(irq_source_id, irq_state);
> > > > >         else
> > > > >                 clear_bit(irq_source_id, irq_state);
> > > > > 
> > > > >         return !!(*irq_state);
> > > > > }
> > > > > 
> > > > > 
> > > > > Now:
> > > > > If other CPU changes some other bit after the atomic change,
> > > > > it looks like !!(*irq_state) might return a stale value.
> > > > > 
> > > > > CPU 0 clears bit 0. CPU 1 sets bit 1. CPU 1 sets level to 1.
> > > > > If CPU 0 sees a stale value now it will return 0 here
> > > > > and interrupt will get cleared.
> > > > > 
> > > > This will hardly happen on x86 especially since bit is set with
> > > > serialized instruction.
> > > 
> > > Probably. But it does make me a bit uneasy.  Why don't we pass
> > > irq_source_id to kvm_pic_set_irq/kvm_ioapic_set_irq, and move
> > > kvm_irq_line_state to under pic_lock/ioapic_lock?  We can then use
> > > __set_bit/__clear_bit in kvm_irq_line_state, making the ordering simpler
> > > and saving an atomic op in the process.
> > > 
> > With my patch I do not see why we can't change them to unlocked variant
> > without moving them anywhere. The only requirement is to not use RMW
> > sequence to set/clear bits. The ordering of setting does not matter. The
> > ordering of reading is.
> 
> You want to use __set_bit/__clear_bit on the same word
> from multiple CPUs, without locking?
> Why won't this lose information?
Because it is not RMW. If it is then yes, you can't do that.
> 
> In any case, it seems simpler and safer to do accesses under lock
> than rely on specific use.
> 
> > --
> > 			Gleb.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin July 18, 2012, 10:51 a.m. UTC | #5
On Wed, Jul 18, 2012 at 01:36:08PM +0300, Gleb Natapov wrote:
> On Wed, Jul 18, 2012 at 01:33:35PM +0300, Michael S. Tsirkin wrote:
> > On Wed, Jul 18, 2012 at 01:27:39PM +0300, Gleb Natapov wrote:
> > > On Wed, Jul 18, 2012 at 01:20:29PM +0300, Michael S. Tsirkin wrote:
> > > > On Wed, Jul 18, 2012 at 09:27:42AM +0300, Gleb Natapov wrote:
> > > > > On Tue, Jul 17, 2012 at 07:14:52PM +0300, Michael S. Tsirkin wrote:
> > > > > > > _Seems_ racy, or _is_ racy?  Please identify the race.
> > > > > > 
> > > > > > Look at this:
> > > > > > 
> > > > > > static inline int kvm_irq_line_state(unsigned long *irq_state,
> > > > > >                                      int irq_source_id, int level)
> > > > > > {
> > > > > >         /* Logical OR for level trig interrupt */
> > > > > >         if (level)
> > > > > >                 set_bit(irq_source_id, irq_state);
> > > > > >         else
> > > > > >                 clear_bit(irq_source_id, irq_state);
> > > > > > 
> > > > > >         return !!(*irq_state);
> > > > > > }
> > > > > > 
> > > > > > 
> > > > > > Now:
> > > > > > If other CPU changes some other bit after the atomic change,
> > > > > > it looks like !!(*irq_state) might return a stale value.
> > > > > > 
> > > > > > CPU 0 clears bit 0. CPU 1 sets bit 1. CPU 1 sets level to 1.
> > > > > > If CPU 0 sees a stale value now it will return 0 here
> > > > > > and interrupt will get cleared.
> > > > > > 
> > > > > This will hardly happen on x86 especially since bit is set with
> > > > > serialized instruction.
> > > > 
> > > > Probably. But it does make me a bit uneasy.  Why don't we pass
> > > > irq_source_id to kvm_pic_set_irq/kvm_ioapic_set_irq, and move
> > > > kvm_irq_line_state to under pic_lock/ioapic_lock?  We can then use
> > > > __set_bit/__clear_bit in kvm_irq_line_state, making the ordering simpler
> > > > and saving an atomic op in the process.
> > > > 
> > > With my patch I do not see why we can't change them to unlocked variant
> > > without moving them anywhere. The only requirement is to not use RMW
> > > sequence to set/clear bits. The ordering of setting does not matter. The
> > > ordering of reading is.
> > 
> > You want to use __set_bit/__clear_bit on the same word
> > from multiple CPUs, without locking?
> > Why won't this lose information?
> Because it is not RMW. If it is then yes, you can't do that.

You are saying __set_bit does not do RMW on x86? Interesting.
It's probably not a good idea to rely on this I think.

> > 
> > In any case, it seems simpler and safer to do accesses under lock
> > than rely on specific use.
> > 
> > > --
> > > 			Gleb.
> 
> --
> 			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov July 18, 2012, 10:53 a.m. UTC | #6
On Wed, Jul 18, 2012 at 01:51:05PM +0300, Michael S. Tsirkin wrote:
> On Wed, Jul 18, 2012 at 01:36:08PM +0300, Gleb Natapov wrote:
> > On Wed, Jul 18, 2012 at 01:33:35PM +0300, Michael S. Tsirkin wrote:
> > > On Wed, Jul 18, 2012 at 01:27:39PM +0300, Gleb Natapov wrote:
> > > > On Wed, Jul 18, 2012 at 01:20:29PM +0300, Michael S. Tsirkin wrote:
> > > > > On Wed, Jul 18, 2012 at 09:27:42AM +0300, Gleb Natapov wrote:
> > > > > > On Tue, Jul 17, 2012 at 07:14:52PM +0300, Michael S. Tsirkin wrote:
> > > > > > > > _Seems_ racy, or _is_ racy?  Please identify the race.
> > > > > > > 
> > > > > > > Look at this:
> > > > > > > 
> > > > > > > static inline int kvm_irq_line_state(unsigned long *irq_state,
> > > > > > >                                      int irq_source_id, int level)
> > > > > > > {
> > > > > > >         /* Logical OR for level trig interrupt */
> > > > > > >         if (level)
> > > > > > >                 set_bit(irq_source_id, irq_state);
> > > > > > >         else
> > > > > > >                 clear_bit(irq_source_id, irq_state);
> > > > > > > 
> > > > > > >         return !!(*irq_state);
> > > > > > > }
> > > > > > > 
> > > > > > > 
> > > > > > > Now:
> > > > > > > If other CPU changes some other bit after the atomic change,
> > > > > > > it looks like !!(*irq_state) might return a stale value.
> > > > > > > 
> > > > > > > CPU 0 clears bit 0. CPU 1 sets bit 1. CPU 1 sets level to 1.
> > > > > > > If CPU 0 sees a stale value now it will return 0 here
> > > > > > > and interrupt will get cleared.
> > > > > > > 
> > > > > > This will hardly happen on x86 especially since bit is set with
> > > > > > serialized instruction.
> > > > > 
> > > > > Probably. But it does make me a bit uneasy.  Why don't we pass
> > > > > irq_source_id to kvm_pic_set_irq/kvm_ioapic_set_irq, and move
> > > > > kvm_irq_line_state to under pic_lock/ioapic_lock?  We can then use
> > > > > __set_bit/__clear_bit in kvm_irq_line_state, making the ordering simpler
> > > > > and saving an atomic op in the process.
> > > > > 
> > > > With my patch I do not see why we can't change them to unlocked variant
> > > > without moving them anywhere. The only requirement is to not use RMW
> > > > sequence to set/clear bits. The ordering of setting does not matter. The
> > > > ordering of reading is.
> > > 
> > > You want to use __set_bit/__clear_bit on the same word
> > > from multiple CPUs, without locking?
> > > Why won't this lose information?
> > Because it is not RMW. If it is then yes, you can't do that.
> 
> You are saying __set_bit does not do RMW on x86? Interesting.
I think it doesn't.

> It's probably not a good idea to rely on this I think.
> 
The code is no in arch/x86 so probably no. Although it is used only on
x86 (and ia64 which has broken kvm anyway).

> > > 
> > > In any case, it seems simpler and safer to do accesses under lock
> > > than rely on specific use.
> > > 
> > > > --
> > > > 			Gleb.
> > 
> > --
> > 			Gleb.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin July 18, 2012, 11:08 a.m. UTC | #7
On Wed, Jul 18, 2012 at 01:53:15PM +0300, Gleb Natapov wrote:
> On Wed, Jul 18, 2012 at 01:51:05PM +0300, Michael S. Tsirkin wrote:
> > On Wed, Jul 18, 2012 at 01:36:08PM +0300, Gleb Natapov wrote:
> > > On Wed, Jul 18, 2012 at 01:33:35PM +0300, Michael S. Tsirkin wrote:
> > > > On Wed, Jul 18, 2012 at 01:27:39PM +0300, Gleb Natapov wrote:
> > > > > On Wed, Jul 18, 2012 at 01:20:29PM +0300, Michael S. Tsirkin wrote:
> > > > > > On Wed, Jul 18, 2012 at 09:27:42AM +0300, Gleb Natapov wrote:
> > > > > > > On Tue, Jul 17, 2012 at 07:14:52PM +0300, Michael S. Tsirkin wrote:
> > > > > > > > > _Seems_ racy, or _is_ racy?  Please identify the race.
> > > > > > > > 
> > > > > > > > Look at this:
> > > > > > > > 
> > > > > > > > static inline int kvm_irq_line_state(unsigned long *irq_state,
> > > > > > > >                                      int irq_source_id, int level)
> > > > > > > > {
> > > > > > > >         /* Logical OR for level trig interrupt */
> > > > > > > >         if (level)
> > > > > > > >                 set_bit(irq_source_id, irq_state);
> > > > > > > >         else
> > > > > > > >                 clear_bit(irq_source_id, irq_state);
> > > > > > > > 
> > > > > > > >         return !!(*irq_state);
> > > > > > > > }
> > > > > > > > 
> > > > > > > > 
> > > > > > > > Now:
> > > > > > > > If other CPU changes some other bit after the atomic change,
> > > > > > > > it looks like !!(*irq_state) might return a stale value.
> > > > > > > > 
> > > > > > > > CPU 0 clears bit 0. CPU 1 sets bit 1. CPU 1 sets level to 1.
> > > > > > > > If CPU 0 sees a stale value now it will return 0 here
> > > > > > > > and interrupt will get cleared.
> > > > > > > > 
> > > > > > > This will hardly happen on x86 especially since bit is set with
> > > > > > > serialized instruction.
> > > > > > 
> > > > > > Probably. But it does make me a bit uneasy.  Why don't we pass
> > > > > > irq_source_id to kvm_pic_set_irq/kvm_ioapic_set_irq, and move
> > > > > > kvm_irq_line_state to under pic_lock/ioapic_lock?  We can then use
> > > > > > __set_bit/__clear_bit in kvm_irq_line_state, making the ordering simpler
> > > > > > and saving an atomic op in the process.
> > > > > > 
> > > > > With my patch I do not see why we can't change them to unlocked variant
> > > > > without moving them anywhere. The only requirement is to not use RMW
> > > > > sequence to set/clear bits. The ordering of setting does not matter. The
> > > > > ordering of reading is.
> > > > 
> > > > You want to use __set_bit/__clear_bit on the same word
> > > > from multiple CPUs, without locking?
> > > > Why won't this lose information?
> > > Because it is not RMW. If it is then yes, you can't do that.
> > 
> > You are saying __set_bit does not do RMW on x86? Interesting.
> I think it doesn't.

Anywhere I can read about this?

> > It's probably not a good idea to rely on this I think.
> > 
> The code is no in arch/x86 so probably no. Although it is used only on
> x86 (and ia64 which has broken kvm anyway).

Yes but exactly the reverse is documented.

/**
 * __set_bit - Set a bit in memory
 * @nr: the bit to set
 * @addr: the address to start counting from
 *
 * Unlike set_bit(), this function is non-atomic and may be reordered.


>>>> pls note the below

 * If it's called on the same region of memory simultaneously, the effect
 * may be that only one operation succeeds.
>>>> until here

 */
static inline void __set_bit(int nr, volatile unsigned long *addr)
{
        asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory");
}




> > > > 
> > > > In any case, it seems simpler and safer to do accesses under lock
> > > > than rely on specific use.
> > > > 
> > > > > --
> > > > > 			Gleb.
> > > 
> > > --
> > > 			Gleb.
> 
> --
> 			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov July 18, 2012, 11:50 a.m. UTC | #8
On Wed, Jul 18, 2012 at 02:08:43PM +0300, Michael S. Tsirkin wrote:
> On Wed, Jul 18, 2012 at 01:53:15PM +0300, Gleb Natapov wrote:
> > On Wed, Jul 18, 2012 at 01:51:05PM +0300, Michael S. Tsirkin wrote:
> > > On Wed, Jul 18, 2012 at 01:36:08PM +0300, Gleb Natapov wrote:
> > > > On Wed, Jul 18, 2012 at 01:33:35PM +0300, Michael S. Tsirkin wrote:
> > > > > On Wed, Jul 18, 2012 at 01:27:39PM +0300, Gleb Natapov wrote:
> > > > > > On Wed, Jul 18, 2012 at 01:20:29PM +0300, Michael S. Tsirkin wrote:
> > > > > > > On Wed, Jul 18, 2012 at 09:27:42AM +0300, Gleb Natapov wrote:
> > > > > > > > On Tue, Jul 17, 2012 at 07:14:52PM +0300, Michael S. Tsirkin wrote:
> > > > > > > > > > _Seems_ racy, or _is_ racy?  Please identify the race.
> > > > > > > > > 
> > > > > > > > > Look at this:
> > > > > > > > > 
> > > > > > > > > static inline int kvm_irq_line_state(unsigned long *irq_state,
> > > > > > > > >                                      int irq_source_id, int level)
> > > > > > > > > {
> > > > > > > > >         /* Logical OR for level trig interrupt */
> > > > > > > > >         if (level)
> > > > > > > > >                 set_bit(irq_source_id, irq_state);
> > > > > > > > >         else
> > > > > > > > >                 clear_bit(irq_source_id, irq_state);
> > > > > > > > > 
> > > > > > > > >         return !!(*irq_state);
> > > > > > > > > }
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > Now:
> > > > > > > > > If other CPU changes some other bit after the atomic change,
> > > > > > > > > it looks like !!(*irq_state) might return a stale value.
> > > > > > > > > 
> > > > > > > > > CPU 0 clears bit 0. CPU 1 sets bit 1. CPU 1 sets level to 1.
> > > > > > > > > If CPU 0 sees a stale value now it will return 0 here
> > > > > > > > > and interrupt will get cleared.
> > > > > > > > > 
> > > > > > > > This will hardly happen on x86 especially since bit is set with
> > > > > > > > serialized instruction.
> > > > > > > 
> > > > > > > Probably. But it does make me a bit uneasy.  Why don't we pass
> > > > > > > irq_source_id to kvm_pic_set_irq/kvm_ioapic_set_irq, and move
> > > > > > > kvm_irq_line_state to under pic_lock/ioapic_lock?  We can then use
> > > > > > > __set_bit/__clear_bit in kvm_irq_line_state, making the ordering simpler
> > > > > > > and saving an atomic op in the process.
> > > > > > > 
> > > > > > With my patch I do not see why we can't change them to unlocked variant
> > > > > > without moving them anywhere. The only requirement is to not use RMW
> > > > > > sequence to set/clear bits. The ordering of setting does not matter. The
> > > > > > ordering of reading is.
> > > > > 
> > > > > You want to use __set_bit/__clear_bit on the same word
> > > > > from multiple CPUs, without locking?
> > > > > Why won't this lose information?
> > > > Because it is not RMW. If it is then yes, you can't do that.
> > > 
> > > You are saying __set_bit does not do RMW on x86? Interesting.
> > I think it doesn't.
> 
> Anywhere I can read about this?
> 
Well actually SDM says LOCK prefix is needed, so yes we cannot use
__set_bit/__clear_bit without moving it under lock.

> > > It's probably not a good idea to rely on this I think.
> > > 
> > The code is no in arch/x86 so probably no. Although it is used only on
> > x86 (and ia64 which has broken kvm anyway).
> 
> Yes but exactly the reverse is documented.
> 
> /**
>  * __set_bit - Set a bit in memory
>  * @nr: the bit to set
>  * @addr: the address to start counting from
>  *
>  * Unlike set_bit(), this function is non-atomic and may be reordered.
> 
> 
> >>>> pls note the below
> 
>  * If it's called on the same region of memory simultaneously, the effect
>  * may be that only one operation succeeds.
> >>>> until here
> 
>  */
> static inline void __set_bit(int nr, volatile unsigned long *addr)
> {
>         asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory");
> }
> 
> 
> 
> 
> > > > > 
> > > > > In any case, it seems simpler and safer to do accesses under lock
> > > > > than rely on specific use.
> > > > > 
> > > > > > --
> > > > > > 			Gleb.
> > > > 
> > > > --
> > > > 			Gleb.
> > 
> > --
> > 			Gleb.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin July 18, 2012, 9:55 p.m. UTC | #9
On Wed, Jul 18, 2012 at 09:27:42AM +0300, Gleb Natapov wrote:
> On Tue, Jul 17, 2012 at 07:14:52PM +0300, Michael S. Tsirkin wrote:
> > > _Seems_ racy, or _is_ racy?  Please identify the race.
> > 
> > Look at this:
> > 
> > static inline int kvm_irq_line_state(unsigned long *irq_state,
> >                                      int irq_source_id, int level)
> > {
> >         /* Logical OR for level trig interrupt */
> >         if (level)
> >                 set_bit(irq_source_id, irq_state);
> >         else
> >                 clear_bit(irq_source_id, irq_state);
> > 
> >         return !!(*irq_state);
> > }
> > 
> > 
> > Now:
> > If other CPU changes some other bit after the atomic change,
> > it looks like !!(*irq_state) might return a stale value.
> > 
> > CPU 0 clears bit 0. CPU 1 sets bit 1. CPU 1 sets level to 1.
> > If CPU 0 sees a stale value now it will return 0 here
> > and interrupt will get cleared.
> > 
> This will hardly happen on x86 especially since bit is set with
> serialized instruction. But there is actually a race here.
> CPU 0 clears bit 0. CPU 0 read irq_state as 0. CPU 1 sets level to 1.
> CPU 1 calls kvm_ioapic_set_irq(1). CPU 0 calls kvm_ioapic_set_irq(0).
> No ioapic thinks the level is 0 but irq_state is not 0.
> 
> This untested and un-compiled patch should fix it.

Getting rid of atomics completely makes me more comfortable,
and by moving all bitmap handling to under pic/ioapic lock
we can do just that.
I just tested and posted a patch that fixes the race in this way.
Could you take a look pls?
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ef91d79..e22c78b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -825,7 +825,7 @@  int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
 
-int kvm_pic_set_irq(void *opaque, int irq, int level);
+int kvm_pic_set_irq(void *opaque, int irq);
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu);
 
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 81cf4fa..0d6988f 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -188,12 +188,13 @@  void kvm_pic_update_irq(struct kvm_pic *s)
 	pic_unlock(s);
 }
 
-int kvm_pic_set_irq(void *opaque, int irq, int level)
+int kvm_pic_set_irq(void *opaque, int irq)
 {
 	struct kvm_pic *s = opaque;
-	int ret = -1;
+	int ret = -1, level;
 
 	pic_lock(s);
+	level = !!s->irq_states[irq];
 	if (irq >= 0 && irq < PIC_NUM_PINS) {
 		ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
 		pic_update_irq(s);
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 26fd54d..6ad6a6b 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -191,14 +191,15 @@  static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
 	return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe);
 }
 
-int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
+int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq)
 {
 	u32 old_irr;
 	u32 mask = 1 << irq;
 	union kvm_ioapic_redirect_entry entry;
-	int ret = 1;
+	int ret = 1, level;
 
 	spin_lock(&ioapic->lock);
+	level = !!ioapic->irq_states[irq];
 	old_irr = ioapic->irr;
 	if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
 		entry = ioapic->redirtbl[irq];
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index 32872a0..65894dd 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -74,7 +74,7 @@  void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
 bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector);
 int kvm_ioapic_init(struct kvm *kvm);
 void kvm_ioapic_destroy(struct kvm *kvm);
-int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
+int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq);
 void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
 int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 		struct kvm_lapic_irq *irq);
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index a6a0365..db0ccef 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -33,7 +33,7 @@ 
 
 #include "ioapic.h"
 
-static inline int kvm_irq_line_state(unsigned long *irq_state,
+static inline void kvm_irq_line_state(unsigned long *irq_state,
 				     int irq_source_id, int level)
 {
 	/* Logical OR for level trig interrupt */
@@ -41,8 +41,6 @@  static inline int kvm_irq_line_state(unsigned long *irq_state,
 		set_bit(irq_source_id, irq_state);
 	else
 		clear_bit(irq_source_id, irq_state);
-
-	return !!(*irq_state);
 }
 
 static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
@@ -50,9 +48,9 @@  static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
 {
 #ifdef CONFIG_X86
 	struct kvm_pic *pic = pic_irqchip(kvm);
-	level = kvm_irq_line_state(&pic->irq_states[e->irqchip.pin],
+	kvm_irq_line_state(&pic->irq_states[e->irqchip.pin],
 				   irq_source_id, level);
-	return kvm_pic_set_irq(pic, e->irqchip.pin, level);
+	return kvm_pic_set_irq(pic, e->irqchip.pin);
 #else
 	return -1;
 #endif
@@ -62,10 +60,10 @@  static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
 			      struct kvm *kvm, int irq_source_id, int level)
 {
 	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
-	level = kvm_irq_line_state(&ioapic->irq_states[e->irqchip.pin],
+	kvm_irq_line_state(&ioapic->irq_states[e->irqchip.pin],
 				   irq_source_id, level);
 
-	return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level);
+	return kvm_ioapic_set_irq(ioapic, e->irqchip.pin);
 }
 
 inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)