[v5,3/4] kvm: Create kvm_clear_irq()

Message ID	20120716203401.1752.78276.stgit@bling.home (mailing list archive)
State	New, archived
Headers	show Return-Path: <kvm-owner@vger.kernel.org> From: Alex Williamson <alex.williamson@redhat.com> Subject: [PATCH v5 3/4] kvm: Create kvm_clear_irq() To: avi@redhat.com, mst@redhat.com Cc: gleb@redhat.com, kvm@vger.kernel.org, linux-kernel@vger.kernel.org, jan.kiszka@siemens.com Date: Mon, 16 Jul 2012 14:34:03 -0600 Message-ID: <20120716203401.1752.78276.stgit@bling.home> In-Reply-To: <20120716202711.1752.71007.stgit@bling.home> References: <20120716202711.1752.71007.stgit@bling.home> User-Agent: StGIT/0.14.3 MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: 7bit Sender: kvm-owner@vger.kernel.org Precedence: bulk

Alex Williamson July 16, 2012, 8:34 p.m. UTC

This is an alternative to kvm_set_irq(,,,0) which returns the previous
assertion state of the interrupt and does nothing if it isn't changed.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

 include/linux/kvm_host.h |    3 ++
 virt/kvm/irq_comm.c      |   78 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 81 insertions(+)


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Michael S. Tsirkin July 17, 2012, 12:51 a.m. UTC | #1

On Mon, Jul 16, 2012 at 02:34:03PM -0600, Alex Williamson wrote:
> This is an alternative to kvm_set_irq(,,,0) which returns the previous
> assertion state of the interrupt and does nothing if it isn't changed.
> 
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
> 
>  include/linux/kvm_host.h |    3 ++
>  virt/kvm/irq_comm.c      |   78 ++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 81 insertions(+)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index a7661c0..6c168f1 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -219,6 +219,8 @@ struct kvm_kernel_irq_routing_entry {
>  	u32 type;
>  	int (*set)(struct kvm_kernel_irq_routing_entry *e,
>  		   struct kvm *kvm, int irq_source_id, int level);
> +	int (*clear)(struct kvm_kernel_irq_routing_entry *e,
> +		     struct kvm *kvm, int irq_source_id);
>  	union {
>  		struct {
>  			unsigned irqchip;
> @@ -629,6 +631,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
>  				   unsigned long *deliver_bitmask);
>  #endif
>  int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
> +int kvm_clear_irq(struct kvm *kvm, int irq_source_id, u32 irq);
>  int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
>  		int irq_source_id, int level);
>  void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
> diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
> index 5afb431..76e8f22 100644
> --- a/virt/kvm/irq_comm.c
> +++ b/virt/kvm/irq_comm.c
> @@ -68,6 +68,42 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
>  	return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level);
>  }
>  
> +static inline int kvm_clear_irq_line_state(unsigned long *irq_state,
> +					    int irq_source_id)
> +{
> +	return !!test_and_clear_bit(irq_source_id, irq_state);
> +}
> +
> +static int kvm_clear_pic_irq(struct kvm_kernel_irq_routing_entry *e,
> +			     struct kvm *kvm, int irq_source_id)
> +{
> +#ifdef CONFIG_X86
> +	struct kvm_pic *pic = pic_irqchip(kvm);
> +	int level = kvm_clear_irq_line_state(&pic->irq_states[e->irqchip.pin],
> +					     irq_source_id);
> +	if (level)
> +		kvm_pic_set_irq(pic, e->irqchip.pin,
> +				!!pic->irq_states[e->irqchip.pin]);
> +	return level;
> +#else
> +	return -1;
> +#endif

What does this ifdef exclude exactly?

> +}
> +
> +static int kvm_clear_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
> +				struct kvm *kvm, int irq_source_id)
> +{
> +	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
> +	int level;
> +
> +	level = kvm_clear_irq_line_state(&ioapic->irq_states[e->irqchip.pin],
> +					 irq_source_id);
> +	if (level)
> +		kvm_ioapic_set_irq(ioapic, e->irqchip.pin,
> +				   !!ioapic->irq_states[e->irqchip.pin]);
> +	return level;
> +}
> +
>  inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
>  {
>  #ifdef CONFIG_IA64
> @@ -190,6 +226,45 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level)
>  	return ret;
>  }
>  
> +/*
> + * Return value:
> + *  < 0   Error
> + *  = 0   Interrupt was not set, did nothing
> + *  > 0   Interrupt was pending, cleared
> + */
> +int kvm_clear_irq(struct kvm *kvm, int irq_source_id, u32 irq)
> +{
> +	struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS];
> +	int ret = -EINVAL, i = 0;
> +	struct kvm_irq_routing_table *irq_rt;
> +	struct hlist_node *n;
> +
> +	/* Not possible to detect if the guest uses the PIC or the
> +	 * IOAPIC.  So clear the bit in both. The guest will ignore
> +	 * writes to the unused one.
> +	 */
> +	rcu_read_lock();
> +	irq_rt = rcu_dereference(kvm->irq_routing);
> +	if (irq < irq_rt->nr_rt_entries)
> +		hlist_for_each_entry(e, n, &irq_rt->map[irq], link)
> +			irq_set[i++] = *e;
> +	rcu_read_unlock();
> +
> +	while (i--) {
> +		int r = -EINVAL;
> +
> +		if (irq_set[i].clear)
> +			r = irq_set[i].clear(&irq_set[i], kvm, irq_source_id);
> +
> +		if (r < 0)
> +			continue;
> +
> +		ret = r + ((ret < 0) ? 0 : ret);
> +	}
> +
> +	return ret;
> +}
> +
>  void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
>  {
>  	struct kvm_irq_ack_notifier *kian;
> @@ -344,16 +419,19 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
>  		switch (ue->u.irqchip.irqchip) {
>  		case KVM_IRQCHIP_PIC_MASTER:
>  			e->set = kvm_set_pic_irq;
> +			e->clear = kvm_clear_pic_irq;
>  			max_pin = 16;
>  			break;
>  		case KVM_IRQCHIP_PIC_SLAVE:
>  			e->set = kvm_set_pic_irq;
> +			e->clear = kvm_clear_pic_irq;
>  			max_pin = 16;
>  			delta = 8;
>  			break;
>  		case KVM_IRQCHIP_IOAPIC:
>  			max_pin = KVM_IOAPIC_NUM_PINS;
>  			e->set = kvm_set_ioapic_irq;
> +			e->clear = kvm_clear_ioapic_irq;
>  			break;
>  		default:
>  			goto out;
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Michael S. Tsirkin July 17, 2012, 12:55 a.m. UTC | #2

On Mon, Jul 16, 2012 at 02:34:03PM -0600, Alex Williamson wrote:
> This is an alternative to kvm_set_irq(,,,0) which returns the previous
> assertion state of the interrupt and does nothing if it isn't changed.
> 
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
> 
>  include/linux/kvm_host.h |    3 ++
>  virt/kvm/irq_comm.c      |   78 ++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 81 insertions(+)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index a7661c0..6c168f1 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -219,6 +219,8 @@ struct kvm_kernel_irq_routing_entry {
>  	u32 type;
>  	int (*set)(struct kvm_kernel_irq_routing_entry *e,
>  		   struct kvm *kvm, int irq_source_id, int level);
> +	int (*clear)(struct kvm_kernel_irq_routing_entry *e,
> +		     struct kvm *kvm, int irq_source_id);
>  	union {
>  		struct {
>  			unsigned irqchip;
> @@ -629,6 +631,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
>  				   unsigned long *deliver_bitmask);
>  #endif
>  int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
> +int kvm_clear_irq(struct kvm *kvm, int irq_source_id, u32 irq);
>  int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
>  		int irq_source_id, int level);
>  void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
> diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
> index 5afb431..76e8f22 100644
> --- a/virt/kvm/irq_comm.c
> +++ b/virt/kvm/irq_comm.c
> @@ -68,6 +68,42 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
>  	return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level);
>  }
>  
> +static inline int kvm_clear_irq_line_state(unsigned long *irq_state,
> +					    int irq_source_id)
> +{
> +	return !!test_and_clear_bit(irq_source_id, irq_state);
> +}
> +
> +static int kvm_clear_pic_irq(struct kvm_kernel_irq_routing_entry *e,
> +			     struct kvm *kvm, int irq_source_id)
> +{
> +#ifdef CONFIG_X86
> +	struct kvm_pic *pic = pic_irqchip(kvm);
> +	int level = kvm_clear_irq_line_state(&pic->irq_states[e->irqchip.pin],
> +					     irq_source_id);
> +	if (level)
> +		kvm_pic_set_irq(pic, e->irqchip.pin,
> +				!!pic->irq_states[e->irqchip.pin]);

This is a bit tricky: add a comment explaining the logic?

> +	return level;
> +#else
> +	return -1;
> +#endif
> +}
> +
> +static int kvm_clear_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
> +				struct kvm *kvm, int irq_source_id)
> +{
> +	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
> +	int level;
> +
> +	level = kvm_clear_irq_line_state(&ioapic->irq_states[e->irqchip.pin],
> +					 irq_source_id);
> +	if (level)
> +		kvm_ioapic_set_irq(ioapic, e->irqchip.pin,
> +				   !!ioapic->irq_states[e->irqchip.pin]);

This is a bit tricky: add a comment explaining the logic?

> +	return level;
> +}
> +
>  inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
>  {
>  #ifdef CONFIG_IA64
> @@ -190,6 +226,45 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level)
>  	return ret;
>  }
>  
> +/*
> + * Return value:
> + *  < 0   Error
> + *  = 0   Interrupt was not set, did nothing
> + *  > 0   Interrupt was pending, cleared
> + */
> +int kvm_clear_irq(struct kvm *kvm, int irq_source_id, u32 irq)
> +{
> +	struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS];
> +	int ret = -EINVAL, i = 0;
> +	struct kvm_irq_routing_table *irq_rt;
> +	struct hlist_node *n;
> +
> +	/* Not possible to detect if the guest uses the PIC or the
> +	 * IOAPIC.  So clear the bit in both. The guest will ignore
> +	 * writes to the unused one.
> +	 */
> +	rcu_read_lock();
> +	irq_rt = rcu_dereference(kvm->irq_routing);
> +	if (irq < irq_rt->nr_rt_entries)
> +		hlist_for_each_entry(e, n, &irq_rt->map[irq], link)
> +			irq_set[i++] = *e;
> +	rcu_read_unlock();
> +
> +	while (i--) {
> +		int r = -EINVAL;
> +
> +		if (irq_set[i].clear)
> +			r = irq_set[i].clear(&irq_set[i], kvm, irq_source_id);
> +
> +		if (r < 0)
> +			continue;
> +
> +		ret = r + ((ret < 0) ? 0 : ret);
> +	}
> +
> +	return ret;
> +}
> +
>  void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
>  {
>  	struct kvm_irq_ack_notifier *kian;
> @@ -344,16 +419,19 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
>  		switch (ue->u.irqchip.irqchip) {
>  		case KVM_IRQCHIP_PIC_MASTER:
>  			e->set = kvm_set_pic_irq;
> +			e->clear = kvm_clear_pic_irq;
>  			max_pin = 16;
>  			break;
>  		case KVM_IRQCHIP_PIC_SLAVE:
>  			e->set = kvm_set_pic_irq;
> +			e->clear = kvm_clear_pic_irq;
>  			max_pin = 16;
>  			delta = 8;
>  			break;
>  		case KVM_IRQCHIP_IOAPIC:
>  			max_pin = KVM_IOAPIC_NUM_PINS;
>  			e->set = kvm_set_ioapic_irq;
> +			e->clear = kvm_clear_ioapic_irq;
>  			break;
>  		default:
>  			goto out;
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Alex Williamson July 17, 2012, 2:42 a.m. UTC | #3

On Tue, 2012-07-17 at 03:51 +0300, Michael S. Tsirkin wrote:
> On Mon, Jul 16, 2012 at 02:34:03PM -0600, Alex Williamson wrote:
> > This is an alternative to kvm_set_irq(,,,0) which returns the previous
> > assertion state of the interrupt and does nothing if it isn't changed.
> > 
> > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > ---
> > 
> >  include/linux/kvm_host.h |    3 ++
> >  virt/kvm/irq_comm.c      |   78 ++++++++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 81 insertions(+)
> > 
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index a7661c0..6c168f1 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -219,6 +219,8 @@ struct kvm_kernel_irq_routing_entry {
> >  	u32 type;
> >  	int (*set)(struct kvm_kernel_irq_routing_entry *e,
> >  		   struct kvm *kvm, int irq_source_id, int level);
> > +	int (*clear)(struct kvm_kernel_irq_routing_entry *e,
> > +		     struct kvm *kvm, int irq_source_id);
> >  	union {
> >  		struct {
> >  			unsigned irqchip;
> > @@ -629,6 +631,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
> >  				   unsigned long *deliver_bitmask);
> >  #endif
> >  int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
> > +int kvm_clear_irq(struct kvm *kvm, int irq_source_id, u32 irq);
> >  int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
> >  		int irq_source_id, int level);
> >  void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
> > diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
> > index 5afb431..76e8f22 100644
> > --- a/virt/kvm/irq_comm.c
> > +++ b/virt/kvm/irq_comm.c
> > @@ -68,6 +68,42 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
> >  	return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level);
> >  }
> >  
> > +static inline int kvm_clear_irq_line_state(unsigned long *irq_state,
> > +					    int irq_source_id)
> > +{
> > +	return !!test_and_clear_bit(irq_source_id, irq_state);
> > +}
> > +
> > +static int kvm_clear_pic_irq(struct kvm_kernel_irq_routing_entry *e,
> > +			     struct kvm *kvm, int irq_source_id)
> > +{
> > +#ifdef CONFIG_X86
> > +	struct kvm_pic *pic = pic_irqchip(kvm);
> > +	int level = kvm_clear_irq_line_state(&pic->irq_states[e->irqchip.pin],
> > +					     irq_source_id);
> > +	if (level)
> > +		kvm_pic_set_irq(pic, e->irqchip.pin,
> > +				!!pic->irq_states[e->irqchip.pin]);
> > +	return level;
> > +#else
> > +	return -1;
> > +#endif
> 
> What does this ifdef exclude exactly?

No pic on ia64.  Not that it works, but I figured the consistency with
kvm_set_pic_irq would make more sense whenever someone goes through and
cleans out the code.  Thanks,

Alex

> > +}
> > +
> > +static int kvm_clear_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
> > +				struct kvm *kvm, int irq_source_id)
> > +{
> > +	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
> > +	int level;
> > +
> > +	level = kvm_clear_irq_line_state(&ioapic->irq_states[e->irqchip.pin],
> > +					 irq_source_id);
> > +	if (level)
> > +		kvm_ioapic_set_irq(ioapic, e->irqchip.pin,
> > +				   !!ioapic->irq_states[e->irqchip.pin]);
> > +	return level;
> > +}
> > +
> >  inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
> >  {
> >  #ifdef CONFIG_IA64
> > @@ -190,6 +226,45 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level)
> >  	return ret;
> >  }
> >  
> > +/*
> > + * Return value:
> > + *  < 0   Error
> > + *  = 0   Interrupt was not set, did nothing
> > + *  > 0   Interrupt was pending, cleared
> > + */
> > +int kvm_clear_irq(struct kvm *kvm, int irq_source_id, u32 irq)
> > +{
> > +	struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS];
> > +	int ret = -EINVAL, i = 0;
> > +	struct kvm_irq_routing_table *irq_rt;
> > +	struct hlist_node *n;
> > +
> > +	/* Not possible to detect if the guest uses the PIC or the
> > +	 * IOAPIC.  So clear the bit in both. The guest will ignore
> > +	 * writes to the unused one.
> > +	 */
> > +	rcu_read_lock();
> > +	irq_rt = rcu_dereference(kvm->irq_routing);
> > +	if (irq < irq_rt->nr_rt_entries)
> > +		hlist_for_each_entry(e, n, &irq_rt->map[irq], link)
> > +			irq_set[i++] = *e;
> > +	rcu_read_unlock();
> > +
> > +	while (i--) {
> > +		int r = -EINVAL;
> > +
> > +		if (irq_set[i].clear)
> > +			r = irq_set[i].clear(&irq_set[i], kvm, irq_source_id);
> > +
> > +		if (r < 0)
> > +			continue;
> > +
> > +		ret = r + ((ret < 0) ? 0 : ret);
> > +	}
> > +
> > +	return ret;
> > +}
> > +
> >  void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
> >  {
> >  	struct kvm_irq_ack_notifier *kian;
> > @@ -344,16 +419,19 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
> >  		switch (ue->u.irqchip.irqchip) {
> >  		case KVM_IRQCHIP_PIC_MASTER:
> >  			e->set = kvm_set_pic_irq;
> > +			e->clear = kvm_clear_pic_irq;
> >  			max_pin = 16;
> >  			break;
> >  		case KVM_IRQCHIP_PIC_SLAVE:
> >  			e->set = kvm_set_pic_irq;
> > +			e->clear = kvm_clear_pic_irq;
> >  			max_pin = 16;
> >  			delta = 8;
> >  			break;
> >  		case KVM_IRQCHIP_IOAPIC:
> >  			max_pin = KVM_IOAPIC_NUM_PINS;
> >  			e->set = kvm_set_ioapic_irq;
> > +			e->clear = kvm_clear_ioapic_irq;
> >  			break;
> >  		default:
> >  			goto out;



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Michael S. Tsirkin July 17, 2012, 10:14 a.m. UTC | #4

On Mon, Jul 16, 2012 at 02:34:03PM -0600, Alex Williamson wrote:
> This is an alternative to kvm_set_irq(,,,0) which returns the previous
> assertion state of the interrupt and does nothing if it isn't changed.
> 
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
> 
>  include/linux/kvm_host.h |    3 ++
>  virt/kvm/irq_comm.c      |   78 ++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 81 insertions(+)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index a7661c0..6c168f1 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -219,6 +219,8 @@ struct kvm_kernel_irq_routing_entry {
>  	u32 type;
>  	int (*set)(struct kvm_kernel_irq_routing_entry *e,
>  		   struct kvm *kvm, int irq_source_id, int level);
> +	int (*clear)(struct kvm_kernel_irq_routing_entry *e,
> +		     struct kvm *kvm, int irq_source_id);
>  	union {
>  		struct {
>  			unsigned irqchip;
> @@ -629,6 +631,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
>  				   unsigned long *deliver_bitmask);
>  #endif
>  int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
> +int kvm_clear_irq(struct kvm *kvm, int irq_source_id, u32 irq);
>  int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
>  		int irq_source_id, int level);
>  void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
> diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
> index 5afb431..76e8f22 100644
> --- a/virt/kvm/irq_comm.c
> +++ b/virt/kvm/irq_comm.c
> @@ -68,6 +68,42 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
>  	return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level);
>  }
>  
> +static inline int kvm_clear_irq_line_state(unsigned long *irq_state,
> +					    int irq_source_id)
> +{
> +	return !!test_and_clear_bit(irq_source_id, irq_state);
> +}
> +
> +static int kvm_clear_pic_irq(struct kvm_kernel_irq_routing_entry *e,
> +			     struct kvm *kvm, int irq_source_id)
> +{
> +#ifdef CONFIG_X86
> +	struct kvm_pic *pic = pic_irqchip(kvm);
> +	int level = kvm_clear_irq_line_state(&pic->irq_states[e->irqchip.pin],
> +					     irq_source_id);
> +	if (level)
> +		kvm_pic_set_irq(pic, e->irqchip.pin,
> +				!!pic->irq_states[e->irqchip.pin]);
> +	return level;

I think I begin to understand: if (level) checks it was previously set,
and then we clear if needed? I think it's worthwhile to rename
level to orig_level and rewrite as:

	if (orig_level && !pic->irq_states[e->irqchip.pin])
		kvm_pic_set_irq(pic, e->irqchip.pin, 0);

This both makes the logic clear without need for comments and
saves some cycles on pic in case nothing actually changed.

> +#else
> +	return -1;
> +#endif
> +}
> +
> +static int kvm_clear_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
> +				struct kvm *kvm, int irq_source_id)
> +{
> +	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
> +	int level;
> +
> +	level = kvm_clear_irq_line_state(&ioapic->irq_states[e->irqchip.pin],
> +					 irq_source_id);
> +	if (level)
> +		kvm_ioapic_set_irq(ioapic, e->irqchip.pin,
> +				   !!ioapic->irq_states[e->irqchip.pin]);
> +	return level;
> +}
> +
>  inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
>  {
>  #ifdef CONFIG_IA64
> @@ -190,6 +226,45 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level)
>  	return ret;
>  }
>  
> +/*
> + * Return value:
> + *  < 0   Error
> + *  = 0   Interrupt was not set, did nothing
> + *  > 0   Interrupt was pending, cleared
> + */
> +int kvm_clear_irq(struct kvm *kvm, int irq_source_id, u32 irq)
> +{
> +	struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS];
> +	int ret = -EINVAL, i = 0;
> +	struct kvm_irq_routing_table *irq_rt;
> +	struct hlist_node *n;
> +
> +	/* Not possible to detect if the guest uses the PIC or the
> +	 * IOAPIC.  So clear the bit in both. The guest will ignore
> +	 * writes to the unused one.
> +	 */
> +	rcu_read_lock();
> +	irq_rt = rcu_dereference(kvm->irq_routing);
> +	if (irq < irq_rt->nr_rt_entries)
> +		hlist_for_each_entry(e, n, &irq_rt->map[irq], link)
> +			irq_set[i++] = *e;
> +	rcu_read_unlock();
> +
> +	while (i--) {
> +		int r = -EINVAL;
> +
> +		if (irq_set[i].clear)
> +			r = irq_set[i].clear(&irq_set[i], kvm, irq_source_id);
> +
> +		if (r < 0)
> +			continue;
> +
> +		ret = r + ((ret < 0) ? 0 : ret);
> +	}
> +
> +	return ret;
> +}
> +
>  void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
>  {
>  	struct kvm_irq_ack_notifier *kian;
> @@ -344,16 +419,19 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
>  		switch (ue->u.irqchip.irqchip) {
>  		case KVM_IRQCHIP_PIC_MASTER:
>  			e->set = kvm_set_pic_irq;
> +			e->clear = kvm_clear_pic_irq;
>  			max_pin = 16;
>  			break;
>  		case KVM_IRQCHIP_PIC_SLAVE:
>  			e->set = kvm_set_pic_irq;
> +			e->clear = kvm_clear_pic_irq;
>  			max_pin = 16;
>  			delta = 8;
>  			break;
>  		case KVM_IRQCHIP_IOAPIC:
>  			max_pin = KVM_IOAPIC_NUM_PINS;
>  			e->set = kvm_set_ioapic_irq;
> +			e->clear = kvm_clear_ioapic_irq;
>  			break;
>  		default:
>  			goto out;
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Michael S. Tsirkin July 17, 2012, 10:18 a.m. UTC | #5

On Mon, Jul 16, 2012 at 02:34:03PM -0600, Alex Williamson wrote:
> This is an alternative to kvm_set_irq(,,,0) which returns the previous
> assertion state of the interrupt and does nothing if it isn't changed.
> 
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
> 
>  include/linux/kvm_host.h |    3 ++
>  virt/kvm/irq_comm.c      |   78 ++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 81 insertions(+)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index a7661c0..6c168f1 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -219,6 +219,8 @@ struct kvm_kernel_irq_routing_entry {
>  	u32 type;
>  	int (*set)(struct kvm_kernel_irq_routing_entry *e,
>  		   struct kvm *kvm, int irq_source_id, int level);
> +	int (*clear)(struct kvm_kernel_irq_routing_entry *e,
> +		     struct kvm *kvm, int irq_source_id);
>  	union {
>  		struct {
>  			unsigned irqchip;
> @@ -629,6 +631,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
>  				   unsigned long *deliver_bitmask);
>  #endif
>  int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
> +int kvm_clear_irq(struct kvm *kvm, int irq_source_id, u32 irq);
>  int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
>  		int irq_source_id, int level);
>  void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
> diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
> index 5afb431..76e8f22 100644
> --- a/virt/kvm/irq_comm.c
> +++ b/virt/kvm/irq_comm.c
> @@ -68,6 +68,42 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
>  	return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level);
>  }
>  
> +static inline int kvm_clear_irq_line_state(unsigned long *irq_state,
> +					    int irq_source_id)
> +{
> +	return !!test_and_clear_bit(irq_source_id, irq_state);
> +}
> +
> +static int kvm_clear_pic_irq(struct kvm_kernel_irq_routing_entry *e,
> +			     struct kvm *kvm, int irq_source_id)
> +{
> +#ifdef CONFIG_X86
> +	struct kvm_pic *pic = pic_irqchip(kvm);
> +	int level = kvm_clear_irq_line_state(&pic->irq_states[e->irqchip.pin],
> +					     irq_source_id);
> +	if (level)
> +		kvm_pic_set_irq(pic, e->irqchip.pin,
> +				!!pic->irq_states[e->irqchip.pin]);
> +	return level;
> +#else
> +	return -1;
> +#endif
> +}
> +
> +static int kvm_clear_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
> +				struct kvm *kvm, int irq_source_id)
> +{
> +	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
> +	int level;
> +
> +	level = kvm_clear_irq_line_state(&ioapic->irq_states[e->irqchip.pin],
> +					 irq_source_id);
> +	if (level)
> +		kvm_ioapic_set_irq(ioapic, e->irqchip.pin,
> +				   !!ioapic->irq_states[e->irqchip.pin]);
> +	return level;
> +}
> +
>  inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
>  {
>  #ifdef CONFIG_IA64
> @@ -190,6 +226,45 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level)
>  	return ret;
>  }
>  
> +/*
> + * Return value:
> + *  < 0   Error
> + *  = 0   Interrupt was not set, did nothing
> + *  > 0   Interrupt was pending, cleared
> + */
> +int kvm_clear_irq(struct kvm *kvm, int irq_source_id, u32 irq)
> +{
> +	struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS];
> +	int ret = -EINVAL, i = 0;
> +	struct kvm_irq_routing_table *irq_rt;
> +	struct hlist_node *n;
> +
> +	/* Not possible to detect if the guest uses the PIC or the
> +	 * IOAPIC.  So clear the bit in both. The guest will ignore
> +	 * writes to the unused one.
> +	 */
> +	rcu_read_lock();
> +	irq_rt = rcu_dereference(kvm->irq_routing);
> +	if (irq < irq_rt->nr_rt_entries)
> +		hlist_for_each_entry(e, n, &irq_rt->map[irq], link)
> +			irq_set[i++] = *e;
> +	rcu_read_unlock();
> +
> +	while (i--) {
> +		int r = -EINVAL;
> +
> +		if (irq_set[i].clear)

I would normally suggest if (likely()) here but recently Marcelo
started pushing back against these tags. Maybe add in a separate patch
so it's easier to ignore ...

> +			r = irq_set[i].clear(&irq_set[i], kvm, irq_source_id);
> +
> +		if (r < 0)
> +			continue;
> +
> +		ret = r + ((ret < 0) ? 0 : ret);
> +	}
> +
> +	return ret;
> +}
> +
>  void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
>  {
>  	struct kvm_irq_ack_notifier *kian;
> @@ -344,16 +419,19 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
>  		switch (ue->u.irqchip.irqchip) {
>  		case KVM_IRQCHIP_PIC_MASTER:
>  			e->set = kvm_set_pic_irq;
> +			e->clear = kvm_clear_pic_irq;
>  			max_pin = 16;
>  			break;
>  		case KVM_IRQCHIP_PIC_SLAVE:
>  			e->set = kvm_set_pic_irq;
> +			e->clear = kvm_clear_pic_irq;
>  			max_pin = 16;
>  			delta = 8;
>  			break;
>  		case KVM_IRQCHIP_IOAPIC:
>  			max_pin = KVM_IOAPIC_NUM_PINS;
>  			e->set = kvm_set_ioapic_irq;
> +			e->clear = kvm_clear_ioapic_irq;
>  			break;
>  		default:
>  			goto out;
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Alex Williamson July 17, 2012, 1:56 p.m. UTC | #6

On Tue, 2012-07-17 at 13:14 +0300, Michael S. Tsirkin wrote:
> On Mon, Jul 16, 2012 at 02:34:03PM -0600, Alex Williamson wrote:
> > This is an alternative to kvm_set_irq(,,,0) which returns the previous
> > assertion state of the interrupt and does nothing if it isn't changed.
> > 
> > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > ---
> > 
> >  include/linux/kvm_host.h |    3 ++
> >  virt/kvm/irq_comm.c      |   78 ++++++++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 81 insertions(+)
> > 
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index a7661c0..6c168f1 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -219,6 +219,8 @@ struct kvm_kernel_irq_routing_entry {
> >  	u32 type;
> >  	int (*set)(struct kvm_kernel_irq_routing_entry *e,
> >  		   struct kvm *kvm, int irq_source_id, int level);
> > +	int (*clear)(struct kvm_kernel_irq_routing_entry *e,
> > +		     struct kvm *kvm, int irq_source_id);
> >  	union {
> >  		struct {
> >  			unsigned irqchip;
> > @@ -629,6 +631,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
> >  				   unsigned long *deliver_bitmask);
> >  #endif
> >  int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
> > +int kvm_clear_irq(struct kvm *kvm, int irq_source_id, u32 irq);
> >  int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
> >  		int irq_source_id, int level);
> >  void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
> > diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
> > index 5afb431..76e8f22 100644
> > --- a/virt/kvm/irq_comm.c
> > +++ b/virt/kvm/irq_comm.c
> > @@ -68,6 +68,42 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
> >  	return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level);
> >  }
> >  
> > +static inline int kvm_clear_irq_line_state(unsigned long *irq_state,
> > +					    int irq_source_id)
> > +{
> > +	return !!test_and_clear_bit(irq_source_id, irq_state);
> > +}
> > +
> > +static int kvm_clear_pic_irq(struct kvm_kernel_irq_routing_entry *e,
> > +			     struct kvm *kvm, int irq_source_id)
> > +{
> > +#ifdef CONFIG_X86
> > +	struct kvm_pic *pic = pic_irqchip(kvm);
> > +	int level = kvm_clear_irq_line_state(&pic->irq_states[e->irqchip.pin],
> > +					     irq_source_id);
> > +	if (level)
> > +		kvm_pic_set_irq(pic, e->irqchip.pin,
> > +				!!pic->irq_states[e->irqchip.pin]);
> > +	return level;
> 
> I think I begin to understand: if (level) checks it was previously set,
> and then we clear if needed?

It's actually very simple, if we change anything in irq_states, then
update via the chip specific set_irq function.

>  I think it's worthwhile to rename
> level to orig_level and rewrite as:
> 
> 	if (orig_level && !pic->irq_states[e->irqchip.pin])
> 		kvm_pic_set_irq(pic, e->irqchip.pin, 0);
> 
> This both makes the logic clear without need for comments and
> saves some cycles on pic in case nothing actually changed.

That may work, but it's not actually the same thing.  kvm_set_irq(,,,0)
will clear the bit and call kvm_pic_set_irq with the new irq_states
value, whether it's 0 or 1.  The optimization I make is to only call
kvm_pic_set_irq if we've "changed" irq_states.  You're taking that one
step further to "changed and is now 0".  I don't know if that's correct
behavior.

> > +#else
> > +	return -1;
> > +#endif
> > +}
> > +
> > +static int kvm_clear_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
> > +				struct kvm *kvm, int irq_source_id)
> > +{
> > +	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
> > +	int level;
> > +
> > +	level = kvm_clear_irq_line_state(&ioapic->irq_states[e->irqchip.pin],
> > +					 irq_source_id);
> > +	if (level)
> > +		kvm_ioapic_set_irq(ioapic, e->irqchip.pin,
> > +				   !!ioapic->irq_states[e->irqchip.pin]);
> > +	return level;
> > +}
> > +
> >  inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
> >  {
> >  #ifdef CONFIG_IA64
> > @@ -190,6 +226,45 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level)
> >  	return ret;
> >  }
> >  
> > +/*
> > + * Return value:
> > + *  < 0   Error
> > + *  = 0   Interrupt was not set, did nothing
> > + *  > 0   Interrupt was pending, cleared
> > + */
> > +int kvm_clear_irq(struct kvm *kvm, int irq_source_id, u32 irq)
> > +{
> > +	struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS];
> > +	int ret = -EINVAL, i = 0;
> > +	struct kvm_irq_routing_table *irq_rt;
> > +	struct hlist_node *n;
> > +
> > +	/* Not possible to detect if the guest uses the PIC or the
> > +	 * IOAPIC.  So clear the bit in both. The guest will ignore
> > +	 * writes to the unused one.
> > +	 */
> > +	rcu_read_lock();
> > +	irq_rt = rcu_dereference(kvm->irq_routing);
> > +	if (irq < irq_rt->nr_rt_entries)
> > +		hlist_for_each_entry(e, n, &irq_rt->map[irq], link)
> > +			irq_set[i++] = *e;
> > +	rcu_read_unlock();
> > +
> > +	while (i--) {
> > +		int r = -EINVAL;
> > +
> > +		if (irq_set[i].clear)
> > +			r = irq_set[i].clear(&irq_set[i], kvm, irq_source_id);
> > +
> > +		if (r < 0)
> > +			continue;
> > +
> > +		ret = r + ((ret < 0) ? 0 : ret);
> > +	}
> > +
> > +	return ret;
> > +}
> > +
> >  void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
> >  {
> >  	struct kvm_irq_ack_notifier *kian;
> > @@ -344,16 +419,19 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
> >  		switch (ue->u.irqchip.irqchip) {
> >  		case KVM_IRQCHIP_PIC_MASTER:
> >  			e->set = kvm_set_pic_irq;
> > +			e->clear = kvm_clear_pic_irq;
> >  			max_pin = 16;
> >  			break;
> >  		case KVM_IRQCHIP_PIC_SLAVE:
> >  			e->set = kvm_set_pic_irq;
> > +			e->clear = kvm_clear_pic_irq;
> >  			max_pin = 16;
> >  			delta = 8;
> >  			break;
> >  		case KVM_IRQCHIP_IOAPIC:
> >  			max_pin = KVM_IOAPIC_NUM_PINS;
> >  			e->set = kvm_set_ioapic_irq;
> > +			e->clear = kvm_clear_ioapic_irq;
> >  			break;
> >  		default:
> >  			goto out;



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Michael S. Tsirkin July 17, 2012, 2:08 p.m. UTC | #7

On Tue, Jul 17, 2012 at 07:56:09AM -0600, Alex Williamson wrote:
> On Tue, 2012-07-17 at 13:14 +0300, Michael S. Tsirkin wrote:
> > On Mon, Jul 16, 2012 at 02:34:03PM -0600, Alex Williamson wrote:
> > > This is an alternative to kvm_set_irq(,,,0) which returns the previous
> > > assertion state of the interrupt and does nothing if it isn't changed.
> > > 
> > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > ---
> > > 
> > >  include/linux/kvm_host.h |    3 ++
> > >  virt/kvm/irq_comm.c      |   78 ++++++++++++++++++++++++++++++++++++++++++++++
> > >  2 files changed, 81 insertions(+)
> > > 
> > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > index a7661c0..6c168f1 100644
> > > --- a/include/linux/kvm_host.h
> > > +++ b/include/linux/kvm_host.h
> > > @@ -219,6 +219,8 @@ struct kvm_kernel_irq_routing_entry {
> > >  	u32 type;
> > >  	int (*set)(struct kvm_kernel_irq_routing_entry *e,
> > >  		   struct kvm *kvm, int irq_source_id, int level);
> > > +	int (*clear)(struct kvm_kernel_irq_routing_entry *e,
> > > +		     struct kvm *kvm, int irq_source_id);
> > >  	union {
> > >  		struct {
> > >  			unsigned irqchip;
> > > @@ -629,6 +631,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
> > >  				   unsigned long *deliver_bitmask);
> > >  #endif
> > >  int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
> > > +int kvm_clear_irq(struct kvm *kvm, int irq_source_id, u32 irq);
> > >  int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
> > >  		int irq_source_id, int level);
> > >  void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
> > > diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
> > > index 5afb431..76e8f22 100644
> > > --- a/virt/kvm/irq_comm.c
> > > +++ b/virt/kvm/irq_comm.c
> > > @@ -68,6 +68,42 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
> > >  	return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level);
> > >  }
> > >  
> > > +static inline int kvm_clear_irq_line_state(unsigned long *irq_state,
> > > +					    int irq_source_id)
> > > +{
> > > +	return !!test_and_clear_bit(irq_source_id, irq_state);
> > > +}
> > > +
> > > +static int kvm_clear_pic_irq(struct kvm_kernel_irq_routing_entry *e,
> > > +			     struct kvm *kvm, int irq_source_id)
> > > +{
> > > +#ifdef CONFIG_X86
> > > +	struct kvm_pic *pic = pic_irqchip(kvm);
> > > +	int level = kvm_clear_irq_line_state(&pic->irq_states[e->irqchip.pin],
> > > +					     irq_source_id);
> > > +	if (level)
> > > +		kvm_pic_set_irq(pic, e->irqchip.pin,
> > > +				!!pic->irq_states[e->irqchip.pin]);
> > > +	return level;
> > 
> > I think I begin to understand: if (level) checks it was previously set,
> > and then we clear if needed?
> 
> It's actually very simple, if we change anything in irq_states, then
> update via the chip specific set_irq function.
> 
> >  I think it's worthwhile to rename
> > level to orig_level and rewrite as:
> > 
> > 	if (orig_level && !pic->irq_states[e->irqchip.pin])
> > 		kvm_pic_set_irq(pic, e->irqchip.pin, 0);
> > 
> > This both makes the logic clear without need for comments and
> > saves some cycles on pic in case nothing actually changed.
> 
> That may work, but it's not actually the same thing.  kvm_set_irq(,,,0)
> will clear the bit and call kvm_pic_set_irq with the new irq_states
> value, whether it's 0 or 1.  The optimization I make is to only call
> kvm_pic_set_irq if we've "changed" irq_states.  You're taking that one
> step further to "changed and is now 0".  I don't know if that's correct
> behavior.

If not then I don't understand. You clear a bit
in a word. You never change it to 1, do you?

But this brings another question:

static inline int kvm_irq_line_state(unsigned long *irq_state,
                                     int irq_source_id, int level)
{
        /* Logical OR for level trig interrupt */
        if (level)
                set_bit(irq_source_id, irq_state);
        else
                clear_bit(irq_source_id, irq_state);


^^^^^^^^^^^
above uses locked instructions

        return !!(*irq_state);


above doesn't

}


why the insonsistency?

> > > +#else
> > > +	return -1;
> > > +#endif
> > > +}
> > > +
> > > +static int kvm_clear_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
> > > +				struct kvm *kvm, int irq_source_id)
> > > +{
> > > +	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
> > > +	int level;
> > > +
> > > +	level = kvm_clear_irq_line_state(&ioapic->irq_states[e->irqchip.pin],
> > > +					 irq_source_id);
> > > +	if (level)
> > > +		kvm_ioapic_set_irq(ioapic, e->irqchip.pin,
> > > +				   !!ioapic->irq_states[e->irqchip.pin]);
> > > +	return level;
> > > +}
> > > +
> > >  inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
> > >  {
> > >  #ifdef CONFIG_IA64
> > > @@ -190,6 +226,45 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level)
> > >  	return ret;
> > >  }
> > >  
> > > +/*
> > > + * Return value:
> > > + *  < 0   Error
> > > + *  = 0   Interrupt was not set, did nothing
> > > + *  > 0   Interrupt was pending, cleared
> > > + */
> > > +int kvm_clear_irq(struct kvm *kvm, int irq_source_id, u32 irq)
> > > +{
> > > +	struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS];
> > > +	int ret = -EINVAL, i = 0;
> > > +	struct kvm_irq_routing_table *irq_rt;
> > > +	struct hlist_node *n;
> > > +
> > > +	/* Not possible to detect if the guest uses the PIC or the
> > > +	 * IOAPIC.  So clear the bit in both. The guest will ignore
> > > +	 * writes to the unused one.
> > > +	 */
> > > +	rcu_read_lock();
> > > +	irq_rt = rcu_dereference(kvm->irq_routing);
> > > +	if (irq < irq_rt->nr_rt_entries)
> > > +		hlist_for_each_entry(e, n, &irq_rt->map[irq], link)
> > > +			irq_set[i++] = *e;
> > > +	rcu_read_unlock();
> > > +
> > > +	while (i--) {
> > > +		int r = -EINVAL;
> > > +
> > > +		if (irq_set[i].clear)
> > > +			r = irq_set[i].clear(&irq_set[i], kvm, irq_source_id);
> > > +
> > > +		if (r < 0)
> > > +			continue;
> > > +
> > > +		ret = r + ((ret < 0) ? 0 : ret);
> > > +	}
> > > +
> > > +	return ret;
> > > +}
> > > +
> > >  void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
> > >  {
> > >  	struct kvm_irq_ack_notifier *kian;
> > > @@ -344,16 +419,19 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
> > >  		switch (ue->u.irqchip.irqchip) {
> > >  		case KVM_IRQCHIP_PIC_MASTER:
> > >  			e->set = kvm_set_pic_irq;
> > > +			e->clear = kvm_clear_pic_irq;
> > >  			max_pin = 16;
> > >  			break;
> > >  		case KVM_IRQCHIP_PIC_SLAVE:
> > >  			e->set = kvm_set_pic_irq;
> > > +			e->clear = kvm_clear_pic_irq;
> > >  			max_pin = 16;
> > >  			delta = 8;
> > >  			break;
> > >  		case KVM_IRQCHIP_IOAPIC:
> > >  			max_pin = KVM_IOAPIC_NUM_PINS;
> > >  			e->set = kvm_set_ioapic_irq;
> > > +			e->clear = kvm_clear_ioapic_irq;
> > >  			break;
> > >  		default:
> > >  			goto out;
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Alex Williamson July 17, 2012, 2:21 p.m. UTC | #8

On Tue, 2012-07-17 at 17:08 +0300, Michael S. Tsirkin wrote:
> On Tue, Jul 17, 2012 at 07:56:09AM -0600, Alex Williamson wrote:
> > On Tue, 2012-07-17 at 13:14 +0300, Michael S. Tsirkin wrote:
> > > On Mon, Jul 16, 2012 at 02:34:03PM -0600, Alex Williamson wrote:
> > > > This is an alternative to kvm_set_irq(,,,0) which returns the previous
> > > > assertion state of the interrupt and does nothing if it isn't changed.
> > > > 
> > > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > > ---
> > > > 
> > > >  include/linux/kvm_host.h |    3 ++
> > > >  virt/kvm/irq_comm.c      |   78 ++++++++++++++++++++++++++++++++++++++++++++++
> > > >  2 files changed, 81 insertions(+)
> > > > 
> > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > > index a7661c0..6c168f1 100644
> > > > --- a/include/linux/kvm_host.h
> > > > +++ b/include/linux/kvm_host.h
> > > > @@ -219,6 +219,8 @@ struct kvm_kernel_irq_routing_entry {
> > > >  	u32 type;
> > > >  	int (*set)(struct kvm_kernel_irq_routing_entry *e,
> > > >  		   struct kvm *kvm, int irq_source_id, int level);
> > > > +	int (*clear)(struct kvm_kernel_irq_routing_entry *e,
> > > > +		     struct kvm *kvm, int irq_source_id);
> > > >  	union {
> > > >  		struct {
> > > >  			unsigned irqchip;
> > > > @@ -629,6 +631,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
> > > >  				   unsigned long *deliver_bitmask);
> > > >  #endif
> > > >  int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
> > > > +int kvm_clear_irq(struct kvm *kvm, int irq_source_id, u32 irq);
> > > >  int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
> > > >  		int irq_source_id, int level);
> > > >  void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
> > > > diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
> > > > index 5afb431..76e8f22 100644
> > > > --- a/virt/kvm/irq_comm.c
> > > > +++ b/virt/kvm/irq_comm.c
> > > > @@ -68,6 +68,42 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
> > > >  	return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level);
> > > >  }
> > > >  
> > > > +static inline int kvm_clear_irq_line_state(unsigned long *irq_state,
> > > > +					    int irq_source_id)
> > > > +{
> > > > +	return !!test_and_clear_bit(irq_source_id, irq_state);
> > > > +}
> > > > +
> > > > +static int kvm_clear_pic_irq(struct kvm_kernel_irq_routing_entry *e,
> > > > +			     struct kvm *kvm, int irq_source_id)
> > > > +{
> > > > +#ifdef CONFIG_X86
> > > > +	struct kvm_pic *pic = pic_irqchip(kvm);
> > > > +	int level = kvm_clear_irq_line_state(&pic->irq_states[e->irqchip.pin],
> > > > +					     irq_source_id);
> > > > +	if (level)
> > > > +		kvm_pic_set_irq(pic, e->irqchip.pin,
> > > > +				!!pic->irq_states[e->irqchip.pin]);
> > > > +	return level;
> > > 
> > > I think I begin to understand: if (level) checks it was previously set,
> > > and then we clear if needed?
> > 
> > It's actually very simple, if we change anything in irq_states, then
> > update via the chip specific set_irq function.
> > 
> > >  I think it's worthwhile to rename
> > > level to orig_level and rewrite as:
> > > 
> > > 	if (orig_level && !pic->irq_states[e->irqchip.pin])
> > > 		kvm_pic_set_irq(pic, e->irqchip.pin, 0);
> > > 
> > > This both makes the logic clear without need for comments and
> > > saves some cycles on pic in case nothing actually changed.
> > 
> > That may work, but it's not actually the same thing.  kvm_set_irq(,,,0)
> > will clear the bit and call kvm_pic_set_irq with the new irq_states
> > value, whether it's 0 or 1.  The optimization I make is to only call
> > kvm_pic_set_irq if we've "changed" irq_states.  You're taking that one
> > step further to "changed and is now 0".  I don't know if that's correct
> > behavior.
> 
> If not then I don't understand. You clear a bit
> in a word. You never change it to 1, do you?

Correct, but kvm_set_irq(,,,0) may call kvm_pic_set_irq(,,1) if other
source IDs are still asserting the interrupt.  Your proposal assumes
that unless irq_states is also 0 we don't need to call kvm_pic_set_irq,
and I don't know if that's correct.

> 
> But this brings another question:
> 
> static inline int kvm_irq_line_state(unsigned long *irq_state,
>                                      int irq_source_id, int level)
> {
>         /* Logical OR for level trig interrupt */
>         if (level)
>                 set_bit(irq_source_id, irq_state);
>         else
>                 clear_bit(irq_source_id, irq_state);
> 
> 
> ^^^^^^^^^^^
> above uses locked instructions
> 
>         return !!(*irq_state);
> 
> 
> above doesn't
> 
> }
> 
> 
> why the insonsistency?

Note that set/clear_bit are not locked instructions, but atomic
instructions and it could be argued that reading the value is also
atomic.  At least that was my guess when I stumbled across the same
yesterday.  IMHO, we're going off into the weeds again with these last
two patches.  It may be a valid optimization, but it really has no
bearing on the meat of the series (and afaict, no significant
performance difference either).



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Michael S. Tsirkin July 17, 2012, 2:53 p.m. UTC | #9

On Tue, Jul 17, 2012 at 08:21:51AM -0600, Alex Williamson wrote:
> On Tue, 2012-07-17 at 17:08 +0300, Michael S. Tsirkin wrote:
> > On Tue, Jul 17, 2012 at 07:56:09AM -0600, Alex Williamson wrote:
> > > On Tue, 2012-07-17 at 13:14 +0300, Michael S. Tsirkin wrote:
> > > > On Mon, Jul 16, 2012 at 02:34:03PM -0600, Alex Williamson wrote:
> > > > > This is an alternative to kvm_set_irq(,,,0) which returns the previous
> > > > > assertion state of the interrupt and does nothing if it isn't changed.
> > > > > 
> > > > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > > > ---
> > > > > 
> > > > >  include/linux/kvm_host.h |    3 ++
> > > > >  virt/kvm/irq_comm.c      |   78 ++++++++++++++++++++++++++++++++++++++++++++++
> > > > >  2 files changed, 81 insertions(+)
> > > > > 
> > > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > > > index a7661c0..6c168f1 100644
> > > > > --- a/include/linux/kvm_host.h
> > > > > +++ b/include/linux/kvm_host.h
> > > > > @@ -219,6 +219,8 @@ struct kvm_kernel_irq_routing_entry {
> > > > >  	u32 type;
> > > > >  	int (*set)(struct kvm_kernel_irq_routing_entry *e,
> > > > >  		   struct kvm *kvm, int irq_source_id, int level);
> > > > > +	int (*clear)(struct kvm_kernel_irq_routing_entry *e,
> > > > > +		     struct kvm *kvm, int irq_source_id);
> > > > >  	union {
> > > > >  		struct {
> > > > >  			unsigned irqchip;
> > > > > @@ -629,6 +631,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
> > > > >  				   unsigned long *deliver_bitmask);
> > > > >  #endif
> > > > >  int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
> > > > > +int kvm_clear_irq(struct kvm *kvm, int irq_source_id, u32 irq);
> > > > >  int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
> > > > >  		int irq_source_id, int level);
> > > > >  void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
> > > > > diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
> > > > > index 5afb431..76e8f22 100644
> > > > > --- a/virt/kvm/irq_comm.c
> > > > > +++ b/virt/kvm/irq_comm.c
> > > > > @@ -68,6 +68,42 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
> > > > >  	return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level);
> > > > >  }
> > > > >  
> > > > > +static inline int kvm_clear_irq_line_state(unsigned long *irq_state,
> > > > > +					    int irq_source_id)
> > > > > +{
> > > > > +	return !!test_and_clear_bit(irq_source_id, irq_state);
> > > > > +}
> > > > > +
> > > > > +static int kvm_clear_pic_irq(struct kvm_kernel_irq_routing_entry *e,
> > > > > +			     struct kvm *kvm, int irq_source_id)
> > > > > +{
> > > > > +#ifdef CONFIG_X86
> > > > > +	struct kvm_pic *pic = pic_irqchip(kvm);
> > > > > +	int level = kvm_clear_irq_line_state(&pic->irq_states[e->irqchip.pin],
> > > > > +					     irq_source_id);
> > > > > +	if (level)
> > > > > +		kvm_pic_set_irq(pic, e->irqchip.pin,
> > > > > +				!!pic->irq_states[e->irqchip.pin]);
> > > > > +	return level;
> > > > 
> > > > I think I begin to understand: if (level) checks it was previously set,
> > > > and then we clear if needed?
> > > 
> > > It's actually very simple, if we change anything in irq_states, then
> > > update via the chip specific set_irq function.
> > > 
> > > >  I think it's worthwhile to rename
> > > > level to orig_level and rewrite as:
> > > > 
> > > > 	if (orig_level && !pic->irq_states[e->irqchip.pin])
> > > > 		kvm_pic_set_irq(pic, e->irqchip.pin, 0);
> > > > 
> > > > This both makes the logic clear without need for comments and
> > > > saves some cycles on pic in case nothing actually changed.
> > > 
> > > That may work, but it's not actually the same thing.  kvm_set_irq(,,,0)
> > > will clear the bit and call kvm_pic_set_irq with the new irq_states
> > > value, whether it's 0 or 1.  The optimization I make is to only call
> > > kvm_pic_set_irq if we've "changed" irq_states.  You're taking that one
> > > step further to "changed and is now 0".  I don't know if that's correct
> > > behavior.
> > 
> > If not then I don't understand. You clear a bit
> > in a word. You never change it to 1, do you?
> 
> Correct, but kvm_set_irq(,,,0) may call kvm_pic_set_irq(,,1) if other
> source IDs are still asserting the interrupt.  Your proposal assumes
> that unless irq_states is also 0 we don't need to call kvm_pic_set_irq,
> and I don't know if that's correct.

Well you are asked to clear some id and level was 1. So we know
interrupt was asserted. Either we clear it or we don't. No?

> > 
> > But this brings another question:
> > 
> > static inline int kvm_irq_line_state(unsigned long *irq_state,
> >                                      int irq_source_id, int level)
> > {
> >         /* Logical OR for level trig interrupt */
> >         if (level)
> >                 set_bit(irq_source_id, irq_state);
> >         else
> >                 clear_bit(irq_source_id, irq_state);
> > 
> > 
> > ^^^^^^^^^^^
> > above uses locked instructions
> > 
> >         return !!(*irq_state);
> > 
> > 
> > above doesn't
> > 
> > }
> > 
> > 
> > why the insonsistency?
> 
> Note that set/clear_bit are not locked instructions,

On x86 they are:
static __always_inline void
set_bit(unsigned int nr, volatile unsigned long *addr)
{
        if (IS_IMMEDIATE(nr)) {
                asm volatile(LOCK_PREFIX "orb %1,%0"
                        : CONST_MASK_ADDR(nr, addr)
                        : "iq" ((u8)CONST_MASK(nr))
                        : "memory");
        } else {
                asm volatile(LOCK_PREFIX "bts %1,%0"
                        : BITOP_ADDR(addr) : "Ir" (nr) : "memory");
        }
}

> but atomic
> instructions and it could be argued that reading the value is also
> atomic.  At least that was my guess when I stumbled across the same
> yesterday.  IMHO, we're going off into the weeds again with these last
> two patches.  It may be a valid optimization, but it really has no
> bearing on the meat of the series (and afaict, no significant
> performance difference either).

For me it's not a performance thing. IMO code is cleaner without this locking:
we add a lock but only use it in some cases, so the rules become really
complex.  And current code looks buggy if yes we need to fix it somehow.

Alex Williamson July 17, 2012, 3:20 p.m. UTC | #10

On Tue, 2012-07-17 at 17:53 +0300, Michael S. Tsirkin wrote:
> On Tue, Jul 17, 2012 at 08:21:51AM -0600, Alex Williamson wrote:
> > On Tue, 2012-07-17 at 17:08 +0300, Michael S. Tsirkin wrote:
> > > On Tue, Jul 17, 2012 at 07:56:09AM -0600, Alex Williamson wrote:
> > > > On Tue, 2012-07-17 at 13:14 +0300, Michael S. Tsirkin wrote:
> > > > > On Mon, Jul 16, 2012 at 02:34:03PM -0600, Alex Williamson wrote:
> > > > > > This is an alternative to kvm_set_irq(,,,0) which returns the previous
> > > > > > assertion state of the interrupt and does nothing if it isn't changed.
> > > > > > 
> > > > > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > > > > ---
> > > > > > 
> > > > > >  include/linux/kvm_host.h |    3 ++
> > > > > >  virt/kvm/irq_comm.c      |   78 ++++++++++++++++++++++++++++++++++++++++++++++
> > > > > >  2 files changed, 81 insertions(+)
> > > > > > 
> > > > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > > > > index a7661c0..6c168f1 100644
> > > > > > --- a/include/linux/kvm_host.h
> > > > > > +++ b/include/linux/kvm_host.h
> > > > > > @@ -219,6 +219,8 @@ struct kvm_kernel_irq_routing_entry {
> > > > > >  	u32 type;
> > > > > >  	int (*set)(struct kvm_kernel_irq_routing_entry *e,
> > > > > >  		   struct kvm *kvm, int irq_source_id, int level);
> > > > > > +	int (*clear)(struct kvm_kernel_irq_routing_entry *e,
> > > > > > +		     struct kvm *kvm, int irq_source_id);
> > > > > >  	union {
> > > > > >  		struct {
> > > > > >  			unsigned irqchip;
> > > > > > @@ -629,6 +631,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
> > > > > >  				   unsigned long *deliver_bitmask);
> > > > > >  #endif
> > > > > >  int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
> > > > > > +int kvm_clear_irq(struct kvm *kvm, int irq_source_id, u32 irq);
> > > > > >  int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
> > > > > >  		int irq_source_id, int level);
> > > > > >  void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
> > > > > > diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
> > > > > > index 5afb431..76e8f22 100644
> > > > > > --- a/virt/kvm/irq_comm.c
> > > > > > +++ b/virt/kvm/irq_comm.c
> > > > > > @@ -68,6 +68,42 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
> > > > > >  	return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level);
> > > > > >  }
> > > > > >  
> > > > > > +static inline int kvm_clear_irq_line_state(unsigned long *irq_state,
> > > > > > +					    int irq_source_id)
> > > > > > +{
> > > > > > +	return !!test_and_clear_bit(irq_source_id, irq_state);
> > > > > > +}
> > > > > > +
> > > > > > +static int kvm_clear_pic_irq(struct kvm_kernel_irq_routing_entry *e,
> > > > > > +			     struct kvm *kvm, int irq_source_id)
> > > > > > +{
> > > > > > +#ifdef CONFIG_X86
> > > > > > +	struct kvm_pic *pic = pic_irqchip(kvm);
> > > > > > +	int level = kvm_clear_irq_line_state(&pic->irq_states[e->irqchip.pin],
> > > > > > +					     irq_source_id);
> > > > > > +	if (level)
> > > > > > +		kvm_pic_set_irq(pic, e->irqchip.pin,
> > > > > > +				!!pic->irq_states[e->irqchip.pin]);
> > > > > > +	return level;
> > > > > 
> > > > > I think I begin to understand: if (level) checks it was previously set,
> > > > > and then we clear if needed?
> > > > 
> > > > It's actually very simple, if we change anything in irq_states, then
> > > > update via the chip specific set_irq function.
> > > > 
> > > > >  I think it's worthwhile to rename
> > > > > level to orig_level and rewrite as:
> > > > > 
> > > > > 	if (orig_level && !pic->irq_states[e->irqchip.pin])
> > > > > 		kvm_pic_set_irq(pic, e->irqchip.pin, 0);
> > > > > 
> > > > > This both makes the logic clear without need for comments and
> > > > > saves some cycles on pic in case nothing actually changed.
> > > > 
> > > > That may work, but it's not actually the same thing.  kvm_set_irq(,,,0)
> > > > will clear the bit and call kvm_pic_set_irq with the new irq_states
> > > > value, whether it's 0 or 1.  The optimization I make is to only call
> > > > kvm_pic_set_irq if we've "changed" irq_states.  You're taking that one
> > > > step further to "changed and is now 0".  I don't know if that's correct
> > > > behavior.
> > > 
> > > If not then I don't understand. You clear a bit
> > > in a word. You never change it to 1, do you?
> > 
> > Correct, but kvm_set_irq(,,,0) may call kvm_pic_set_irq(,,1) if other
> > source IDs are still asserting the interrupt.  Your proposal assumes
> > that unless irq_states is also 0 we don't need to call kvm_pic_set_irq,
> > and I don't know if that's correct.
> 
> Well you are asked to clear some id and level was 1. So we know
> interrupt was asserted. Either we clear it or we don't. No?
> 
> > > 
> > > But this brings another question:
> > > 
> > > static inline int kvm_irq_line_state(unsigned long *irq_state,
> > >                                      int irq_source_id, int level)
> > > {
> > >         /* Logical OR for level trig interrupt */
> > >         if (level)
> > >                 set_bit(irq_source_id, irq_state);
> > >         else
> > >                 clear_bit(irq_source_id, irq_state);
> > > 
> > > 
> > > ^^^^^^^^^^^
> > > above uses locked instructions
> > > 
> > >         return !!(*irq_state);
> > > 
> > > 
> > > above doesn't
> > > 
> > > }
> > > 
> > > 
> > > why the insonsistency?
> > 
> > Note that set/clear_bit are not locked instructions,
> 
> On x86 they are:
> static __always_inline void
> set_bit(unsigned int nr, volatile unsigned long *addr)
> {
>         if (IS_IMMEDIATE(nr)) {
>                 asm volatile(LOCK_PREFIX "orb %1,%0"
>                         : CONST_MASK_ADDR(nr, addr)
>                         : "iq" ((u8)CONST_MASK(nr))
>                         : "memory");
>         } else {
>                 asm volatile(LOCK_PREFIX "bts %1,%0"
>                         : BITOP_ADDR(addr) : "Ir" (nr) : "memory");
>         }
> }
> 
> > but atomic
> > instructions and it could be argued that reading the value is also
> > atomic.  At least that was my guess when I stumbled across the same
> > yesterday.  IMHO, we're going off into the weeds again with these last
> > two patches.  It may be a valid optimization, but it really has no
> > bearing on the meat of the series (and afaict, no significant
> > performance difference either).
> 
> For me it's not a performance thing. IMO code is cleaner without this locking:
> we add a lock but only use it in some cases, so the rules become really
> complex.

Seriously?

        spin_lock(&irqfd->source->lock);
        if (!irqfd->source->level_asserted) {
                kvm_set_irq(irqfd->kvm, irqfd->source->id, irqfd->gsi, 1);
                irqfd->source->level_asserted = true;
        }
        spin_unlock(&irqfd->source->lock);

...

        spin_lock(&eoifd->source->lock);
        if (eoifd->source->level_asserted) {
                kvm_set_irq(eoifd->kvm,
                            eoifd->source->id, eoifd->notifier.gsi, 0);
                eoifd->source->level_asserted = false;
                eventfd_signal(eoifd->eventfd, 1);
        }
        spin_unlock(&eoifd->source->lock);


Locking doesn't get much more straightforward than that

>   And current code looks buggy if yes we need to fix it somehow.


Which to me seems to indicate this should be handled as a separate
effort.


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Michael S. Tsirkin July 17, 2012, 3:36 p.m. UTC | #11

On Tue, Jul 17, 2012 at 09:20:11AM -0600, Alex Williamson wrote:
> On Tue, 2012-07-17 at 17:53 +0300, Michael S. Tsirkin wrote:
> > On Tue, Jul 17, 2012 at 08:21:51AM -0600, Alex Williamson wrote:
> > > On Tue, 2012-07-17 at 17:08 +0300, Michael S. Tsirkin wrote:
> > > > On Tue, Jul 17, 2012 at 07:56:09AM -0600, Alex Williamson wrote:
> > > > > On Tue, 2012-07-17 at 13:14 +0300, Michael S. Tsirkin wrote:
> > > > > > On Mon, Jul 16, 2012 at 02:34:03PM -0600, Alex Williamson wrote:
> > > > > > > This is an alternative to kvm_set_irq(,,,0) which returns the previous
> > > > > > > assertion state of the interrupt and does nothing if it isn't changed.
> > > > > > > 
> > > > > > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > > > > > ---
> > > > > > > 
> > > > > > >  include/linux/kvm_host.h |    3 ++
> > > > > > >  virt/kvm/irq_comm.c      |   78 ++++++++++++++++++++++++++++++++++++++++++++++
> > > > > > >  2 files changed, 81 insertions(+)
> > > > > > > 
> > > > > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > > > > > index a7661c0..6c168f1 100644
> > > > > > > --- a/include/linux/kvm_host.h
> > > > > > > +++ b/include/linux/kvm_host.h
> > > > > > > @@ -219,6 +219,8 @@ struct kvm_kernel_irq_routing_entry {
> > > > > > >  	u32 type;
> > > > > > >  	int (*set)(struct kvm_kernel_irq_routing_entry *e,
> > > > > > >  		   struct kvm *kvm, int irq_source_id, int level);
> > > > > > > +	int (*clear)(struct kvm_kernel_irq_routing_entry *e,
> > > > > > > +		     struct kvm *kvm, int irq_source_id);
> > > > > > >  	union {
> > > > > > >  		struct {
> > > > > > >  			unsigned irqchip;
> > > > > > > @@ -629,6 +631,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
> > > > > > >  				   unsigned long *deliver_bitmask);
> > > > > > >  #endif
> > > > > > >  int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
> > > > > > > +int kvm_clear_irq(struct kvm *kvm, int irq_source_id, u32 irq);
> > > > > > >  int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
> > > > > > >  		int irq_source_id, int level);
> > > > > > >  void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
> > > > > > > diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
> > > > > > > index 5afb431..76e8f22 100644
> > > > > > > --- a/virt/kvm/irq_comm.c
> > > > > > > +++ b/virt/kvm/irq_comm.c
> > > > > > > @@ -68,6 +68,42 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
> > > > > > >  	return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level);
> > > > > > >  }
> > > > > > >  
> > > > > > > +static inline int kvm_clear_irq_line_state(unsigned long *irq_state,
> > > > > > > +					    int irq_source_id)
> > > > > > > +{
> > > > > > > +	return !!test_and_clear_bit(irq_source_id, irq_state);
> > > > > > > +}
> > > > > > > +
> > > > > > > +static int kvm_clear_pic_irq(struct kvm_kernel_irq_routing_entry *e,
> > > > > > > +			     struct kvm *kvm, int irq_source_id)
> > > > > > > +{
> > > > > > > +#ifdef CONFIG_X86
> > > > > > > +	struct kvm_pic *pic = pic_irqchip(kvm);
> > > > > > > +	int level = kvm_clear_irq_line_state(&pic->irq_states[e->irqchip.pin],
> > > > > > > +					     irq_source_id);
> > > > > > > +	if (level)
> > > > > > > +		kvm_pic_set_irq(pic, e->irqchip.pin,
> > > > > > > +				!!pic->irq_states[e->irqchip.pin]);
> > > > > > > +	return level;
> > > > > > 
> > > > > > I think I begin to understand: if (level) checks it was previously set,
> > > > > > and then we clear if needed?
> > > > > 
> > > > > It's actually very simple, if we change anything in irq_states, then
> > > > > update via the chip specific set_irq function.
> > > > > 
> > > > > >  I think it's worthwhile to rename
> > > > > > level to orig_level and rewrite as:
> > > > > > 
> > > > > > 	if (orig_level && !pic->irq_states[e->irqchip.pin])
> > > > > > 		kvm_pic_set_irq(pic, e->irqchip.pin, 0);
> > > > > > 
> > > > > > This both makes the logic clear without need for comments and
> > > > > > saves some cycles on pic in case nothing actually changed.
> > > > > 
> > > > > That may work, but it's not actually the same thing.  kvm_set_irq(,,,0)
> > > > > will clear the bit and call kvm_pic_set_irq with the new irq_states
> > > > > value, whether it's 0 or 1.  The optimization I make is to only call
> > > > > kvm_pic_set_irq if we've "changed" irq_states.  You're taking that one
> > > > > step further to "changed and is now 0".  I don't know if that's correct
> > > > > behavior.
> > > > 
> > > > If not then I don't understand. You clear a bit
> > > > in a word. You never change it to 1, do you?
> > > 
> > > Correct, but kvm_set_irq(,,,0) may call kvm_pic_set_irq(,,1) if other
> > > source IDs are still asserting the interrupt.  Your proposal assumes
> > > that unless irq_states is also 0 we don't need to call kvm_pic_set_irq,
> > > and I don't know if that's correct.
> > 
> > Well you are asked to clear some id and level was 1. So we know
> > interrupt was asserted. Either we clear it or we don't. No?
> > 
> > > > 
> > > > But this brings another question:
> > > > 
> > > > static inline int kvm_irq_line_state(unsigned long *irq_state,
> > > >                                      int irq_source_id, int level)
> > > > {
> > > >         /* Logical OR for level trig interrupt */
> > > >         if (level)
> > > >                 set_bit(irq_source_id, irq_state);
> > > >         else
> > > >                 clear_bit(irq_source_id, irq_state);
> > > > 
> > > > 
> > > > ^^^^^^^^^^^
> > > > above uses locked instructions
> > > > 
> > > >         return !!(*irq_state);
> > > > 
> > > > 
> > > > above doesn't
> > > > 
> > > > }
> > > > 
> > > > 
> > > > why the insonsistency?
> > > 
> > > Note that set/clear_bit are not locked instructions,
> > 
> > On x86 they are:
> > static __always_inline void
> > set_bit(unsigned int nr, volatile unsigned long *addr)
> > {
> >         if (IS_IMMEDIATE(nr)) {
> >                 asm volatile(LOCK_PREFIX "orb %1,%0"
> >                         : CONST_MASK_ADDR(nr, addr)
> >                         : "iq" ((u8)CONST_MASK(nr))
> >                         : "memory");
> >         } else {
> >                 asm volatile(LOCK_PREFIX "bts %1,%0"
> >                         : BITOP_ADDR(addr) : "Ir" (nr) : "memory");
> >         }
> > }
> > 
> > > but atomic
> > > instructions and it could be argued that reading the value is also
> > > atomic.  At least that was my guess when I stumbled across the same
> > > yesterday.  IMHO, we're going off into the weeds again with these last
> > > two patches.  It may be a valid optimization, but it really has no
> > > bearing on the meat of the series (and afaict, no significant
> > > performance difference either).
> > 
> > For me it's not a performance thing. IMO code is cleaner without this locking:
> > we add a lock but only use it in some cases, so the rules become really
> > complex.
> 
> Seriously?
> 
>         spin_lock(&irqfd->source->lock);
>         if (!irqfd->source->level_asserted) {
>                 kvm_set_irq(irqfd->kvm, irqfd->source->id, irqfd->gsi, 1);
>                 irqfd->source->level_asserted = true;
>         }
>         spin_unlock(&irqfd->source->lock);
> 
> ...
> 
>         spin_lock(&eoifd->source->lock);
>         if (eoifd->source->level_asserted) {
>                 kvm_set_irq(eoifd->kvm,
>                             eoifd->source->id, eoifd->notifier.gsi, 0);
>                 eoifd->source->level_asserted = false;
>                 eventfd_signal(eoifd->eventfd, 1);
>         }
>         spin_unlock(&eoifd->source->lock);
> 
> 
> Locking doesn't get much more straightforward than that

Don't look at it in isolation. You are now calling kvm_set_irq
from under a spinlock. You are saying it is always safe but
this seems far from obvious. kvm_set_irq used to be
unsafe from an atomic context.

> >   And current code looks buggy if yes we need to fix it somehow.
> 
> 
> Which to me seems to indicate this should be handled as a separate
> effort.

A separate patchset, sure. But likely a prerequisite: we still need to
look at all the code. Let's not copy bugs, need to fix them.

Alex Williamson July 17, 2012, 3:51 p.m. UTC | #12

On Tue, 2012-07-17 at 18:36 +0300, Michael S. Tsirkin wrote:
> On Tue, Jul 17, 2012 at 09:20:11AM -0600, Alex Williamson wrote:
> > On Tue, 2012-07-17 at 17:53 +0300, Michael S. Tsirkin wrote:
> > > On Tue, Jul 17, 2012 at 08:21:51AM -0600, Alex Williamson wrote:
> > > > On Tue, 2012-07-17 at 17:08 +0300, Michael S. Tsirkin wrote:
> > > > > On Tue, Jul 17, 2012 at 07:56:09AM -0600, Alex Williamson wrote:
> > > > > > On Tue, 2012-07-17 at 13:14 +0300, Michael S. Tsirkin wrote:
> > > > > > > On Mon, Jul 16, 2012 at 02:34:03PM -0600, Alex Williamson wrote:
> > > > > > > > This is an alternative to kvm_set_irq(,,,0) which returns the previous
> > > > > > > > assertion state of the interrupt and does nothing if it isn't changed.
> > > > > > > > 
> > > > > > > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > > > > > > ---
> > > > > > > > 
> > > > > > > >  include/linux/kvm_host.h |    3 ++
> > > > > > > >  virt/kvm/irq_comm.c      |   78 ++++++++++++++++++++++++++++++++++++++++++++++
> > > > > > > >  2 files changed, 81 insertions(+)
> > > > > > > > 
> > > > > > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > > > > > > index a7661c0..6c168f1 100644
> > > > > > > > --- a/include/linux/kvm_host.h
> > > > > > > > +++ b/include/linux/kvm_host.h
> > > > > > > > @@ -219,6 +219,8 @@ struct kvm_kernel_irq_routing_entry {
> > > > > > > >  	u32 type;
> > > > > > > >  	int (*set)(struct kvm_kernel_irq_routing_entry *e,
> > > > > > > >  		   struct kvm *kvm, int irq_source_id, int level);
> > > > > > > > +	int (*clear)(struct kvm_kernel_irq_routing_entry *e,
> > > > > > > > +		     struct kvm *kvm, int irq_source_id);
> > > > > > > >  	union {
> > > > > > > >  		struct {
> > > > > > > >  			unsigned irqchip;
> > > > > > > > @@ -629,6 +631,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
> > > > > > > >  				   unsigned long *deliver_bitmask);
> > > > > > > >  #endif
> > > > > > > >  int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
> > > > > > > > +int kvm_clear_irq(struct kvm *kvm, int irq_source_id, u32 irq);
> > > > > > > >  int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
> > > > > > > >  		int irq_source_id, int level);
> > > > > > > >  void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
> > > > > > > > diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
> > > > > > > > index 5afb431..76e8f22 100644
> > > > > > > > --- a/virt/kvm/irq_comm.c
> > > > > > > > +++ b/virt/kvm/irq_comm.c
> > > > > > > > @@ -68,6 +68,42 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
> > > > > > > >  	return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level);
> > > > > > > >  }
> > > > > > > >  
> > > > > > > > +static inline int kvm_clear_irq_line_state(unsigned long *irq_state,
> > > > > > > > +					    int irq_source_id)
> > > > > > > > +{
> > > > > > > > +	return !!test_and_clear_bit(irq_source_id, irq_state);
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +static int kvm_clear_pic_irq(struct kvm_kernel_irq_routing_entry *e,
> > > > > > > > +			     struct kvm *kvm, int irq_source_id)
> > > > > > > > +{
> > > > > > > > +#ifdef CONFIG_X86
> > > > > > > > +	struct kvm_pic *pic = pic_irqchip(kvm);
> > > > > > > > +	int level = kvm_clear_irq_line_state(&pic->irq_states[e->irqchip.pin],
> > > > > > > > +					     irq_source_id);
> > > > > > > > +	if (level)
> > > > > > > > +		kvm_pic_set_irq(pic, e->irqchip.pin,
> > > > > > > > +				!!pic->irq_states[e->irqchip.pin]);
> > > > > > > > +	return level;
> > > > > > > 
> > > > > > > I think I begin to understand: if (level) checks it was previously set,
> > > > > > > and then we clear if needed?
> > > > > > 
> > > > > > It's actually very simple, if we change anything in irq_states, then
> > > > > > update via the chip specific set_irq function.
> > > > > > 
> > > > > > >  I think it's worthwhile to rename
> > > > > > > level to orig_level and rewrite as:
> > > > > > > 
> > > > > > > 	if (orig_level && !pic->irq_states[e->irqchip.pin])
> > > > > > > 		kvm_pic_set_irq(pic, e->irqchip.pin, 0);
> > > > > > > 
> > > > > > > This both makes the logic clear without need for comments and
> > > > > > > saves some cycles on pic in case nothing actually changed.
> > > > > > 
> > > > > > That may work, but it's not actually the same thing.  kvm_set_irq(,,,0)
> > > > > > will clear the bit and call kvm_pic_set_irq with the new irq_states
> > > > > > value, whether it's 0 or 1.  The optimization I make is to only call
> > > > > > kvm_pic_set_irq if we've "changed" irq_states.  You're taking that one
> > > > > > step further to "changed and is now 0".  I don't know if that's correct
> > > > > > behavior.
> > > > > 
> > > > > If not then I don't understand. You clear a bit
> > > > > in a word. You never change it to 1, do you?
> > > > 
> > > > Correct, but kvm_set_irq(,,,0) may call kvm_pic_set_irq(,,1) if other
> > > > source IDs are still asserting the interrupt.  Your proposal assumes
> > > > that unless irq_states is also 0 we don't need to call kvm_pic_set_irq,
> > > > and I don't know if that's correct.
> > > 
> > > Well you are asked to clear some id and level was 1. So we know
> > > interrupt was asserted. Either we clear it or we don't. No?
> > > 
> > > > > 
> > > > > But this brings another question:
> > > > > 
> > > > > static inline int kvm_irq_line_state(unsigned long *irq_state,
> > > > >                                      int irq_source_id, int level)
> > > > > {
> > > > >         /* Logical OR for level trig interrupt */
> > > > >         if (level)
> > > > >                 set_bit(irq_source_id, irq_state);
> > > > >         else
> > > > >                 clear_bit(irq_source_id, irq_state);
> > > > > 
> > > > > 
> > > > > ^^^^^^^^^^^
> > > > > above uses locked instructions
> > > > > 
> > > > >         return !!(*irq_state);
> > > > > 
> > > > > 
> > > > > above doesn't
> > > > > 
> > > > > }
> > > > > 
> > > > > 
> > > > > why the insonsistency?
> > > > 
> > > > Note that set/clear_bit are not locked instructions,
> > > 
> > > On x86 they are:
> > > static __always_inline void
> > > set_bit(unsigned int nr, volatile unsigned long *addr)
> > > {
> > >         if (IS_IMMEDIATE(nr)) {
> > >                 asm volatile(LOCK_PREFIX "orb %1,%0"
> > >                         : CONST_MASK_ADDR(nr, addr)
> > >                         : "iq" ((u8)CONST_MASK(nr))
> > >                         : "memory");
> > >         } else {
> > >                 asm volatile(LOCK_PREFIX "bts %1,%0"
> > >                         : BITOP_ADDR(addr) : "Ir" (nr) : "memory");
> > >         }
> > > }
> > > 
> > > > but atomic
> > > > instructions and it could be argued that reading the value is also
> > > > atomic.  At least that was my guess when I stumbled across the same
> > > > yesterday.  IMHO, we're going off into the weeds again with these last
> > > > two patches.  It may be a valid optimization, but it really has no
> > > > bearing on the meat of the series (and afaict, no significant
> > > > performance difference either).
> > > 
> > > For me it's not a performance thing. IMO code is cleaner without this locking:
> > > we add a lock but only use it in some cases, so the rules become really
> > > complex.
> > 
> > Seriously?
> > 
> >         spin_lock(&irqfd->source->lock);
> >         if (!irqfd->source->level_asserted) {
> >                 kvm_set_irq(irqfd->kvm, irqfd->source->id, irqfd->gsi, 1);
> >                 irqfd->source->level_asserted = true;
> >         }
> >         spin_unlock(&irqfd->source->lock);
> > 
> > ...
> > 
> >         spin_lock(&eoifd->source->lock);
> >         if (eoifd->source->level_asserted) {
> >                 kvm_set_irq(eoifd->kvm,
> >                             eoifd->source->id, eoifd->notifier.gsi, 0);
> >                 eoifd->source->level_asserted = false;
> >                 eventfd_signal(eoifd->eventfd, 1);
> >         }
> >         spin_unlock(&eoifd->source->lock);
> > 
> > 
> > Locking doesn't get much more straightforward than that
> 
> Don't look at it in isolation. You are now calling kvm_set_irq
> from under a spinlock. You are saying it is always safe but
> this seems far from obvious. kvm_set_irq used to be
> unsafe from an atomic context.

Device assignment has been calling kvm_set_irq from atomic context for
quite a long time.

> > >   And current code looks buggy if yes we need to fix it somehow.
> > 
> > 
> > Which to me seems to indicate this should be handled as a separate
> > effort.
> 
> A separate patchset, sure. But likely a prerequisite: we still need to
> look at all the code. Let's not copy bugs, need to fix them.

This looks tangential to me unless you can come up with an actual reason
the above spinlock usage is incorrect or insufficient.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Michael S. Tsirkin July 17, 2012, 3:57 p.m. UTC | #13

On Tue, Jul 17, 2012 at 09:51:41AM -0600, Alex Williamson wrote:
> On Tue, 2012-07-17 at 18:36 +0300, Michael S. Tsirkin wrote:
> > On Tue, Jul 17, 2012 at 09:20:11AM -0600, Alex Williamson wrote:
> > > On Tue, 2012-07-17 at 17:53 +0300, Michael S. Tsirkin wrote:
> > > > On Tue, Jul 17, 2012 at 08:21:51AM -0600, Alex Williamson wrote:
> > > > > On Tue, 2012-07-17 at 17:08 +0300, Michael S. Tsirkin wrote:
> > > > > > On Tue, Jul 17, 2012 at 07:56:09AM -0600, Alex Williamson wrote:
> > > > > > > On Tue, 2012-07-17 at 13:14 +0300, Michael S. Tsirkin wrote:
> > > > > > > > On Mon, Jul 16, 2012 at 02:34:03PM -0600, Alex Williamson wrote:
> > > > > > > > > This is an alternative to kvm_set_irq(,,,0) which returns the previous
> > > > > > > > > assertion state of the interrupt and does nothing if it isn't changed.
> > > > > > > > > 
> > > > > > > > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > > > > > > > ---
> > > > > > > > > 
> > > > > > > > >  include/linux/kvm_host.h |    3 ++
> > > > > > > > >  virt/kvm/irq_comm.c      |   78 ++++++++++++++++++++++++++++++++++++++++++++++
> > > > > > > > >  2 files changed, 81 insertions(+)
> > > > > > > > > 
> > > > > > > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > > > > > > > index a7661c0..6c168f1 100644
> > > > > > > > > --- a/include/linux/kvm_host.h
> > > > > > > > > +++ b/include/linux/kvm_host.h
> > > > > > > > > @@ -219,6 +219,8 @@ struct kvm_kernel_irq_routing_entry {
> > > > > > > > >  	u32 type;
> > > > > > > > >  	int (*set)(struct kvm_kernel_irq_routing_entry *e,
> > > > > > > > >  		   struct kvm *kvm, int irq_source_id, int level);
> > > > > > > > > +	int (*clear)(struct kvm_kernel_irq_routing_entry *e,
> > > > > > > > > +		     struct kvm *kvm, int irq_source_id);
> > > > > > > > >  	union {
> > > > > > > > >  		struct {
> > > > > > > > >  			unsigned irqchip;
> > > > > > > > > @@ -629,6 +631,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
> > > > > > > > >  				   unsigned long *deliver_bitmask);
> > > > > > > > >  #endif
> > > > > > > > >  int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
> > > > > > > > > +int kvm_clear_irq(struct kvm *kvm, int irq_source_id, u32 irq);
> > > > > > > > >  int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
> > > > > > > > >  		int irq_source_id, int level);
> > > > > > > > >  void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
> > > > > > > > > diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
> > > > > > > > > index 5afb431..76e8f22 100644
> > > > > > > > > --- a/virt/kvm/irq_comm.c
> > > > > > > > > +++ b/virt/kvm/irq_comm.c
> > > > > > > > > @@ -68,6 +68,42 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
> > > > > > > > >  	return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level);
> > > > > > > > >  }
> > > > > > > > >  
> > > > > > > > > +static inline int kvm_clear_irq_line_state(unsigned long *irq_state,
> > > > > > > > > +					    int irq_source_id)
> > > > > > > > > +{
> > > > > > > > > +	return !!test_and_clear_bit(irq_source_id, irq_state);
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +static int kvm_clear_pic_irq(struct kvm_kernel_irq_routing_entry *e,
> > > > > > > > > +			     struct kvm *kvm, int irq_source_id)
> > > > > > > > > +{
> > > > > > > > > +#ifdef CONFIG_X86
> > > > > > > > > +	struct kvm_pic *pic = pic_irqchip(kvm);
> > > > > > > > > +	int level = kvm_clear_irq_line_state(&pic->irq_states[e->irqchip.pin],
> > > > > > > > > +					     irq_source_id);
> > > > > > > > > +	if (level)
> > > > > > > > > +		kvm_pic_set_irq(pic, e->irqchip.pin,
> > > > > > > > > +				!!pic->irq_states[e->irqchip.pin]);
> > > > > > > > > +	return level;
> > > > > > > > 
> > > > > > > > I think I begin to understand: if (level) checks it was previously set,
> > > > > > > > and then we clear if needed?
> > > > > > > 
> > > > > > > It's actually very simple, if we change anything in irq_states, then
> > > > > > > update via the chip specific set_irq function.
> > > > > > > 
> > > > > > > >  I think it's worthwhile to rename
> > > > > > > > level to orig_level and rewrite as:
> > > > > > > > 
> > > > > > > > 	if (orig_level && !pic->irq_states[e->irqchip.pin])
> > > > > > > > 		kvm_pic_set_irq(pic, e->irqchip.pin, 0);
> > > > > > > > 
> > > > > > > > This both makes the logic clear without need for comments and
> > > > > > > > saves some cycles on pic in case nothing actually changed.
> > > > > > > 
> > > > > > > That may work, but it's not actually the same thing.  kvm_set_irq(,,,0)
> > > > > > > will clear the bit and call kvm_pic_set_irq with the new irq_states
> > > > > > > value, whether it's 0 or 1.  The optimization I make is to only call
> > > > > > > kvm_pic_set_irq if we've "changed" irq_states.  You're taking that one
> > > > > > > step further to "changed and is now 0".  I don't know if that's correct
> > > > > > > behavior.
> > > > > > 
> > > > > > If not then I don't understand. You clear a bit
> > > > > > in a word. You never change it to 1, do you?
> > > > > 
> > > > > Correct, but kvm_set_irq(,,,0) may call kvm_pic_set_irq(,,1) if other
> > > > > source IDs are still asserting the interrupt.  Your proposal assumes
> > > > > that unless irq_states is also 0 we don't need to call kvm_pic_set_irq,
> > > > > and I don't know if that's correct.
> > > > 
> > > > Well you are asked to clear some id and level was 1. So we know
> > > > interrupt was asserted. Either we clear it or we don't. No?
> > > > 
> > > > > > 
> > > > > > But this brings another question:
> > > > > > 
> > > > > > static inline int kvm_irq_line_state(unsigned long *irq_state,
> > > > > >                                      int irq_source_id, int level)
> > > > > > {
> > > > > >         /* Logical OR for level trig interrupt */
> > > > > >         if (level)
> > > > > >                 set_bit(irq_source_id, irq_state);
> > > > > >         else
> > > > > >                 clear_bit(irq_source_id, irq_state);
> > > > > > 
> > > > > > 
> > > > > > ^^^^^^^^^^^
> > > > > > above uses locked instructions
> > > > > > 
> > > > > >         return !!(*irq_state);
> > > > > > 
> > > > > > 
> > > > > > above doesn't
> > > > > > 
> > > > > > }
> > > > > > 
> > > > > > 
> > > > > > why the insonsistency?
> > > > > 
> > > > > Note that set/clear_bit are not locked instructions,
> > > > 
> > > > On x86 they are:
> > > > static __always_inline void
> > > > set_bit(unsigned int nr, volatile unsigned long *addr)
> > > > {
> > > >         if (IS_IMMEDIATE(nr)) {
> > > >                 asm volatile(LOCK_PREFIX "orb %1,%0"
> > > >                         : CONST_MASK_ADDR(nr, addr)
> > > >                         : "iq" ((u8)CONST_MASK(nr))
> > > >                         : "memory");
> > > >         } else {
> > > >                 asm volatile(LOCK_PREFIX "bts %1,%0"
> > > >                         : BITOP_ADDR(addr) : "Ir" (nr) : "memory");
> > > >         }
> > > > }
> > > > 
> > > > > but atomic
> > > > > instructions and it could be argued that reading the value is also
> > > > > atomic.  At least that was my guess when I stumbled across the same
> > > > > yesterday.  IMHO, we're going off into the weeds again with these last
> > > > > two patches.  It may be a valid optimization, but it really has no
> > > > > bearing on the meat of the series (and afaict, no significant
> > > > > performance difference either).
> > > > 
> > > > For me it's not a performance thing. IMO code is cleaner without this locking:
> > > > we add a lock but only use it in some cases, so the rules become really
> > > > complex.
> > > 
> > > Seriously?
> > > 
> > >         spin_lock(&irqfd->source->lock);
> > >         if (!irqfd->source->level_asserted) {
> > >                 kvm_set_irq(irqfd->kvm, irqfd->source->id, irqfd->gsi, 1);
> > >                 irqfd->source->level_asserted = true;
> > >         }
> > >         spin_unlock(&irqfd->source->lock);
> > > 
> > > ...
> > > 
> > >         spin_lock(&eoifd->source->lock);
> > >         if (eoifd->source->level_asserted) {
> > >                 kvm_set_irq(eoifd->kvm,
> > >                             eoifd->source->id, eoifd->notifier.gsi, 0);
> > >                 eoifd->source->level_asserted = false;
> > >                 eventfd_signal(eoifd->eventfd, 1);
> > >         }
> > >         spin_unlock(&eoifd->source->lock);
> > > 
> > > 
> > > Locking doesn't get much more straightforward than that
> > 
> > Don't look at it in isolation. You are now calling kvm_set_irq
> > from under a spinlock. You are saying it is always safe but
> > this seems far from obvious. kvm_set_irq used to be
> > unsafe from an atomic context.
> 
> Device assignment has been calling kvm_set_irq from atomic context for
> quite a long time.

Only for MSI. That's an exception (and it's also a messy one).

> > > >   And current code looks buggy if yes we need to fix it somehow.
> > > 
> > > 
> > > Which to me seems to indicate this should be handled as a separate
> > > effort.
> > 
> > A separate patchset, sure. But likely a prerequisite: we still need to
> > look at all the code. Let's not copy bugs, need to fix them.
> 
> This looks tangential to me unless you can come up with an actual reason
> the above spinlock usage is incorrect or insufficient.

You copy the same pattern that seems racy. So you double the
amount of code that woul need to be fixed.

Gleb Natapov July 17, 2012, 4:01 p.m. UTC | #14

On Tue, Jul 17, 2012 at 06:57:01PM +0300, Michael S. Tsirkin wrote:
> On Tue, Jul 17, 2012 at 09:51:41AM -0600, Alex Williamson wrote:
> > On Tue, 2012-07-17 at 18:36 +0300, Michael S. Tsirkin wrote:
> > > On Tue, Jul 17, 2012 at 09:20:11AM -0600, Alex Williamson wrote:
> > > > On Tue, 2012-07-17 at 17:53 +0300, Michael S. Tsirkin wrote:
> > > > > On Tue, Jul 17, 2012 at 08:21:51AM -0600, Alex Williamson wrote:
> > > > > > On Tue, 2012-07-17 at 17:08 +0300, Michael S. Tsirkin wrote:
> > > > > > > On Tue, Jul 17, 2012 at 07:56:09AM -0600, Alex Williamson wrote:
> > > > > > > > On Tue, 2012-07-17 at 13:14 +0300, Michael S. Tsirkin wrote:
> > > > > > > > > On Mon, Jul 16, 2012 at 02:34:03PM -0600, Alex Williamson wrote:
> > > > > > > > > > This is an alternative to kvm_set_irq(,,,0) which returns the previous
> > > > > > > > > > assertion state of the interrupt and does nothing if it isn't changed.
> > > > > > > > > > 
> > > > > > > > > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > > > > > > > > ---
> > > > > > > > > > 
> > > > > > > > > >  include/linux/kvm_host.h |    3 ++
> > > > > > > > > >  virt/kvm/irq_comm.c      |   78 ++++++++++++++++++++++++++++++++++++++++++++++
> > > > > > > > > >  2 files changed, 81 insertions(+)
> > > > > > > > > > 
> > > > > > > > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > > > > > > > > index a7661c0..6c168f1 100644
> > > > > > > > > > --- a/include/linux/kvm_host.h
> > > > > > > > > > +++ b/include/linux/kvm_host.h
> > > > > > > > > > @@ -219,6 +219,8 @@ struct kvm_kernel_irq_routing_entry {
> > > > > > > > > >  	u32 type;
> > > > > > > > > >  	int (*set)(struct kvm_kernel_irq_routing_entry *e,
> > > > > > > > > >  		   struct kvm *kvm, int irq_source_id, int level);
> > > > > > > > > > +	int (*clear)(struct kvm_kernel_irq_routing_entry *e,
> > > > > > > > > > +		     struct kvm *kvm, int irq_source_id);
> > > > > > > > > >  	union {
> > > > > > > > > >  		struct {
> > > > > > > > > >  			unsigned irqchip;
> > > > > > > > > > @@ -629,6 +631,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
> > > > > > > > > >  				   unsigned long *deliver_bitmask);
> > > > > > > > > >  #endif
> > > > > > > > > >  int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
> > > > > > > > > > +int kvm_clear_irq(struct kvm *kvm, int irq_source_id, u32 irq);
> > > > > > > > > >  int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
> > > > > > > > > >  		int irq_source_id, int level);
> > > > > > > > > >  void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
> > > > > > > > > > diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
> > > > > > > > > > index 5afb431..76e8f22 100644
> > > > > > > > > > --- a/virt/kvm/irq_comm.c
> > > > > > > > > > +++ b/virt/kvm/irq_comm.c
> > > > > > > > > > @@ -68,6 +68,42 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
> > > > > > > > > >  	return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level);
> > > > > > > > > >  }
> > > > > > > > > >  
> > > > > > > > > > +static inline int kvm_clear_irq_line_state(unsigned long *irq_state,
> > > > > > > > > > +					    int irq_source_id)
> > > > > > > > > > +{
> > > > > > > > > > +	return !!test_and_clear_bit(irq_source_id, irq_state);
> > > > > > > > > > +}
> > > > > > > > > > +
> > > > > > > > > > +static int kvm_clear_pic_irq(struct kvm_kernel_irq_routing_entry *e,
> > > > > > > > > > +			     struct kvm *kvm, int irq_source_id)
> > > > > > > > > > +{
> > > > > > > > > > +#ifdef CONFIG_X86
> > > > > > > > > > +	struct kvm_pic *pic = pic_irqchip(kvm);
> > > > > > > > > > +	int level = kvm_clear_irq_line_state(&pic->irq_states[e->irqchip.pin],
> > > > > > > > > > +					     irq_source_id);
> > > > > > > > > > +	if (level)
> > > > > > > > > > +		kvm_pic_set_irq(pic, e->irqchip.pin,
> > > > > > > > > > +				!!pic->irq_states[e->irqchip.pin]);
> > > > > > > > > > +	return level;
> > > > > > > > > 
> > > > > > > > > I think I begin to understand: if (level) checks it was previously set,
> > > > > > > > > and then we clear if needed?
> > > > > > > > 
> > > > > > > > It's actually very simple, if we change anything in irq_states, then
> > > > > > > > update via the chip specific set_irq function.
> > > > > > > > 
> > > > > > > > >  I think it's worthwhile to rename
> > > > > > > > > level to orig_level and rewrite as:
> > > > > > > > > 
> > > > > > > > > 	if (orig_level && !pic->irq_states[e->irqchip.pin])
> > > > > > > > > 		kvm_pic_set_irq(pic, e->irqchip.pin, 0);
> > > > > > > > > 
> > > > > > > > > This both makes the logic clear without need for comments and
> > > > > > > > > saves some cycles on pic in case nothing actually changed.
> > > > > > > > 
> > > > > > > > That may work, but it's not actually the same thing.  kvm_set_irq(,,,0)
> > > > > > > > will clear the bit and call kvm_pic_set_irq with the new irq_states
> > > > > > > > value, whether it's 0 or 1.  The optimization I make is to only call
> > > > > > > > kvm_pic_set_irq if we've "changed" irq_states.  You're taking that one
> > > > > > > > step further to "changed and is now 0".  I don't know if that's correct
> > > > > > > > behavior.
> > > > > > > 
> > > > > > > If not then I don't understand. You clear a bit
> > > > > > > in a word. You never change it to 1, do you?
> > > > > > 
> > > > > > Correct, but kvm_set_irq(,,,0) may call kvm_pic_set_irq(,,1) if other
> > > > > > source IDs are still asserting the interrupt.  Your proposal assumes
> > > > > > that unless irq_states is also 0 we don't need to call kvm_pic_set_irq,
> > > > > > and I don't know if that's correct.
> > > > > 
> > > > > Well you are asked to clear some id and level was 1. So we know
> > > > > interrupt was asserted. Either we clear it or we don't. No?
> > > > > 
> > > > > > > 
> > > > > > > But this brings another question:
> > > > > > > 
> > > > > > > static inline int kvm_irq_line_state(unsigned long *irq_state,
> > > > > > >                                      int irq_source_id, int level)
> > > > > > > {
> > > > > > >         /* Logical OR for level trig interrupt */
> > > > > > >         if (level)
> > > > > > >                 set_bit(irq_source_id, irq_state);
> > > > > > >         else
> > > > > > >                 clear_bit(irq_source_id, irq_state);
> > > > > > > 
> > > > > > > 
> > > > > > > ^^^^^^^^^^^
> > > > > > > above uses locked instructions
> > > > > > > 
> > > > > > >         return !!(*irq_state);
> > > > > > > 
> > > > > > > 
> > > > > > > above doesn't
> > > > > > > 
> > > > > > > }
> > > > > > > 
> > > > > > > 
> > > > > > > why the insonsistency?
> > > > > > 
> > > > > > Note that set/clear_bit are not locked instructions,
> > > > > 
> > > > > On x86 they are:
> > > > > static __always_inline void
> > > > > set_bit(unsigned int nr, volatile unsigned long *addr)
> > > > > {
> > > > >         if (IS_IMMEDIATE(nr)) {
> > > > >                 asm volatile(LOCK_PREFIX "orb %1,%0"
> > > > >                         : CONST_MASK_ADDR(nr, addr)
> > > > >                         : "iq" ((u8)CONST_MASK(nr))
> > > > >                         : "memory");
> > > > >         } else {
> > > > >                 asm volatile(LOCK_PREFIX "bts %1,%0"
> > > > >                         : BITOP_ADDR(addr) : "Ir" (nr) : "memory");
> > > > >         }
> > > > > }
> > > > > 
> > > > > > but atomic
> > > > > > instructions and it could be argued that reading the value is also
> > > > > > atomic.  At least that was my guess when I stumbled across the same
> > > > > > yesterday.  IMHO, we're going off into the weeds again with these last
> > > > > > two patches.  It may be a valid optimization, but it really has no
> > > > > > bearing on the meat of the series (and afaict, no significant
> > > > > > performance difference either).
> > > > > 
> > > > > For me it's not a performance thing. IMO code is cleaner without this locking:
> > > > > we add a lock but only use it in some cases, so the rules become really
> > > > > complex.
> > > > 
> > > > Seriously?
> > > > 
> > > >         spin_lock(&irqfd->source->lock);
> > > >         if (!irqfd->source->level_asserted) {
> > > >                 kvm_set_irq(irqfd->kvm, irqfd->source->id, irqfd->gsi, 1);
> > > >                 irqfd->source->level_asserted = true;
> > > >         }
> > > >         spin_unlock(&irqfd->source->lock);
> > > > 
> > > > ...
> > > > 
> > > >         spin_lock(&eoifd->source->lock);
> > > >         if (eoifd->source->level_asserted) {
> > > >                 kvm_set_irq(eoifd->kvm,
> > > >                             eoifd->source->id, eoifd->notifier.gsi, 0);
> > > >                 eoifd->source->level_asserted = false;
> > > >                 eventfd_signal(eoifd->eventfd, 1);
> > > >         }
> > > >         spin_unlock(&eoifd->source->lock);
> > > > 
> > > > 
> > > > Locking doesn't get much more straightforward than that
> > > 
> > > Don't look at it in isolation. You are now calling kvm_set_irq
> > > from under a spinlock. You are saying it is always safe but
> > > this seems far from obvious. kvm_set_irq used to be
> > > unsafe from an atomic context.
> > 
> > Device assignment has been calling kvm_set_irq from atomic context for
> > quite a long time.
> 
> Only for MSI. That's an exception (and it's also a messy one).
> 
ioapic/pic used to use mutexes for locking. But this is not longer the
case. See 46a47b1e for instance. I wasn't able to find the reason for
the commit.

> > > > >   And current code looks buggy if yes we need to fix it somehow.
> > > > 
> > > > 
> > > > Which to me seems to indicate this should be handled as a separate
> > > > effort.
> > > 
> > > A separate patchset, sure. But likely a prerequisite: we still need to
> > > look at all the code. Let's not copy bugs, need to fix them.
> > 
> > This looks tangential to me unless you can come up with an actual reason
> > the above spinlock usage is incorrect or insufficient.
> 
> You copy the same pattern that seems racy. So you double the
> amount of code that woul need to be fixed.
> 
> -- 
> MST

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Alex Williamson July 17, 2012, 4:08 p.m. UTC | #15

On Tue, 2012-07-17 at 18:57 +0300, Michael S. Tsirkin wrote:
> On Tue, Jul 17, 2012 at 09:51:41AM -0600, Alex Williamson wrote:
> > On Tue, 2012-07-17 at 18:36 +0300, Michael S. Tsirkin wrote:
> > > On Tue, Jul 17, 2012 at 09:20:11AM -0600, Alex Williamson wrote:
> > > > On Tue, 2012-07-17 at 17:53 +0300, Michael S. Tsirkin wrote:
> > > > > On Tue, Jul 17, 2012 at 08:21:51AM -0600, Alex Williamson wrote:
> > > > > > On Tue, 2012-07-17 at 17:08 +0300, Michael S. Tsirkin wrote:
> > > > > > > On Tue, Jul 17, 2012 at 07:56:09AM -0600, Alex Williamson wrote:
> > > > > > > > On Tue, 2012-07-17 at 13:14 +0300, Michael S. Tsirkin wrote:
> > > > > > > > > On Mon, Jul 16, 2012 at 02:34:03PM -0600, Alex Williamson wrote:
> > > > > > > > > > This is an alternative to kvm_set_irq(,,,0) which returns the previous
> > > > > > > > > > assertion state of the interrupt and does nothing if it isn't changed.
> > > > > > > > > > 
> > > > > > > > > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > > > > > > > > ---
> > > > > > > > > > 
> > > > > > > > > >  include/linux/kvm_host.h |    3 ++
> > > > > > > > > >  virt/kvm/irq_comm.c      |   78 ++++++++++++++++++++++++++++++++++++++++++++++
> > > > > > > > > >  2 files changed, 81 insertions(+)
> > > > > > > > > > 
> > > > > > > > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > > > > > > > > index a7661c0..6c168f1 100644
> > > > > > > > > > --- a/include/linux/kvm_host.h
> > > > > > > > > > +++ b/include/linux/kvm_host.h
> > > > > > > > > > @@ -219,6 +219,8 @@ struct kvm_kernel_irq_routing_entry {
> > > > > > > > > >  	u32 type;
> > > > > > > > > >  	int (*set)(struct kvm_kernel_irq_routing_entry *e,
> > > > > > > > > >  		   struct kvm *kvm, int irq_source_id, int level);
> > > > > > > > > > +	int (*clear)(struct kvm_kernel_irq_routing_entry *e,
> > > > > > > > > > +		     struct kvm *kvm, int irq_source_id);
> > > > > > > > > >  	union {
> > > > > > > > > >  		struct {
> > > > > > > > > >  			unsigned irqchip;
> > > > > > > > > > @@ -629,6 +631,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
> > > > > > > > > >  				   unsigned long *deliver_bitmask);
> > > > > > > > > >  #endif
> > > > > > > > > >  int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
> > > > > > > > > > +int kvm_clear_irq(struct kvm *kvm, int irq_source_id, u32 irq);
> > > > > > > > > >  int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
> > > > > > > > > >  		int irq_source_id, int level);
> > > > > > > > > >  void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
> > > > > > > > > > diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
> > > > > > > > > > index 5afb431..76e8f22 100644
> > > > > > > > > > --- a/virt/kvm/irq_comm.c
> > > > > > > > > > +++ b/virt/kvm/irq_comm.c
> > > > > > > > > > @@ -68,6 +68,42 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
> > > > > > > > > >  	return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level);
> > > > > > > > > >  }
> > > > > > > > > >  
> > > > > > > > > > +static inline int kvm_clear_irq_line_state(unsigned long *irq_state,
> > > > > > > > > > +					    int irq_source_id)
> > > > > > > > > > +{
> > > > > > > > > > +	return !!test_and_clear_bit(irq_source_id, irq_state);
> > > > > > > > > > +}
> > > > > > > > > > +
> > > > > > > > > > +static int kvm_clear_pic_irq(struct kvm_kernel_irq_routing_entry *e,
> > > > > > > > > > +			     struct kvm *kvm, int irq_source_id)
> > > > > > > > > > +{
> > > > > > > > > > +#ifdef CONFIG_X86
> > > > > > > > > > +	struct kvm_pic *pic = pic_irqchip(kvm);
> > > > > > > > > > +	int level = kvm_clear_irq_line_state(&pic->irq_states[e->irqchip.pin],
> > > > > > > > > > +					     irq_source_id);
> > > > > > > > > > +	if (level)
> > > > > > > > > > +		kvm_pic_set_irq(pic, e->irqchip.pin,
> > > > > > > > > > +				!!pic->irq_states[e->irqchip.pin]);
> > > > > > > > > > +	return level;
> > > > > > > > > 
> > > > > > > > > I think I begin to understand: if (level) checks it was previously set,
> > > > > > > > > and then we clear if needed?
> > > > > > > > 
> > > > > > > > It's actually very simple, if we change anything in irq_states, then
> > > > > > > > update via the chip specific set_irq function.
> > > > > > > > 
> > > > > > > > >  I think it's worthwhile to rename
> > > > > > > > > level to orig_level and rewrite as:
> > > > > > > > > 
> > > > > > > > > 	if (orig_level && !pic->irq_states[e->irqchip.pin])
> > > > > > > > > 		kvm_pic_set_irq(pic, e->irqchip.pin, 0);
> > > > > > > > > 
> > > > > > > > > This both makes the logic clear without need for comments and
> > > > > > > > > saves some cycles on pic in case nothing actually changed.
> > > > > > > > 
> > > > > > > > That may work, but it's not actually the same thing.  kvm_set_irq(,,,0)
> > > > > > > > will clear the bit and call kvm_pic_set_irq with the new irq_states
> > > > > > > > value, whether it's 0 or 1.  The optimization I make is to only call
> > > > > > > > kvm_pic_set_irq if we've "changed" irq_states.  You're taking that one
> > > > > > > > step further to "changed and is now 0".  I don't know if that's correct
> > > > > > > > behavior.
> > > > > > > 
> > > > > > > If not then I don't understand. You clear a bit
> > > > > > > in a word. You never change it to 1, do you?
> > > > > > 
> > > > > > Correct, but kvm_set_irq(,,,0) may call kvm_pic_set_irq(,,1) if other
> > > > > > source IDs are still asserting the interrupt.  Your proposal assumes
> > > > > > that unless irq_states is also 0 we don't need to call kvm_pic_set_irq,
> > > > > > and I don't know if that's correct.
> > > > > 
> > > > > Well you are asked to clear some id and level was 1. So we know
> > > > > interrupt was asserted. Either we clear it or we don't. No?
> > > > > 
> > > > > > > 
> > > > > > > But this brings another question:
> > > > > > > 
> > > > > > > static inline int kvm_irq_line_state(unsigned long *irq_state,
> > > > > > >                                      int irq_source_id, int level)
> > > > > > > {
> > > > > > >         /* Logical OR for level trig interrupt */
> > > > > > >         if (level)
> > > > > > >                 set_bit(irq_source_id, irq_state);
> > > > > > >         else
> > > > > > >                 clear_bit(irq_source_id, irq_state);
> > > > > > > 
> > > > > > > 
> > > > > > > ^^^^^^^^^^^
> > > > > > > above uses locked instructions
> > > > > > > 
> > > > > > >         return !!(*irq_state);
> > > > > > > 
> > > > > > > 
> > > > > > > above doesn't
> > > > > > > 
> > > > > > > }
> > > > > > > 
> > > > > > > 
> > > > > > > why the insonsistency?
> > > > > > 
> > > > > > Note that set/clear_bit are not locked instructions,
> > > > > 
> > > > > On x86 they are:
> > > > > static __always_inline void
> > > > > set_bit(unsigned int nr, volatile unsigned long *addr)
> > > > > {
> > > > >         if (IS_IMMEDIATE(nr)) {
> > > > >                 asm volatile(LOCK_PREFIX "orb %1,%0"
> > > > >                         : CONST_MASK_ADDR(nr, addr)
> > > > >                         : "iq" ((u8)CONST_MASK(nr))
> > > > >                         : "memory");
> > > > >         } else {
> > > > >                 asm volatile(LOCK_PREFIX "bts %1,%0"
> > > > >                         : BITOP_ADDR(addr) : "Ir" (nr) : "memory");
> > > > >         }
> > > > > }
> > > > > 
> > > > > > but atomic
> > > > > > instructions and it could be argued that reading the value is also
> > > > > > atomic.  At least that was my guess when I stumbled across the same
> > > > > > yesterday.  IMHO, we're going off into the weeds again with these last
> > > > > > two patches.  It may be a valid optimization, but it really has no
> > > > > > bearing on the meat of the series (and afaict, no significant
> > > > > > performance difference either).
> > > > > 
> > > > > For me it's not a performance thing. IMO code is cleaner without this locking:
> > > > > we add a lock but only use it in some cases, so the rules become really
> > > > > complex.
> > > > 
> > > > Seriously?
> > > > 
> > > >         spin_lock(&irqfd->source->lock);
> > > >         if (!irqfd->source->level_asserted) {
> > > >                 kvm_set_irq(irqfd->kvm, irqfd->source->id, irqfd->gsi, 1);
> > > >                 irqfd->source->level_asserted = true;
> > > >         }
> > > >         spin_unlock(&irqfd->source->lock);
> > > > 
> > > > ...
> > > > 
> > > >         spin_lock(&eoifd->source->lock);
> > > >         if (eoifd->source->level_asserted) {
> > > >                 kvm_set_irq(eoifd->kvm,
> > > >                             eoifd->source->id, eoifd->notifier.gsi, 0);
> > > >                 eoifd->source->level_asserted = false;
> > > >                 eventfd_signal(eoifd->eventfd, 1);
> > > >         }
> > > >         spin_unlock(&eoifd->source->lock);
> > > > 
> > > > 
> > > > Locking doesn't get much more straightforward than that
> > > 
> > > Don't look at it in isolation. You are now calling kvm_set_irq
> > > from under a spinlock. You are saying it is always safe but
> > > this seems far from obvious. kvm_set_irq used to be
> > > unsafe from an atomic context.
> > 
> > Device assignment has been calling kvm_set_irq from atomic context for
> > quite a long time.
> 
> Only for MSI. That's an exception (and it's also a messy one).

Nope, I see past code that used it for INTx as well.

> > > > >   And current code looks buggy if yes we need to fix it somehow.
> > > > 
> > > > 
> > > > Which to me seems to indicate this should be handled as a separate
> > > > effort.
> > > 
> > > A separate patchset, sure. But likely a prerequisite: we still need to
> > > look at all the code. Let's not copy bugs, need to fix them.
> > 
> > This looks tangential to me unless you can come up with an actual reason
> > the above spinlock usage is incorrect or insufficient.
> 
> You copy the same pattern that seems racy. So you double the
> amount of code that woul need to be fixed.


_Seems_ racy, or _is_ racy?  Please identify the race.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Michael S. Tsirkin July 17, 2012, 4:14 p.m. UTC | #16

On Tue, Jul 17, 2012 at 10:08:21AM -0600, Alex Williamson wrote:
> On Tue, 2012-07-17 at 18:57 +0300, Michael S. Tsirkin wrote:
> > On Tue, Jul 17, 2012 at 09:51:41AM -0600, Alex Williamson wrote:
> > > On Tue, 2012-07-17 at 18:36 +0300, Michael S. Tsirkin wrote:
> > > > On Tue, Jul 17, 2012 at 09:20:11AM -0600, Alex Williamson wrote:
> > > > > On Tue, 2012-07-17 at 17:53 +0300, Michael S. Tsirkin wrote:
> > > > > > On Tue, Jul 17, 2012 at 08:21:51AM -0600, Alex Williamson wrote:
> > > > > > > On Tue, 2012-07-17 at 17:08 +0300, Michael S. Tsirkin wrote:
> > > > > > > > On Tue, Jul 17, 2012 at 07:56:09AM -0600, Alex Williamson wrote:
> > > > > > > > > On Tue, 2012-07-17 at 13:14 +0300, Michael S. Tsirkin wrote:
> > > > > > > > > > On Mon, Jul 16, 2012 at 02:34:03PM -0600, Alex Williamson wrote:
> > > > > > > > > > > This is an alternative to kvm_set_irq(,,,0) which returns the previous
> > > > > > > > > > > assertion state of the interrupt and does nothing if it isn't changed.
> > > > > > > > > > > 
> > > > > > > > > > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > > > > > > > > > ---
> > > > > > > > > > > 
> > > > > > > > > > >  include/linux/kvm_host.h |    3 ++
> > > > > > > > > > >  virt/kvm/irq_comm.c      |   78 ++++++++++++++++++++++++++++++++++++++++++++++
> > > > > > > > > > >  2 files changed, 81 insertions(+)
> > > > > > > > > > > 
> > > > > > > > > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > > > > > > > > > index a7661c0..6c168f1 100644
> > > > > > > > > > > --- a/include/linux/kvm_host.h
> > > > > > > > > > > +++ b/include/linux/kvm_host.h
> > > > > > > > > > > @@ -219,6 +219,8 @@ struct kvm_kernel_irq_routing_entry {
> > > > > > > > > > >  	u32 type;
> > > > > > > > > > >  	int (*set)(struct kvm_kernel_irq_routing_entry *e,
> > > > > > > > > > >  		   struct kvm *kvm, int irq_source_id, int level);
> > > > > > > > > > > +	int (*clear)(struct kvm_kernel_irq_routing_entry *e,
> > > > > > > > > > > +		     struct kvm *kvm, int irq_source_id);
> > > > > > > > > > >  	union {
> > > > > > > > > > >  		struct {
> > > > > > > > > > >  			unsigned irqchip;
> > > > > > > > > > > @@ -629,6 +631,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
> > > > > > > > > > >  				   unsigned long *deliver_bitmask);
> > > > > > > > > > >  #endif
> > > > > > > > > > >  int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
> > > > > > > > > > > +int kvm_clear_irq(struct kvm *kvm, int irq_source_id, u32 irq);
> > > > > > > > > > >  int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
> > > > > > > > > > >  		int irq_source_id, int level);
> > > > > > > > > > >  void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
> > > > > > > > > > > diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
> > > > > > > > > > > index 5afb431..76e8f22 100644
> > > > > > > > > > > --- a/virt/kvm/irq_comm.c
> > > > > > > > > > > +++ b/virt/kvm/irq_comm.c
> > > > > > > > > > > @@ -68,6 +68,42 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
> > > > > > > > > > >  	return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level);
> > > > > > > > > > >  }
> > > > > > > > > > >  
> > > > > > > > > > > +static inline int kvm_clear_irq_line_state(unsigned long *irq_state,
> > > > > > > > > > > +					    int irq_source_id)
> > > > > > > > > > > +{
> > > > > > > > > > > +	return !!test_and_clear_bit(irq_source_id, irq_state);
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > > +static int kvm_clear_pic_irq(struct kvm_kernel_irq_routing_entry *e,
> > > > > > > > > > > +			     struct kvm *kvm, int irq_source_id)
> > > > > > > > > > > +{
> > > > > > > > > > > +#ifdef CONFIG_X86
> > > > > > > > > > > +	struct kvm_pic *pic = pic_irqchip(kvm);
> > > > > > > > > > > +	int level = kvm_clear_irq_line_state(&pic->irq_states[e->irqchip.pin],
> > > > > > > > > > > +					     irq_source_id);
> > > > > > > > > > > +	if (level)
> > > > > > > > > > > +		kvm_pic_set_irq(pic, e->irqchip.pin,
> > > > > > > > > > > +				!!pic->irq_states[e->irqchip.pin]);
> > > > > > > > > > > +	return level;
> > > > > > > > > > 
> > > > > > > > > > I think I begin to understand: if (level) checks it was previously set,
> > > > > > > > > > and then we clear if needed?
> > > > > > > > > 
> > > > > > > > > It's actually very simple, if we change anything in irq_states, then
> > > > > > > > > update via the chip specific set_irq function.
> > > > > > > > > 
> > > > > > > > > >  I think it's worthwhile to rename
> > > > > > > > > > level to orig_level and rewrite as:
> > > > > > > > > > 
> > > > > > > > > > 	if (orig_level && !pic->irq_states[e->irqchip.pin])
> > > > > > > > > > 		kvm_pic_set_irq(pic, e->irqchip.pin, 0);
> > > > > > > > > > 
> > > > > > > > > > This both makes the logic clear without need for comments and
> > > > > > > > > > saves some cycles on pic in case nothing actually changed.
> > > > > > > > > 
> > > > > > > > > That may work, but it's not actually the same thing.  kvm_set_irq(,,,0)
> > > > > > > > > will clear the bit and call kvm_pic_set_irq with the new irq_states
> > > > > > > > > value, whether it's 0 or 1.  The optimization I make is to only call
> > > > > > > > > kvm_pic_set_irq if we've "changed" irq_states.  You're taking that one
> > > > > > > > > step further to "changed and is now 0".  I don't know if that's correct
> > > > > > > > > behavior.
> > > > > > > > 
> > > > > > > > If not then I don't understand. You clear a bit
> > > > > > > > in a word. You never change it to 1, do you?
> > > > > > > 
> > > > > > > Correct, but kvm_set_irq(,,,0) may call kvm_pic_set_irq(,,1) if other
> > > > > > > source IDs are still asserting the interrupt.  Your proposal assumes
> > > > > > > that unless irq_states is also 0 we don't need to call kvm_pic_set_irq,
> > > > > > > and I don't know if that's correct.
> > > > > > 
> > > > > > Well you are asked to clear some id and level was 1. So we know
> > > > > > interrupt was asserted. Either we clear it or we don't. No?
> > > > > > 
> > > > > > > > 
> > > > > > > > But this brings another question:
> > > > > > > > 
> > > > > > > > static inline int kvm_irq_line_state(unsigned long *irq_state,
> > > > > > > >                                      int irq_source_id, int level)
> > > > > > > > {
> > > > > > > >         /* Logical OR for level trig interrupt */
> > > > > > > >         if (level)
> > > > > > > >                 set_bit(irq_source_id, irq_state);
> > > > > > > >         else
> > > > > > > >                 clear_bit(irq_source_id, irq_state);
> > > > > > > > 
> > > > > > > > 
> > > > > > > > ^^^^^^^^^^^
> > > > > > > > above uses locked instructions
> > > > > > > > 
> > > > > > > >         return !!(*irq_state);
> > > > > > > > 
> > > > > > > > 
> > > > > > > > above doesn't
> > > > > > > > 
> > > > > > > > }
> > > > > > > > 
> > > > > > > > 
> > > > > > > > why the insonsistency?
> > > > > > > 
> > > > > > > Note that set/clear_bit are not locked instructions,
> > > > > > 
> > > > > > On x86 they are:
> > > > > > static __always_inline void
> > > > > > set_bit(unsigned int nr, volatile unsigned long *addr)
> > > > > > {
> > > > > >         if (IS_IMMEDIATE(nr)) {
> > > > > >                 asm volatile(LOCK_PREFIX "orb %1,%0"
> > > > > >                         : CONST_MASK_ADDR(nr, addr)
> > > > > >                         : "iq" ((u8)CONST_MASK(nr))
> > > > > >                         : "memory");
> > > > > >         } else {
> > > > > >                 asm volatile(LOCK_PREFIX "bts %1,%0"
> > > > > >                         : BITOP_ADDR(addr) : "Ir" (nr) : "memory");
> > > > > >         }
> > > > > > }
> > > > > > 
> > > > > > > but atomic
> > > > > > > instructions and it could be argued that reading the value is also
> > > > > > > atomic.  At least that was my guess when I stumbled across the same
> > > > > > > yesterday.  IMHO, we're going off into the weeds again with these last
> > > > > > > two patches.  It may be a valid optimization, but it really has no
> > > > > > > bearing on the meat of the series (and afaict, no significant
> > > > > > > performance difference either).
> > > > > > 
> > > > > > For me it's not a performance thing. IMO code is cleaner without this locking:
> > > > > > we add a lock but only use it in some cases, so the rules become really
> > > > > > complex.
> > > > > 
> > > > > Seriously?
> > > > > 
> > > > >         spin_lock(&irqfd->source->lock);
> > > > >         if (!irqfd->source->level_asserted) {
> > > > >                 kvm_set_irq(irqfd->kvm, irqfd->source->id, irqfd->gsi, 1);
> > > > >                 irqfd->source->level_asserted = true;
> > > > >         }
> > > > >         spin_unlock(&irqfd->source->lock);
> > > > > 
> > > > > ...
> > > > > 
> > > > >         spin_lock(&eoifd->source->lock);
> > > > >         if (eoifd->source->level_asserted) {
> > > > >                 kvm_set_irq(eoifd->kvm,
> > > > >                             eoifd->source->id, eoifd->notifier.gsi, 0);
> > > > >                 eoifd->source->level_asserted = false;
> > > > >                 eventfd_signal(eoifd->eventfd, 1);
> > > > >         }
> > > > >         spin_unlock(&eoifd->source->lock);
> > > > > 
> > > > > 
> > > > > Locking doesn't get much more straightforward than that
> > > > 
> > > > Don't look at it in isolation. You are now calling kvm_set_irq
> > > > from under a spinlock. You are saying it is always safe but
> > > > this seems far from obvious. kvm_set_irq used to be
> > > > unsafe from an atomic context.
> > > 
> > > Device assignment has been calling kvm_set_irq from atomic context for
> > > quite a long time.
> > 
> > Only for MSI. That's an exception (and it's also a messy one).
> 
> Nope, I see past code that used it for INTx as well.
> 
> > > > > >   And current code looks buggy if yes we need to fix it somehow.
> > > > > 
> > > > > 
> > > > > Which to me seems to indicate this should be handled as a separate
> > > > > effort.
> > > > 
> > > > A separate patchset, sure. But likely a prerequisite: we still need to
> > > > look at all the code. Let's not copy bugs, need to fix them.
> > > 
> > > This looks tangential to me unless you can come up with an actual reason
> > > the above spinlock usage is incorrect or insufficient.
> > 
> > You copy the same pattern that seems racy. So you double the
> > amount of code that woul need to be fixed.
> 
> 
> _Seems_ racy, or _is_ racy?  Please identify the race.

Look at this:

static inline int kvm_irq_line_state(unsigned long *irq_state,
                                     int irq_source_id, int level)
{
        /* Logical OR for level trig interrupt */
        if (level)
                set_bit(irq_source_id, irq_state);
        else
                clear_bit(irq_source_id, irq_state);

        return !!(*irq_state);
}


Now:
If other CPU changes some other bit after the atomic change,
it looks like !!(*irq_state) might return a stale value.

CPU 0 clears bit 0. CPU 1 sets bit 1. CPU 1 sets level to 1.
If CPU 0 sees a stale value now it will return 0 here
and interrupt will get cleared.


Maybe this is not a problem. But in that case IMO it needs
a comment explaining why and why it's not a problem in
your code.

Alex Williamson July 17, 2012, 4:17 p.m. UTC | #17

On Tue, 2012-07-17 at 19:14 +0300, Michael S. Tsirkin wrote:
> On Tue, Jul 17, 2012 at 10:08:21AM -0600, Alex Williamson wrote:
> > On Tue, 2012-07-17 at 18:57 +0300, Michael S. Tsirkin wrote:
> > > On Tue, Jul 17, 2012 at 09:51:41AM -0600, Alex Williamson wrote:
> > > > On Tue, 2012-07-17 at 18:36 +0300, Michael S. Tsirkin wrote:
> > > > > On Tue, Jul 17, 2012 at 09:20:11AM -0600, Alex Williamson wrote:
> > > > > > On Tue, 2012-07-17 at 17:53 +0300, Michael S. Tsirkin wrote:
> > > > > > > On Tue, Jul 17, 2012 at 08:21:51AM -0600, Alex Williamson wrote:
> > > > > > > > On Tue, 2012-07-17 at 17:08 +0300, Michael S. Tsirkin wrote:
> > > > > > > > > On Tue, Jul 17, 2012 at 07:56:09AM -0600, Alex Williamson wrote:
> > > > > > > > > > On Tue, 2012-07-17 at 13:14 +0300, Michael S. Tsirkin wrote:
> > > > > > > > > > > On Mon, Jul 16, 2012 at 02:34:03PM -0600, Alex Williamson wrote:
> > > > > > > > > > > > This is an alternative to kvm_set_irq(,,,0) which returns the previous
> > > > > > > > > > > > assertion state of the interrupt and does nothing if it isn't changed.
> > > > > > > > > > > > 
> > > > > > > > > > > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > > > > > > > > > > ---
> > > > > > > > > > > > 
> > > > > > > > > > > >  include/linux/kvm_host.h |    3 ++
> > > > > > > > > > > >  virt/kvm/irq_comm.c      |   78 ++++++++++++++++++++++++++++++++++++++++++++++
> > > > > > > > > > > >  2 files changed, 81 insertions(+)
> > > > > > > > > > > > 
> > > > > > > > > > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > > > > > > > > > > index a7661c0..6c168f1 100644
> > > > > > > > > > > > --- a/include/linux/kvm_host.h
> > > > > > > > > > > > +++ b/include/linux/kvm_host.h
> > > > > > > > > > > > @@ -219,6 +219,8 @@ struct kvm_kernel_irq_routing_entry {
> > > > > > > > > > > >  	u32 type;
> > > > > > > > > > > >  	int (*set)(struct kvm_kernel_irq_routing_entry *e,
> > > > > > > > > > > >  		   struct kvm *kvm, int irq_source_id, int level);
> > > > > > > > > > > > +	int (*clear)(struct kvm_kernel_irq_routing_entry *e,
> > > > > > > > > > > > +		     struct kvm *kvm, int irq_source_id);
> > > > > > > > > > > >  	union {
> > > > > > > > > > > >  		struct {
> > > > > > > > > > > >  			unsigned irqchip;
> > > > > > > > > > > > @@ -629,6 +631,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
> > > > > > > > > > > >  				   unsigned long *deliver_bitmask);
> > > > > > > > > > > >  #endif
> > > > > > > > > > > >  int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
> > > > > > > > > > > > +int kvm_clear_irq(struct kvm *kvm, int irq_source_id, u32 irq);
> > > > > > > > > > > >  int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
> > > > > > > > > > > >  		int irq_source_id, int level);
> > > > > > > > > > > >  void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
> > > > > > > > > > > > diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
> > > > > > > > > > > > index 5afb431..76e8f22 100644
> > > > > > > > > > > > --- a/virt/kvm/irq_comm.c
> > > > > > > > > > > > +++ b/virt/kvm/irq_comm.c
> > > > > > > > > > > > @@ -68,6 +68,42 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
> > > > > > > > > > > >  	return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level);
> > > > > > > > > > > >  }
> > > > > > > > > > > >  
> > > > > > > > > > > > +static inline int kvm_clear_irq_line_state(unsigned long *irq_state,
> > > > > > > > > > > > +					    int irq_source_id)
> > > > > > > > > > > > +{
> > > > > > > > > > > > +	return !!test_and_clear_bit(irq_source_id, irq_state);
> > > > > > > > > > > > +}
> > > > > > > > > > > > +
> > > > > > > > > > > > +static int kvm_clear_pic_irq(struct kvm_kernel_irq_routing_entry *e,
> > > > > > > > > > > > +			     struct kvm *kvm, int irq_source_id)
> > > > > > > > > > > > +{
> > > > > > > > > > > > +#ifdef CONFIG_X86
> > > > > > > > > > > > +	struct kvm_pic *pic = pic_irqchip(kvm);
> > > > > > > > > > > > +	int level = kvm_clear_irq_line_state(&pic->irq_states[e->irqchip.pin],
> > > > > > > > > > > > +					     irq_source_id);
> > > > > > > > > > > > +	if (level)
> > > > > > > > > > > > +		kvm_pic_set_irq(pic, e->irqchip.pin,
> > > > > > > > > > > > +				!!pic->irq_states[e->irqchip.pin]);
> > > > > > > > > > > > +	return level;
> > > > > > > > > > > 
> > > > > > > > > > > I think I begin to understand: if (level) checks it was previously set,
> > > > > > > > > > > and then we clear if needed?
> > > > > > > > > > 
> > > > > > > > > > It's actually very simple, if we change anything in irq_states, then
> > > > > > > > > > update via the chip specific set_irq function.
> > > > > > > > > > 
> > > > > > > > > > >  I think it's worthwhile to rename
> > > > > > > > > > > level to orig_level and rewrite as:
> > > > > > > > > > > 
> > > > > > > > > > > 	if (orig_level && !pic->irq_states[e->irqchip.pin])
> > > > > > > > > > > 		kvm_pic_set_irq(pic, e->irqchip.pin, 0);
> > > > > > > > > > > 
> > > > > > > > > > > This both makes the logic clear without need for comments and
> > > > > > > > > > > saves some cycles on pic in case nothing actually changed.
> > > > > > > > > > 
> > > > > > > > > > That may work, but it's not actually the same thing.  kvm_set_irq(,,,0)
> > > > > > > > > > will clear the bit and call kvm_pic_set_irq with the new irq_states
> > > > > > > > > > value, whether it's 0 or 1.  The optimization I make is to only call
> > > > > > > > > > kvm_pic_set_irq if we've "changed" irq_states.  You're taking that one
> > > > > > > > > > step further to "changed and is now 0".  I don't know if that's correct
> > > > > > > > > > behavior.
> > > > > > > > > 
> > > > > > > > > If not then I don't understand. You clear a bit
> > > > > > > > > in a word. You never change it to 1, do you?
> > > > > > > > 
> > > > > > > > Correct, but kvm_set_irq(,,,0) may call kvm_pic_set_irq(,,1) if other
> > > > > > > > source IDs are still asserting the interrupt.  Your proposal assumes
> > > > > > > > that unless irq_states is also 0 we don't need to call kvm_pic_set_irq,
> > > > > > > > and I don't know if that's correct.
> > > > > > > 
> > > > > > > Well you are asked to clear some id and level was 1. So we know
> > > > > > > interrupt was asserted. Either we clear it or we don't. No?
> > > > > > > 
> > > > > > > > > 
> > > > > > > > > But this brings another question:
> > > > > > > > > 
> > > > > > > > > static inline int kvm_irq_line_state(unsigned long *irq_state,
> > > > > > > > >                                      int irq_source_id, int level)
> > > > > > > > > {
> > > > > > > > >         /* Logical OR for level trig interrupt */
> > > > > > > > >         if (level)
> > > > > > > > >                 set_bit(irq_source_id, irq_state);
> > > > > > > > >         else
> > > > > > > > >                 clear_bit(irq_source_id, irq_state);
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > ^^^^^^^^^^^
> > > > > > > > > above uses locked instructions
> > > > > > > > > 
> > > > > > > > >         return !!(*irq_state);
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > above doesn't
> > > > > > > > > 
> > > > > > > > > }
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > why the insonsistency?
> > > > > > > > 
> > > > > > > > Note that set/clear_bit are not locked instructions,
> > > > > > > 
> > > > > > > On x86 they are:
> > > > > > > static __always_inline void
> > > > > > > set_bit(unsigned int nr, volatile unsigned long *addr)
> > > > > > > {
> > > > > > >         if (IS_IMMEDIATE(nr)) {
> > > > > > >                 asm volatile(LOCK_PREFIX "orb %1,%0"
> > > > > > >                         : CONST_MASK_ADDR(nr, addr)
> > > > > > >                         : "iq" ((u8)CONST_MASK(nr))
> > > > > > >                         : "memory");
> > > > > > >         } else {
> > > > > > >                 asm volatile(LOCK_PREFIX "bts %1,%0"
> > > > > > >                         : BITOP_ADDR(addr) : "Ir" (nr) : "memory");
> > > > > > >         }
> > > > > > > }
> > > > > > > 
> > > > > > > > but atomic
> > > > > > > > instructions and it could be argued that reading the value is also
> > > > > > > > atomic.  At least that was my guess when I stumbled across the same
> > > > > > > > yesterday.  IMHO, we're going off into the weeds again with these last
> > > > > > > > two patches.  It may be a valid optimization, but it really has no
> > > > > > > > bearing on the meat of the series (and afaict, no significant
> > > > > > > > performance difference either).
> > > > > > > 
> > > > > > > For me it's not a performance thing. IMO code is cleaner without this locking:
> > > > > > > we add a lock but only use it in some cases, so the rules become really
> > > > > > > complex.
> > > > > > 
> > > > > > Seriously?
> > > > > > 
> > > > > >         spin_lock(&irqfd->source->lock);
> > > > > >         if (!irqfd->source->level_asserted) {
> > > > > >                 kvm_set_irq(irqfd->kvm, irqfd->source->id, irqfd->gsi, 1);
> > > > > >                 irqfd->source->level_asserted = true;
> > > > > >         }
> > > > > >         spin_unlock(&irqfd->source->lock);
> > > > > > 
> > > > > > ...
> > > > > > 
> > > > > >         spin_lock(&eoifd->source->lock);
> > > > > >         if (eoifd->source->level_asserted) {
> > > > > >                 kvm_set_irq(eoifd->kvm,
> > > > > >                             eoifd->source->id, eoifd->notifier.gsi, 0);
> > > > > >                 eoifd->source->level_asserted = false;
> > > > > >                 eventfd_signal(eoifd->eventfd, 1);
> > > > > >         }
> > > > > >         spin_unlock(&eoifd->source->lock);
> > > > > > 
> > > > > > 
> > > > > > Locking doesn't get much more straightforward than that
> > > > > 
> > > > > Don't look at it in isolation. You are now calling kvm_set_irq
> > > > > from under a spinlock. You are saying it is always safe but
> > > > > this seems far from obvious. kvm_set_irq used to be
> > > > > unsafe from an atomic context.
> > > > 
> > > > Device assignment has been calling kvm_set_irq from atomic context for
> > > > quite a long time.
> > > 
> > > Only for MSI. That's an exception (and it's also a messy one).
> > 
> > Nope, I see past code that used it for INTx as well.
> > 
> > > > > > >   And current code looks buggy if yes we need to fix it somehow.
> > > > > > 
> > > > > > 
> > > > > > Which to me seems to indicate this should be handled as a separate
> > > > > > effort.
> > > > > 
> > > > > A separate patchset, sure. But likely a prerequisite: we still need to
> > > > > look at all the code. Let's not copy bugs, need to fix them.
> > > > 
> > > > This looks tangential to me unless you can come up with an actual reason
> > > > the above spinlock usage is incorrect or insufficient.
> > > 
> > > You copy the same pattern that seems racy. So you double the
> > > amount of code that woul need to be fixed.
> > 
> > 
> > _Seems_ racy, or _is_ racy?  Please identify the race.
> 
> Look at this:
> 
> static inline int kvm_irq_line_state(unsigned long *irq_state,
>                                      int irq_source_id, int level)
> {
>         /* Logical OR for level trig interrupt */
>         if (level)
>                 set_bit(irq_source_id, irq_state);
>         else
>                 clear_bit(irq_source_id, irq_state);
> 
>         return !!(*irq_state);
> }
> 
> 
> Now:
> If other CPU changes some other bit after the atomic change,
> it looks like !!(*irq_state) might return a stale value.
> 
> CPU 0 clears bit 0. CPU 1 sets bit 1. CPU 1 sets level to 1.
> If CPU 0 sees a stale value now it will return 0 here
> and interrupt will get cleared.
> 
> 
> Maybe this is not a problem. But in that case IMO it needs
> a comment explaining why and why it's not a problem in
> your code.

So you want to close the door on anything that uses kvm_set_irq until
this gets fixed... that's insane.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Michael S. Tsirkin July 17, 2012, 4:21 p.m. UTC | #18

On Tue, Jul 17, 2012 at 10:17:03AM -0600, Alex Williamson wrote:
> > > > > > > >   And current code looks buggy if yes we need to fix it somehow.
> > > > > > > 
> > > > > > > 
> > > > > > > Which to me seems to indicate this should be handled as a separate
> > > > > > > effort.
> > > > > > 
> > > > > > A separate patchset, sure. But likely a prerequisite: we still need to
> > > > > > look at all the code. Let's not copy bugs, need to fix them.
> > > > > 
> > > > > This looks tangential to me unless you can come up with an actual reason
> > > > > the above spinlock usage is incorrect or insufficient.
> > > > 
> > > > You copy the same pattern that seems racy. So you double the
> > > > amount of code that woul need to be fixed.
> > > 
> > > 
> > > _Seems_ racy, or _is_ racy?  Please identify the race.
> > 
> > Look at this:
> > 
> > static inline int kvm_irq_line_state(unsigned long *irq_state,
> >                                      int irq_source_id, int level)
> > {
> >         /* Logical OR for level trig interrupt */
> >         if (level)
> >                 set_bit(irq_source_id, irq_state);
> >         else
> >                 clear_bit(irq_source_id, irq_state);
> > 
> >         return !!(*irq_state);
> > }
> > 
> > 
> > Now:
> > If other CPU changes some other bit after the atomic change,
> > it looks like !!(*irq_state) might return a stale value.
> > 
> > CPU 0 clears bit 0. CPU 1 sets bit 1. CPU 1 sets level to 1.
> > If CPU 0 sees a stale value now it will return 0 here
> > and interrupt will get cleared.
> > 
> > 
> > Maybe this is not a problem. But in that case IMO it needs
> > a comment explaining why and why it's not a problem in
> > your code.
> 
> So you want to close the door on anything that uses kvm_set_irq until
> this gets fixed... that's insane.

What does kvm_set_irq use have to do with it?  You posted this patch:

+static int kvm_clear_pic_irq(struct kvm_kernel_irq_routing_entry *e,
+                            struct kvm *kvm, int irq_source_id)
+{
+#ifdef CONFIG_X86
+       struct kvm_pic *pic = pic_irqchip(kvm);
+       int level =
kvm_clear_irq_line_state(&pic->irq_states[e->irqchip.pin],
+                                            irq_source_id);
+       if (level)
+               kvm_pic_set_irq(pic, e->irqchip.pin,
+                               !!pic->irq_states[e->irqchip.pin]);
+       return level;
+#else
+       return -1;
+#endif
+}
+

it seems racy in the same way.

Michael S. Tsirkin July 17, 2012, 4:36 p.m. UTC | #19

On Tue, Jul 17, 2012 at 10:08:21AM -0600, Alex Williamson wrote:
> On Tue, 2012-07-17 at 18:57 +0300, Michael S. Tsirkin wrote:
> > On Tue, Jul 17, 2012 at 09:51:41AM -0600, Alex Williamson wrote:
> > > On Tue, 2012-07-17 at 18:36 +0300, Michael S. Tsirkin wrote:
> > > > On Tue, Jul 17, 2012 at 09:20:11AM -0600, Alex Williamson wrote:
> > > > > On Tue, 2012-07-17 at 17:53 +0300, Michael S. Tsirkin wrote:
> > > > > > On Tue, Jul 17, 2012 at 08:21:51AM -0600, Alex Williamson wrote:
> > > > > > > On Tue, 2012-07-17 at 17:08 +0300, Michael S. Tsirkin wrote:
> > > > > > > > On Tue, Jul 17, 2012 at 07:56:09AM -0600, Alex Williamson wrote:
> > > > > > > > > On Tue, 2012-07-17 at 13:14 +0300, Michael S. Tsirkin wrote:
> > > > > > > > > > On Mon, Jul 16, 2012 at 02:34:03PM -0600, Alex Williamson wrote:
> > > > > > > > > > > This is an alternative to kvm_set_irq(,,,0) which returns the previous
> > > > > > > > > > > assertion state of the interrupt and does nothing if it isn't changed.
> > > > > > > > > > > 
> > > > > > > > > > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > > > > > > > > > ---
> > > > > > > > > > > 
> > > > > > > > > > >  include/linux/kvm_host.h |    3 ++
> > > > > > > > > > >  virt/kvm/irq_comm.c      |   78 ++++++++++++++++++++++++++++++++++++++++++++++
> > > > > > > > > > >  2 files changed, 81 insertions(+)
> > > > > > > > > > > 
> > > > > > > > > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > > > > > > > > > index a7661c0..6c168f1 100644
> > > > > > > > > > > --- a/include/linux/kvm_host.h
> > > > > > > > > > > +++ b/include/linux/kvm_host.h
> > > > > > > > > > > @@ -219,6 +219,8 @@ struct kvm_kernel_irq_routing_entry {
> > > > > > > > > > >  	u32 type;
> > > > > > > > > > >  	int (*set)(struct kvm_kernel_irq_routing_entry *e,
> > > > > > > > > > >  		   struct kvm *kvm, int irq_source_id, int level);
> > > > > > > > > > > +	int (*clear)(struct kvm_kernel_irq_routing_entry *e,
> > > > > > > > > > > +		     struct kvm *kvm, int irq_source_id);
> > > > > > > > > > >  	union {
> > > > > > > > > > >  		struct {
> > > > > > > > > > >  			unsigned irqchip;
> > > > > > > > > > > @@ -629,6 +631,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
> > > > > > > > > > >  				   unsigned long *deliver_bitmask);
> > > > > > > > > > >  #endif
> > > > > > > > > > >  int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
> > > > > > > > > > > +int kvm_clear_irq(struct kvm *kvm, int irq_source_id, u32 irq);
> > > > > > > > > > >  int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
> > > > > > > > > > >  		int irq_source_id, int level);
> > > > > > > > > > >  void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
> > > > > > > > > > > diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
> > > > > > > > > > > index 5afb431..76e8f22 100644
> > > > > > > > > > > --- a/virt/kvm/irq_comm.c
> > > > > > > > > > > +++ b/virt/kvm/irq_comm.c
> > > > > > > > > > > @@ -68,6 +68,42 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
> > > > > > > > > > >  	return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level);
> > > > > > > > > > >  }
> > > > > > > > > > >  
> > > > > > > > > > > +static inline int kvm_clear_irq_line_state(unsigned long *irq_state,
> > > > > > > > > > > +					    int irq_source_id)
> > > > > > > > > > > +{
> > > > > > > > > > > +	return !!test_and_clear_bit(irq_source_id, irq_state);
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > > +static int kvm_clear_pic_irq(struct kvm_kernel_irq_routing_entry *e,
> > > > > > > > > > > +			     struct kvm *kvm, int irq_source_id)
> > > > > > > > > > > +{
> > > > > > > > > > > +#ifdef CONFIG_X86
> > > > > > > > > > > +	struct kvm_pic *pic = pic_irqchip(kvm);
> > > > > > > > > > > +	int level = kvm_clear_irq_line_state(&pic->irq_states[e->irqchip.pin],
> > > > > > > > > > > +					     irq_source_id);
> > > > > > > > > > > +	if (level)
> > > > > > > > > > > +		kvm_pic_set_irq(pic, e->irqchip.pin,
> > > > > > > > > > > +				!!pic->irq_states[e->irqchip.pin]);
> > > > > > > > > > > +	return level;
> > > > > > > > > > 
> > > > > > > > > > I think I begin to understand: if (level) checks it was previously set,
> > > > > > > > > > and then we clear if needed?
> > > > > > > > > 
> > > > > > > > > It's actually very simple, if we change anything in irq_states, then
> > > > > > > > > update via the chip specific set_irq function.
> > > > > > > > > 
> > > > > > > > > >  I think it's worthwhile to rename
> > > > > > > > > > level to orig_level and rewrite as:
> > > > > > > > > > 
> > > > > > > > > > 	if (orig_level && !pic->irq_states[e->irqchip.pin])
> > > > > > > > > > 		kvm_pic_set_irq(pic, e->irqchip.pin, 0);
> > > > > > > > > > 
> > > > > > > > > > This both makes the logic clear without need for comments and
> > > > > > > > > > saves some cycles on pic in case nothing actually changed.
> > > > > > > > > 
> > > > > > > > > That may work, but it's not actually the same thing.  kvm_set_irq(,,,0)
> > > > > > > > > will clear the bit and call kvm_pic_set_irq with the new irq_states
> > > > > > > > > value, whether it's 0 or 1.  The optimization I make is to only call
> > > > > > > > > kvm_pic_set_irq if we've "changed" irq_states.  You're taking that one
> > > > > > > > > step further to "changed and is now 0".  I don't know if that's correct
> > > > > > > > > behavior.
> > > > > > > > 
> > > > > > > > If not then I don't understand. You clear a bit
> > > > > > > > in a word. You never change it to 1, do you?
> > > > > > > 
> > > > > > > Correct, but kvm_set_irq(,,,0) may call kvm_pic_set_irq(,,1) if other
> > > > > > > source IDs are still asserting the interrupt.  Your proposal assumes
> > > > > > > that unless irq_states is also 0 we don't need to call kvm_pic_set_irq,
> > > > > > > and I don't know if that's correct.
> > > > > > 
> > > > > > Well you are asked to clear some id and level was 1. So we know
> > > > > > interrupt was asserted. Either we clear it or we don't. No?
> > > > > > 
> > > > > > > > 
> > > > > > > > But this brings another question:
> > > > > > > > 
> > > > > > > > static inline int kvm_irq_line_state(unsigned long *irq_state,
> > > > > > > >                                      int irq_source_id, int level)
> > > > > > > > {
> > > > > > > >         /* Logical OR for level trig interrupt */
> > > > > > > >         if (level)
> > > > > > > >                 set_bit(irq_source_id, irq_state);
> > > > > > > >         else
> > > > > > > >                 clear_bit(irq_source_id, irq_state);
> > > > > > > > 
> > > > > > > > 
> > > > > > > > ^^^^^^^^^^^
> > > > > > > > above uses locked instructions
> > > > > > > > 
> > > > > > > >         return !!(*irq_state);
> > > > > > > > 
> > > > > > > > 
> > > > > > > > above doesn't
> > > > > > > > 
> > > > > > > > }
> > > > > > > > 
> > > > > > > > 
> > > > > > > > why the insonsistency?
> > > > > > > 
> > > > > > > Note that set/clear_bit are not locked instructions,
> > > > > > 
> > > > > > On x86 they are:
> > > > > > static __always_inline void
> > > > > > set_bit(unsigned int nr, volatile unsigned long *addr)
> > > > > > {
> > > > > >         if (IS_IMMEDIATE(nr)) {
> > > > > >                 asm volatile(LOCK_PREFIX "orb %1,%0"
> > > > > >                         : CONST_MASK_ADDR(nr, addr)
> > > > > >                         : "iq" ((u8)CONST_MASK(nr))
> > > > > >                         : "memory");
> > > > > >         } else {
> > > > > >                 asm volatile(LOCK_PREFIX "bts %1,%0"
> > > > > >                         : BITOP_ADDR(addr) : "Ir" (nr) : "memory");
> > > > > >         }
> > > > > > }
> > > > > > 
> > > > > > > but atomic
> > > > > > > instructions and it could be argued that reading the value is also
> > > > > > > atomic.  At least that was my guess when I stumbled across the same
> > > > > > > yesterday.  IMHO, we're going off into the weeds again with these last
> > > > > > > two patches.  It may be a valid optimization, but it really has no
> > > > > > > bearing on the meat of the series (and afaict, no significant
> > > > > > > performance difference either).
> > > > > > 
> > > > > > For me it's not a performance thing. IMO code is cleaner without this locking:
> > > > > > we add a lock but only use it in some cases, so the rules become really
> > > > > > complex.
> > > > > 
> > > > > Seriously?
> > > > > 
> > > > >         spin_lock(&irqfd->source->lock);
> > > > >         if (!irqfd->source->level_asserted) {
> > > > >                 kvm_set_irq(irqfd->kvm, irqfd->source->id, irqfd->gsi, 1);
> > > > >                 irqfd->source->level_asserted = true;
> > > > >         }
> > > > >         spin_unlock(&irqfd->source->lock);
> > > > > 
> > > > > ...
> > > > > 
> > > > >         spin_lock(&eoifd->source->lock);
> > > > >         if (eoifd->source->level_asserted) {
> > > > >                 kvm_set_irq(eoifd->kvm,
> > > > >                             eoifd->source->id, eoifd->notifier.gsi, 0);
> > > > >                 eoifd->source->level_asserted = false;
> > > > >                 eventfd_signal(eoifd->eventfd, 1);
> > > > >         }
> > > > >         spin_unlock(&eoifd->source->lock);
> > > > > 
> > > > > 
> > > > > Locking doesn't get much more straightforward than that
> > > > 
> > > > Don't look at it in isolation. You are now calling kvm_set_irq
> > > > from under a spinlock. You are saying it is always safe but
> > > > this seems far from obvious. kvm_set_irq used to be
> > > > unsafe from an atomic context.
> > > 
> > > Device assignment has been calling kvm_set_irq from atomic context for
> > > quite a long time.
> > 
> > Only for MSI. That's an exception (and it's also a messy one).
> 
> Nope, I see past code that used it for INTx as well.

While this looks like it will not crash, this scans all vcpus under a
spinlock. A problem for big VMs.
Again, yes we have such uses now but we are looking for ways
to fix them and not be adding more.


> > > > > >   And current code looks buggy if yes we need to fix it somehow.
> > > > > 
> > > > > 
> > > > > Which to me seems to indicate this should be handled as a separate
> > > > > effort.
> > > > 
> > > > A separate patchset, sure. But likely a prerequisite: we still need to
> > > > look at all the code. Let's not copy bugs, need to fix them.
> > > 
> > > This looks tangential to me unless you can come up with an actual reason
> > > the above spinlock usage is incorrect or insufficient.
> > 
> > You copy the same pattern that seems racy. So you double the
> > amount of code that woul need to be fixed.
> 
> 
> _Seems_ racy, or _is_ racy?  Please identify the race.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Alex Williamson July 17, 2012, 4:45 p.m. UTC | #20

On Tue, 2012-07-17 at 19:21 +0300, Michael S. Tsirkin wrote:
> On Tue, Jul 17, 2012 at 10:17:03AM -0600, Alex Williamson wrote:
> > > > > > > > >   And current code looks buggy if yes we need to fix it somehow.
> > > > > > > > 
> > > > > > > > 
> > > > > > > > Which to me seems to indicate this should be handled as a separate
> > > > > > > > effort.
> > > > > > > 
> > > > > > > A separate patchset, sure. But likely a prerequisite: we still need to
> > > > > > > look at all the code. Let's not copy bugs, need to fix them.
> > > > > > 
> > > > > > This looks tangential to me unless you can come up with an actual reason
> > > > > > the above spinlock usage is incorrect or insufficient.
> > > > > 
> > > > > You copy the same pattern that seems racy. So you double the
> > > > > amount of code that woul need to be fixed.
> > > > 
> > > > 
> > > > _Seems_ racy, or _is_ racy?  Please identify the race.
> > > 
> > > Look at this:
> > > 
> > > static inline int kvm_irq_line_state(unsigned long *irq_state,
> > >                                      int irq_source_id, int level)
> > > {
> > >         /* Logical OR for level trig interrupt */
> > >         if (level)
> > >                 set_bit(irq_source_id, irq_state);
> > >         else
> > >                 clear_bit(irq_source_id, irq_state);
> > > 
> > >         return !!(*irq_state);
> > > }
> > > 
> > > 
> > > Now:
> > > If other CPU changes some other bit after the atomic change,
> > > it looks like !!(*irq_state) might return a stale value.
> > > 
> > > CPU 0 clears bit 0. CPU 1 sets bit 1. CPU 1 sets level to 1.
> > > If CPU 0 sees a stale value now it will return 0 here
> > > and interrupt will get cleared.
> > > 
> > > 
> > > Maybe this is not a problem. But in that case IMO it needs
> > > a comment explaining why and why it's not a problem in
> > > your code.
> > 
> > So you want to close the door on anything that uses kvm_set_irq until
> > this gets fixed... that's insane.
> 
> What does kvm_set_irq use have to do with it?  You posted this patch:
> 
> +static int kvm_clear_pic_irq(struct kvm_kernel_irq_routing_entry *e,
> +                            struct kvm *kvm, int irq_source_id)
> +{
> +#ifdef CONFIG_X86
> +       struct kvm_pic *pic = pic_irqchip(kvm);
> +       int level =
> kvm_clear_irq_line_state(&pic->irq_states[e->irqchip.pin],
> +                                            irq_source_id);
> +       if (level)
> +               kvm_pic_set_irq(pic, e->irqchip.pin,
> +                               !!pic->irq_states[e->irqchip.pin]);
> +       return level;
> +#else
> +       return -1;
> +#endif
> +}
> +
> 
> it seems racy in the same way.

Now you're just misrepresenting how we got here, which was:

> > > > > > IMHO, we're going off into the weeds again with these last
> > > > > > two patches.  It may be a valid optimization, but it really has no
> > > > > > bearing on the meat of the series (and afaict, no significant
> > > > > > performance difference either).
> > > > > 
> > > > > For me it's not a performance thing. IMO code is cleaner without this locking:
> > > > > we add a lock but only use it in some cases, so the rules become really
> > > > > complex.

So I'm happy to drop the last 2 patches, which were done at your request
anyway, but you've failed to show how the locking in patches 1&2 is
messy, inconsistent, or complex and now you're asking to block all
progress.  Those patches are just users of kvm_set_irq.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Gleb Natapov July 17, 2012, 5:09 p.m. UTC | #21

On Tue, Jul 17, 2012 at 07:36:49PM +0300, Michael S. Tsirkin wrote:
> On Tue, Jul 17, 2012 at 10:08:21AM -0600, Alex Williamson wrote:
> > On Tue, 2012-07-17 at 18:57 +0300, Michael S. Tsirkin wrote:
> > > On Tue, Jul 17, 2012 at 09:51:41AM -0600, Alex Williamson wrote:
> > > > On Tue, 2012-07-17 at 18:36 +0300, Michael S. Tsirkin wrote:
> > > > > On Tue, Jul 17, 2012 at 09:20:11AM -0600, Alex Williamson wrote:
> > > > > > On Tue, 2012-07-17 at 17:53 +0300, Michael S. Tsirkin wrote:
> > > > > > > On Tue, Jul 17, 2012 at 08:21:51AM -0600, Alex Williamson wrote:
> > > > > > > > On Tue, 2012-07-17 at 17:08 +0300, Michael S. Tsirkin wrote:
> > > > > > > > > On Tue, Jul 17, 2012 at 07:56:09AM -0600, Alex Williamson wrote:
> > > > > > > > > > On Tue, 2012-07-17 at 13:14 +0300, Michael S. Tsirkin wrote:
> > > > > > > > > > > On Mon, Jul 16, 2012 at 02:34:03PM -0600, Alex Williamson wrote:
> > > > > > > > > > > > This is an alternative to kvm_set_irq(,,,0) which returns the previous
> > > > > > > > > > > > assertion state of the interrupt and does nothing if it isn't changed.
> > > > > > > > > > > > 
> > > > > > > > > > > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > > > > > > > > > > ---
> > > > > > > > > > > > 
> > > > > > > > > > > >  include/linux/kvm_host.h |    3 ++
> > > > > > > > > > > >  virt/kvm/irq_comm.c      |   78 ++++++++++++++++++++++++++++++++++++++++++++++
> > > > > > > > > > > >  2 files changed, 81 insertions(+)
> > > > > > > > > > > > 
> > > > > > > > > > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > > > > > > > > > > index a7661c0..6c168f1 100644
> > > > > > > > > > > > --- a/include/linux/kvm_host.h
> > > > > > > > > > > > +++ b/include/linux/kvm_host.h
> > > > > > > > > > > > @@ -219,6 +219,8 @@ struct kvm_kernel_irq_routing_entry {
> > > > > > > > > > > >  	u32 type;
> > > > > > > > > > > >  	int (*set)(struct kvm_kernel_irq_routing_entry *e,
> > > > > > > > > > > >  		   struct kvm *kvm, int irq_source_id, int level);
> > > > > > > > > > > > +	int (*clear)(struct kvm_kernel_irq_routing_entry *e,
> > > > > > > > > > > > +		     struct kvm *kvm, int irq_source_id);
> > > > > > > > > > > >  	union {
> > > > > > > > > > > >  		struct {
> > > > > > > > > > > >  			unsigned irqchip;
> > > > > > > > > > > > @@ -629,6 +631,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
> > > > > > > > > > > >  				   unsigned long *deliver_bitmask);
> > > > > > > > > > > >  #endif
> > > > > > > > > > > >  int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
> > > > > > > > > > > > +int kvm_clear_irq(struct kvm *kvm, int irq_source_id, u32 irq);
> > > > > > > > > > > >  int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
> > > > > > > > > > > >  		int irq_source_id, int level);
> > > > > > > > > > > >  void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
> > > > > > > > > > > > diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
> > > > > > > > > > > > index 5afb431..76e8f22 100644
> > > > > > > > > > > > --- a/virt/kvm/irq_comm.c
> > > > > > > > > > > > +++ b/virt/kvm/irq_comm.c
> > > > > > > > > > > > @@ -68,6 +68,42 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
> > > > > > > > > > > >  	return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level);
> > > > > > > > > > > >  }
> > > > > > > > > > > >  
> > > > > > > > > > > > +static inline int kvm_clear_irq_line_state(unsigned long *irq_state,
> > > > > > > > > > > > +					    int irq_source_id)
> > > > > > > > > > > > +{
> > > > > > > > > > > > +	return !!test_and_clear_bit(irq_source_id, irq_state);
> > > > > > > > > > > > +}
> > > > > > > > > > > > +
> > > > > > > > > > > > +static int kvm_clear_pic_irq(struct kvm_kernel_irq_routing_entry *e,
> > > > > > > > > > > > +			     struct kvm *kvm, int irq_source_id)
> > > > > > > > > > > > +{
> > > > > > > > > > > > +#ifdef CONFIG_X86
> > > > > > > > > > > > +	struct kvm_pic *pic = pic_irqchip(kvm);
> > > > > > > > > > > > +	int level = kvm_clear_irq_line_state(&pic->irq_states[e->irqchip.pin],
> > > > > > > > > > > > +					     irq_source_id);
> > > > > > > > > > > > +	if (level)
> > > > > > > > > > > > +		kvm_pic_set_irq(pic, e->irqchip.pin,
> > > > > > > > > > > > +				!!pic->irq_states[e->irqchip.pin]);
> > > > > > > > > > > > +	return level;
> > > > > > > > > > > 
> > > > > > > > > > > I think I begin to understand: if (level) checks it was previously set,
> > > > > > > > > > > and then we clear if needed?
> > > > > > > > > > 
> > > > > > > > > > It's actually very simple, if we change anything in irq_states, then
> > > > > > > > > > update via the chip specific set_irq function.
> > > > > > > > > > 
> > > > > > > > > > >  I think it's worthwhile to rename
> > > > > > > > > > > level to orig_level and rewrite as:
> > > > > > > > > > > 
> > > > > > > > > > > 	if (orig_level && !pic->irq_states[e->irqchip.pin])
> > > > > > > > > > > 		kvm_pic_set_irq(pic, e->irqchip.pin, 0);
> > > > > > > > > > > 
> > > > > > > > > > > This both makes the logic clear without need for comments and
> > > > > > > > > > > saves some cycles on pic in case nothing actually changed.
> > > > > > > > > > 
> > > > > > > > > > That may work, but it's not actually the same thing.  kvm_set_irq(,,,0)
> > > > > > > > > > will clear the bit and call kvm_pic_set_irq with the new irq_states
> > > > > > > > > > value, whether it's 0 or 1.  The optimization I make is to only call
> > > > > > > > > > kvm_pic_set_irq if we've "changed" irq_states.  You're taking that one
> > > > > > > > > > step further to "changed and is now 0".  I don't know if that's correct
> > > > > > > > > > behavior.
> > > > > > > > > 
> > > > > > > > > If not then I don't understand. You clear a bit
> > > > > > > > > in a word. You never change it to 1, do you?
> > > > > > > > 
> > > > > > > > Correct, but kvm_set_irq(,,,0) may call kvm_pic_set_irq(,,1) if other
> > > > > > > > source IDs are still asserting the interrupt.  Your proposal assumes
> > > > > > > > that unless irq_states is also 0 we don't need to call kvm_pic_set_irq,
> > > > > > > > and I don't know if that's correct.
> > > > > > > 
> > > > > > > Well you are asked to clear some id and level was 1. So we know
> > > > > > > interrupt was asserted. Either we clear it or we don't. No?
> > > > > > > 
> > > > > > > > > 
> > > > > > > > > But this brings another question:
> > > > > > > > > 
> > > > > > > > > static inline int kvm_irq_line_state(unsigned long *irq_state,
> > > > > > > > >                                      int irq_source_id, int level)
> > > > > > > > > {
> > > > > > > > >         /* Logical OR for level trig interrupt */
> > > > > > > > >         if (level)
> > > > > > > > >                 set_bit(irq_source_id, irq_state);
> > > > > > > > >         else
> > > > > > > > >                 clear_bit(irq_source_id, irq_state);
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > ^^^^^^^^^^^
> > > > > > > > > above uses locked instructions
> > > > > > > > > 
> > > > > > > > >         return !!(*irq_state);
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > above doesn't
> > > > > > > > > 
> > > > > > > > > }
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > why the insonsistency?
> > > > > > > > 
> > > > > > > > Note that set/clear_bit are not locked instructions,
> > > > > > > 
> > > > > > > On x86 they are:
> > > > > > > static __always_inline void
> > > > > > > set_bit(unsigned int nr, volatile unsigned long *addr)
> > > > > > > {
> > > > > > >         if (IS_IMMEDIATE(nr)) {
> > > > > > >                 asm volatile(LOCK_PREFIX "orb %1,%0"
> > > > > > >                         : CONST_MASK_ADDR(nr, addr)
> > > > > > >                         : "iq" ((u8)CONST_MASK(nr))
> > > > > > >                         : "memory");
> > > > > > >         } else {
> > > > > > >                 asm volatile(LOCK_PREFIX "bts %1,%0"
> > > > > > >                         : BITOP_ADDR(addr) : "Ir" (nr) : "memory");
> > > > > > >         }
> > > > > > > }
> > > > > > > 
> > > > > > > > but atomic
> > > > > > > > instructions and it could be argued that reading the value is also
> > > > > > > > atomic.  At least that was my guess when I stumbled across the same
> > > > > > > > yesterday.  IMHO, we're going off into the weeds again with these last
> > > > > > > > two patches.  It may be a valid optimization, but it really has no
> > > > > > > > bearing on the meat of the series (and afaict, no significant
> > > > > > > > performance difference either).
> > > > > > > 
> > > > > > > For me it's not a performance thing. IMO code is cleaner without this locking:
> > > > > > > we add a lock but only use it in some cases, so the rules become really
> > > > > > > complex.
> > > > > > 
> > > > > > Seriously?
> > > > > > 
> > > > > >         spin_lock(&irqfd->source->lock);
> > > > > >         if (!irqfd->source->level_asserted) {
> > > > > >                 kvm_set_irq(irqfd->kvm, irqfd->source->id, irqfd->gsi, 1);
> > > > > >                 irqfd->source->level_asserted = true;
> > > > > >         }
> > > > > >         spin_unlock(&irqfd->source->lock);
> > > > > > 
> > > > > > ...
> > > > > > 
> > > > > >         spin_lock(&eoifd->source->lock);
> > > > > >         if (eoifd->source->level_asserted) {
> > > > > >                 kvm_set_irq(eoifd->kvm,
> > > > > >                             eoifd->source->id, eoifd->notifier.gsi, 0);
> > > > > >                 eoifd->source->level_asserted = false;
> > > > > >                 eventfd_signal(eoifd->eventfd, 1);
> > > > > >         }
> > > > > >         spin_unlock(&eoifd->source->lock);
> > > > > > 
> > > > > > 
> > > > > > Locking doesn't get much more straightforward than that
> > > > > 
> > > > > Don't look at it in isolation. You are now calling kvm_set_irq
> > > > > from under a spinlock. You are saying it is always safe but
> > > > > this seems far from obvious. kvm_set_irq used to be
> > > > > unsafe from an atomic context.
> > > > 
> > > > Device assignment has been calling kvm_set_irq from atomic context for
> > > > quite a long time.
> > > 
> > > Only for MSI. That's an exception (and it's also a messy one).
> > 
> > Nope, I see past code that used it for INTx as well.
> 
> While this looks like it will not crash, this scans all vcpus under a
> spinlock. A problem for big VMs.
> Again, yes we have such uses now but we are looking for ways
> to fix them and not be adding more.
> 
> 
Same as with MSI.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Michael S. Tsirkin July 17, 2012, 6:55 p.m. UTC | #22

On Tue, Jul 17, 2012 at 10:45:52AM -0600, Alex Williamson wrote:
> On Tue, 2012-07-17 at 19:21 +0300, Michael S. Tsirkin wrote:
> > On Tue, Jul 17, 2012 at 10:17:03AM -0600, Alex Williamson wrote:
> > > > > > > > > >   And current code looks buggy if yes we need to fix it somehow.
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > Which to me seems to indicate this should be handled as a separate
> > > > > > > > > effort.
> > > > > > > > 
> > > > > > > > A separate patchset, sure. But likely a prerequisite: we still need to
> > > > > > > > look at all the code. Let's not copy bugs, need to fix them.
> > > > > > > 
> > > > > > > This looks tangential to me unless you can come up with an actual reason
> > > > > > > the above spinlock usage is incorrect or insufficient.
> > > > > > 
> > > > > > You copy the same pattern that seems racy. So you double the
> > > > > > amount of code that woul need to be fixed.
> > > > > 
> > > > > 
> > > > > _Seems_ racy, or _is_ racy?  Please identify the race.
> > > > 
> > > > Look at this:
> > > > 
> > > > static inline int kvm_irq_line_state(unsigned long *irq_state,
> > > >                                      int irq_source_id, int level)
> > > > {
> > > >         /* Logical OR for level trig interrupt */
> > > >         if (level)
> > > >                 set_bit(irq_source_id, irq_state);
> > > >         else
> > > >                 clear_bit(irq_source_id, irq_state);
> > > > 
> > > >         return !!(*irq_state);
> > > > }
> > > > 
> > > > 
> > > > Now:
> > > > If other CPU changes some other bit after the atomic change,
> > > > it looks like !!(*irq_state) might return a stale value.
> > > > 
> > > > CPU 0 clears bit 0. CPU 1 sets bit 1. CPU 1 sets level to 1.
> > > > If CPU 0 sees a stale value now it will return 0 here
> > > > and interrupt will get cleared.
> > > > 
> > > > 
> > > > Maybe this is not a problem. But in that case IMO it needs
> > > > a comment explaining why and why it's not a problem in
> > > > your code.
> > > 
> > > So you want to close the door on anything that uses kvm_set_irq until
> > > this gets fixed... that's insane.
> > 
> > What does kvm_set_irq use have to do with it?  You posted this patch:
> > 
> > +static int kvm_clear_pic_irq(struct kvm_kernel_irq_routing_entry *e,
> > +                            struct kvm *kvm, int irq_source_id)
> > +{
> > +#ifdef CONFIG_X86
> > +       struct kvm_pic *pic = pic_irqchip(kvm);
> > +       int level =
> > kvm_clear_irq_line_state(&pic->irq_states[e->irqchip.pin],
> > +                                            irq_source_id);
> > +       if (level)
> > +               kvm_pic_set_irq(pic, e->irqchip.pin,
> > +                               !!pic->irq_states[e->irqchip.pin]);
> > +       return level;
> > +#else
> > +       return -1;
> > +#endif
> > +}
> > +
> > 
> > it seems racy in the same way.
> 
> Now you're just misrepresenting how we got here, which was:
> 
> > > > > > > IMHO, we're going off into the weeds again with these last
> > > > > > > two patches.  It may be a valid optimization, but it really has no
> > > > > > > bearing on the meat of the series (and afaict, no significant
> > > > > > > performance difference either).
> > > > > > 
> > > > > > For me it's not a performance thing. IMO code is cleaner without this locking:
> > > > > > we add a lock but only use it in some cases, so the rules become really
> > > > > > complex.
> 
> So I'm happy to drop the last 2 patches, which were done at your request
> anyway, but you've failed to show how the locking in patches 1&2 is
> messy, inconsistent, or complex and now you're asking to block all
> progress.

I'm asking for bugs to get fixed and not duplicated. Adding more bugs is
not progress. Or maybe there is no bug. Let's see why and add a comment.

>  Those patches are just users of kvm_set_irq.


Well these add calls to kvm_set_irq which scans all vcpus under
spinlock. In the past Avi thought this is not a good idea too.
Maybe things changed.

Alex Williamson July 17, 2012, 7:51 p.m. UTC | #23

On Tue, 2012-07-17 at 21:55 +0300, Michael S. Tsirkin wrote:
> On Tue, Jul 17, 2012 at 10:45:52AM -0600, Alex Williamson wrote:
> > On Tue, 2012-07-17 at 19:21 +0300, Michael S. Tsirkin wrote:
> > > On Tue, Jul 17, 2012 at 10:17:03AM -0600, Alex Williamson wrote:
> > > > > > > > > > >   And current code looks buggy if yes we need to fix it somehow.
> > > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > Which to me seems to indicate this should be handled as a separate
> > > > > > > > > > effort.
> > > > > > > > > 
> > > > > > > > > A separate patchset, sure. But likely a prerequisite: we still need to
> > > > > > > > > look at all the code. Let's not copy bugs, need to fix them.
> > > > > > > > 
> > > > > > > > This looks tangential to me unless you can come up with an actual reason
> > > > > > > > the above spinlock usage is incorrect or insufficient.
> > > > > > > 
> > > > > > > You copy the same pattern that seems racy. So you double the
> > > > > > > amount of code that woul need to be fixed.
> > > > > > 
> > > > > > 
> > > > > > _Seems_ racy, or _is_ racy?  Please identify the race.
> > > > > 
> > > > > Look at this:
> > > > > 
> > > > > static inline int kvm_irq_line_state(unsigned long *irq_state,
> > > > >                                      int irq_source_id, int level)
> > > > > {
> > > > >         /* Logical OR for level trig interrupt */
> > > > >         if (level)
> > > > >                 set_bit(irq_source_id, irq_state);
> > > > >         else
> > > > >                 clear_bit(irq_source_id, irq_state);
> > > > > 
> > > > >         return !!(*irq_state);
> > > > > }
> > > > > 
> > > > > 
> > > > > Now:
> > > > > If other CPU changes some other bit after the atomic change,
> > > > > it looks like !!(*irq_state) might return a stale value.
> > > > > 
> > > > > CPU 0 clears bit 0. CPU 1 sets bit 1. CPU 1 sets level to 1.
> > > > > If CPU 0 sees a stale value now it will return 0 here
> > > > > and interrupt will get cleared.
> > > > > 
> > > > > 
> > > > > Maybe this is not a problem. But in that case IMO it needs
> > > > > a comment explaining why and why it's not a problem in
> > > > > your code.
> > > > 
> > > > So you want to close the door on anything that uses kvm_set_irq until
> > > > this gets fixed... that's insane.
> > > 
> > > What does kvm_set_irq use have to do with it?  You posted this patch:
> > > 
> > > +static int kvm_clear_pic_irq(struct kvm_kernel_irq_routing_entry *e,
> > > +                            struct kvm *kvm, int irq_source_id)
> > > +{
> > > +#ifdef CONFIG_X86
> > > +       struct kvm_pic *pic = pic_irqchip(kvm);
> > > +       int level =
> > > kvm_clear_irq_line_state(&pic->irq_states[e->irqchip.pin],
> > > +                                            irq_source_id);
> > > +       if (level)
> > > +               kvm_pic_set_irq(pic, e->irqchip.pin,
> > > +                               !!pic->irq_states[e->irqchip.pin]);
> > > +       return level;
> > > +#else
> > > +       return -1;
> > > +#endif
> > > +}
> > > +
> > > 
> > > it seems racy in the same way.
> > 
> > Now you're just misrepresenting how we got here, which was:
> > 
> > > > > > > > IMHO, we're going off into the weeds again with these last
> > > > > > > > two patches.  It may be a valid optimization, but it really has no
> > > > > > > > bearing on the meat of the series (and afaict, no significant
> > > > > > > > performance difference either).
> > > > > > > 
> > > > > > > For me it's not a performance thing. IMO code is cleaner without this locking:
> > > > > > > we add a lock but only use it in some cases, so the rules become really
> > > > > > > complex.
> > 
> > So I'm happy to drop the last 2 patches, which were done at your request
> > anyway, but you've failed to show how the locking in patches 1&2 is
> > messy, inconsistent, or complex and now you're asking to block all
> > progress.
> 
> I'm asking for bugs to get fixed and not duplicated. Adding more bugs is
> not progress. Or maybe there is no bug. Let's see why and add a comment.
> 
> >  Those patches are just users of kvm_set_irq.
> 
> 
> Well these add calls to kvm_set_irq which scans all vcpus under
> spinlock. In the past Avi thought this is not a good idea too.
> Maybe things changed.

We can drop the spinlock if we don't care about spurious EOIs, which is
only a theoretical scalability problem anyway.  We're talking about
level interrupts here, how scalable do we need to be?



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Michael S. Tsirkin July 17, 2012, 9:05 p.m. UTC | #24

On Tue, Jul 17, 2012 at 01:51:27PM -0600, Alex Williamson wrote:
> On Tue, 2012-07-17 at 21:55 +0300, Michael S. Tsirkin wrote:
> > On Tue, Jul 17, 2012 at 10:45:52AM -0600, Alex Williamson wrote:
> > > On Tue, 2012-07-17 at 19:21 +0300, Michael S. Tsirkin wrote:
> > > > On Tue, Jul 17, 2012 at 10:17:03AM -0600, Alex Williamson wrote:
> > > > > > > > > > > >   And current code looks buggy if yes we need to fix it somehow.
> > > > > > > > > > > 
> > > > > > > > > > > 
> > > > > > > > > > > Which to me seems to indicate this should be handled as a separate
> > > > > > > > > > > effort.
> > > > > > > > > > 
> > > > > > > > > > A separate patchset, sure. But likely a prerequisite: we still need to
> > > > > > > > > > look at all the code. Let's not copy bugs, need to fix them.
> > > > > > > > > 
> > > > > > > > > This looks tangential to me unless you can come up with an actual reason
> > > > > > > > > the above spinlock usage is incorrect or insufficient.
> > > > > > > > 
> > > > > > > > You copy the same pattern that seems racy. So you double the
> > > > > > > > amount of code that woul need to be fixed.
> > > > > > > 
> > > > > > > 
> > > > > > > _Seems_ racy, or _is_ racy?  Please identify the race.
> > > > > > 
> > > > > > Look at this:
> > > > > > 
> > > > > > static inline int kvm_irq_line_state(unsigned long *irq_state,
> > > > > >                                      int irq_source_id, int level)
> > > > > > {
> > > > > >         /* Logical OR for level trig interrupt */
> > > > > >         if (level)
> > > > > >                 set_bit(irq_source_id, irq_state);
> > > > > >         else
> > > > > >                 clear_bit(irq_source_id, irq_state);
> > > > > > 
> > > > > >         return !!(*irq_state);
> > > > > > }
> > > > > > 
> > > > > > 
> > > > > > Now:
> > > > > > If other CPU changes some other bit after the atomic change,
> > > > > > it looks like !!(*irq_state) might return a stale value.
> > > > > > 
> > > > > > CPU 0 clears bit 0. CPU 1 sets bit 1. CPU 1 sets level to 1.
> > > > > > If CPU 0 sees a stale value now it will return 0 here
> > > > > > and interrupt will get cleared.
> > > > > > 
> > > > > > 
> > > > > > Maybe this is not a problem. But in that case IMO it needs
> > > > > > a comment explaining why and why it's not a problem in
> > > > > > your code.
> > > > > 
> > > > > So you want to close the door on anything that uses kvm_set_irq until
> > > > > this gets fixed... that's insane.
> > > > 
> > > > What does kvm_set_irq use have to do with it?  You posted this patch:
> > > > 
> > > > +static int kvm_clear_pic_irq(struct kvm_kernel_irq_routing_entry *e,
> > > > +                            struct kvm *kvm, int irq_source_id)
> > > > +{
> > > > +#ifdef CONFIG_X86
> > > > +       struct kvm_pic *pic = pic_irqchip(kvm);
> > > > +       int level =
> > > > kvm_clear_irq_line_state(&pic->irq_states[e->irqchip.pin],
> > > > +                                            irq_source_id);
> > > > +       if (level)
> > > > +               kvm_pic_set_irq(pic, e->irqchip.pin,
> > > > +                               !!pic->irq_states[e->irqchip.pin]);
> > > > +       return level;
> > > > +#else
> > > > +       return -1;
> > > > +#endif
> > > > +}
> > > > +
> > > > 
> > > > it seems racy in the same way.
> > > 
> > > Now you're just misrepresenting how we got here, which was:
> > > 
> > > > > > > > > IMHO, we're going off into the weeds again with these last
> > > > > > > > > two patches.  It may be a valid optimization, but it really has no
> > > > > > > > > bearing on the meat of the series (and afaict, no significant
> > > > > > > > > performance difference either).
> > > > > > > > 
> > > > > > > > For me it's not a performance thing. IMO code is cleaner without this locking:
> > > > > > > > we add a lock but only use it in some cases, so the rules become really
> > > > > > > > complex.
> > > 
> > > So I'm happy to drop the last 2 patches, which were done at your request
> > > anyway, but you've failed to show how the locking in patches 1&2 is
> > > messy, inconsistent, or complex and now you're asking to block all
> > > progress.
> > 
> > I'm asking for bugs to get fixed and not duplicated. Adding more bugs is
> > not progress. Or maybe there is no bug. Let's see why and add a comment.
> > 
> > >  Those patches are just users of kvm_set_irq.
> > 
> > 
> > Well these add calls to kvm_set_irq which scans all vcpus under
> > spinlock. In the past Avi thought this is not a good idea too.
> > Maybe things changed.
> 
> We can drop the spinlock if we don't care about spurious EOIs, which is
> only a theoretical scalability problem anyway.

Not theoretical at all IMO. We see the problem with virtio with old
guests today.

> We're talking about
> level interrupts here, how scalable do we need to be?
> 

The reason we are moving them into kernel at all is for speed, no?

Alex Williamson July 17, 2012, 10:01 p.m. UTC | #25

On Wed, 2012-07-18 at 00:05 +0300, Michael S. Tsirkin wrote:
> On Tue, Jul 17, 2012 at 01:51:27PM -0600, Alex Williamson wrote:
> > On Tue, 2012-07-17 at 21:55 +0300, Michael S. Tsirkin wrote:
> > > On Tue, Jul 17, 2012 at 10:45:52AM -0600, Alex Williamson wrote:
> > > > On Tue, 2012-07-17 at 19:21 +0300, Michael S. Tsirkin wrote:
> > > > > On Tue, Jul 17, 2012 at 10:17:03AM -0600, Alex Williamson wrote:
> > > > > > > > > > > > >   And current code looks buggy if yes we need to fix it somehow.
> > > > > > > > > > > > 
> > > > > > > > > > > > 
> > > > > > > > > > > > Which to me seems to indicate this should be handled as a separate
> > > > > > > > > > > > effort.
> > > > > > > > > > > 
> > > > > > > > > > > A separate patchset, sure. But likely a prerequisite: we still need to
> > > > > > > > > > > look at all the code. Let's not copy bugs, need to fix them.
> > > > > > > > > > 
> > > > > > > > > > This looks tangential to me unless you can come up with an actual reason
> > > > > > > > > > the above spinlock usage is incorrect or insufficient.
> > > > > > > > > 
> > > > > > > > > You copy the same pattern that seems racy. So you double the
> > > > > > > > > amount of code that woul need to be fixed.
> > > > > > > > 
> > > > > > > > 
> > > > > > > > _Seems_ racy, or _is_ racy?  Please identify the race.
> > > > > > > 
> > > > > > > Look at this:
> > > > > > > 
> > > > > > > static inline int kvm_irq_line_state(unsigned long *irq_state,
> > > > > > >                                      int irq_source_id, int level)
> > > > > > > {
> > > > > > >         /* Logical OR for level trig interrupt */
> > > > > > >         if (level)
> > > > > > >                 set_bit(irq_source_id, irq_state);
> > > > > > >         else
> > > > > > >                 clear_bit(irq_source_id, irq_state);
> > > > > > > 
> > > > > > >         return !!(*irq_state);
> > > > > > > }
> > > > > > > 
> > > > > > > 
> > > > > > > Now:
> > > > > > > If other CPU changes some other bit after the atomic change,
> > > > > > > it looks like !!(*irq_state) might return a stale value.
> > > > > > > 
> > > > > > > CPU 0 clears bit 0. CPU 1 sets bit 1. CPU 1 sets level to 1.
> > > > > > > If CPU 0 sees a stale value now it will return 0 here
> > > > > > > and interrupt will get cleared.
> > > > > > > 
> > > > > > > 
> > > > > > > Maybe this is not a problem. But in that case IMO it needs
> > > > > > > a comment explaining why and why it's not a problem in
> > > > > > > your code.
> > > > > > 
> > > > > > So you want to close the door on anything that uses kvm_set_irq until
> > > > > > this gets fixed... that's insane.
> > > > > 
> > > > > What does kvm_set_irq use have to do with it?  You posted this patch:
> > > > > 
> > > > > +static int kvm_clear_pic_irq(struct kvm_kernel_irq_routing_entry *e,
> > > > > +                            struct kvm *kvm, int irq_source_id)
> > > > > +{
> > > > > +#ifdef CONFIG_X86
> > > > > +       struct kvm_pic *pic = pic_irqchip(kvm);
> > > > > +       int level =
> > > > > kvm_clear_irq_line_state(&pic->irq_states[e->irqchip.pin],
> > > > > +                                            irq_source_id);
> > > > > +       if (level)
> > > > > +               kvm_pic_set_irq(pic, e->irqchip.pin,
> > > > > +                               !!pic->irq_states[e->irqchip.pin]);
> > > > > +       return level;
> > > > > +#else
> > > > > +       return -1;
> > > > > +#endif
> > > > > +}
> > > > > +
> > > > > 
> > > > > it seems racy in the same way.
> > > > 
> > > > Now you're just misrepresenting how we got here, which was:
> > > > 
> > > > > > > > > > IMHO, we're going off into the weeds again with these last
> > > > > > > > > > two patches.  It may be a valid optimization, but it really has no
> > > > > > > > > > bearing on the meat of the series (and afaict, no significant
> > > > > > > > > > performance difference either).
> > > > > > > > > 
> > > > > > > > > For me it's not a performance thing. IMO code is cleaner without this locking:
> > > > > > > > > we add a lock but only use it in some cases, so the rules become really
> > > > > > > > > complex.
> > > > 
> > > > So I'm happy to drop the last 2 patches, which were done at your request
> > > > anyway, but you've failed to show how the locking in patches 1&2 is
> > > > messy, inconsistent, or complex and now you're asking to block all
> > > > progress.
> > > 
> > > I'm asking for bugs to get fixed and not duplicated. Adding more bugs is
> > > not progress. Or maybe there is no bug. Let's see why and add a comment.
> > > 
> > > >  Those patches are just users of kvm_set_irq.
> > > 
> > > 
> > > Well these add calls to kvm_set_irq which scans all vcpus under
> > > spinlock. In the past Avi thought this is not a good idea too.
> > > Maybe things changed.
> > 
> > We can drop the spinlock if we don't care about spurious EOIs, which is
> > only a theoretical scalability problem anyway.
> 
> Not theoretical at all IMO. We see the problem with virtio with old
> guests today.

And how are you injecting level interrupts with virtio today w/o this
interface?

> > We're talking about
> > level interrupts here, how scalable do we need to be?
> > 
> 
> The reason we are moving them into kernel at all is for speed, no?

Come on, if we take that approach why aren't we writing all of this in
assembly for speed?!  All I'm suggesting is there's a limit to return on
investment at some point.  Maybe it's here.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Michael S. Tsirkin July 17, 2012, 10:05 p.m. UTC | #26

On Tue, Jul 17, 2012 at 04:01:40PM -0600, Alex Williamson wrote:
> On Wed, 2012-07-18 at 00:05 +0300, Michael S. Tsirkin wrote:
> > On Tue, Jul 17, 2012 at 01:51:27PM -0600, Alex Williamson wrote:
> > > On Tue, 2012-07-17 at 21:55 +0300, Michael S. Tsirkin wrote:
> > > > On Tue, Jul 17, 2012 at 10:45:52AM -0600, Alex Williamson wrote:
> > > > > On Tue, 2012-07-17 at 19:21 +0300, Michael S. Tsirkin wrote:
> > > > > > On Tue, Jul 17, 2012 at 10:17:03AM -0600, Alex Williamson wrote:
> > > > > > > > > > > > > >   And current code looks buggy if yes we need to fix it somehow.
> > > > > > > > > > > > > 
> > > > > > > > > > > > > 
> > > > > > > > > > > > > Which to me seems to indicate this should be handled as a separate
> > > > > > > > > > > > > effort.
> > > > > > > > > > > > 
> > > > > > > > > > > > A separate patchset, sure. But likely a prerequisite: we still need to
> > > > > > > > > > > > look at all the code. Let's not copy bugs, need to fix them.
> > > > > > > > > > > 
> > > > > > > > > > > This looks tangential to me unless you can come up with an actual reason
> > > > > > > > > > > the above spinlock usage is incorrect or insufficient.
> > > > > > > > > > 
> > > > > > > > > > You copy the same pattern that seems racy. So you double the
> > > > > > > > > > amount of code that woul need to be fixed.
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > _Seems_ racy, or _is_ racy?  Please identify the race.
> > > > > > > > 
> > > > > > > > Look at this:
> > > > > > > > 
> > > > > > > > static inline int kvm_irq_line_state(unsigned long *irq_state,
> > > > > > > >                                      int irq_source_id, int level)
> > > > > > > > {
> > > > > > > >         /* Logical OR for level trig interrupt */
> > > > > > > >         if (level)
> > > > > > > >                 set_bit(irq_source_id, irq_state);
> > > > > > > >         else
> > > > > > > >                 clear_bit(irq_source_id, irq_state);
> > > > > > > > 
> > > > > > > >         return !!(*irq_state);
> > > > > > > > }
> > > > > > > > 
> > > > > > > > 
> > > > > > > > Now:
> > > > > > > > If other CPU changes some other bit after the atomic change,
> > > > > > > > it looks like !!(*irq_state) might return a stale value.
> > > > > > > > 
> > > > > > > > CPU 0 clears bit 0. CPU 1 sets bit 1. CPU 1 sets level to 1.
> > > > > > > > If CPU 0 sees a stale value now it will return 0 here
> > > > > > > > and interrupt will get cleared.
> > > > > > > > 
> > > > > > > > 
> > > > > > > > Maybe this is not a problem. But in that case IMO it needs
> > > > > > > > a comment explaining why and why it's not a problem in
> > > > > > > > your code.
> > > > > > > 
> > > > > > > So you want to close the door on anything that uses kvm_set_irq until
> > > > > > > this gets fixed... that's insane.
> > > > > > 
> > > > > > What does kvm_set_irq use have to do with it?  You posted this patch:
> > > > > > 
> > > > > > +static int kvm_clear_pic_irq(struct kvm_kernel_irq_routing_entry *e,
> > > > > > +                            struct kvm *kvm, int irq_source_id)
> > > > > > +{
> > > > > > +#ifdef CONFIG_X86
> > > > > > +       struct kvm_pic *pic = pic_irqchip(kvm);
> > > > > > +       int level =
> > > > > > kvm_clear_irq_line_state(&pic->irq_states[e->irqchip.pin],
> > > > > > +                                            irq_source_id);
> > > > > > +       if (level)
> > > > > > +               kvm_pic_set_irq(pic, e->irqchip.pin,
> > > > > > +                               !!pic->irq_states[e->irqchip.pin]);
> > > > > > +       return level;
> > > > > > +#else
> > > > > > +       return -1;
> > > > > > +#endif
> > > > > > +}
> > > > > > +
> > > > > > 
> > > > > > it seems racy in the same way.
> > > > > 
> > > > > Now you're just misrepresenting how we got here, which was:
> > > > > 
> > > > > > > > > > > IMHO, we're going off into the weeds again with these last
> > > > > > > > > > > two patches.  It may be a valid optimization, but it really has no
> > > > > > > > > > > bearing on the meat of the series (and afaict, no significant
> > > > > > > > > > > performance difference either).
> > > > > > > > > > 
> > > > > > > > > > For me it's not a performance thing. IMO code is cleaner without this locking:
> > > > > > > > > > we add a lock but only use it in some cases, so the rules become really
> > > > > > > > > > complex.
> > > > > 
> > > > > So I'm happy to drop the last 2 patches, which were done at your request
> > > > > anyway, but you've failed to show how the locking in patches 1&2 is
> > > > > messy, inconsistent, or complex and now you're asking to block all
> > > > > progress.
> > > > 
> > > > I'm asking for bugs to get fixed and not duplicated. Adding more bugs is
> > > > not progress. Or maybe there is no bug. Let's see why and add a comment.
> > > > 
> > > > >  Those patches are just users of kvm_set_irq.
> > > > 
> > > > 
> > > > Well these add calls to kvm_set_irq which scans all vcpus under
> > > > spinlock. In the past Avi thought this is not a good idea too.
> > > > Maybe things changed.
> > > 
> > > We can drop the spinlock if we don't care about spurious EOIs, which is
> > > only a theoretical scalability problem anyway.
> > 
> > Not theoretical at all IMO. We see the problem with virtio with old
> > guests today.
> 
> And how are you injecting level interrupts with virtio today w/o this
> interface?

Not well at all. Bad performance with interrupt sharing.

> > > We're talking about
> > > level interrupts here, how scalable do we need to be?
> > > 
> > 
> > The reason we are moving them into kernel at all is for speed, no?
> 
> Come on, if we take that approach why aren't we writing all of this in
> assembly for speed?!  All I'm suggesting is there's a limit to return on
> investment at some point.  Maybe it's here.

Well I am just warning about known problems: don't wake up (typically)
many eoifds on a single interrupt or you will have scalability
problems that we already see with emulated devices.

Alex Williamson July 17, 2012, 10:22 p.m. UTC | #27

On Wed, 2012-07-18 at 01:05 +0300, Michael S. Tsirkin wrote:
> On Tue, Jul 17, 2012 at 04:01:40PM -0600, Alex Williamson wrote:
> > On Wed, 2012-07-18 at 00:05 +0300, Michael S. Tsirkin wrote:
> > > On Tue, Jul 17, 2012 at 01:51:27PM -0600, Alex Williamson wrote:
> > > > On Tue, 2012-07-17 at 21:55 +0300, Michael S. Tsirkin wrote:
> > > > > On Tue, Jul 17, 2012 at 10:45:52AM -0600, Alex Williamson wrote:
> > > > > > On Tue, 2012-07-17 at 19:21 +0300, Michael S. Tsirkin wrote:
> > > > > > > On Tue, Jul 17, 2012 at 10:17:03AM -0600, Alex Williamson wrote:
> > > > > > > > > > > > > > >   And current code looks buggy if yes we need to fix it somehow.
> > > > > > > > > > > > > > 
> > > > > > > > > > > > > > 
> > > > > > > > > > > > > > Which to me seems to indicate this should be handled as a separate
> > > > > > > > > > > > > > effort.
> > > > > > > > > > > > > 
> > > > > > > > > > > > > A separate patchset, sure. But likely a prerequisite: we still need to
> > > > > > > > > > > > > look at all the code. Let's not copy bugs, need to fix them.
> > > > > > > > > > > > 
> > > > > > > > > > > > This looks tangential to me unless you can come up with an actual reason
> > > > > > > > > > > > the above spinlock usage is incorrect or insufficient.
> > > > > > > > > > > 
> > > > > > > > > > > You copy the same pattern that seems racy. So you double the
> > > > > > > > > > > amount of code that woul need to be fixed.
> > > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > _Seems_ racy, or _is_ racy?  Please identify the race.
> > > > > > > > > 
> > > > > > > > > Look at this:
> > > > > > > > > 
> > > > > > > > > static inline int kvm_irq_line_state(unsigned long *irq_state,
> > > > > > > > >                                      int irq_source_id, int level)
> > > > > > > > > {
> > > > > > > > >         /* Logical OR for level trig interrupt */
> > > > > > > > >         if (level)
> > > > > > > > >                 set_bit(irq_source_id, irq_state);
> > > > > > > > >         else
> > > > > > > > >                 clear_bit(irq_source_id, irq_state);
> > > > > > > > > 
> > > > > > > > >         return !!(*irq_state);
> > > > > > > > > }
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > Now:
> > > > > > > > > If other CPU changes some other bit after the atomic change,
> > > > > > > > > it looks like !!(*irq_state) might return a stale value.
> > > > > > > > > 
> > > > > > > > > CPU 0 clears bit 0. CPU 1 sets bit 1. CPU 1 sets level to 1.
> > > > > > > > > If CPU 0 sees a stale value now it will return 0 here
> > > > > > > > > and interrupt will get cleared.
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > Maybe this is not a problem. But in that case IMO it needs
> > > > > > > > > a comment explaining why and why it's not a problem in
> > > > > > > > > your code.
> > > > > > > > 
> > > > > > > > So you want to close the door on anything that uses kvm_set_irq until
> > > > > > > > this gets fixed... that's insane.
> > > > > > > 
> > > > > > > What does kvm_set_irq use have to do with it?  You posted this patch:
> > > > > > > 
> > > > > > > +static int kvm_clear_pic_irq(struct kvm_kernel_irq_routing_entry *e,
> > > > > > > +                            struct kvm *kvm, int irq_source_id)
> > > > > > > +{
> > > > > > > +#ifdef CONFIG_X86
> > > > > > > +       struct kvm_pic *pic = pic_irqchip(kvm);
> > > > > > > +       int level =
> > > > > > > kvm_clear_irq_line_state(&pic->irq_states[e->irqchip.pin],
> > > > > > > +                                            irq_source_id);
> > > > > > > +       if (level)
> > > > > > > +               kvm_pic_set_irq(pic, e->irqchip.pin,
> > > > > > > +                               !!pic->irq_states[e->irqchip.pin]);
> > > > > > > +       return level;
> > > > > > > +#else
> > > > > > > +       return -1;
> > > > > > > +#endif
> > > > > > > +}
> > > > > > > +
> > > > > > > 
> > > > > > > it seems racy in the same way.
> > > > > > 
> > > > > > Now you're just misrepresenting how we got here, which was:
> > > > > > 
> > > > > > > > > > > > IMHO, we're going off into the weeds again with these last
> > > > > > > > > > > > two patches.  It may be a valid optimization, but it really has no
> > > > > > > > > > > > bearing on the meat of the series (and afaict, no significant
> > > > > > > > > > > > performance difference either).
> > > > > > > > > > > 
> > > > > > > > > > > For me it's not a performance thing. IMO code is cleaner without this locking:
> > > > > > > > > > > we add a lock but only use it in some cases, so the rules become really
> > > > > > > > > > > complex.
> > > > > > 
> > > > > > So I'm happy to drop the last 2 patches, which were done at your request
> > > > > > anyway, but you've failed to show how the locking in patches 1&2 is
> > > > > > messy, inconsistent, or complex and now you're asking to block all
> > > > > > progress.
> > > > > 
> > > > > I'm asking for bugs to get fixed and not duplicated. Adding more bugs is
> > > > > not progress. Or maybe there is no bug. Let's see why and add a comment.
> > > > > 
> > > > > >  Those patches are just users of kvm_set_irq.
> > > > > 
> > > > > 
> > > > > Well these add calls to kvm_set_irq which scans all vcpus under
> > > > > spinlock. In the past Avi thought this is not a good idea too.
> > > > > Maybe things changed.
> > > > 
> > > > We can drop the spinlock if we don't care about spurious EOIs, which is
> > > > only a theoretical scalability problem anyway.
> > > 
> > > Not theoretical at all IMO. We see the problem with virtio with old
> > > guests today.
> > 
> > And how are you injecting level interrupts with virtio today w/o this
> > interface?
> 
> Not well at all. Bad performance with interrupt sharing.
> 
> > > > We're talking about
> > > > level interrupts here, how scalable do we need to be?
> > > > 
> > > 
> > > The reason we are moving them into kernel at all is for speed, no?
> > 
> > Come on, if we take that approach why aren't we writing all of this in
> > assembly for speed?!  All I'm suggesting is there's a limit to return on
> > investment at some point.  Maybe it's here.
> 
> Well I am just warning about known problems: don't wake up (typically)
> many eoifds on a single interrupt or you will have scalability
> problems that we already see with emulated devices.

Well, that's why we don't want to bounce back to userspace.  All we need
to do in VFIO for each callback is note that the interrupt wasn't
previously masked and do nothing.  So we're talking about acquiring a
spinlock and a few data references.  Sure, I'd like to avoid doing that,
but not if it means blocking this patch series until some unknown number
of patches fixes all the tangential problems you find.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Michael S. Tsirkin July 17, 2012, 10:31 p.m. UTC | #28

On Tue, Jul 17, 2012 at 04:22:10PM -0600, Alex Williamson wrote:
> On Wed, 2012-07-18 at 01:05 +0300, Michael S. Tsirkin wrote:
> > On Tue, Jul 17, 2012 at 04:01:40PM -0600, Alex Williamson wrote:
> > > On Wed, 2012-07-18 at 00:05 +0300, Michael S. Tsirkin wrote:
> > > > On Tue, Jul 17, 2012 at 01:51:27PM -0600, Alex Williamson wrote:
> > > > > On Tue, 2012-07-17 at 21:55 +0300, Michael S. Tsirkin wrote:
> > > > > > On Tue, Jul 17, 2012 at 10:45:52AM -0600, Alex Williamson wrote:
> > > > > > > On Tue, 2012-07-17 at 19:21 +0300, Michael S. Tsirkin wrote:
> > > > > > > > On Tue, Jul 17, 2012 at 10:17:03AM -0600, Alex Williamson wrote:
> > > > > > > > > > > > > > > >   And current code looks buggy if yes we need to fix it somehow.
> > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > Which to me seems to indicate this should be handled as a separate
> > > > > > > > > > > > > > > effort.
> > > > > > > > > > > > > > 
> > > > > > > > > > > > > > A separate patchset, sure. But likely a prerequisite: we still need to
> > > > > > > > > > > > > > look at all the code. Let's not copy bugs, need to fix them.
> > > > > > > > > > > > > 
> > > > > > > > > > > > > This looks tangential to me unless you can come up with an actual reason
> > > > > > > > > > > > > the above spinlock usage is incorrect or insufficient.
> > > > > > > > > > > > 
> > > > > > > > > > > > You copy the same pattern that seems racy. So you double the
> > > > > > > > > > > > amount of code that woul need to be fixed.
> > > > > > > > > > > 
> > > > > > > > > > > 
> > > > > > > > > > > _Seems_ racy, or _is_ racy?  Please identify the race.
> > > > > > > > > > 
> > > > > > > > > > Look at this:
> > > > > > > > > > 
> > > > > > > > > > static inline int kvm_irq_line_state(unsigned long *irq_state,
> > > > > > > > > >                                      int irq_source_id, int level)
> > > > > > > > > > {
> > > > > > > > > >         /* Logical OR for level trig interrupt */
> > > > > > > > > >         if (level)
> > > > > > > > > >                 set_bit(irq_source_id, irq_state);
> > > > > > > > > >         else
> > > > > > > > > >                 clear_bit(irq_source_id, irq_state);
> > > > > > > > > > 
> > > > > > > > > >         return !!(*irq_state);
> > > > > > > > > > }
> > > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > Now:
> > > > > > > > > > If other CPU changes some other bit after the atomic change,
> > > > > > > > > > it looks like !!(*irq_state) might return a stale value.
> > > > > > > > > > 
> > > > > > > > > > CPU 0 clears bit 0. CPU 1 sets bit 1. CPU 1 sets level to 1.
> > > > > > > > > > If CPU 0 sees a stale value now it will return 0 here
> > > > > > > > > > and interrupt will get cleared.
> > > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > Maybe this is not a problem. But in that case IMO it needs
> > > > > > > > > > a comment explaining why and why it's not a problem in
> > > > > > > > > > your code.
> > > > > > > > > 
> > > > > > > > > So you want to close the door on anything that uses kvm_set_irq until
> > > > > > > > > this gets fixed... that's insane.
> > > > > > > > 
> > > > > > > > What does kvm_set_irq use have to do with it?  You posted this patch:
> > > > > > > > 
> > > > > > > > +static int kvm_clear_pic_irq(struct kvm_kernel_irq_routing_entry *e,
> > > > > > > > +                            struct kvm *kvm, int irq_source_id)
> > > > > > > > +{
> > > > > > > > +#ifdef CONFIG_X86
> > > > > > > > +       struct kvm_pic *pic = pic_irqchip(kvm);
> > > > > > > > +       int level =
> > > > > > > > kvm_clear_irq_line_state(&pic->irq_states[e->irqchip.pin],
> > > > > > > > +                                            irq_source_id);
> > > > > > > > +       if (level)
> > > > > > > > +               kvm_pic_set_irq(pic, e->irqchip.pin,
> > > > > > > > +                               !!pic->irq_states[e->irqchip.pin]);
> > > > > > > > +       return level;
> > > > > > > > +#else
> > > > > > > > +       return -1;
> > > > > > > > +#endif
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > 
> > > > > > > > it seems racy in the same way.
> > > > > > > 
> > > > > > > Now you're just misrepresenting how we got here, which was:
> > > > > > > 
> > > > > > > > > > > > > IMHO, we're going off into the weeds again with these last
> > > > > > > > > > > > > two patches.  It may be a valid optimization, but it really has no
> > > > > > > > > > > > > bearing on the meat of the series (and afaict, no significant
> > > > > > > > > > > > > performance difference either).
> > > > > > > > > > > > 
> > > > > > > > > > > > For me it's not a performance thing. IMO code is cleaner without this locking:
> > > > > > > > > > > > we add a lock but only use it in some cases, so the rules become really
> > > > > > > > > > > > complex.
> > > > > > > 
> > > > > > > So I'm happy to drop the last 2 patches, which were done at your request
> > > > > > > anyway, but you've failed to show how the locking in patches 1&2 is
> > > > > > > messy, inconsistent, or complex and now you're asking to block all
> > > > > > > progress.
> > > > > > 
> > > > > > I'm asking for bugs to get fixed and not duplicated. Adding more bugs is
> > > > > > not progress. Or maybe there is no bug. Let's see why and add a comment.
> > > > > > 
> > > > > > >  Those patches are just users of kvm_set_irq.
> > > > > > 
> > > > > > 
> > > > > > Well these add calls to kvm_set_irq which scans all vcpus under
> > > > > > spinlock. In the past Avi thought this is not a good idea too.
> > > > > > Maybe things changed.
> > > > > 
> > > > > We can drop the spinlock if we don't care about spurious EOIs, which is
> > > > > only a theoretical scalability problem anyway.
> > > > 
> > > > Not theoretical at all IMO. We see the problem with virtio with old
> > > > guests today.
> > > 
> > > And how are you injecting level interrupts with virtio today w/o this
> > > interface?
> > 
> > Not well at all. Bad performance with interrupt sharing.
> > 
> > > > > We're talking about
> > > > > level interrupts here, how scalable do we need to be?
> > > > > 
> > > > 
> > > > The reason we are moving them into kernel at all is for speed, no?
> > > 
> > > Come on, if we take that approach why aren't we writing all of this in
> > > assembly for speed?!  All I'm suggesting is there's a limit to return on
> > > investment at some point.  Maybe it's here.
> > 
> > Well I am just warning about known problems: don't wake up (typically)
> > many eoifds on a single interrupt or you will have scalability
> > problems that we already see with emulated devices.
> 
> Well, that's why we don't want to bounce back to userspace.  All we need
> to do in VFIO for each callback is note that the interrupt wasn't
> previously masked and do nothing.  So we're talking about acquiring a
> spinlock and a few data references.  Sure, I'd like to avoid doing that,
> but not if it means blocking this patch series until some unknown number
> of patches fixes all the tangential problems you find.

I'd like Avi's take on whether kvm_set_irq under a spinlock here
is OK. He nacked this in the past.

Meanwhile fixing the tangential problems is time well spent too.
If there are races in kvm irq handling it seems more important
to fix them than add an optimization of level interrupts.

[v5,3/4] kvm: Create kvm_clear_irq()

Commit Message

Comments

Patch