diff mbox

[PATCHv0,RFC] kvm: irqfd support for level interrupts

Message ID 20090726162225.GA21972@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Michael S. Tsirkin July 26, 2009, 4:22 p.m. UTC
Here's an untested patch with partial support for level triggered
interrupts in irqfd. What this patch has: support for clearing interrupt
on ack. What this patch does not have: support signalling eventfd on ack
so that userspace can take action and e.g. reenable interrupt.

Gleb, Marcelo, I'd like your input on the approach taken wrt locking.
Does it look sane?

Avi, how's the interface? I intend to also add an eventfd probably in
the padding in the irqfd struct.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

---

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Gleb Natapov July 27, 2009, 5:33 a.m. UTC | #1
On Sun, Jul 26, 2009 at 07:22:25PM +0300, Michael S. Tsirkin wrote:
> Here's an untested patch with partial support for level triggered
> interrupts in irqfd. What this patch has: support for clearing interrupt
> on ack. What this patch does not have: support signalling eventfd on ack
> so that userspace can take action and e.g. reenable interrupt.
> 
Do we want level triggered irqfd to be used only for device assignment?
Because if not it is not appropriate to zero irq level on ack. And you
have no other way to zero irq level?! Another note is that level
triggered irqfd shouldn't use KVM_USERSPACE_IRQ_SOURCE_ID since it may
be shared.

> Gleb, Marcelo, I'd like your input on the approach taken wrt locking.
> Does it look sane?
> 
> Avi, how's the interface? I intend to also add an eventfd probably in
> the padding in the irqfd struct.
> 
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> 
> ---
> 
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index 230a91a..8bf16af 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -488,6 +488,7 @@ struct kvm_x86_mce {
>  #endif
>  
>  #define KVM_IRQFD_FLAG_DEASSIGN (1 << 0)
> +#define KVM_IRQFD_FLAG_LEVEL (1 << 1)
>  
>  struct kvm_irqfd {
>  	__u32 fd;
> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> index 99017e8..fcbf5b5 100644
> --- a/virt/kvm/eventfd.c
> +++ b/virt/kvm/eventfd.c
> @@ -45,12 +45,14 @@ struct _irqfd {
>  	struct kvm               *kvm;
>  	struct eventfd_ctx       *eventfd;
>  	int                       gsi;
> +	int                       is_level;
>  	struct list_head          list;
>  	poll_table                pt;
>  	wait_queue_head_t        *wqh;
>  	wait_queue_t              wait;
>  	struct work_struct        inject;
>  	struct work_struct        shutdown;
> +	struct kvm_irq_ack_notifier kian;
>  };
>  
>  static struct workqueue_struct *irqfd_cleanup_wq;
> @@ -63,10 +65,15 @@ irqfd_inject(struct work_struct *work)
>  
>  	mutex_lock(&kvm->irq_lock);
>  	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
> -	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
> +	if (!irqfd->is_level)
> +		kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
>  	mutex_unlock(&kvm->irq_lock);
>  }
>  
> +static void irqfd_irq_acked(struct kvm_irq_ack_notifier *kian)
> +{
> +	kvm_set_irq(kian->kvm, KVM_USERSPACE_IRQ_SOURCE_ID, kian->gsi, 0);
> +}
>  /*
>   * Race-free decouple logic (ordering is critical)
>   */
> @@ -87,6 +94,9 @@ irqfd_shutdown(struct work_struct *work)
>  	 */
>  	flush_work(&irqfd->inject);
>  
> +	if (irqfd->is_level)
> +		kvm_unregister_irq_ack_notifier(&irqfd->kian);
> +
>  	/*
>  	 * It is now safe to release the object's resources
>  	 */
> @@ -166,7 +176,7 @@ irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
>  }
>  
>  static int
> -kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
> +kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi, int is_level)
>  {
>  	struct _irqfd *irqfd;
>  	struct file *file = NULL;
> @@ -180,6 +190,7 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
>  
>  	irqfd->kvm = kvm;
>  	irqfd->gsi = gsi;
> +	irqfd->is_level = is_level;
>  	INIT_LIST_HEAD(&irqfd->list);
>  	INIT_WORK(&irqfd->inject, irqfd_inject);
>  	INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
> @@ -198,6 +209,12 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
>  
>  	irqfd->eventfd = eventfd;
>  
> +	if (is_level) {
> +		irqfd->kian.gsi = gsi;
> +		irqfd->kian.irq_acked = irqfd_irq_acked;
> +		kvm_register_irq_ack_notifier(&irqfd->kian);
> +	}
> +
>  	/*
>  	 * Install our own custom wake-up handling so we are notified via
>  	 * a callback whenever someone signals the underlying eventfd
> @@ -281,10 +298,13 @@ kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
>  int
>  kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
>  {
> +	if (flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_LEVEL))
> +		return -EINVAL;
> +
>  	if (flags & KVM_IRQFD_FLAG_DEASSIGN)
>  		return kvm_irqfd_deassign(kvm, fd, gsi);
>  
> -	return kvm_irqfd_assign(kvm, fd, gsi);
> +	return kvm_irqfd_assign(kvm, fd, gsi, !!(flags & KVM_IRQFD_FLAG_LEVEL));
>  }
>  
>  /*

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin July 27, 2009, 6:28 a.m. UTC | #2
On Mon, Jul 27, 2009 at 08:33:12AM +0300, Gleb Natapov wrote:
> On Sun, Jul 26, 2009 at 07:22:25PM +0300, Michael S. Tsirkin wrote:
> > Here's an untested patch with partial support for level triggered
> > interrupts in irqfd. What this patch has: support for clearing interrupt
> > on ack. What this patch does not have: support signalling eventfd on ack
> > so that userspace can take action and e.g. reenable interrupt.
> > 
> Do we want level triggered irqfd to be used only for device assignment?

generally level triggered-not only.
But irqfd is promarily for device assignment and emulated devices.

> Because if not it is not appropriate to zero irq level on ack.

The bit that is missing in this patch, is that we'll signal eventfd
after we zero irq. Userspace polls that and re-asserts irq.

> And you
> have no other way to zero irq level?!

The reason is interrupt sharing: device can assert interrupt
but can not clear it by itself.
qemu can still use the SET_IRQ ioctl if it wants full control.


> Another note is that level
> triggered irqfd shouldn't use KVM_USERSPACE_IRQ_SOURCE_ID since it may
> be shared.

Since we zero on ack, I think this is not a problem.

> > Gleb, Marcelo, I'd like your input on the approach taken wrt locking.
> > Does it look sane?
> > 
> > Avi, how's the interface? I intend to also add an eventfd probably in
> > the padding in the irqfd struct.
> > 
> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > 
> > ---
> > 
> > diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> > index 230a91a..8bf16af 100644
> > --- a/include/linux/kvm.h
> > +++ b/include/linux/kvm.h
> > @@ -488,6 +488,7 @@ struct kvm_x86_mce {
> >  #endif
> >  
> >  #define KVM_IRQFD_FLAG_DEASSIGN (1 << 0)
> > +#define KVM_IRQFD_FLAG_LEVEL (1 << 1)
> >  
> >  struct kvm_irqfd {
> >  	__u32 fd;
> > diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> > index 99017e8..fcbf5b5 100644
> > --- a/virt/kvm/eventfd.c
> > +++ b/virt/kvm/eventfd.c
> > @@ -45,12 +45,14 @@ struct _irqfd {
> >  	struct kvm               *kvm;
> >  	struct eventfd_ctx       *eventfd;
> >  	int                       gsi;
> > +	int                       is_level;
> >  	struct list_head          list;
> >  	poll_table                pt;
> >  	wait_queue_head_t        *wqh;
> >  	wait_queue_t              wait;
> >  	struct work_struct        inject;
> >  	struct work_struct        shutdown;
> > +	struct kvm_irq_ack_notifier kian;
> >  };
> >  
> >  static struct workqueue_struct *irqfd_cleanup_wq;
> > @@ -63,10 +65,15 @@ irqfd_inject(struct work_struct *work)
> >  
> >  	mutex_lock(&kvm->irq_lock);
> >  	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
> > -	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
> > +	if (!irqfd->is_level)
> > +		kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
> >  	mutex_unlock(&kvm->irq_lock);
> >  }
> >  
> > +static void irqfd_irq_acked(struct kvm_irq_ack_notifier *kian)
> > +{
> > +	kvm_set_irq(kian->kvm, KVM_USERSPACE_IRQ_SOURCE_ID, kian->gsi, 0);
> > +}
> >  /*
> >   * Race-free decouple logic (ordering is critical)
> >   */
> > @@ -87,6 +94,9 @@ irqfd_shutdown(struct work_struct *work)
> >  	 */
> >  	flush_work(&irqfd->inject);
> >  
> > +	if (irqfd->is_level)
> > +		kvm_unregister_irq_ack_notifier(&irqfd->kian);
> > +
> >  	/*
> >  	 * It is now safe to release the object's resources
> >  	 */
> > @@ -166,7 +176,7 @@ irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
> >  }
> >  
> >  static int
> > -kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
> > +kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi, int is_level)
> >  {
> >  	struct _irqfd *irqfd;
> >  	struct file *file = NULL;
> > @@ -180,6 +190,7 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
> >  
> >  	irqfd->kvm = kvm;
> >  	irqfd->gsi = gsi;
> > +	irqfd->is_level = is_level;
> >  	INIT_LIST_HEAD(&irqfd->list);
> >  	INIT_WORK(&irqfd->inject, irqfd_inject);
> >  	INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
> > @@ -198,6 +209,12 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
> >  
> >  	irqfd->eventfd = eventfd;
> >  
> > +	if (is_level) {
> > +		irqfd->kian.gsi = gsi;
> > +		irqfd->kian.irq_acked = irqfd_irq_acked;
> > +		kvm_register_irq_ack_notifier(&irqfd->kian);
> > +	}
> > +
> >  	/*
> >  	 * Install our own custom wake-up handling so we are notified via
> >  	 * a callback whenever someone signals the underlying eventfd
> > @@ -281,10 +298,13 @@ kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
> >  int
> >  kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
> >  {
> > +	if (flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_LEVEL))
> > +		return -EINVAL;
> > +
> >  	if (flags & KVM_IRQFD_FLAG_DEASSIGN)
> >  		return kvm_irqfd_deassign(kvm, fd, gsi);
> >  
> > -	return kvm_irqfd_assign(kvm, fd, gsi);
> > +	return kvm_irqfd_assign(kvm, fd, gsi, !!(flags & KVM_IRQFD_FLAG_LEVEL));
> >  }
> >  
> >  /*
> 
> --
> 			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov July 27, 2009, 7:31 a.m. UTC | #3
On Mon, Jul 27, 2009 at 09:28:34AM +0300, Michael S. Tsirkin wrote:
> On Mon, Jul 27, 2009 at 08:33:12AM +0300, Gleb Natapov wrote:
> > On Sun, Jul 26, 2009 at 07:22:25PM +0300, Michael S. Tsirkin wrote:
> > > Here's an untested patch with partial support for level triggered
> > > interrupts in irqfd. What this patch has: support for clearing interrupt
> > > on ack. What this patch does not have: support signalling eventfd on ack
> > > so that userspace can take action and e.g. reenable interrupt.
> > > 
> > Do we want level triggered irqfd to be used only for device assignment?
> 
> generally level triggered-not only.
> But irqfd is promarily for device assignment and emulated devices.
> 
If it is for emulated device zeroing irq level on ack is not acceptable.

> > Because if not it is not appropriate to zero irq level on ack.
> 
> The bit that is missing in this patch, is that we'll signal eventfd
> after we zero irq. Userspace polls that and re-asserts irq.
That is not needed for device emulation. Emulated device should lower
irq line by itself. It know exactly when to do it. Please don't push
broken device assignment model to emulated devices.

> 
> > And you
> > have no other way to zero irq level?!
> 
> The reason is interrupt sharing: device can assert interrupt
> but can not clear it by itself.
For interrupt sharing to work you need to allocate source id for each
irqfd.

> qemu can still use the SET_IRQ ioctl if it wants full control.
> 
> 
> > Another note is that level
> > triggered irqfd shouldn't use KVM_USERSPACE_IRQ_SOURCE_ID since it may
> > be shared.
> 
> Since we zero on ack, I think this is not a problem.
> 
We will not zero on ack for emulated devices. Please don't abuse ack
notifiers. Even if you zero on ack this is still the problem BTW since
other device may lower your line level.

> > > Gleb, Marcelo, I'd like your input on the approach taken wrt locking.
> > > Does it look sane?
> > > 
> > > Avi, how's the interface? I intend to also add an eventfd probably in
> > > the padding in the irqfd struct.
> > > 
> > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > > 
> > > ---
> > > 
> > > diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> > > index 230a91a..8bf16af 100644
> > > --- a/include/linux/kvm.h
> > > +++ b/include/linux/kvm.h
> > > @@ -488,6 +488,7 @@ struct kvm_x86_mce {
> > >  #endif
> > >  
> > >  #define KVM_IRQFD_FLAG_DEASSIGN (1 << 0)
> > > +#define KVM_IRQFD_FLAG_LEVEL (1 << 1)
> > >  
> > >  struct kvm_irqfd {
> > >  	__u32 fd;
> > > diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> > > index 99017e8..fcbf5b5 100644
> > > --- a/virt/kvm/eventfd.c
> > > +++ b/virt/kvm/eventfd.c
> > > @@ -45,12 +45,14 @@ struct _irqfd {
> > >  	struct kvm               *kvm;
> > >  	struct eventfd_ctx       *eventfd;
> > >  	int                       gsi;
> > > +	int                       is_level;
> > >  	struct list_head          list;
> > >  	poll_table                pt;
> > >  	wait_queue_head_t        *wqh;
> > >  	wait_queue_t              wait;
> > >  	struct work_struct        inject;
> > >  	struct work_struct        shutdown;
> > > +	struct kvm_irq_ack_notifier kian;
> > >  };
> > >  
> > >  static struct workqueue_struct *irqfd_cleanup_wq;
> > > @@ -63,10 +65,15 @@ irqfd_inject(struct work_struct *work)
> > >  
> > >  	mutex_lock(&kvm->irq_lock);
> > >  	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
> > > -	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
> > > +	if (!irqfd->is_level)
> > > +		kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
> > >  	mutex_unlock(&kvm->irq_lock);
> > >  }
> > >  
> > > +static void irqfd_irq_acked(struct kvm_irq_ack_notifier *kian)
> > > +{
> > > +	kvm_set_irq(kian->kvm, KVM_USERSPACE_IRQ_SOURCE_ID, kian->gsi, 0);
> > > +}
> > >  /*
> > >   * Race-free decouple logic (ordering is critical)
> > >   */
> > > @@ -87,6 +94,9 @@ irqfd_shutdown(struct work_struct *work)
> > >  	 */
> > >  	flush_work(&irqfd->inject);
> > >  
> > > +	if (irqfd->is_level)
> > > +		kvm_unregister_irq_ack_notifier(&irqfd->kian);
> > > +
> > >  	/*
> > >  	 * It is now safe to release the object's resources
> > >  	 */
> > > @@ -166,7 +176,7 @@ irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
> > >  }
> > >  
> > >  static int
> > > -kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
> > > +kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi, int is_level)
> > >  {
> > >  	struct _irqfd *irqfd;
> > >  	struct file *file = NULL;
> > > @@ -180,6 +190,7 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
> > >  
> > >  	irqfd->kvm = kvm;
> > >  	irqfd->gsi = gsi;
> > > +	irqfd->is_level = is_level;
> > >  	INIT_LIST_HEAD(&irqfd->list);
> > >  	INIT_WORK(&irqfd->inject, irqfd_inject);
> > >  	INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
> > > @@ -198,6 +209,12 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
> > >  
> > >  	irqfd->eventfd = eventfd;
> > >  
> > > +	if (is_level) {
> > > +		irqfd->kian.gsi = gsi;
> > > +		irqfd->kian.irq_acked = irqfd_irq_acked;
> > > +		kvm_register_irq_ack_notifier(&irqfd->kian);
> > > +	}
> > > +
> > >  	/*
> > >  	 * Install our own custom wake-up handling so we are notified via
> > >  	 * a callback whenever someone signals the underlying eventfd
> > > @@ -281,10 +298,13 @@ kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
> > >  int
> > >  kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
> > >  {
> > > +	if (flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_LEVEL))
> > > +		return -EINVAL;
> > > +
> > >  	if (flags & KVM_IRQFD_FLAG_DEASSIGN)
> > >  		return kvm_irqfd_deassign(kvm, fd, gsi);
> > >  
> > > -	return kvm_irqfd_assign(kvm, fd, gsi);
> > > +	return kvm_irqfd_assign(kvm, fd, gsi, !!(flags & KVM_IRQFD_FLAG_LEVEL));
> > >  }
> > >  
> > >  /*
> > 
> > --
> > 			Gleb.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin July 27, 2009, 8:02 a.m. UTC | #4
On Mon, Jul 27, 2009 at 10:31:38AM +0300, Gleb Natapov wrote:
> On Mon, Jul 27, 2009 at 09:28:34AM +0300, Michael S. Tsirkin wrote:
> > On Mon, Jul 27, 2009 at 08:33:12AM +0300, Gleb Natapov wrote:
> > > On Sun, Jul 26, 2009 at 07:22:25PM +0300, Michael S. Tsirkin wrote:
> > > > Here's an untested patch with partial support for level triggered
> > > > interrupts in irqfd. What this patch has: support for clearing interrupt
> > > > on ack. What this patch does not have: support signalling eventfd on ack
> > > > so that userspace can take action and e.g. reenable interrupt.
> > > > 
> > > Do we want level triggered irqfd to be used only for device assignment?
> > 
> > generally level triggered-not only.
> > But irqfd is promarily for device assignment and emulated devices.
> > 
> If it is for emulated device zeroing irq level on ack is not acceptable.

OK, forget about emulated devices for now. Let's focus on assigned
devices.

> > > Because if not it is not appropriate to zero irq level on ack.
> > 
> > The bit that is missing in this patch, is that we'll signal eventfd
> > after we zero irq. Userspace polls that and re-asserts irq.
> That is not needed for device emulation. Emulated device should lower
> irq line by itself. It know exactly when to do it. Please don't push
> broken device assignment model to emulated devices.
> 
> > 
> > > And you
> > > have no other way to zero irq level?!
> > 
> > The reason is interrupt sharing: device can assert interrupt
> > but can not clear it by itself.
> For interrupt sharing to work you need to allocate source id for each
> irqfd.

BTW, current SET_IRQ_LINE interface uses a common source id for all
irqs. Does this mean that interrupt sharing in guest is broken now?

> > qemu can still use the SET_IRQ ioctl if it wants full control.
> > 
> > 
> > > Another note is that level
> > > triggered irqfd shouldn't use KVM_USERSPACE_IRQ_SOURCE_ID since it may
> > > be shared.
> > 
> > Since we zero on ack, I think this is not a problem.
> > 
> We will not zero on ack for emulated devices. Please don't abuse ack
> notifiers. Even if you zero on ack this is still the problem BTW since
> other device may lower your line level.
> 
> > > > Gleb, Marcelo, I'd like your input on the approach taken wrt locking.
> > > > Does it look sane?
> > > > 
> > > > Avi, how's the interface? I intend to also add an eventfd probably in
> > > > the padding in the irqfd struct.
> > > > 
> > > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > > > 
> > > > ---
> > > > 
> > > > diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> > > > index 230a91a..8bf16af 100644
> > > > --- a/include/linux/kvm.h
> > > > +++ b/include/linux/kvm.h
> > > > @@ -488,6 +488,7 @@ struct kvm_x86_mce {
> > > >  #endif
> > > >  
> > > >  #define KVM_IRQFD_FLAG_DEASSIGN (1 << 0)
> > > > +#define KVM_IRQFD_FLAG_LEVEL (1 << 1)
> > > >  
> > > >  struct kvm_irqfd {
> > > >  	__u32 fd;
> > > > diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> > > > index 99017e8..fcbf5b5 100644
> > > > --- a/virt/kvm/eventfd.c
> > > > +++ b/virt/kvm/eventfd.c
> > > > @@ -45,12 +45,14 @@ struct _irqfd {
> > > >  	struct kvm               *kvm;
> > > >  	struct eventfd_ctx       *eventfd;
> > > >  	int                       gsi;
> > > > +	int                       is_level;
> > > >  	struct list_head          list;
> > > >  	poll_table                pt;
> > > >  	wait_queue_head_t        *wqh;
> > > >  	wait_queue_t              wait;
> > > >  	struct work_struct        inject;
> > > >  	struct work_struct        shutdown;
> > > > +	struct kvm_irq_ack_notifier kian;
> > > >  };
> > > >  
> > > >  static struct workqueue_struct *irqfd_cleanup_wq;
> > > > @@ -63,10 +65,15 @@ irqfd_inject(struct work_struct *work)
> > > >  
> > > >  	mutex_lock(&kvm->irq_lock);
> > > >  	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
> > > > -	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
> > > > +	if (!irqfd->is_level)
> > > > +		kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
> > > >  	mutex_unlock(&kvm->irq_lock);
> > > >  }
> > > >  
> > > > +static void irqfd_irq_acked(struct kvm_irq_ack_notifier *kian)
> > > > +{
> > > > +	kvm_set_irq(kian->kvm, KVM_USERSPACE_IRQ_SOURCE_ID, kian->gsi, 0);
> > > > +}
> > > >  /*
> > > >   * Race-free decouple logic (ordering is critical)
> > > >   */
> > > > @@ -87,6 +94,9 @@ irqfd_shutdown(struct work_struct *work)
> > > >  	 */
> > > >  	flush_work(&irqfd->inject);
> > > >  
> > > > +	if (irqfd->is_level)
> > > > +		kvm_unregister_irq_ack_notifier(&irqfd->kian);
> > > > +
> > > >  	/*
> > > >  	 * It is now safe to release the object's resources
> > > >  	 */
> > > > @@ -166,7 +176,7 @@ irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
> > > >  }
> > > >  
> > > >  static int
> > > > -kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
> > > > +kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi, int is_level)
> > > >  {
> > > >  	struct _irqfd *irqfd;
> > > >  	struct file *file = NULL;
> > > > @@ -180,6 +190,7 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
> > > >  
> > > >  	irqfd->kvm = kvm;
> > > >  	irqfd->gsi = gsi;
> > > > +	irqfd->is_level = is_level;
> > > >  	INIT_LIST_HEAD(&irqfd->list);
> > > >  	INIT_WORK(&irqfd->inject, irqfd_inject);
> > > >  	INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
> > > > @@ -198,6 +209,12 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
> > > >  
> > > >  	irqfd->eventfd = eventfd;
> > > >  
> > > > +	if (is_level) {
> > > > +		irqfd->kian.gsi = gsi;
> > > > +		irqfd->kian.irq_acked = irqfd_irq_acked;
> > > > +		kvm_register_irq_ack_notifier(&irqfd->kian);
> > > > +	}
> > > > +
> > > >  	/*
> > > >  	 * Install our own custom wake-up handling so we are notified via
> > > >  	 * a callback whenever someone signals the underlying eventfd
> > > > @@ -281,10 +298,13 @@ kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
> > > >  int
> > > >  kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
> > > >  {
> > > > +	if (flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_LEVEL))
> > > > +		return -EINVAL;
> > > > +
> > > >  	if (flags & KVM_IRQFD_FLAG_DEASSIGN)
> > > >  		return kvm_irqfd_deassign(kvm, fd, gsi);
> > > >  
> > > > -	return kvm_irqfd_assign(kvm, fd, gsi);
> > > > +	return kvm_irqfd_assign(kvm, fd, gsi, !!(flags & KVM_IRQFD_FLAG_LEVEL));
> > > >  }
> > > >  
> > > >  /*
> > > 
> > > --
> > > 			Gleb.
> 
> --
> 			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov July 27, 2009, 8:13 a.m. UTC | #5
On Mon, Jul 27, 2009 at 11:02:06AM +0300, Michael S. Tsirkin wrote:
> On Mon, Jul 27, 2009 at 10:31:38AM +0300, Gleb Natapov wrote:
> > On Mon, Jul 27, 2009 at 09:28:34AM +0300, Michael S. Tsirkin wrote:
> > > On Mon, Jul 27, 2009 at 08:33:12AM +0300, Gleb Natapov wrote:
> > > > On Sun, Jul 26, 2009 at 07:22:25PM +0300, Michael S. Tsirkin wrote:
> > > > > Here's an untested patch with partial support for level triggered
> > > > > interrupts in irqfd. What this patch has: support for clearing interrupt
> > > > > on ack. What this patch does not have: support signalling eventfd on ack
> > > > > so that userspace can take action and e.g. reenable interrupt.
> > > > > 
> > > > Do we want level triggered irqfd to be used only for device assignment?
> > > 
> > > generally level triggered-not only.
> > > But irqfd is promarily for device assignment and emulated devices.
> > > 
> > If it is for emulated device zeroing irq level on ack is not acceptable.
> 
> OK, forget about emulated devices for now. Let's focus on assigned
> devices.
> 
Can't. We should design interface that works for everything.

> > > > Because if not it is not appropriate to zero irq level on ack.
> > > 
> > > The bit that is missing in this patch, is that we'll signal eventfd
> > > after we zero irq. Userspace polls that and re-asserts irq.
> > That is not needed for device emulation. Emulated device should lower
> > irq line by itself. It know exactly when to do it. Please don't push
> > broken device assignment model to emulated devices.
> > 
> > > 
> > > > And you
> > > > have no other way to zero irq level?!
> > > 
> > > The reason is interrupt sharing: device can assert interrupt
> > > but can not clear it by itself.
> > For interrupt sharing to work you need to allocate source id for each
> > irqfd.
> 
> BTW, current SET_IRQ_LINE interface uses a common source id for all
> irqs. Does this mean that interrupt sharing in guest is broken now?
No. It means that sharing in handled in userspace by chipset emulation.

> 
> > > qemu can still use the SET_IRQ ioctl if it wants full control.
> > > 
> > > 
> > > > Another note is that level
> > > > triggered irqfd shouldn't use KVM_USERSPACE_IRQ_SOURCE_ID since it may
> > > > be shared.
> > > 
> > > Since we zero on ack, I think this is not a problem.
> > > 
> > We will not zero on ack for emulated devices. Please don't abuse ack
> > notifiers. Even if you zero on ack this is still the problem BTW since
> > other device may lower your line level.
> > 
> > > > > Gleb, Marcelo, I'd like your input on the approach taken wrt locking.
> > > > > Does it look sane?
> > > > > 
> > > > > Avi, how's the interface? I intend to also add an eventfd probably in
> > > > > the padding in the irqfd struct.
> > > > > 
> > > > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > > > > 
> > > > > ---
> > > > > 
> > > > > diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> > > > > index 230a91a..8bf16af 100644
> > > > > --- a/include/linux/kvm.h
> > > > > +++ b/include/linux/kvm.h
> > > > > @@ -488,6 +488,7 @@ struct kvm_x86_mce {
> > > > >  #endif
> > > > >  
> > > > >  #define KVM_IRQFD_FLAG_DEASSIGN (1 << 0)
> > > > > +#define KVM_IRQFD_FLAG_LEVEL (1 << 1)
> > > > >  
> > > > >  struct kvm_irqfd {
> > > > >  	__u32 fd;
> > > > > diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> > > > > index 99017e8..fcbf5b5 100644
> > > > > --- a/virt/kvm/eventfd.c
> > > > > +++ b/virt/kvm/eventfd.c
> > > > > @@ -45,12 +45,14 @@ struct _irqfd {
> > > > >  	struct kvm               *kvm;
> > > > >  	struct eventfd_ctx       *eventfd;
> > > > >  	int                       gsi;
> > > > > +	int                       is_level;
> > > > >  	struct list_head          list;
> > > > >  	poll_table                pt;
> > > > >  	wait_queue_head_t        *wqh;
> > > > >  	wait_queue_t              wait;
> > > > >  	struct work_struct        inject;
> > > > >  	struct work_struct        shutdown;
> > > > > +	struct kvm_irq_ack_notifier kian;
> > > > >  };
> > > > >  
> > > > >  static struct workqueue_struct *irqfd_cleanup_wq;
> > > > > @@ -63,10 +65,15 @@ irqfd_inject(struct work_struct *work)
> > > > >  
> > > > >  	mutex_lock(&kvm->irq_lock);
> > > > >  	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
> > > > > -	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
> > > > > +	if (!irqfd->is_level)
> > > > > +		kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
> > > > >  	mutex_unlock(&kvm->irq_lock);
> > > > >  }
> > > > >  
> > > > > +static void irqfd_irq_acked(struct kvm_irq_ack_notifier *kian)
> > > > > +{
> > > > > +	kvm_set_irq(kian->kvm, KVM_USERSPACE_IRQ_SOURCE_ID, kian->gsi, 0);
> > > > > +}
> > > > >  /*
> > > > >   * Race-free decouple logic (ordering is critical)
> > > > >   */
> > > > > @@ -87,6 +94,9 @@ irqfd_shutdown(struct work_struct *work)
> > > > >  	 */
> > > > >  	flush_work(&irqfd->inject);
> > > > >  
> > > > > +	if (irqfd->is_level)
> > > > > +		kvm_unregister_irq_ack_notifier(&irqfd->kian);
> > > > > +
> > > > >  	/*
> > > > >  	 * It is now safe to release the object's resources
> > > > >  	 */
> > > > > @@ -166,7 +176,7 @@ irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
> > > > >  }
> > > > >  
> > > > >  static int
> > > > > -kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
> > > > > +kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi, int is_level)
> > > > >  {
> > > > >  	struct _irqfd *irqfd;
> > > > >  	struct file *file = NULL;
> > > > > @@ -180,6 +190,7 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
> > > > >  
> > > > >  	irqfd->kvm = kvm;
> > > > >  	irqfd->gsi = gsi;
> > > > > +	irqfd->is_level = is_level;
> > > > >  	INIT_LIST_HEAD(&irqfd->list);
> > > > >  	INIT_WORK(&irqfd->inject, irqfd_inject);
> > > > >  	INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
> > > > > @@ -198,6 +209,12 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
> > > > >  
> > > > >  	irqfd->eventfd = eventfd;
> > > > >  
> > > > > +	if (is_level) {
> > > > > +		irqfd->kian.gsi = gsi;
> > > > > +		irqfd->kian.irq_acked = irqfd_irq_acked;
> > > > > +		kvm_register_irq_ack_notifier(&irqfd->kian);
> > > > > +	}
> > > > > +
> > > > >  	/*
> > > > >  	 * Install our own custom wake-up handling so we are notified via
> > > > >  	 * a callback whenever someone signals the underlying eventfd
> > > > > @@ -281,10 +298,13 @@ kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
> > > > >  int
> > > > >  kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
> > > > >  {
> > > > > +	if (flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_LEVEL))
> > > > > +		return -EINVAL;
> > > > > +
> > > > >  	if (flags & KVM_IRQFD_FLAG_DEASSIGN)
> > > > >  		return kvm_irqfd_deassign(kvm, fd, gsi);
> > > > >  
> > > > > -	return kvm_irqfd_assign(kvm, fd, gsi);
> > > > > +	return kvm_irqfd_assign(kvm, fd, gsi, !!(flags & KVM_IRQFD_FLAG_LEVEL));
> > > > >  }
> > > > >  
> > > > >  /*
> > > > 
> > > > --
> > > > 			Gleb.
> > 
> > --
> > 			Gleb.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin July 27, 2009, 8:21 a.m. UTC | #6
On Mon, Jul 27, 2009 at 11:13:50AM +0300, Gleb Natapov wrote:
> On Mon, Jul 27, 2009 at 11:02:06AM +0300, Michael S. Tsirkin wrote:
> > On Mon, Jul 27, 2009 at 10:31:38AM +0300, Gleb Natapov wrote:
> > > On Mon, Jul 27, 2009 at 09:28:34AM +0300, Michael S. Tsirkin wrote:
> > > > On Mon, Jul 27, 2009 at 08:33:12AM +0300, Gleb Natapov wrote:
> > > > > On Sun, Jul 26, 2009 at 07:22:25PM +0300, Michael S. Tsirkin wrote:
> > > > > > Here's an untested patch with partial support for level triggered
> > > > > > interrupts in irqfd. What this patch has: support for clearing interrupt
> > > > > > on ack. What this patch does not have: support signalling eventfd on ack
> > > > > > so that userspace can take action and e.g. reenable interrupt.
> > > > > > 
> > > > > Do we want level triggered irqfd to be used only for device assignment?
> > > > 
> > > > generally level triggered-not only.
> > > > But irqfd is promarily for device assignment and emulated devices.
> > > > 
> > > If it is for emulated device zeroing irq level on ack is not acceptable.
> > 
> > OK, forget about emulated devices for now. Let's focus on assigned
> > devices.
> > 
> Can't. We should design interface that works for everything.

We can always add the KVM_MAKE_COFFEE ioctl later.

> > > > > Because if not it is not appropriate to zero irq level on ack.
> > > > 
> > > > The bit that is missing in this patch, is that we'll signal eventfd
> > > > after we zero irq. Userspace polls that and re-asserts irq.
> > > That is not needed for device emulation. Emulated device should lower
> > > irq line by itself. It know exactly when to do it. Please don't push
> > > broken device assignment model to emulated devices.
> > > 
> > > > 
> > > > > And you
> > > > > have no other way to zero irq level?!
> > > > 
> > > > The reason is interrupt sharing: device can assert interrupt
> > > > but can not clear it by itself.
> > > For interrupt sharing to work you need to allocate source id for each
> > > irqfd.
> > 
> > BTW, current SET_IRQ_LINE interface uses a common source id for all
> > irqs. Does this mean that interrupt sharing in guest is broken now?
> No. It means that sharing in handled in userspace by chipset emulation.

Aha. Where's that code in qemu-kvm? I only see it forwarding to the
ioctl call directly. And what about assigned devices? These bypass qemu.

> > 
> > > > qemu can still use the SET_IRQ ioctl if it wants full control.
> > > > 
> > > > 
> > > > > Another note is that level
> > > > > triggered irqfd shouldn't use KVM_USERSPACE_IRQ_SOURCE_ID since it may
> > > > > be shared.
> > > > 
> > > > Since we zero on ack, I think this is not a problem.
> > > > 
> > > We will not zero on ack for emulated devices. Please don't abuse ack
> > > notifiers. Even if you zero on ack this is still the problem BTW since
> > > other device may lower your line level.
> > > 
> > > > > > Gleb, Marcelo, I'd like your input on the approach taken wrt locking.
> > > > > > Does it look sane?
> > > > > > 
> > > > > > Avi, how's the interface? I intend to also add an eventfd probably in
> > > > > > the padding in the irqfd struct.
> > > > > > 
> > > > > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > > > > > 
> > > > > > ---
> > > > > > 
> > > > > > diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> > > > > > index 230a91a..8bf16af 100644
> > > > > > --- a/include/linux/kvm.h
> > > > > > +++ b/include/linux/kvm.h
> > > > > > @@ -488,6 +488,7 @@ struct kvm_x86_mce {
> > > > > >  #endif
> > > > > >  
> > > > > >  #define KVM_IRQFD_FLAG_DEASSIGN (1 << 0)
> > > > > > +#define KVM_IRQFD_FLAG_LEVEL (1 << 1)
> > > > > >  
> > > > > >  struct kvm_irqfd {
> > > > > >  	__u32 fd;
> > > > > > diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> > > > > > index 99017e8..fcbf5b5 100644
> > > > > > --- a/virt/kvm/eventfd.c
> > > > > > +++ b/virt/kvm/eventfd.c
> > > > > > @@ -45,12 +45,14 @@ struct _irqfd {
> > > > > >  	struct kvm               *kvm;
> > > > > >  	struct eventfd_ctx       *eventfd;
> > > > > >  	int                       gsi;
> > > > > > +	int                       is_level;
> > > > > >  	struct list_head          list;
> > > > > >  	poll_table                pt;
> > > > > >  	wait_queue_head_t        *wqh;
> > > > > >  	wait_queue_t              wait;
> > > > > >  	struct work_struct        inject;
> > > > > >  	struct work_struct        shutdown;
> > > > > > +	struct kvm_irq_ack_notifier kian;
> > > > > >  };
> > > > > >  
> > > > > >  static struct workqueue_struct *irqfd_cleanup_wq;
> > > > > > @@ -63,10 +65,15 @@ irqfd_inject(struct work_struct *work)
> > > > > >  
> > > > > >  	mutex_lock(&kvm->irq_lock);
> > > > > >  	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
> > > > > > -	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
> > > > > > +	if (!irqfd->is_level)
> > > > > > +		kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
> > > > > >  	mutex_unlock(&kvm->irq_lock);
> > > > > >  }
> > > > > >  
> > > > > > +static void irqfd_irq_acked(struct kvm_irq_ack_notifier *kian)
> > > > > > +{
> > > > > > +	kvm_set_irq(kian->kvm, KVM_USERSPACE_IRQ_SOURCE_ID, kian->gsi, 0);
> > > > > > +}
> > > > > >  /*
> > > > > >   * Race-free decouple logic (ordering is critical)
> > > > > >   */
> > > > > > @@ -87,6 +94,9 @@ irqfd_shutdown(struct work_struct *work)
> > > > > >  	 */
> > > > > >  	flush_work(&irqfd->inject);
> > > > > >  
> > > > > > +	if (irqfd->is_level)
> > > > > > +		kvm_unregister_irq_ack_notifier(&irqfd->kian);
> > > > > > +
> > > > > >  	/*
> > > > > >  	 * It is now safe to release the object's resources
> > > > > >  	 */
> > > > > > @@ -166,7 +176,7 @@ irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
> > > > > >  }
> > > > > >  
> > > > > >  static int
> > > > > > -kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
> > > > > > +kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi, int is_level)
> > > > > >  {
> > > > > >  	struct _irqfd *irqfd;
> > > > > >  	struct file *file = NULL;
> > > > > > @@ -180,6 +190,7 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
> > > > > >  
> > > > > >  	irqfd->kvm = kvm;
> > > > > >  	irqfd->gsi = gsi;
> > > > > > +	irqfd->is_level = is_level;
> > > > > >  	INIT_LIST_HEAD(&irqfd->list);
> > > > > >  	INIT_WORK(&irqfd->inject, irqfd_inject);
> > > > > >  	INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
> > > > > > @@ -198,6 +209,12 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
> > > > > >  
> > > > > >  	irqfd->eventfd = eventfd;
> > > > > >  
> > > > > > +	if (is_level) {
> > > > > > +		irqfd->kian.gsi = gsi;
> > > > > > +		irqfd->kian.irq_acked = irqfd_irq_acked;
> > > > > > +		kvm_register_irq_ack_notifier(&irqfd->kian);
> > > > > > +	}
> > > > > > +
> > > > > >  	/*
> > > > > >  	 * Install our own custom wake-up handling so we are notified via
> > > > > >  	 * a callback whenever someone signals the underlying eventfd
> > > > > > @@ -281,10 +298,13 @@ kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
> > > > > >  int
> > > > > >  kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
> > > > > >  {
> > > > > > +	if (flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_LEVEL))
> > > > > > +		return -EINVAL;
> > > > > > +
> > > > > >  	if (flags & KVM_IRQFD_FLAG_DEASSIGN)
> > > > > >  		return kvm_irqfd_deassign(kvm, fd, gsi);
> > > > > >  
> > > > > > -	return kvm_irqfd_assign(kvm, fd, gsi);
> > > > > > +	return kvm_irqfd_assign(kvm, fd, gsi, !!(flags & KVM_IRQFD_FLAG_LEVEL));
> > > > > >  }
> > > > > >  
> > > > > >  /*
> > > > > 
> > > > > --
> > > > > 			Gleb.
> > > 
> > > --
> > > 			Gleb.
> 
> --
> 			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov July 27, 2009, 8:30 a.m. UTC | #7
On Mon, Jul 27, 2009 at 11:21:39AM +0300, Michael S. Tsirkin wrote:
> On Mon, Jul 27, 2009 at 11:13:50AM +0300, Gleb Natapov wrote:
> > On Mon, Jul 27, 2009 at 11:02:06AM +0300, Michael S. Tsirkin wrote:
> > > On Mon, Jul 27, 2009 at 10:31:38AM +0300, Gleb Natapov wrote:
> > > > On Mon, Jul 27, 2009 at 09:28:34AM +0300, Michael S. Tsirkin wrote:
> > > > > On Mon, Jul 27, 2009 at 08:33:12AM +0300, Gleb Natapov wrote:
> > > > > > On Sun, Jul 26, 2009 at 07:22:25PM +0300, Michael S. Tsirkin wrote:
> > > > > > > Here's an untested patch with partial support for level triggered
> > > > > > > interrupts in irqfd. What this patch has: support for clearing interrupt
> > > > > > > on ack. What this patch does not have: support signalling eventfd on ack
> > > > > > > so that userspace can take action and e.g. reenable interrupt.
> > > > > > > 
> > > > > > Do we want level triggered irqfd to be used only for device assignment?
> > > > > 
> > > > > generally level triggered-not only.
> > > > > But irqfd is promarily for device assignment and emulated devices.
> > > > > 
> > > > If it is for emulated device zeroing irq level on ack is not acceptable.
> > > 
> > > OK, forget about emulated devices for now. Let's focus on assigned
> > > devices.
> > > 
> > Can't. We should design interface that works for everything.
> 
> We can always add the KVM_MAKE_COFFEE ioctl later.
> 
Assigned devices are non interesting case, so we can hack something
afterwards to make it work. Emulated devices is a normal case that
should work when interface is introduced. Why assigning devices through
the kernel is not good enough? What is the reason to push it to
userspace?

> > > > > > Because if not it is not appropriate to zero irq level on ack.
> > > > > 
> > > > > The bit that is missing in this patch, is that we'll signal eventfd
> > > > > after we zero irq. Userspace polls that and re-asserts irq.
> > > > That is not needed for device emulation. Emulated device should lower
> > > > irq line by itself. It know exactly when to do it. Please don't push
> > > > broken device assignment model to emulated devices.
> > > > 
> > > > > 
> > > > > > And you
> > > > > > have no other way to zero irq level?!
> > > > > 
> > > > > The reason is interrupt sharing: device can assert interrupt
> > > > > but can not clear it by itself.
> > > > For interrupt sharing to work you need to allocate source id for each
> > > > irqfd.
> > > 
> > > BTW, current SET_IRQ_LINE interface uses a common source id for all
> > > irqs. Does this mean that interrupt sharing in guest is broken now?
> > No. It means that sharing in handled in userspace by chipset emulation.
> 
> Aha. Where's that code in qemu-kvm? I only see it forwarding to the
AFAIR in hw/pci.c and/or hw/piix_pci.c.

> ioctl call directly. And what about assigned devices? These bypass qemu.
> 
Each assigned device allocates its own source id. This is the reason
source id exists at all.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gregory Haskins July 29, 2009, 4:01 p.m. UTC | #8
Michael S. Tsirkin wrote:
> Here's an untested patch with partial support for level triggered
> interrupts in irqfd. What this patch has: support for clearing interrupt
> on ack. What this patch does not have: support signalling eventfd on ack
> so that userspace can take action and e.g. reenable interrupt.
> 
> Gleb, Marcelo, I'd like your input on the approach taken wrt locking.
> Does it look sane?
> 
> Avi, how's the interface? I intend to also add an eventfd probably in
> the padding in the irqfd struct.
> 
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

Patch seems reasonable to me.

Acked-by: Gregory Haskins <ghaskins@novell.com>

> 
> ---
> 
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index 230a91a..8bf16af 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -488,6 +488,7 @@ struct kvm_x86_mce {
>  #endif
>  
>  #define KVM_IRQFD_FLAG_DEASSIGN (1 << 0)
> +#define KVM_IRQFD_FLAG_LEVEL (1 << 1)
>  
>  struct kvm_irqfd {
>  	__u32 fd;
> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> index 99017e8..fcbf5b5 100644
> --- a/virt/kvm/eventfd.c
> +++ b/virt/kvm/eventfd.c
> @@ -45,12 +45,14 @@ struct _irqfd {
>  	struct kvm               *kvm;
>  	struct eventfd_ctx       *eventfd;
>  	int                       gsi;
> +	int                       is_level;
>  	struct list_head          list;
>  	poll_table                pt;
>  	wait_queue_head_t        *wqh;
>  	wait_queue_t              wait;
>  	struct work_struct        inject;
>  	struct work_struct        shutdown;
> +	struct kvm_irq_ack_notifier kian;
>  };
>  
>  static struct workqueue_struct *irqfd_cleanup_wq;
> @@ -63,10 +65,15 @@ irqfd_inject(struct work_struct *work)
>  
>  	mutex_lock(&kvm->irq_lock);
>  	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
> -	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
> +	if (!irqfd->is_level)
> +		kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
>  	mutex_unlock(&kvm->irq_lock);
>  }
>  
> +static void irqfd_irq_acked(struct kvm_irq_ack_notifier *kian)
> +{
> +	kvm_set_irq(kian->kvm, KVM_USERSPACE_IRQ_SOURCE_ID, kian->gsi, 0);
> +}
>  /*
>   * Race-free decouple logic (ordering is critical)
>   */
> @@ -87,6 +94,9 @@ irqfd_shutdown(struct work_struct *work)
>  	 */
>  	flush_work(&irqfd->inject);
>  
> +	if (irqfd->is_level)
> +		kvm_unregister_irq_ack_notifier(&irqfd->kian);
> +
>  	/*
>  	 * It is now safe to release the object's resources
>  	 */
> @@ -166,7 +176,7 @@ irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
>  }
>  
>  static int
> -kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
> +kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi, int is_level)
>  {
>  	struct _irqfd *irqfd;
>  	struct file *file = NULL;
> @@ -180,6 +190,7 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
>  
>  	irqfd->kvm = kvm;
>  	irqfd->gsi = gsi;
> +	irqfd->is_level = is_level;
>  	INIT_LIST_HEAD(&irqfd->list);
>  	INIT_WORK(&irqfd->inject, irqfd_inject);
>  	INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
> @@ -198,6 +209,12 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
>  
>  	irqfd->eventfd = eventfd;
>  
> +	if (is_level) {
> +		irqfd->kian.gsi = gsi;
> +		irqfd->kian.irq_acked = irqfd_irq_acked;
> +		kvm_register_irq_ack_notifier(&irqfd->kian);
> +	}
> +
>  	/*
>  	 * Install our own custom wake-up handling so we are notified via
>  	 * a callback whenever someone signals the underlying eventfd
> @@ -281,10 +298,13 @@ kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
>  int
>  kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
>  {
> +	if (flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_LEVEL))
> +		return -EINVAL;
> +
>  	if (flags & KVM_IRQFD_FLAG_DEASSIGN)
>  		return kvm_irqfd_deassign(kvm, fd, gsi);
>  
> -	return kvm_irqfd_assign(kvm, fd, gsi);
> +	return kvm_irqfd_assign(kvm, fd, gsi, !!(flags & KVM_IRQFD_FLAG_LEVEL));
>  }
>  
>  /*
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gregory Haskins July 29, 2009, 4:03 p.m. UTC | #9
Gregory Haskins wrote:
> Michael S. Tsirkin wrote:
>> Here's an untested patch with partial support for level triggered
>> interrupts in irqfd. What this patch has: support for clearing interrupt
>> on ack. What this patch does not have: support signalling eventfd on ack
>> so that userspace can take action and e.g. reenable interrupt.
>>
>> Gleb, Marcelo, I'd like your input on the approach taken wrt locking.
>> Does it look sane?
>>
>> Avi, how's the interface? I intend to also add an eventfd probably in
>> the padding in the irqfd struct.
>>
>> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> 
> Patch seems reasonable to me.
> 
> Acked-by: Gregory Haskins <ghaskins@novell.com>

Actually, one thing I didn't think of earlier is whether a
CAP_IRQFD_LEVEL kind of feature is required.  I would think not, since
the baseline IRQFD hasnt been officially published yet.

But its something to consider.

-Greg



> 
>> ---
>>
>> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
>> index 230a91a..8bf16af 100644
>> --- a/include/linux/kvm.h
>> +++ b/include/linux/kvm.h
>> @@ -488,6 +488,7 @@ struct kvm_x86_mce {
>>  #endif
>>  
>>  #define KVM_IRQFD_FLAG_DEASSIGN (1 << 0)
>> +#define KVM_IRQFD_FLAG_LEVEL (1 << 1)
>>  
>>  struct kvm_irqfd {
>>  	__u32 fd;
>> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
>> index 99017e8..fcbf5b5 100644
>> --- a/virt/kvm/eventfd.c
>> +++ b/virt/kvm/eventfd.c
>> @@ -45,12 +45,14 @@ struct _irqfd {
>>  	struct kvm               *kvm;
>>  	struct eventfd_ctx       *eventfd;
>>  	int                       gsi;
>> +	int                       is_level;
>>  	struct list_head          list;
>>  	poll_table                pt;
>>  	wait_queue_head_t        *wqh;
>>  	wait_queue_t              wait;
>>  	struct work_struct        inject;
>>  	struct work_struct        shutdown;
>> +	struct kvm_irq_ack_notifier kian;
>>  };
>>  
>>  static struct workqueue_struct *irqfd_cleanup_wq;
>> @@ -63,10 +65,15 @@ irqfd_inject(struct work_struct *work)
>>  
>>  	mutex_lock(&kvm->irq_lock);
>>  	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
>> -	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
>> +	if (!irqfd->is_level)
>> +		kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
>>  	mutex_unlock(&kvm->irq_lock);
>>  }
>>  
>> +static void irqfd_irq_acked(struct kvm_irq_ack_notifier *kian)
>> +{
>> +	kvm_set_irq(kian->kvm, KVM_USERSPACE_IRQ_SOURCE_ID, kian->gsi, 0);
>> +}
>>  /*
>>   * Race-free decouple logic (ordering is critical)
>>   */
>> @@ -87,6 +94,9 @@ irqfd_shutdown(struct work_struct *work)
>>  	 */
>>  	flush_work(&irqfd->inject);
>>  
>> +	if (irqfd->is_level)
>> +		kvm_unregister_irq_ack_notifier(&irqfd->kian);
>> +
>>  	/*
>>  	 * It is now safe to release the object's resources
>>  	 */
>> @@ -166,7 +176,7 @@ irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
>>  }
>>  
>>  static int
>> -kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
>> +kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi, int is_level)
>>  {
>>  	struct _irqfd *irqfd;
>>  	struct file *file = NULL;
>> @@ -180,6 +190,7 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
>>  
>>  	irqfd->kvm = kvm;
>>  	irqfd->gsi = gsi;
>> +	irqfd->is_level = is_level;
>>  	INIT_LIST_HEAD(&irqfd->list);
>>  	INIT_WORK(&irqfd->inject, irqfd_inject);
>>  	INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
>> @@ -198,6 +209,12 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
>>  
>>  	irqfd->eventfd = eventfd;
>>  
>> +	if (is_level) {
>> +		irqfd->kian.gsi = gsi;
>> +		irqfd->kian.irq_acked = irqfd_irq_acked;
>> +		kvm_register_irq_ack_notifier(&irqfd->kian);
>> +	}
>> +
>>  	/*
>>  	 * Install our own custom wake-up handling so we are notified via
>>  	 * a callback whenever someone signals the underlying eventfd
>> @@ -281,10 +298,13 @@ kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
>>  int
>>  kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
>>  {
>> +	if (flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_LEVEL))
>> +		return -EINVAL;
>> +
>>  	if (flags & KVM_IRQFD_FLAG_DEASSIGN)
>>  		return kvm_irqfd_deassign(kvm, fd, gsi);
>>  
>> -	return kvm_irqfd_assign(kvm, fd, gsi);
>> +	return kvm_irqfd_assign(kvm, fd, gsi, !!(flags & KVM_IRQFD_FLAG_LEVEL));
>>  }
>>  
>>  /*
>> --
>> To unsubscribe from this list: send the line "unsubscribe kvm" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
>
Avi Kivity Aug. 19, 2009, 12:17 p.m. UTC | #10
On 07/27/2009 08:33 AM, Gleb Natapov wrote:
> On Sun, Jul 26, 2009 at 07:22:25PM +0300, Michael S. Tsirkin wrote:
>    
>> Here's an untested patch with partial support for level triggered
>> interrupts in irqfd. What this patch has: support for clearing interrupt
>> on ack. What this patch does not have: support signalling eventfd on ack
>> so that userspace can take action and e.g. reenable interrupt.
>>
>>      
> Do we want level triggered irqfd to be used only for device assignment?
>    

If it works, it should work everywhere.
Avi Kivity Aug. 19, 2009, 12:20 p.m. UTC | #11
On 07/26/2009 07:22 PM, Michael S. Tsirkin wrote:
> Here's an untested patch with partial support for level triggered
> interrupts in irqfd. What this patch has: support for clearing interrupt
> on ack. What this patch does not have: support signalling eventfd on ack
> so that userspace can take action and e.g. reenable interrupt.
>
> Gleb, Marcelo, I'd like your input on the approach taken wrt locking.
> Does it look sane?
>
> Avi, how's the interface? I intend to also add an eventfd probably in
> the padding in the irqfd struct.
>    

What about the state-capable eventfd?  Seems much more natural for this.

>
>   static struct workqueue_struct *irqfd_cleanup_wq;
> @@ -63,10 +65,15 @@ irqfd_inject(struct work_struct *work)
>
>   	mutex_lock(&kvm->irq_lock);
>   	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
> -	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
> +	if (!irqfd->is_level)
> +		kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
>    

e.g. inject the eventfd->state as the irq level.
Michael S. Tsirkin Aug. 19, 2009, 1:09 p.m. UTC | #12
On Wed, Aug 19, 2009 at 03:20:21PM +0300, Avi Kivity wrote:
> On 07/26/2009 07:22 PM, Michael S. Tsirkin wrote:
>> Here's an untested patch with partial support for level triggered
>> interrupts in irqfd. What this patch has: support for clearing interrupt
>> on ack. What this patch does not have: support signalling eventfd on ack
>> so that userspace can take action and e.g. reenable interrupt.
>>
>> Gleb, Marcelo, I'd like your input on the approach taken wrt locking.
>> Does it look sane?
>>
>> Avi, how's the interface? I intend to also add an eventfd probably in
>> the padding in the irqfd struct.
>>    
>
> What about the state-capable eventfd?

This patch precedes that effort.

>  Seems much more natural for this.

Good. Could you please review and ack the state-capable eventfd patch
then, so I can try pushing it upstream?


>>
>>   static struct workqueue_struct *irqfd_cleanup_wq;
>> @@ -63,10 +65,15 @@ irqfd_inject(struct work_struct *work)
>>
>>   	mutex_lock(&kvm->irq_lock);
>>   	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
>> -	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
>> +	if (!irqfd->is_level)
>> +		kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
>>    
>
> e.g. inject the eventfd->state as the irq level.
>
> -- 
> error compiling committee.c: too many arguments to function
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 230a91a..8bf16af 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -488,6 +488,7 @@  struct kvm_x86_mce {
 #endif
 
 #define KVM_IRQFD_FLAG_DEASSIGN (1 << 0)
+#define KVM_IRQFD_FLAG_LEVEL (1 << 1)
 
 struct kvm_irqfd {
 	__u32 fd;
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 99017e8..fcbf5b5 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -45,12 +45,14 @@  struct _irqfd {
 	struct kvm               *kvm;
 	struct eventfd_ctx       *eventfd;
 	int                       gsi;
+	int                       is_level;
 	struct list_head          list;
 	poll_table                pt;
 	wait_queue_head_t        *wqh;
 	wait_queue_t              wait;
 	struct work_struct        inject;
 	struct work_struct        shutdown;
+	struct kvm_irq_ack_notifier kian;
 };
 
 static struct workqueue_struct *irqfd_cleanup_wq;
@@ -63,10 +65,15 @@  irqfd_inject(struct work_struct *work)
 
 	mutex_lock(&kvm->irq_lock);
 	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
-	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
+	if (!irqfd->is_level)
+		kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
 	mutex_unlock(&kvm->irq_lock);
 }
 
+static void irqfd_irq_acked(struct kvm_irq_ack_notifier *kian)
+{
+	kvm_set_irq(kian->kvm, KVM_USERSPACE_IRQ_SOURCE_ID, kian->gsi, 0);
+}
 /*
  * Race-free decouple logic (ordering is critical)
  */
@@ -87,6 +94,9 @@  irqfd_shutdown(struct work_struct *work)
 	 */
 	flush_work(&irqfd->inject);
 
+	if (irqfd->is_level)
+		kvm_unregister_irq_ack_notifier(&irqfd->kian);
+
 	/*
 	 * It is now safe to release the object's resources
 	 */
@@ -166,7 +176,7 @@  irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
 }
 
 static int
-kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
+kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi, int is_level)
 {
 	struct _irqfd *irqfd;
 	struct file *file = NULL;
@@ -180,6 +190,7 @@  kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
 
 	irqfd->kvm = kvm;
 	irqfd->gsi = gsi;
+	irqfd->is_level = is_level;
 	INIT_LIST_HEAD(&irqfd->list);
 	INIT_WORK(&irqfd->inject, irqfd_inject);
 	INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
@@ -198,6 +209,12 @@  kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
 
 	irqfd->eventfd = eventfd;
 
+	if (is_level) {
+		irqfd->kian.gsi = gsi;
+		irqfd->kian.irq_acked = irqfd_irq_acked;
+		kvm_register_irq_ack_notifier(&irqfd->kian);
+	}
+
 	/*
 	 * Install our own custom wake-up handling so we are notified via
 	 * a callback whenever someone signals the underlying eventfd
@@ -281,10 +298,13 @@  kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
 int
 kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
 {
+	if (flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_LEVEL))
+		return -EINVAL;
+
 	if (flags & KVM_IRQFD_FLAG_DEASSIGN)
 		return kvm_irqfd_deassign(kvm, fd, gsi);
 
-	return kvm_irqfd_assign(kvm, fd, gsi);
+	return kvm_irqfd_assign(kvm, fd, gsi, !!(flags & KVM_IRQFD_FLAG_LEVEL));
 }
 
 /*