diff mbox

[v7,2/2] kvm: KVM_EOIFD, an eventfd for EOIs

Message ID 20120724204320.21081.32333.stgit@bling.home (mailing list archive)
State New, archived
Headers show

Commit Message

Alex Williamson July 24, 2012, 8:43 p.m. UTC
This new ioctl enables an eventfd to be triggered when an EOI is
written for a specified irqchip pin.  The first user of this will
be external device assignment through VFIO, using a level irqfd
for asserting a PCI INTx interrupt and this interface for de-assert
and notification once the interrupt is serviced.

Here we make use of the reference counting of the _irq_source
object allowing us to share it with an irqfd and cleanup regardless
of the release order.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

 Documentation/virtual/kvm/api.txt |   21 ++
 arch/x86/kvm/x86.c                |    2 
 include/linux/kvm.h               |   15 ++
 include/linux/kvm_host.h          |   13 +
 virt/kvm/eventfd.c                |  336 +++++++++++++++++++++++++++++++++++++
 virt/kvm/kvm_main.c               |   11 +
 6 files changed, 398 insertions(+)


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Michael S. Tsirkin July 29, 2012, 2:54 p.m. UTC | #1
On Tue, Jul 24, 2012 at 02:43:22PM -0600, Alex Williamson wrote:
> This new ioctl enables an eventfd to be triggered when an EOI is
> written for a specified irqchip pin.  The first user of this will
> be external device assignment through VFIO, using a level irqfd
> for asserting a PCI INTx interrupt and this interface for de-assert
> and notification once the interrupt is serviced.
> 
> Here we make use of the reference counting of the _irq_source
> object allowing us to share it with an irqfd and cleanup regardless
> of the release order.
> 
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>

> ---
> 
>  Documentation/virtual/kvm/api.txt |   21 ++
>  arch/x86/kvm/x86.c                |    2 
>  include/linux/kvm.h               |   15 ++
>  include/linux/kvm_host.h          |   13 +
>  virt/kvm/eventfd.c                |  336 +++++++++++++++++++++++++++++++++++++
>  virt/kvm/kvm_main.c               |   11 +
>  6 files changed, 398 insertions(+)
> 
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index 3911e62..8cd6b36 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -1989,6 +1989,27 @@ return the hash table order in the parameter.  (If the guest is using
>  the virtualized real-mode area (VRMA) facility, the kernel will
>  re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.)
>  
> +4.77 KVM_EOIFD
> +
> +Capability: KVM_CAP_EOIFD
> +Architectures: x86
> +Type: vm ioctl
> +Parameters: struct kvm_eoifd (in)
> +Returns: 0 on success, < 0 on error
> +
> +KVM_EOIFD allows userspace to receive interrupt EOI notification
> +through an eventfd.

I thought about it some more, and I think it should be renamed to an
interrupt ack notification than eoi notification.
For example, consider userspace that uses threaded interrupts.
Currently what will happen is each interrupt will be injected
twice, since on eoi device is still asserting it.
One fix would be to delay event until interrupt is re-enabled.
Now I am not asking you to fix this immediately,
but I think we should make the interface generic by
saying we report an ack to userspace and not specifically EOI.


>  kvm_eoifd.fd specifies the eventfd used for
> +notification.  KVM_EOIFD_FLAG_DEASSIGN is used to de-assign an eoifd
> +once assigned.  KVM_EOIFD also requires additional bits set in
> +kvm_eoifd.flags to bind to the proper interrupt line.  The
> +KVM_EOIFD_FLAG_LEVEL_IRQFD indicates that kvm_eoifd.key is provided
> +and is a key from a level triggered interrupt (configured from
> +KVM_IRQFD using KVM_IRQFD_FLAG_LEVEL).  The EOI notification is bound
> +to the same GSI and irqchip input as the irqfd.  Both kvm_eoifd.key
> +and KVM_EOIFD_FLAG_LEVEL_IRQFD must be specified on assignment and
> +de-assignment of KVM_EOIFD.  A level irqfd may only be bound to a
> +single eoifd.  KVM_CAP_EOIFD_LEVEL_IRQFD indicates support of
> +KVM_EOIFD_FLAG_LEVEL_IRQFD.
>  

Hmm returning the key means we'll need to keep refcounting for source
IDs around forever. I liked passing the fd better: make implementation
match interface and not the other way around.

>  5. The kvm_run structure
>  ------------------------
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 9ded39d..8f3164e 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2171,6 +2171,8 @@ int kvm_dev_ioctl_check_extension(long ext)
>  	case KVM_CAP_PCI_2_3:
>  	case KVM_CAP_KVMCLOCK_CTRL:
>  	case KVM_CAP_IRQFD_LEVEL:
> +	case KVM_CAP_EOIFD:
> +	case KVM_CAP_EOIFD_LEVEL_IRQFD:
>  		r = 1;
>  		break;
>  	case KVM_CAP_COALESCED_MMIO:
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index b2e6e4f..effb916 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -619,6 +619,8 @@ struct kvm_ppc_smmu_info {
>  #define KVM_CAP_S390_COW 79
>  #define KVM_CAP_PPC_ALLOC_HTAB 80
>  #define KVM_CAP_IRQFD_LEVEL 81
> +#define KVM_CAP_EOIFD 82
> +#define KVM_CAP_EOIFD_LEVEL_IRQFD 83
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  
> @@ -694,6 +696,17 @@ struct kvm_irqfd {
>  	__u8  pad[20];
>  };
>  
> +#define KVM_EOIFD_FLAG_DEASSIGN (1 << 0)
> +/* Available with KVM_CAP_EOIFD_LEVEL_IRQFD */
> +#define KVM_EOIFD_FLAG_LEVEL_IRQFD (1 << 1)
> +
> +struct kvm_eoifd {
> +	__u32 fd;
> +	__u32 flags;
> +	__u32 key;
> +	__u8 pad[20];
> +};
> +
>  struct kvm_clock_data {
>  	__u64 clock;
>  	__u32 flags;
> @@ -834,6 +847,8 @@ struct kvm_s390_ucas_mapping {
>  #define KVM_PPC_GET_SMMU_INFO	  _IOR(KVMIO,  0xa6, struct kvm_ppc_smmu_info)
>  /* Available with KVM_CAP_PPC_ALLOC_HTAB */
>  #define KVM_PPC_ALLOCATE_HTAB	  _IOWR(KVMIO, 0xa7, __u32)
> +/* Available with KVM_CAP_EOIFD */
> +#define KVM_EOIFD                 _IOW(KVMIO,  0xa8, struct kvm_eoifd)
>  
>  /*
>   * ioctls for vcpu fds
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index c73f071..01e72a6 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -289,6 +289,10 @@ struct kvm {
>  		struct mutex lock;
>  		struct list_head items;
>  	} irqsources;
> +	struct {
> +		spinlock_t lock;
> +		struct list_head items;
> +	} eoifds;
>  #endif
>  	struct kvm_vm_stat stat;
>  	struct kvm_arch arch;
> @@ -832,6 +836,8 @@ int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args);
>  void kvm_irqfd_release(struct kvm *kvm);
>  void kvm_irq_routing_update(struct kvm *, struct kvm_irq_routing_table *);
>  int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
> +int kvm_eoifd(struct kvm *kvm, struct kvm_eoifd *args);
> +void kvm_eoifd_release(struct kvm *kvm);
>  
>  #else
>  
> @@ -857,6 +863,13 @@ static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
>  	return -ENOSYS;
>  }
>  
> +static inline int kvm_eoifd(struct kvm *kvm, struct kvm_eoifd *args)
> +{
> +	return -ENOSYS;
> +}
> +
> +static inline void kvm_eoifd_release(struct kvm *kvm) {}
> +
>  #endif /* CONFIG_HAVE_KVM_EVENTFD */
>  
>  #ifdef CONFIG_KVM_APIC_ARCHITECTURE
> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> index 878cb52..3aa2d62 100644
> --- a/virt/kvm/eventfd.c
> +++ b/virt/kvm/eventfd.c
> @@ -95,6 +95,25 @@ static struct _irq_source *_irq_source_alloc(struct kvm *kvm, int gsi)
>  	return source;
>  }
>  
> +static struct _irq_source *_irq_source_get_from_key(struct kvm *kvm, int key)
> +{
> +	struct _irq_source *tmp, *source = ERR_PTR(-ENOENT);
> +
> +	mutex_lock(&kvm->irqsources.lock);
> +
> +	list_for_each_entry(tmp, &kvm->irqsources.items, list) {
> +		if (tmp->id == key) {
> +			source = tmp;
> +			kref_get(&source->kref);
> +			break;
> +		}
> +	}
> +
> +	mutex_unlock(&kvm->irqsources.lock);
> +
> +	return source;
> +}
> +
>  /*
>   * --------------------------------------------------------------------
>   * irqfd: Allows an fd to be used to inject an interrupt to the guest
> @@ -406,6 +425,8 @@ kvm_eventfd_init(struct kvm *kvm)
>  	INIT_LIST_HEAD(&kvm->ioeventfds);
>  	mutex_init(&kvm->irqsources.lock);
>  	INIT_LIST_HEAD(&kvm->irqsources.items);
> +	spin_lock_init(&kvm->eoifds.lock);
> +	INIT_LIST_HEAD(&kvm->eoifds.items);
>  }
>  
>  /*
> @@ -772,3 +793,318 @@ kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
>  
>  	return kvm_assign_ioeventfd(kvm, args);
>  }
> +
> +/*
> + * --------------------------------------------------------------------
> + *  eoifd: Translate KVM APIC/IOAPIC EOI into eventfd signal.
> + *
> + *  userspace can register with an eventfd for receiving
> + *  notification when an EOI occurs.
> + * --------------------------------------------------------------------
> + */
> +
> +struct _eoifd {
> +	/* eventfd triggered on EOI */
> +	struct eventfd_ctx *eventfd;
> +	/* irq source ID de-asserted on EOI */
> +	struct _irq_source *source;
> +	wait_queue_t wait;
> +	/* EOI notification from KVM */
> +	struct kvm_irq_ack_notifier notifier;
> +	struct list_head list;
> +	poll_table pt;
> +	struct work_struct shutdown;
> +};
> +
> +/* Called under eoifds.lock */
> +static void eoifd_shutdown(struct work_struct *work)
> +{
> +	struct _eoifd *eoifd = container_of(work, struct _eoifd, shutdown);
> +	struct kvm *kvm = eoifd->source->kvm;
> +	u64 cnt;
> +
> +	/*
> +	 * Stop EOI signaling
> +	 */
> +	kvm_unregister_irq_ack_notifier(kvm, &eoifd->notifier);
> +
> +	/*
> +	 * Synchronize with the wait-queue and unhook ourselves to prevent
> +	 * further events.
> +	 */
> +	eventfd_ctx_remove_wait_queue(eoifd->eventfd, &eoifd->wait, &cnt);
> +
> +	/*
> +	 * Release resources
> +	 */
> +	eventfd_ctx_put(eoifd->eventfd);
> +	_irq_source_put(eoifd->source);
> +	kfree(eoifd);
> +}
> +
> +/* assumes kvm->eoifds.lock is held */
> +static bool eoifd_is_active(struct _eoifd *eoifd)
> +{
> +	return list_empty(&eoifd->list) ? false : true;
> +}
> +
> +/*
> + * Mark the eoifd as inactive and schedule it for removal
> + *
> + * assumes kvm->eoifds.lock is held
> + */
> +static void eoifd_deactivate(struct _eoifd *eoifd)
> +{
> +	BUG_ON(!eoifd_is_active(eoifd));
> +
> +	list_del_init(&eoifd->list);
> +
> +	queue_work(irqfd_cleanup_wq, &eoifd->shutdown);
> +}
> +
> +/*
> + * Called with wqh->lock held and interrupts disabled
> + */
> +static int eoifd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
> +{
> +	unsigned long flags = (unsigned long)key;
> +
> +	if (unlikely(flags & POLLHUP)) {
> +		/* The eventfd is closing, detach from KVM */
> +		struct _eoifd *eoifd = container_of(wait, struct _eoifd, wait);
> +		struct kvm *kvm = eoifd->source->kvm;
> +		unsigned long flags;
> +
> +		spin_lock_irqsave(&kvm->eoifds.lock, flags);
> +
> +		/*
> +		 * We must check if someone deactivated the eoifd before
> +		 * we could acquire the eoifds.lock since the item is
> +		 * deactivated from the KVM side before it is unhooked from
> +		 * the wait-queue.  If it is already deactivated, we can
> +		 * simply return knowing the other side will cleanup for us.
> +		 * We cannot race against the eoifd going away since the
> +		 * other side is required to acquire wqh->lock, which we hold
> +		 */
> +		if (eoifd_is_active(eoifd))
> +			eoifd_deactivate(eoifd);
> +
> +		spin_unlock_irqrestore(&kvm->eoifds.lock, flags);
> +	}
> +
> +	return 0;
> +}

Looks like there is a bug here: if I close irqfd, then close eoifd,
the key is not immediately released so an attempt to create
an irqfd can fail to get the source id.

Maybe we should simply document that userspace should deassign
eoifd before closing it? This is what we do for ioeventfd.
If we do this, the whole polling code can go away completely.



> +
> +static void eoifd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
> +				    poll_table *pt)
> +{
> +	struct _eoifd *eoifd = container_of(pt, struct _eoifd, pt);
> +	add_wait_queue(wqh, &eoifd->wait);
> +}
> +
> +/*
> + * This function is called as the kvm VM fd is being released. Shutdown all
> + * eoifds that still remain open
> + */
> +void kvm_eoifd_release(struct kvm *kvm)
> +{
> +	struct _eoifd *tmp, *eoifd;
> +
> +	spin_lock_irq(&kvm->eoifds.lock);
> +
> +	list_for_each_entry_safe(eoifd, tmp, &kvm->eoifds.items, list)
> +		eoifd_deactivate(eoifd);
> +
> +	spin_unlock_irq(&kvm->eoifds.lock);
> +
> +	flush_workqueue(irqfd_cleanup_wq);
> +}
> +
> +static void eoifd_event(struct kvm_irq_ack_notifier *notifier)
> +{
> +	struct _eoifd *eoifd;
> +
> +	eoifd = container_of(notifier, struct _eoifd, notifier);
> +
> +	if (unlikely(!eoifd->source))
> +		return;
> +
> +	/*
> +	 * De-assert and send EOI, user needs to re-assert if
> +	 * device still requires service.
> +	 */

I'm not sure why did you drop filtering by source id.
This means userspace gets events even if it did not send an interrupt.
So
1. Should be documented that you can get spurious events 
2. when an interrupt is shared with an emulated device,
   and said device uses EOI, this will not
   perform well as we will wake up userspace on each EOI.
3. Just sharing interrupt with virtio means we are polling
   assigned device on each virtio interrupt.


> +	kvm_set_irq(eoifd->source->kvm,
> +		    eoifd->source->id, eoifd->source->gsi, 0);
> +	eventfd_signal(eoifd->eventfd, 1);
> +}
> +
> +static int kvm_assign_eoifd(struct kvm *kvm, struct kvm_eoifd *args)
> +{
> +	struct file *file = NULL;
> +	struct eventfd_ctx *eventfd = NULL;
> +	struct _eoifd *eoifd = NULL, *tmp;
> +	struct _irq_source *source = NULL;
> +	int ret;
> +	u64 cnt;
> +
> +	if (!(args->flags & KVM_EOIFD_FLAG_LEVEL_IRQFD))
> +		return -EINVAL;
> +
> +	file = eventfd_fget(args->fd);
> +	if (IS_ERR(file)) {
> +		ret = PTR_ERR(file);
> +		goto fail;
> +	}
> +
> +	eventfd = eventfd_ctx_fileget(file);
> +	if (IS_ERR(eventfd)) {
> +		ret = PTR_ERR(eventfd);
> +		goto fail;
> +	}
> +
> +	eoifd = kzalloc(sizeof(*eoifd), GFP_KERNEL);
> +	if (!eoifd) {
> +		ret = -ENOMEM;
> +		goto fail;
> +	}
> +
> +	source = _irq_source_get_from_key(kvm, args->key);
> +	if (IS_ERR(source)) {
> +		ret = PTR_ERR(source);
> +		goto fail;
> +	}
> +
> +	INIT_LIST_HEAD(&eoifd->list);
> +	INIT_WORK(&eoifd->shutdown, eoifd_shutdown);
> +	eoifd->eventfd = eventfd;
> +	eoifd->notifier.gsi = source->gsi;
> +	eoifd->notifier.irq_acked = eoifd_event;
> +
> +	/*
> +	 * Install our own custom wake-up handling so we are notified via
> +	 * a callback whenever someone releases the underlying eventfd
> +	 */
> +	init_waitqueue_func_entry(&eoifd->wait, eoifd_wakeup);
> +	init_poll_funcptr(&eoifd->pt, eoifd_ptable_queue_proc);
> +
> +	/*
> +	 * Clear out any previously released eoifds that might conflict
> +	 */
> +	flush_workqueue(irqfd_cleanup_wq);
> +
> +	/*
> +	 * This can sleep, so register before acquiring spinlock, notifier
> +	 * becomes a nop until we finish.
> +	 */
> +	kvm_register_irq_ack_notifier(kvm, &eoifd->notifier);
> +
> +	/*
> +	 * Install the wait queue function to allow cleanup when the
> +	 * eventfd is closed by the user.  This grabs the wqh lock, so
> +	 * we do it out of spinlock, holding the file reference ensures
> +	 * we won't see a POLLHUP until setup is complete.
> +	 */
> +	file->f_op->poll(file, &eoifd->pt);
> +
> +	spin_lock_irq(&kvm->eoifds.lock);
> +
> +	/*
> +	 * Enforce a one-to-one relationship between irq source and eoifd so
> +	 * that this interface can't be used to consume all kernel memory.
> +	 * NB. single eventfd can still be used by multiple eoifds.
> +	 */
> +	list_for_each_entry(tmp, &kvm->eoifds.items, list) {
> +		if (tmp->source == source) {
> +			spin_unlock_irq(&kvm->eoifds.lock);
> +			ret = -EBUSY;
> +			goto fail_unregister;
> +		}
> +	}
> +
> +	list_add_tail(&eoifd->list, &kvm->eoifds.items);
> +	eoifd->source = source; /* Enable ack notifier */
> +
> +	spin_unlock_irq(&kvm->eoifds.lock);
> +
> +	fput(file); /* Enable POLLHUP */
> +
> +	return 0;
> +
> +fail_unregister:
> +	eventfd_ctx_remove_wait_queue(eventfd, &eoifd->wait, &cnt);
> +	kvm_unregister_irq_ack_notifier(kvm, &eoifd->notifier);
> +fail:
> +	if (source && !IS_ERR(source))
> +		_irq_source_put(source);
> +
> +	if (eventfd && !IS_ERR(eventfd))
> +		eventfd_ctx_put(eventfd);
> +
> +	if (file && !IS_ERR(file))
> +		fput(file);
> +
> +	kfree(eoifd);
> +	return ret;
> +}
> +
> +static int kvm_deassign_eoifd(struct kvm *kvm, struct kvm_eoifd *args)
> +{
> +	struct eventfd_ctx *eventfd = NULL;
> +	struct _irq_source *source = NULL;
> +	struct _eoifd *eoifd;
> +	int ret = -ENOENT;
> +
> +	if (!(args->flags & KVM_EOIFD_FLAG_LEVEL_IRQFD))
> +		return -EINVAL;
> +
> +	eventfd = eventfd_ctx_fdget(args->fd);
> +	if (IS_ERR(eventfd)) {
> +		ret = PTR_ERR(eventfd);
> +		goto fail;
> +	}
> +
> +	source = _irq_source_get_from_key(kvm, args->key);
> +	if (IS_ERR(source)) {
> +		ret = PTR_ERR(source);
> +		goto fail;
> +	}
> +
> +	spin_lock_irq(&kvm->eoifds.lock);
> +
> +	list_for_each_entry(eoifd, &kvm->eoifds.items, list) {
> +		if (eoifd->eventfd == eventfd && eoifd->source == source) {
> +			eoifd_deactivate(eoifd);
> +			ret = 0;
> +			break;
> +		}
> +	}
> +
> +	spin_unlock_irq(&kvm->eoifds.lock);
> +
> +fail:
> +	if (source && !IS_ERR(source))
> +		_irq_source_put(source);
> +	if (eventfd && !IS_ERR(eventfd))
> +		eventfd_ctx_put(eventfd);
> +
> +	/*
> +	 * Block until we know all outstanding shutdown jobs have completed
> +	 * so that we guarantee there will not be any more EOIs signaled on
> +	 * this eventfd once this deassign function returns.
> +	 */
> +	flush_workqueue(irqfd_cleanup_wq);
> +
> +	return ret;
> +}
> +
> +int kvm_eoifd(struct kvm *kvm, struct kvm_eoifd *args)
> +{
> +	if (args->flags & ~(KVM_EOIFD_FLAG_DEASSIGN |
> +			    KVM_EOIFD_FLAG_LEVEL_IRQFD))
> +		return -EINVAL;
> +
> +	if (args->flags & KVM_EOIFD_FLAG_DEASSIGN)
> +		return kvm_deassign_eoifd(kvm, args);
> +
> +	return kvm_assign_eoifd(kvm, args);
> +}
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 2468523..0b241bf 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -620,6 +620,8 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
>  
>  	kvm_irqfd_release(kvm);
>  
> +	kvm_eoifd_release(kvm);
> +
>  	kvm_put_kvm(kvm);
>  	return 0;
>  }
> @@ -2093,6 +2095,15 @@ static long kvm_vm_ioctl(struct file *filp,
>  		break;
>  	}
>  #endif
> +	case KVM_EOIFD: {
> +		struct kvm_eoifd data;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&data, argp, sizeof data))
> +			goto out;
> +		r = kvm_eoifd(kvm, &data);
> +		break;
> +	}
>  	default:
>  		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
>  		if (r == -ENOTTY)
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson July 30, 2012, 4:22 p.m. UTC | #2
On Sun, 2012-07-29 at 17:54 +0300, Michael S. Tsirkin wrote:
> On Tue, Jul 24, 2012 at 02:43:22PM -0600, Alex Williamson wrote:
> > This new ioctl enables an eventfd to be triggered when an EOI is
> > written for a specified irqchip pin.  The first user of this will
> > be external device assignment through VFIO, using a level irqfd
> > for asserting a PCI INTx interrupt and this interface for de-assert
> > and notification once the interrupt is serviced.
> > 
> > Here we make use of the reference counting of the _irq_source
> > object allowing us to share it with an irqfd and cleanup regardless
> > of the release order.
> > 
> > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> 
> > ---
> > 
> >  Documentation/virtual/kvm/api.txt |   21 ++
> >  arch/x86/kvm/x86.c                |    2 
> >  include/linux/kvm.h               |   15 ++
> >  include/linux/kvm_host.h          |   13 +
> >  virt/kvm/eventfd.c                |  336 +++++++++++++++++++++++++++++++++++++
> >  virt/kvm/kvm_main.c               |   11 +
> >  6 files changed, 398 insertions(+)
> > 
> > diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> > index 3911e62..8cd6b36 100644
> > --- a/Documentation/virtual/kvm/api.txt
> > +++ b/Documentation/virtual/kvm/api.txt
> > @@ -1989,6 +1989,27 @@ return the hash table order in the parameter.  (If the guest is using
> >  the virtualized real-mode area (VRMA) facility, the kernel will
> >  re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.)
> >  
> > +4.77 KVM_EOIFD
> > +
> > +Capability: KVM_CAP_EOIFD
> > +Architectures: x86
> > +Type: vm ioctl
> > +Parameters: struct kvm_eoifd (in)
> > +Returns: 0 on success, < 0 on error
> > +
> > +KVM_EOIFD allows userspace to receive interrupt EOI notification
> > +through an eventfd.
> 
> I thought about it some more, and I think it should be renamed to an
> interrupt ack notification than eoi notification.
> For example, consider userspace that uses threaded interrupts.
> Currently what will happen is each interrupt will be injected
> twice, since on eoi device is still asserting it.

I don't follow, why is userspace writing an eoi to the ioapic if it
hasn't handled the interrupt and why wouldn't the same happen on bare
metal?

> One fix would be to delay event until interrupt is re-enabled.
> Now I am not asking you to fix this immediately,
> but I think we should make the interface generic by
> saying we report an ack to userspace and not specifically EOI.

Using the word "delay" in the context of interrupt delivery raises all
sorts of red flags for me, but I really don't understand your argument.

> >  kvm_eoifd.fd specifies the eventfd used for
> > +notification.  KVM_EOIFD_FLAG_DEASSIGN is used to de-assign an eoifd
> > +once assigned.  KVM_EOIFD also requires additional bits set in
> > +kvm_eoifd.flags to bind to the proper interrupt line.  The
> > +KVM_EOIFD_FLAG_LEVEL_IRQFD indicates that kvm_eoifd.key is provided
> > +and is a key from a level triggered interrupt (configured from
> > +KVM_IRQFD using KVM_IRQFD_FLAG_LEVEL).  The EOI notification is bound
> > +to the same GSI and irqchip input as the irqfd.  Both kvm_eoifd.key
> > +and KVM_EOIFD_FLAG_LEVEL_IRQFD must be specified on assignment and
> > +de-assignment of KVM_EOIFD.  A level irqfd may only be bound to a
> > +single eoifd.  KVM_CAP_EOIFD_LEVEL_IRQFD indicates support of
> > +KVM_EOIFD_FLAG_LEVEL_IRQFD.
> >  
> 
> Hmm returning the key means we'll need to keep refcounting for source
> IDs around forever. I liked passing the fd better: make implementation
> match interface and not the other way around.

False, a source ID has a finite lifecycle.  The fd approach was broken.
Holding the irqfd context imposed too many dependencies between eoifd
and irqfd necessitating things like one interface disabling another.  I
thoroughly disagree with that approach.

> >  5. The kvm_run structure
> >  ------------------------
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 9ded39d..8f3164e 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -2171,6 +2171,8 @@ int kvm_dev_ioctl_check_extension(long ext)
> >  	case KVM_CAP_PCI_2_3:
> >  	case KVM_CAP_KVMCLOCK_CTRL:
> >  	case KVM_CAP_IRQFD_LEVEL:
> > +	case KVM_CAP_EOIFD:
> > +	case KVM_CAP_EOIFD_LEVEL_IRQFD:
> >  		r = 1;
> >  		break;
> >  	case KVM_CAP_COALESCED_MMIO:
> > diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> > index b2e6e4f..effb916 100644
> > --- a/include/linux/kvm.h
> > +++ b/include/linux/kvm.h
> > @@ -619,6 +619,8 @@ struct kvm_ppc_smmu_info {
> >  #define KVM_CAP_S390_COW 79
> >  #define KVM_CAP_PPC_ALLOC_HTAB 80
> >  #define KVM_CAP_IRQFD_LEVEL 81
> > +#define KVM_CAP_EOIFD 82
> > +#define KVM_CAP_EOIFD_LEVEL_IRQFD 83
> >  
> >  #ifdef KVM_CAP_IRQ_ROUTING
> >  
> > @@ -694,6 +696,17 @@ struct kvm_irqfd {
> >  	__u8  pad[20];
> >  };
> >  
> > +#define KVM_EOIFD_FLAG_DEASSIGN (1 << 0)
> > +/* Available with KVM_CAP_EOIFD_LEVEL_IRQFD */
> > +#define KVM_EOIFD_FLAG_LEVEL_IRQFD (1 << 1)
> > +
> > +struct kvm_eoifd {
> > +	__u32 fd;
> > +	__u32 flags;
> > +	__u32 key;
> > +	__u8 pad[20];
> > +};
> > +
> >  struct kvm_clock_data {
> >  	__u64 clock;
> >  	__u32 flags;
> > @@ -834,6 +847,8 @@ struct kvm_s390_ucas_mapping {
> >  #define KVM_PPC_GET_SMMU_INFO	  _IOR(KVMIO,  0xa6, struct kvm_ppc_smmu_info)
> >  /* Available with KVM_CAP_PPC_ALLOC_HTAB */
> >  #define KVM_PPC_ALLOCATE_HTAB	  _IOWR(KVMIO, 0xa7, __u32)
> > +/* Available with KVM_CAP_EOIFD */
> > +#define KVM_EOIFD                 _IOW(KVMIO,  0xa8, struct kvm_eoifd)
> >  
> >  /*
> >   * ioctls for vcpu fds
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index c73f071..01e72a6 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -289,6 +289,10 @@ struct kvm {
> >  		struct mutex lock;
> >  		struct list_head items;
> >  	} irqsources;
> > +	struct {
> > +		spinlock_t lock;
> > +		struct list_head items;
> > +	} eoifds;
> >  #endif
> >  	struct kvm_vm_stat stat;
> >  	struct kvm_arch arch;
> > @@ -832,6 +836,8 @@ int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args);
> >  void kvm_irqfd_release(struct kvm *kvm);
> >  void kvm_irq_routing_update(struct kvm *, struct kvm_irq_routing_table *);
> >  int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
> > +int kvm_eoifd(struct kvm *kvm, struct kvm_eoifd *args);
> > +void kvm_eoifd_release(struct kvm *kvm);
> >  
> >  #else
> >  
> > @@ -857,6 +863,13 @@ static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
> >  	return -ENOSYS;
> >  }
> >  
> > +static inline int kvm_eoifd(struct kvm *kvm, struct kvm_eoifd *args)
> > +{
> > +	return -ENOSYS;
> > +}
> > +
> > +static inline void kvm_eoifd_release(struct kvm *kvm) {}
> > +
> >  #endif /* CONFIG_HAVE_KVM_EVENTFD */
> >  
> >  #ifdef CONFIG_KVM_APIC_ARCHITECTURE
> > diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> > index 878cb52..3aa2d62 100644
> > --- a/virt/kvm/eventfd.c
> > +++ b/virt/kvm/eventfd.c
> > @@ -95,6 +95,25 @@ static struct _irq_source *_irq_source_alloc(struct kvm *kvm, int gsi)
> >  	return source;
> >  }
> >  
> > +static struct _irq_source *_irq_source_get_from_key(struct kvm *kvm, int key)
> > +{
> > +	struct _irq_source *tmp, *source = ERR_PTR(-ENOENT);
> > +
> > +	mutex_lock(&kvm->irqsources.lock);
> > +
> > +	list_for_each_entry(tmp, &kvm->irqsources.items, list) {
> > +		if (tmp->id == key) {
> > +			source = tmp;
> > +			kref_get(&source->kref);
> > +			break;
> > +		}
> > +	}
> > +
> > +	mutex_unlock(&kvm->irqsources.lock);
> > +
> > +	return source;
> > +}
> > +
> >  /*
> >   * --------------------------------------------------------------------
> >   * irqfd: Allows an fd to be used to inject an interrupt to the guest
> > @@ -406,6 +425,8 @@ kvm_eventfd_init(struct kvm *kvm)
> >  	INIT_LIST_HEAD(&kvm->ioeventfds);
> >  	mutex_init(&kvm->irqsources.lock);
> >  	INIT_LIST_HEAD(&kvm->irqsources.items);
> > +	spin_lock_init(&kvm->eoifds.lock);
> > +	INIT_LIST_HEAD(&kvm->eoifds.items);
> >  }
> >  
> >  /*
> > @@ -772,3 +793,318 @@ kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
> >  
> >  	return kvm_assign_ioeventfd(kvm, args);
> >  }
> > +
> > +/*
> > + * --------------------------------------------------------------------
> > + *  eoifd: Translate KVM APIC/IOAPIC EOI into eventfd signal.
> > + *
> > + *  userspace can register with an eventfd for receiving
> > + *  notification when an EOI occurs.
> > + * --------------------------------------------------------------------
> > + */
> > +
> > +struct _eoifd {
> > +	/* eventfd triggered on EOI */
> > +	struct eventfd_ctx *eventfd;
> > +	/* irq source ID de-asserted on EOI */
> > +	struct _irq_source *source;
> > +	wait_queue_t wait;
> > +	/* EOI notification from KVM */
> > +	struct kvm_irq_ack_notifier notifier;
> > +	struct list_head list;
> > +	poll_table pt;
> > +	struct work_struct shutdown;
> > +};
> > +
> > +/* Called under eoifds.lock */
> > +static void eoifd_shutdown(struct work_struct *work)
> > +{
> > +	struct _eoifd *eoifd = container_of(work, struct _eoifd, shutdown);
> > +	struct kvm *kvm = eoifd->source->kvm;
> > +	u64 cnt;
> > +
> > +	/*
> > +	 * Stop EOI signaling
> > +	 */
> > +	kvm_unregister_irq_ack_notifier(kvm, &eoifd->notifier);
> > +
> > +	/*
> > +	 * Synchronize with the wait-queue and unhook ourselves to prevent
> > +	 * further events.
> > +	 */
> > +	eventfd_ctx_remove_wait_queue(eoifd->eventfd, &eoifd->wait, &cnt);
> > +
> > +	/*
> > +	 * Release resources
> > +	 */
> > +	eventfd_ctx_put(eoifd->eventfd);
> > +	_irq_source_put(eoifd->source);
> > +	kfree(eoifd);
> > +}
> > +
> > +/* assumes kvm->eoifds.lock is held */
> > +static bool eoifd_is_active(struct _eoifd *eoifd)
> > +{
> > +	return list_empty(&eoifd->list) ? false : true;
> > +}
> > +
> > +/*
> > + * Mark the eoifd as inactive and schedule it for removal
> > + *
> > + * assumes kvm->eoifds.lock is held
> > + */
> > +static void eoifd_deactivate(struct _eoifd *eoifd)
> > +{
> > +	BUG_ON(!eoifd_is_active(eoifd));
> > +
> > +	list_del_init(&eoifd->list);
> > +
> > +	queue_work(irqfd_cleanup_wq, &eoifd->shutdown);
> > +}
> > +
> > +/*
> > + * Called with wqh->lock held and interrupts disabled
> > + */
> > +static int eoifd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
> > +{
> > +	unsigned long flags = (unsigned long)key;
> > +
> > +	if (unlikely(flags & POLLHUP)) {
> > +		/* The eventfd is closing, detach from KVM */
> > +		struct _eoifd *eoifd = container_of(wait, struct _eoifd, wait);
> > +		struct kvm *kvm = eoifd->source->kvm;
> > +		unsigned long flags;
> > +
> > +		spin_lock_irqsave(&kvm->eoifds.lock, flags);
> > +
> > +		/*
> > +		 * We must check if someone deactivated the eoifd before
> > +		 * we could acquire the eoifds.lock since the item is
> > +		 * deactivated from the KVM side before it is unhooked from
> > +		 * the wait-queue.  If it is already deactivated, we can
> > +		 * simply return knowing the other side will cleanup for us.
> > +		 * We cannot race against the eoifd going away since the
> > +		 * other side is required to acquire wqh->lock, which we hold
> > +		 */
> > +		if (eoifd_is_active(eoifd))
> > +			eoifd_deactivate(eoifd);
> > +
> > +		spin_unlock_irqrestore(&kvm->eoifds.lock, flags);
> > +	}
> > +
> > +	return 0;
> > +}
> 
> Looks like there is a bug here: if I close irqfd, then close eoifd,
> the key is not immediately released so an attempt to create
> an irqfd can fail to get the source id.

Both irqfd and eoifd use the same workqueue for releasing objects and
both flush on assign.

> Maybe we should simply document that userspace should deassign
> eoifd before closing it? This is what we do for ioeventfd.
> If we do this, the whole polling code can go away completely.

You're again ignoring the close problem.  We cannot document around an
impossible requirement that fds are always deassigned before close.
IMHO ioeventfd is broken here and I don't wish to emulate it's behavior.

> > +
> > +static void eoifd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
> > +				    poll_table *pt)
> > +{
> > +	struct _eoifd *eoifd = container_of(pt, struct _eoifd, pt);
> > +	add_wait_queue(wqh, &eoifd->wait);
> > +}
> > +
> > +/*
> > + * This function is called as the kvm VM fd is being released. Shutdown all
> > + * eoifds that still remain open
> > + */
> > +void kvm_eoifd_release(struct kvm *kvm)
> > +{
> > +	struct _eoifd *tmp, *eoifd;
> > +
> > +	spin_lock_irq(&kvm->eoifds.lock);
> > +
> > +	list_for_each_entry_safe(eoifd, tmp, &kvm->eoifds.items, list)
> > +		eoifd_deactivate(eoifd);
> > +
> > +	spin_unlock_irq(&kvm->eoifds.lock);
> > +
> > +	flush_workqueue(irqfd_cleanup_wq);
> > +}
> > +
> > +static void eoifd_event(struct kvm_irq_ack_notifier *notifier)
> > +{
> > +	struct _eoifd *eoifd;
> > +
> > +	eoifd = container_of(notifier, struct _eoifd, notifier);
> > +
> > +	if (unlikely(!eoifd->source))
> > +		return;
> > +
> > +	/*
> > +	 * De-assert and send EOI, user needs to re-assert if
> > +	 * device still requires service.
> > +	 */
> 
> I'm not sure why did you drop filtering by source id.
> This means userspace gets events even if it did not send an interrupt.
> So
> 1. Should be documented that you can get spurious events 
> 2. when an interrupt is shared with an emulated device,
>    and said device uses EOI, this will not
>    perform well as we will wake up userspace on each EOI.
> 3. Just sharing interrupt with virtio means we are polling
>    assigned device on each virtio interrupt.

Didn't we just agree after v5 that filtering requires a spinlock around
around calling kvm_irq_set or at least a new interface to setting irqs
that allows us to see the current assertion state and that neither of
those seem to be worth the effort for level irqs?  That's why I dropped
it.  Interrupts always have to support spurious events.  The comment
immediately above indicates this.  Legacy interrupts, especially shared
legacy interrupts should not be our primary performance path.  VFIO has
a very efficient path for handling spurious EOIs.


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin July 31, 2012, 12:01 a.m. UTC | #3
On Mon, Jul 30, 2012 at 10:22:10AM -0600, Alex Williamson wrote:
> On Sun, 2012-07-29 at 17:54 +0300, Michael S. Tsirkin wrote:
> > On Tue, Jul 24, 2012 at 02:43:22PM -0600, Alex Williamson wrote:
> > > This new ioctl enables an eventfd to be triggered when an EOI is
> > > written for a specified irqchip pin.  The first user of this will
> > > be external device assignment through VFIO, using a level irqfd
> > > for asserting a PCI INTx interrupt and this interface for de-assert
> > > and notification once the interrupt is serviced.
> > > 
> > > Here we make use of the reference counting of the _irq_source
> > > object allowing us to share it with an irqfd and cleanup regardless
> > > of the release order.
> > > 
> > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > 
> > > ---
> > > 
> > >  Documentation/virtual/kvm/api.txt |   21 ++
> > >  arch/x86/kvm/x86.c                |    2 
> > >  include/linux/kvm.h               |   15 ++
> > >  include/linux/kvm_host.h          |   13 +
> > >  virt/kvm/eventfd.c                |  336 +++++++++++++++++++++++++++++++++++++
> > >  virt/kvm/kvm_main.c               |   11 +
> > >  6 files changed, 398 insertions(+)
> > > 
> > > diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> > > index 3911e62..8cd6b36 100644
> > > --- a/Documentation/virtual/kvm/api.txt
> > > +++ b/Documentation/virtual/kvm/api.txt
> > > @@ -1989,6 +1989,27 @@ return the hash table order in the parameter.  (If the guest is using
> > >  the virtualized real-mode area (VRMA) facility, the kernel will
> > >  re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.)
> > >  
> > > +4.77 KVM_EOIFD
> > > +
> > > +Capability: KVM_CAP_EOIFD
> > > +Architectures: x86
> > > +Type: vm ioctl
> > > +Parameters: struct kvm_eoifd (in)
> > > +Returns: 0 on success, < 0 on error
> > > +
> > > +KVM_EOIFD allows userspace to receive interrupt EOI notification
> > > +through an eventfd.
> > 
> > I thought about it some more, and I think it should be renamed to an
> > interrupt ack notification than eoi notification.
> > For example, consider userspace that uses threaded interrupts.
> > Currently what will happen is each interrupt will be injected
> > twice, since on eoi device is still asserting it.
> 
> I don't follow, why is userspace writing an eoi to the ioapic if it
> hasn't handled the interrupt

It has handled it - it disabled the hardware interrupt.

> and why wouldn't the same happen on bare
> metal?

on bare metal level does not matter as long as interrupt
is disabled.

> > One fix would be to delay event until interrupt is re-enabled.
> > Now I am not asking you to fix this immediately,
> > but I think we should make the interface generic by
> > saying we report an ack to userspace and not specifically EOI.
> 
> Using the word "delay" in the context of interrupt delivery raises all
> sorts of red flags for me, but I really don't understand your argument.

I am saying it's an "ack" of interrupt userspace cares about.
The fact it is done by EOI is an implementation detail.

> > >  kvm_eoifd.fd specifies the eventfd used for
> > > +notification.  KVM_EOIFD_FLAG_DEASSIGN is used to de-assign an eoifd
> > > +once assigned.  KVM_EOIFD also requires additional bits set in
> > > +kvm_eoifd.flags to bind to the proper interrupt line.  The
> > > +KVM_EOIFD_FLAG_LEVEL_IRQFD indicates that kvm_eoifd.key is provided
> > > +and is a key from a level triggered interrupt (configured from
> > > +KVM_IRQFD using KVM_IRQFD_FLAG_LEVEL).  The EOI notification is bound
> > > +to the same GSI and irqchip input as the irqfd.  Both kvm_eoifd.key
> > > +and KVM_EOIFD_FLAG_LEVEL_IRQFD must be specified on assignment and
> > > +de-assignment of KVM_EOIFD.  A level irqfd may only be bound to a
> > > +single eoifd.  KVM_CAP_EOIFD_LEVEL_IRQFD indicates support of
> > > +KVM_EOIFD_FLAG_LEVEL_IRQFD.
> > >  
> > 
> > Hmm returning the key means we'll need to keep refcounting for source
> > IDs around forever. I liked passing the fd better: make implementation
> > match interface and not the other way around.
> 
> False, a source ID has a finite lifecycle.  The fd approach was broken.
> Holding the irqfd context imposed too many dependencies between eoifd
> and irqfd necessitating things like one interface disabling another.  I
> thoroughly disagree with that approach.

You keep saying this but it is still true: once irqfd
is closed eoifd does not get any more interrupts.

> > >  5. The kvm_run structure
> > >  ------------------------
> > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > index 9ded39d..8f3164e 100644
> > > --- a/arch/x86/kvm/x86.c
> > > +++ b/arch/x86/kvm/x86.c
> > > @@ -2171,6 +2171,8 @@ int kvm_dev_ioctl_check_extension(long ext)
> > >  	case KVM_CAP_PCI_2_3:
> > >  	case KVM_CAP_KVMCLOCK_CTRL:
> > >  	case KVM_CAP_IRQFD_LEVEL:
> > > +	case KVM_CAP_EOIFD:
> > > +	case KVM_CAP_EOIFD_LEVEL_IRQFD:
> > >  		r = 1;
> > >  		break;
> > >  	case KVM_CAP_COALESCED_MMIO:
> > > diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> > > index b2e6e4f..effb916 100644
> > > --- a/include/linux/kvm.h
> > > +++ b/include/linux/kvm.h
> > > @@ -619,6 +619,8 @@ struct kvm_ppc_smmu_info {
> > >  #define KVM_CAP_S390_COW 79
> > >  #define KVM_CAP_PPC_ALLOC_HTAB 80
> > >  #define KVM_CAP_IRQFD_LEVEL 81
> > > +#define KVM_CAP_EOIFD 82
> > > +#define KVM_CAP_EOIFD_LEVEL_IRQFD 83
> > >  
> > >  #ifdef KVM_CAP_IRQ_ROUTING
> > >  
> > > @@ -694,6 +696,17 @@ struct kvm_irqfd {
> > >  	__u8  pad[20];
> > >  };
> > >  
> > > +#define KVM_EOIFD_FLAG_DEASSIGN (1 << 0)
> > > +/* Available with KVM_CAP_EOIFD_LEVEL_IRQFD */
> > > +#define KVM_EOIFD_FLAG_LEVEL_IRQFD (1 << 1)
> > > +
> > > +struct kvm_eoifd {
> > > +	__u32 fd;
> > > +	__u32 flags;
> > > +	__u32 key;
> > > +	__u8 pad[20];
> > > +};
> > > +
> > >  struct kvm_clock_data {
> > >  	__u64 clock;
> > >  	__u32 flags;
> > > @@ -834,6 +847,8 @@ struct kvm_s390_ucas_mapping {
> > >  #define KVM_PPC_GET_SMMU_INFO	  _IOR(KVMIO,  0xa6, struct kvm_ppc_smmu_info)
> > >  /* Available with KVM_CAP_PPC_ALLOC_HTAB */
> > >  #define KVM_PPC_ALLOCATE_HTAB	  _IOWR(KVMIO, 0xa7, __u32)
> > > +/* Available with KVM_CAP_EOIFD */
> > > +#define KVM_EOIFD                 _IOW(KVMIO,  0xa8, struct kvm_eoifd)
> > >  
> > >  /*
> > >   * ioctls for vcpu fds
> > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > index c73f071..01e72a6 100644
> > > --- a/include/linux/kvm_host.h
> > > +++ b/include/linux/kvm_host.h
> > > @@ -289,6 +289,10 @@ struct kvm {
> > >  		struct mutex lock;
> > >  		struct list_head items;
> > >  	} irqsources;
> > > +	struct {
> > > +		spinlock_t lock;
> > > +		struct list_head items;
> > > +	} eoifds;
> > >  #endif
> > >  	struct kvm_vm_stat stat;
> > >  	struct kvm_arch arch;
> > > @@ -832,6 +836,8 @@ int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args);
> > >  void kvm_irqfd_release(struct kvm *kvm);
> > >  void kvm_irq_routing_update(struct kvm *, struct kvm_irq_routing_table *);
> > >  int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
> > > +int kvm_eoifd(struct kvm *kvm, struct kvm_eoifd *args);
> > > +void kvm_eoifd_release(struct kvm *kvm);
> > >  
> > >  #else
> > >  
> > > @@ -857,6 +863,13 @@ static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
> > >  	return -ENOSYS;
> > >  }
> > >  
> > > +static inline int kvm_eoifd(struct kvm *kvm, struct kvm_eoifd *args)
> > > +{
> > > +	return -ENOSYS;
> > > +}
> > > +
> > > +static inline void kvm_eoifd_release(struct kvm *kvm) {}
> > > +
> > >  #endif /* CONFIG_HAVE_KVM_EVENTFD */
> > >  
> > >  #ifdef CONFIG_KVM_APIC_ARCHITECTURE
> > > diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> > > index 878cb52..3aa2d62 100644
> > > --- a/virt/kvm/eventfd.c
> > > +++ b/virt/kvm/eventfd.c
> > > @@ -95,6 +95,25 @@ static struct _irq_source *_irq_source_alloc(struct kvm *kvm, int gsi)
> > >  	return source;
> > >  }
> > >  
> > > +static struct _irq_source *_irq_source_get_from_key(struct kvm *kvm, int key)
> > > +{
> > > +	struct _irq_source *tmp, *source = ERR_PTR(-ENOENT);
> > > +
> > > +	mutex_lock(&kvm->irqsources.lock);
> > > +
> > > +	list_for_each_entry(tmp, &kvm->irqsources.items, list) {
> > > +		if (tmp->id == key) {
> > > +			source = tmp;
> > > +			kref_get(&source->kref);
> > > +			break;
> > > +		}
> > > +	}
> > > +
> > > +	mutex_unlock(&kvm->irqsources.lock);
> > > +
> > > +	return source;
> > > +}
> > > +
> > >  /*
> > >   * --------------------------------------------------------------------
> > >   * irqfd: Allows an fd to be used to inject an interrupt to the guest
> > > @@ -406,6 +425,8 @@ kvm_eventfd_init(struct kvm *kvm)
> > >  	INIT_LIST_HEAD(&kvm->ioeventfds);
> > >  	mutex_init(&kvm->irqsources.lock);
> > >  	INIT_LIST_HEAD(&kvm->irqsources.items);
> > > +	spin_lock_init(&kvm->eoifds.lock);
> > > +	INIT_LIST_HEAD(&kvm->eoifds.items);
> > >  }
> > >  
> > >  /*
> > > @@ -772,3 +793,318 @@ kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
> > >  
> > >  	return kvm_assign_ioeventfd(kvm, args);
> > >  }
> > > +
> > > +/*
> > > + * --------------------------------------------------------------------
> > > + *  eoifd: Translate KVM APIC/IOAPIC EOI into eventfd signal.
> > > + *
> > > + *  userspace can register with an eventfd for receiving
> > > + *  notification when an EOI occurs.
> > > + * --------------------------------------------------------------------
> > > + */
> > > +
> > > +struct _eoifd {
> > > +	/* eventfd triggered on EOI */
> > > +	struct eventfd_ctx *eventfd;
> > > +	/* irq source ID de-asserted on EOI */
> > > +	struct _irq_source *source;
> > > +	wait_queue_t wait;
> > > +	/* EOI notification from KVM */
> > > +	struct kvm_irq_ack_notifier notifier;
> > > +	struct list_head list;
> > > +	poll_table pt;
> > > +	struct work_struct shutdown;
> > > +};
> > > +
> > > +/* Called under eoifds.lock */
> > > +static void eoifd_shutdown(struct work_struct *work)
> > > +{
> > > +	struct _eoifd *eoifd = container_of(work, struct _eoifd, shutdown);
> > > +	struct kvm *kvm = eoifd->source->kvm;
> > > +	u64 cnt;
> > > +
> > > +	/*
> > > +	 * Stop EOI signaling
> > > +	 */
> > > +	kvm_unregister_irq_ack_notifier(kvm, &eoifd->notifier);
> > > +
> > > +	/*
> > > +	 * Synchronize with the wait-queue and unhook ourselves to prevent
> > > +	 * further events.
> > > +	 */
> > > +	eventfd_ctx_remove_wait_queue(eoifd->eventfd, &eoifd->wait, &cnt);
> > > +
> > > +	/*
> > > +	 * Release resources
> > > +	 */
> > > +	eventfd_ctx_put(eoifd->eventfd);
> > > +	_irq_source_put(eoifd->source);
> > > +	kfree(eoifd);
> > > +}
> > > +
> > > +/* assumes kvm->eoifds.lock is held */
> > > +static bool eoifd_is_active(struct _eoifd *eoifd)
> > > +{
> > > +	return list_empty(&eoifd->list) ? false : true;
> > > +}
> > > +
> > > +/*
> > > + * Mark the eoifd as inactive and schedule it for removal
> > > + *
> > > + * assumes kvm->eoifds.lock is held
> > > + */
> > > +static void eoifd_deactivate(struct _eoifd *eoifd)
> > > +{
> > > +	BUG_ON(!eoifd_is_active(eoifd));
> > > +
> > > +	list_del_init(&eoifd->list);
> > > +
> > > +	queue_work(irqfd_cleanup_wq, &eoifd->shutdown);
> > > +}
> > > +
> > > +/*
> > > + * Called with wqh->lock held and interrupts disabled
> > > + */
> > > +static int eoifd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
> > > +{
> > > +	unsigned long flags = (unsigned long)key;
> > > +
> > > +	if (unlikely(flags & POLLHUP)) {
> > > +		/* The eventfd is closing, detach from KVM */
> > > +		struct _eoifd *eoifd = container_of(wait, struct _eoifd, wait);
> > > +		struct kvm *kvm = eoifd->source->kvm;
> > > +		unsigned long flags;
> > > +
> > > +		spin_lock_irqsave(&kvm->eoifds.lock, flags);
> > > +
> > > +		/*
> > > +		 * We must check if someone deactivated the eoifd before
> > > +		 * we could acquire the eoifds.lock since the item is
> > > +		 * deactivated from the KVM side before it is unhooked from
> > > +		 * the wait-queue.  If it is already deactivated, we can
> > > +		 * simply return knowing the other side will cleanup for us.
> > > +		 * We cannot race against the eoifd going away since the
> > > +		 * other side is required to acquire wqh->lock, which we hold
> > > +		 */
> > > +		if (eoifd_is_active(eoifd))
> > > +			eoifd_deactivate(eoifd);
> > > +
> > > +		spin_unlock_irqrestore(&kvm->eoifds.lock, flags);
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > 
> > Looks like there is a bug here: if I close irqfd, then close eoifd,
> > the key is not immediately released so an attempt to create
> > an irqfd can fail to get the source id.
> 
> Both irqfd and eoifd use the same workqueue for releasing objects and
> both flush on assign.
> 
> > Maybe we should simply document that userspace should deassign
> > eoifd before closing it? This is what we do for ioeventfd.
> > If we do this, the whole polling code can go away completely.
> 
> You're again ignoring the close problem.  We cannot document around an
> impossible requirement that fds are always deassigned before close.

Well userspace can easily call a deassign ioctl. Why is it so important
that deassign is not required?

> IMHO ioeventfd is broken here and I don't wish to emulate it's behavior.

So fix ioeventfd first. Making eoifd and ioeventfd behave differently does not
make sense they are very similar.

> > > +
> > > +static void eoifd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
> > > +				    poll_table *pt)
> > > +{
> > > +	struct _eoifd *eoifd = container_of(pt, struct _eoifd, pt);
> > > +	add_wait_queue(wqh, &eoifd->wait);
> > > +}
> > > +
> > > +/*
> > > + * This function is called as the kvm VM fd is being released. Shutdown all
> > > + * eoifds that still remain open
> > > + */
> > > +void kvm_eoifd_release(struct kvm *kvm)
> > > +{
> > > +	struct _eoifd *tmp, *eoifd;
> > > +
> > > +	spin_lock_irq(&kvm->eoifds.lock);
> > > +
> > > +	list_for_each_entry_safe(eoifd, tmp, &kvm->eoifds.items, list)
> > > +		eoifd_deactivate(eoifd);
> > > +
> > > +	spin_unlock_irq(&kvm->eoifds.lock);
> > > +
> > > +	flush_workqueue(irqfd_cleanup_wq);
> > > +}
> > > +
> > > +static void eoifd_event(struct kvm_irq_ack_notifier *notifier)
> > > +{
> > > +	struct _eoifd *eoifd;
> > > +
> > > +	eoifd = container_of(notifier, struct _eoifd, notifier);
> > > +
> > > +	if (unlikely(!eoifd->source))
> > > +		return;
> > > +
> > > +	/*
> > > +	 * De-assert and send EOI, user needs to re-assert if
> > > +	 * device still requires service.
> > > +	 */
> > 
> > I'm not sure why did you drop filtering by source id.
> > This means userspace gets events even if it did not send an interrupt.
> > So
> > 1. Should be documented that you can get spurious events 
> > 2. when an interrupt is shared with an emulated device,
> >    and said device uses EOI, this will not
> >    perform well as we will wake up userspace on each EOI.
> > 3. Just sharing interrupt with virtio means we are polling
> >    assigned device on each virtio interrupt.
> 
> Didn't we just agree after v5 that filtering requires a spinlock around
> around calling kvm_irq_set or at least a new interface to setting irqs
> that allows us to see the current assertion state and that neither of
> those seem to be worth the effort for level irqs?  That's why I dropped
> it.  Interrupts always have to support spurious events.  The comment
> immediately above indicates this.  Legacy interrupts, especially shared
> legacy interrupts should not be our primary performance path.  VFIO has
> a very efficient path for handling spurious EOIs.

But it will not help that vfio does this efficiently if userspace
is woken up. You need to make it efficient for userspace consumers.
Otherwise it's a vfio specific interface.
Alex Williamson July 31, 2012, 12:26 a.m. UTC | #4
On Tue, 2012-07-31 at 03:01 +0300, Michael S. Tsirkin wrote:
> On Mon, Jul 30, 2012 at 10:22:10AM -0600, Alex Williamson wrote:
> > On Sun, 2012-07-29 at 17:54 +0300, Michael S. Tsirkin wrote:
> > > On Tue, Jul 24, 2012 at 02:43:22PM -0600, Alex Williamson wrote:
> > > > This new ioctl enables an eventfd to be triggered when an EOI is
> > > > written for a specified irqchip pin.  The first user of this will
> > > > be external device assignment through VFIO, using a level irqfd
> > > > for asserting a PCI INTx interrupt and this interface for de-assert
> > > > and notification once the interrupt is serviced.
> > > > 
> > > > Here we make use of the reference counting of the _irq_source
> > > > object allowing us to share it with an irqfd and cleanup regardless
> > > > of the release order.
> > > > 
> > > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > 
> > > > ---
> > > > 
> > > >  Documentation/virtual/kvm/api.txt |   21 ++
> > > >  arch/x86/kvm/x86.c                |    2 
> > > >  include/linux/kvm.h               |   15 ++
> > > >  include/linux/kvm_host.h          |   13 +
> > > >  virt/kvm/eventfd.c                |  336 +++++++++++++++++++++++++++++++++++++
> > > >  virt/kvm/kvm_main.c               |   11 +
> > > >  6 files changed, 398 insertions(+)
> > > > 
> > > > diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> > > > index 3911e62..8cd6b36 100644
> > > > --- a/Documentation/virtual/kvm/api.txt
> > > > +++ b/Documentation/virtual/kvm/api.txt
> > > > @@ -1989,6 +1989,27 @@ return the hash table order in the parameter.  (If the guest is using
> > > >  the virtualized real-mode area (VRMA) facility, the kernel will
> > > >  re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.)
> > > >  
> > > > +4.77 KVM_EOIFD
> > > > +
> > > > +Capability: KVM_CAP_EOIFD
> > > > +Architectures: x86
> > > > +Type: vm ioctl
> > > > +Parameters: struct kvm_eoifd (in)
> > > > +Returns: 0 on success, < 0 on error
> > > > +
> > > > +KVM_EOIFD allows userspace to receive interrupt EOI notification
> > > > +through an eventfd.
> > > 
> > > I thought about it some more, and I think it should be renamed to an
> > > interrupt ack notification than eoi notification.
> > > For example, consider userspace that uses threaded interrupts.
> > > Currently what will happen is each interrupt will be injected
> > > twice, since on eoi device is still asserting it.
> > 
> > I don't follow, why is userspace writing an eoi to the ioapic if it
> > hasn't handled the interrupt
> 
> It has handled it - it disabled the hardware interrupt.

So it's not injected twice, it's held pending at the ioapic the second
time, just like hardware.  Maybe there's a future optimization there,
but I don't think it's appropriate at this time.

> > and why wouldn't the same happen on bare
> > metal?
> 
> on bare metal level does not matter as long as interrupt
> is disabled.
> 
> > > One fix would be to delay event until interrupt is re-enabled.
> > > Now I am not asking you to fix this immediately,
> > > but I think we should make the interface generic by
> > > saying we report an ack to userspace and not specifically EOI.
> > 
> > Using the word "delay" in the context of interrupt delivery raises all
> > sorts of red flags for me, but I really don't understand your argument.
> 
> I am saying it's an "ack" of interrupt userspace cares about.
> The fact it is done by EOI is an implementation detail.

The implementation is how an EOI is generated on an ioapic, not that an
EOI exists.  How do I read a hardware spec and figure out what "ack of
interrupt" means?

> > > >  kvm_eoifd.fd specifies the eventfd used for
> > > > +notification.  KVM_EOIFD_FLAG_DEASSIGN is used to de-assign an eoifd
> > > > +once assigned.  KVM_EOIFD also requires additional bits set in
> > > > +kvm_eoifd.flags to bind to the proper interrupt line.  The
> > > > +KVM_EOIFD_FLAG_LEVEL_IRQFD indicates that kvm_eoifd.key is provided
> > > > +and is a key from a level triggered interrupt (configured from
> > > > +KVM_IRQFD using KVM_IRQFD_FLAG_LEVEL).  The EOI notification is bound
> > > > +to the same GSI and irqchip input as the irqfd.  Both kvm_eoifd.key
> > > > +and KVM_EOIFD_FLAG_LEVEL_IRQFD must be specified on assignment and
> > > > +de-assignment of KVM_EOIFD.  A level irqfd may only be bound to a
> > > > +single eoifd.  KVM_CAP_EOIFD_LEVEL_IRQFD indicates support of
> > > > +KVM_EOIFD_FLAG_LEVEL_IRQFD.
> > > >  
> > > 
> > > Hmm returning the key means we'll need to keep refcounting for source
> > > IDs around forever. I liked passing the fd better: make implementation
> > > match interface and not the other way around.
> > 
> > False, a source ID has a finite lifecycle.  The fd approach was broken.
> > Holding the irqfd context imposed too many dependencies between eoifd
> > and irqfd necessitating things like one interface disabling another.  I
> > thoroughly disagree with that approach.
> 
> You keep saying this but it is still true: once irqfd
> is closed eoifd does not get any more interrupts.

How does that matter?

> > > >  5. The kvm_run structure
> > > >  ------------------------
> > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > > index 9ded39d..8f3164e 100644
> > > > --- a/arch/x86/kvm/x86.c
> > > > +++ b/arch/x86/kvm/x86.c
> > > > @@ -2171,6 +2171,8 @@ int kvm_dev_ioctl_check_extension(long ext)
> > > >  	case KVM_CAP_PCI_2_3:
> > > >  	case KVM_CAP_KVMCLOCK_CTRL:
> > > >  	case KVM_CAP_IRQFD_LEVEL:
> > > > +	case KVM_CAP_EOIFD:
> > > > +	case KVM_CAP_EOIFD_LEVEL_IRQFD:
> > > >  		r = 1;
> > > >  		break;
> > > >  	case KVM_CAP_COALESCED_MMIO:
> > > > diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> > > > index b2e6e4f..effb916 100644
> > > > --- a/include/linux/kvm.h
> > > > +++ b/include/linux/kvm.h
> > > > @@ -619,6 +619,8 @@ struct kvm_ppc_smmu_info {
> > > >  #define KVM_CAP_S390_COW 79
> > > >  #define KVM_CAP_PPC_ALLOC_HTAB 80
> > > >  #define KVM_CAP_IRQFD_LEVEL 81
> > > > +#define KVM_CAP_EOIFD 82
> > > > +#define KVM_CAP_EOIFD_LEVEL_IRQFD 83
> > > >  
> > > >  #ifdef KVM_CAP_IRQ_ROUTING
> > > >  
> > > > @@ -694,6 +696,17 @@ struct kvm_irqfd {
> > > >  	__u8  pad[20];
> > > >  };
> > > >  
> > > > +#define KVM_EOIFD_FLAG_DEASSIGN (1 << 0)
> > > > +/* Available with KVM_CAP_EOIFD_LEVEL_IRQFD */
> > > > +#define KVM_EOIFD_FLAG_LEVEL_IRQFD (1 << 1)
> > > > +
> > > > +struct kvm_eoifd {
> > > > +	__u32 fd;
> > > > +	__u32 flags;
> > > > +	__u32 key;
> > > > +	__u8 pad[20];
> > > > +};
> > > > +
> > > >  struct kvm_clock_data {
> > > >  	__u64 clock;
> > > >  	__u32 flags;
> > > > @@ -834,6 +847,8 @@ struct kvm_s390_ucas_mapping {
> > > >  #define KVM_PPC_GET_SMMU_INFO	  _IOR(KVMIO,  0xa6, struct kvm_ppc_smmu_info)
> > > >  /* Available with KVM_CAP_PPC_ALLOC_HTAB */
> > > >  #define KVM_PPC_ALLOCATE_HTAB	  _IOWR(KVMIO, 0xa7, __u32)
> > > > +/* Available with KVM_CAP_EOIFD */
> > > > +#define KVM_EOIFD                 _IOW(KVMIO,  0xa8, struct kvm_eoifd)
> > > >  
> > > >  /*
> > > >   * ioctls for vcpu fds
> > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > > index c73f071..01e72a6 100644
> > > > --- a/include/linux/kvm_host.h
> > > > +++ b/include/linux/kvm_host.h
> > > > @@ -289,6 +289,10 @@ struct kvm {
> > > >  		struct mutex lock;
> > > >  		struct list_head items;
> > > >  	} irqsources;
> > > > +	struct {
> > > > +		spinlock_t lock;
> > > > +		struct list_head items;
> > > > +	} eoifds;
> > > >  #endif
> > > >  	struct kvm_vm_stat stat;
> > > >  	struct kvm_arch arch;
> > > > @@ -832,6 +836,8 @@ int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args);
> > > >  void kvm_irqfd_release(struct kvm *kvm);
> > > >  void kvm_irq_routing_update(struct kvm *, struct kvm_irq_routing_table *);
> > > >  int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
> > > > +int kvm_eoifd(struct kvm *kvm, struct kvm_eoifd *args);
> > > > +void kvm_eoifd_release(struct kvm *kvm);
> > > >  
> > > >  #else
> > > >  
> > > > @@ -857,6 +863,13 @@ static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
> > > >  	return -ENOSYS;
> > > >  }
> > > >  
> > > > +static inline int kvm_eoifd(struct kvm *kvm, struct kvm_eoifd *args)
> > > > +{
> > > > +	return -ENOSYS;
> > > > +}
> > > > +
> > > > +static inline void kvm_eoifd_release(struct kvm *kvm) {}
> > > > +
> > > >  #endif /* CONFIG_HAVE_KVM_EVENTFD */
> > > >  
> > > >  #ifdef CONFIG_KVM_APIC_ARCHITECTURE
> > > > diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> > > > index 878cb52..3aa2d62 100644
> > > > --- a/virt/kvm/eventfd.c
> > > > +++ b/virt/kvm/eventfd.c
> > > > @@ -95,6 +95,25 @@ static struct _irq_source *_irq_source_alloc(struct kvm *kvm, int gsi)
> > > >  	return source;
> > > >  }
> > > >  
> > > > +static struct _irq_source *_irq_source_get_from_key(struct kvm *kvm, int key)
> > > > +{
> > > > +	struct _irq_source *tmp, *source = ERR_PTR(-ENOENT);
> > > > +
> > > > +	mutex_lock(&kvm->irqsources.lock);
> > > > +
> > > > +	list_for_each_entry(tmp, &kvm->irqsources.items, list) {
> > > > +		if (tmp->id == key) {
> > > > +			source = tmp;
> > > > +			kref_get(&source->kref);
> > > > +			break;
> > > > +		}
> > > > +	}
> > > > +
> > > > +	mutex_unlock(&kvm->irqsources.lock);
> > > > +
> > > > +	return source;
> > > > +}
> > > > +
> > > >  /*
> > > >   * --------------------------------------------------------------------
> > > >   * irqfd: Allows an fd to be used to inject an interrupt to the guest
> > > > @@ -406,6 +425,8 @@ kvm_eventfd_init(struct kvm *kvm)
> > > >  	INIT_LIST_HEAD(&kvm->ioeventfds);
> > > >  	mutex_init(&kvm->irqsources.lock);
> > > >  	INIT_LIST_HEAD(&kvm->irqsources.items);
> > > > +	spin_lock_init(&kvm->eoifds.lock);
> > > > +	INIT_LIST_HEAD(&kvm->eoifds.items);
> > > >  }
> > > >  
> > > >  /*
> > > > @@ -772,3 +793,318 @@ kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
> > > >  
> > > >  	return kvm_assign_ioeventfd(kvm, args);
> > > >  }
> > > > +
> > > > +/*
> > > > + * --------------------------------------------------------------------
> > > > + *  eoifd: Translate KVM APIC/IOAPIC EOI into eventfd signal.
> > > > + *
> > > > + *  userspace can register with an eventfd for receiving
> > > > + *  notification when an EOI occurs.
> > > > + * --------------------------------------------------------------------
> > > > + */
> > > > +
> > > > +struct _eoifd {
> > > > +	/* eventfd triggered on EOI */
> > > > +	struct eventfd_ctx *eventfd;
> > > > +	/* irq source ID de-asserted on EOI */
> > > > +	struct _irq_source *source;
> > > > +	wait_queue_t wait;
> > > > +	/* EOI notification from KVM */
> > > > +	struct kvm_irq_ack_notifier notifier;
> > > > +	struct list_head list;
> > > > +	poll_table pt;
> > > > +	struct work_struct shutdown;
> > > > +};
> > > > +
> > > > +/* Called under eoifds.lock */
> > > > +static void eoifd_shutdown(struct work_struct *work)
> > > > +{
> > > > +	struct _eoifd *eoifd = container_of(work, struct _eoifd, shutdown);
> > > > +	struct kvm *kvm = eoifd->source->kvm;
> > > > +	u64 cnt;
> > > > +
> > > > +	/*
> > > > +	 * Stop EOI signaling
> > > > +	 */
> > > > +	kvm_unregister_irq_ack_notifier(kvm, &eoifd->notifier);
> > > > +
> > > > +	/*
> > > > +	 * Synchronize with the wait-queue and unhook ourselves to prevent
> > > > +	 * further events.
> > > > +	 */
> > > > +	eventfd_ctx_remove_wait_queue(eoifd->eventfd, &eoifd->wait, &cnt);
> > > > +
> > > > +	/*
> > > > +	 * Release resources
> > > > +	 */
> > > > +	eventfd_ctx_put(eoifd->eventfd);
> > > > +	_irq_source_put(eoifd->source);
> > > > +	kfree(eoifd);
> > > > +}
> > > > +
> > > > +/* assumes kvm->eoifds.lock is held */
> > > > +static bool eoifd_is_active(struct _eoifd *eoifd)
> > > > +{
> > > > +	return list_empty(&eoifd->list) ? false : true;
> > > > +}
> > > > +
> > > > +/*
> > > > + * Mark the eoifd as inactive and schedule it for removal
> > > > + *
> > > > + * assumes kvm->eoifds.lock is held
> > > > + */
> > > > +static void eoifd_deactivate(struct _eoifd *eoifd)
> > > > +{
> > > > +	BUG_ON(!eoifd_is_active(eoifd));
> > > > +
> > > > +	list_del_init(&eoifd->list);
> > > > +
> > > > +	queue_work(irqfd_cleanup_wq, &eoifd->shutdown);
> > > > +}
> > > > +
> > > > +/*
> > > > + * Called with wqh->lock held and interrupts disabled
> > > > + */
> > > > +static int eoifd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
> > > > +{
> > > > +	unsigned long flags = (unsigned long)key;
> > > > +
> > > > +	if (unlikely(flags & POLLHUP)) {
> > > > +		/* The eventfd is closing, detach from KVM */
> > > > +		struct _eoifd *eoifd = container_of(wait, struct _eoifd, wait);
> > > > +		struct kvm *kvm = eoifd->source->kvm;
> > > > +		unsigned long flags;
> > > > +
> > > > +		spin_lock_irqsave(&kvm->eoifds.lock, flags);
> > > > +
> > > > +		/*
> > > > +		 * We must check if someone deactivated the eoifd before
> > > > +		 * we could acquire the eoifds.lock since the item is
> > > > +		 * deactivated from the KVM side before it is unhooked from
> > > > +		 * the wait-queue.  If it is already deactivated, we can
> > > > +		 * simply return knowing the other side will cleanup for us.
> > > > +		 * We cannot race against the eoifd going away since the
> > > > +		 * other side is required to acquire wqh->lock, which we hold
> > > > +		 */
> > > > +		if (eoifd_is_active(eoifd))
> > > > +			eoifd_deactivate(eoifd);
> > > > +
> > > > +		spin_unlock_irqrestore(&kvm->eoifds.lock, flags);
> > > > +	}
> > > > +
> > > > +	return 0;
> > > > +}
> > > 
> > > Looks like there is a bug here: if I close irqfd, then close eoifd,
> > > the key is not immediately released so an attempt to create
> > > an irqfd can fail to get the source id.
> > 
> > Both irqfd and eoifd use the same workqueue for releasing objects and
> > both flush on assign.
> > 
> > > Maybe we should simply document that userspace should deassign
> > > eoifd before closing it? This is what we do for ioeventfd.
> > > If we do this, the whole polling code can go away completely.
> > 
> > You're again ignoring the close problem.  We cannot document around an
> > impossible requirement that fds are always deassigned before close.
> 
> Well userspace can easily call a deassign ioctl. Why is it so important
> that deassign is not required?

Because everything allocated through a file descriptor, specific to that
file descriptor, should be freed when the file descriptor is closed.
That's what people expect.

> > IMHO ioeventfd is broken here and I don't wish to emulate it's behavior.
> 
> So fix ioeventfd first. Making eoifd and ioeventfd behave differently does not
> make sense they are very similar.

One at a time.  eoifd and ioeventfd have different requirements.
ioeventfd is just wasting memory, eoifd can potentially exhaust irq
source IDs.  Besides, you still defend ioeventfd as correct.

> > > > +
> > > > +static void eoifd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
> > > > +				    poll_table *pt)
> > > > +{
> > > > +	struct _eoifd *eoifd = container_of(pt, struct _eoifd, pt);
> > > > +	add_wait_queue(wqh, &eoifd->wait);
> > > > +}
> > > > +
> > > > +/*
> > > > + * This function is called as the kvm VM fd is being released. Shutdown all
> > > > + * eoifds that still remain open
> > > > + */
> > > > +void kvm_eoifd_release(struct kvm *kvm)
> > > > +{
> > > > +	struct _eoifd *tmp, *eoifd;
> > > > +
> > > > +	spin_lock_irq(&kvm->eoifds.lock);
> > > > +
> > > > +	list_for_each_entry_safe(eoifd, tmp, &kvm->eoifds.items, list)
> > > > +		eoifd_deactivate(eoifd);
> > > > +
> > > > +	spin_unlock_irq(&kvm->eoifds.lock);
> > > > +
> > > > +	flush_workqueue(irqfd_cleanup_wq);
> > > > +}
> > > > +
> > > > +static void eoifd_event(struct kvm_irq_ack_notifier *notifier)
> > > > +{
> > > > +	struct _eoifd *eoifd;
> > > > +
> > > > +	eoifd = container_of(notifier, struct _eoifd, notifier);
> > > > +
> > > > +	if (unlikely(!eoifd->source))
> > > > +		return;
> > > > +
> > > > +	/*
> > > > +	 * De-assert and send EOI, user needs to re-assert if
> > > > +	 * device still requires service.
> > > > +	 */
> > > 
> > > I'm not sure why did you drop filtering by source id.
> > > This means userspace gets events even if it did not send an interrupt.
> > > So
> > > 1. Should be documented that you can get spurious events 
> > > 2. when an interrupt is shared with an emulated device,
> > >    and said device uses EOI, this will not
> > >    perform well as we will wake up userspace on each EOI.
> > > 3. Just sharing interrupt with virtio means we are polling
> > >    assigned device on each virtio interrupt.
> > 
> > Didn't we just agree after v5 that filtering requires a spinlock around
> > around calling kvm_irq_set or at least a new interface to setting irqs
> > that allows us to see the current assertion state and that neither of
> > those seem to be worth the effort for level irqs?  That's why I dropped
> > it.  Interrupts always have to support spurious events.  The comment
> > immediately above indicates this.  Legacy interrupts, especially shared
> > legacy interrupts should not be our primary performance path.  VFIO has
> > a very efficient path for handling spurious EOIs.
> 
> But it will not help that vfio does this efficiently if userspace
> is woken up. You need to make it efficient for userspace consumers.
> Otherwise it's a vfio specific interface.

Does this effect the design of this interface or is this a potential
future optimization?



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin July 31, 2012, 12:36 a.m. UTC | #5
On Mon, Jul 30, 2012 at 06:26:31PM -0600, Alex Williamson wrote:
> On Tue, 2012-07-31 at 03:01 +0300, Michael S. Tsirkin wrote:
> > On Mon, Jul 30, 2012 at 10:22:10AM -0600, Alex Williamson wrote:
> > > On Sun, 2012-07-29 at 17:54 +0300, Michael S. Tsirkin wrote:
> > > > On Tue, Jul 24, 2012 at 02:43:22PM -0600, Alex Williamson wrote:
> > > > > This new ioctl enables an eventfd to be triggered when an EOI is
> > > > > written for a specified irqchip pin.  The first user of this will
> > > > > be external device assignment through VFIO, using a level irqfd
> > > > > for asserting a PCI INTx interrupt and this interface for de-assert
> > > > > and notification once the interrupt is serviced.
> > > > > 
> > > > > Here we make use of the reference counting of the _irq_source
> > > > > object allowing us to share it with an irqfd and cleanup regardless
> > > > > of the release order.
> > > > > 
> > > > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > > 
> > > > > ---
> > > > > 
> > > > >  Documentation/virtual/kvm/api.txt |   21 ++
> > > > >  arch/x86/kvm/x86.c                |    2 
> > > > >  include/linux/kvm.h               |   15 ++
> > > > >  include/linux/kvm_host.h          |   13 +
> > > > >  virt/kvm/eventfd.c                |  336 +++++++++++++++++++++++++++++++++++++
> > > > >  virt/kvm/kvm_main.c               |   11 +
> > > > >  6 files changed, 398 insertions(+)
> > > > > 
> > > > > diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> > > > > index 3911e62..8cd6b36 100644
> > > > > --- a/Documentation/virtual/kvm/api.txt
> > > > > +++ b/Documentation/virtual/kvm/api.txt
> > > > > @@ -1989,6 +1989,27 @@ return the hash table order in the parameter.  (If the guest is using
> > > > >  the virtualized real-mode area (VRMA) facility, the kernel will
> > > > >  re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.)
> > > > >  
> > > > > +4.77 KVM_EOIFD
> > > > > +
> > > > > +Capability: KVM_CAP_EOIFD
> > > > > +Architectures: x86
> > > > > +Type: vm ioctl
> > > > > +Parameters: struct kvm_eoifd (in)
> > > > > +Returns: 0 on success, < 0 on error
> > > > > +
> > > > > +KVM_EOIFD allows userspace to receive interrupt EOI notification
> > > > > +through an eventfd.
> > > > 
> > > > I thought about it some more, and I think it should be renamed to an
> > > > interrupt ack notification than eoi notification.
> > > > For example, consider userspace that uses threaded interrupts.
> > > > Currently what will happen is each interrupt will be injected
> > > > twice, since on eoi device is still asserting it.
> > > 
> > > I don't follow, why is userspace writing an eoi to the ioapic if it
> > > hasn't handled the interrupt
> > 
> > It has handled it - it disabled the hardware interrupt.
> 
> So it's not injected twice, it's held pending at the ioapic the second
> time, just like hardware.

It is not like hardware at all. in hardware there is no overhead
here you interrupot the guest to run handler in host.

>  Maybe there's a future optimization there,
> but I don't think it's appropriate at this time.

Yes. But to make it *possible* in future we must remove
the requirement to signal fd immediately on EOI.
So rename it ackfd.

> > > and why wouldn't the same happen on bare
> > > metal?
> > 
> > on bare metal level does not matter as long as interrupt
> > is disabled.
> > 
> > > > One fix would be to delay event until interrupt is re-enabled.
> > > > Now I am not asking you to fix this immediately,
> > > > but I think we should make the interface generic by
> > > > saying we report an ack to userspace and not specifically EOI.
> > > 
> > > Using the word "delay" in the context of interrupt delivery raises all
> > > sorts of red flags for me, but I really don't understand your argument.
> > 
> > I am saying it's an "ack" of interrupt userspace cares about.
> > The fact it is done by EOI is an implementation detail.
> 
> The implementation is how an EOI is generated on an ioapic, not that an
> EOI exists.  How do I read a hardware spec and figure out what "ack of
> interrupt" means?

It just means it will be called after guest has completed handling
interrupt. How we detect that is our problem.

> > > > >  kvm_eoifd.fd specifies the eventfd used for
> > > > > +notification.  KVM_EOIFD_FLAG_DEASSIGN is used to de-assign an eoifd
> > > > > +once assigned.  KVM_EOIFD also requires additional bits set in
> > > > > +kvm_eoifd.flags to bind to the proper interrupt line.  The
> > > > > +KVM_EOIFD_FLAG_LEVEL_IRQFD indicates that kvm_eoifd.key is provided
> > > > > +and is a key from a level triggered interrupt (configured from
> > > > > +KVM_IRQFD using KVM_IRQFD_FLAG_LEVEL).  The EOI notification is bound
> > > > > +to the same GSI and irqchip input as the irqfd.  Both kvm_eoifd.key
> > > > > +and KVM_EOIFD_FLAG_LEVEL_IRQFD must be specified on assignment and
> > > > > +de-assignment of KVM_EOIFD.  A level irqfd may only be bound to a
> > > > > +single eoifd.  KVM_CAP_EOIFD_LEVEL_IRQFD indicates support of
> > > > > +KVM_EOIFD_FLAG_LEVEL_IRQFD.
> > > > >  
> > > > 
> > > > Hmm returning the key means we'll need to keep refcounting for source
> > > > IDs around forever. I liked passing the fd better: make implementation
> > > > match interface and not the other way around.
> > > 
> > > False, a source ID has a finite lifecycle.  The fd approach was broken.
> > > Holding the irqfd context imposed too many dependencies between eoifd
> > > and irqfd necessitating things like one interface disabling another.  I
> > > thoroughly disagree with that approach.
> > 
> > You keep saying this but it is still true: once irqfd
> > is closed eoifd does not get any more interrupts.
> 
> How does that matter?

Well if it does not get events it is disabled.
so you have one ifc disabling another, anyway.

> > > > >  5. The kvm_run structure
> > > > >  ------------------------
> > > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > > > index 9ded39d..8f3164e 100644
> > > > > --- a/arch/x86/kvm/x86.c
> > > > > +++ b/arch/x86/kvm/x86.c
> > > > > @@ -2171,6 +2171,8 @@ int kvm_dev_ioctl_check_extension(long ext)
> > > > >  	case KVM_CAP_PCI_2_3:
> > > > >  	case KVM_CAP_KVMCLOCK_CTRL:
> > > > >  	case KVM_CAP_IRQFD_LEVEL:
> > > > > +	case KVM_CAP_EOIFD:
> > > > > +	case KVM_CAP_EOIFD_LEVEL_IRQFD:
> > > > >  		r = 1;
> > > > >  		break;
> > > > >  	case KVM_CAP_COALESCED_MMIO:
> > > > > diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> > > > > index b2e6e4f..effb916 100644
> > > > > --- a/include/linux/kvm.h
> > > > > +++ b/include/linux/kvm.h
> > > > > @@ -619,6 +619,8 @@ struct kvm_ppc_smmu_info {
> > > > >  #define KVM_CAP_S390_COW 79
> > > > >  #define KVM_CAP_PPC_ALLOC_HTAB 80
> > > > >  #define KVM_CAP_IRQFD_LEVEL 81
> > > > > +#define KVM_CAP_EOIFD 82
> > > > > +#define KVM_CAP_EOIFD_LEVEL_IRQFD 83
> > > > >  
> > > > >  #ifdef KVM_CAP_IRQ_ROUTING
> > > > >  
> > > > > @@ -694,6 +696,17 @@ struct kvm_irqfd {
> > > > >  	__u8  pad[20];
> > > > >  };
> > > > >  
> > > > > +#define KVM_EOIFD_FLAG_DEASSIGN (1 << 0)
> > > > > +/* Available with KVM_CAP_EOIFD_LEVEL_IRQFD */
> > > > > +#define KVM_EOIFD_FLAG_LEVEL_IRQFD (1 << 1)
> > > > > +
> > > > > +struct kvm_eoifd {
> > > > > +	__u32 fd;
> > > > > +	__u32 flags;
> > > > > +	__u32 key;
> > > > > +	__u8 pad[20];
> > > > > +};
> > > > > +
> > > > >  struct kvm_clock_data {
> > > > >  	__u64 clock;
> > > > >  	__u32 flags;
> > > > > @@ -834,6 +847,8 @@ struct kvm_s390_ucas_mapping {
> > > > >  #define KVM_PPC_GET_SMMU_INFO	  _IOR(KVMIO,  0xa6, struct kvm_ppc_smmu_info)
> > > > >  /* Available with KVM_CAP_PPC_ALLOC_HTAB */
> > > > >  #define KVM_PPC_ALLOCATE_HTAB	  _IOWR(KVMIO, 0xa7, __u32)
> > > > > +/* Available with KVM_CAP_EOIFD */
> > > > > +#define KVM_EOIFD                 _IOW(KVMIO,  0xa8, struct kvm_eoifd)
> > > > >  
> > > > >  /*
> > > > >   * ioctls for vcpu fds
> > > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > > > index c73f071..01e72a6 100644
> > > > > --- a/include/linux/kvm_host.h
> > > > > +++ b/include/linux/kvm_host.h
> > > > > @@ -289,6 +289,10 @@ struct kvm {
> > > > >  		struct mutex lock;
> > > > >  		struct list_head items;
> > > > >  	} irqsources;
> > > > > +	struct {
> > > > > +		spinlock_t lock;
> > > > > +		struct list_head items;
> > > > > +	} eoifds;
> > > > >  #endif
> > > > >  	struct kvm_vm_stat stat;
> > > > >  	struct kvm_arch arch;
> > > > > @@ -832,6 +836,8 @@ int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args);
> > > > >  void kvm_irqfd_release(struct kvm *kvm);
> > > > >  void kvm_irq_routing_update(struct kvm *, struct kvm_irq_routing_table *);
> > > > >  int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
> > > > > +int kvm_eoifd(struct kvm *kvm, struct kvm_eoifd *args);
> > > > > +void kvm_eoifd_release(struct kvm *kvm);
> > > > >  
> > > > >  #else
> > > > >  
> > > > > @@ -857,6 +863,13 @@ static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
> > > > >  	return -ENOSYS;
> > > > >  }
> > > > >  
> > > > > +static inline int kvm_eoifd(struct kvm *kvm, struct kvm_eoifd *args)
> > > > > +{
> > > > > +	return -ENOSYS;
> > > > > +}
> > > > > +
> > > > > +static inline void kvm_eoifd_release(struct kvm *kvm) {}
> > > > > +
> > > > >  #endif /* CONFIG_HAVE_KVM_EVENTFD */
> > > > >  
> > > > >  #ifdef CONFIG_KVM_APIC_ARCHITECTURE
> > > > > diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> > > > > index 878cb52..3aa2d62 100644
> > > > > --- a/virt/kvm/eventfd.c
> > > > > +++ b/virt/kvm/eventfd.c
> > > > > @@ -95,6 +95,25 @@ static struct _irq_source *_irq_source_alloc(struct kvm *kvm, int gsi)
> > > > >  	return source;
> > > > >  }
> > > > >  
> > > > > +static struct _irq_source *_irq_source_get_from_key(struct kvm *kvm, int key)
> > > > > +{
> > > > > +	struct _irq_source *tmp, *source = ERR_PTR(-ENOENT);
> > > > > +
> > > > > +	mutex_lock(&kvm->irqsources.lock);
> > > > > +
> > > > > +	list_for_each_entry(tmp, &kvm->irqsources.items, list) {
> > > > > +		if (tmp->id == key) {
> > > > > +			source = tmp;
> > > > > +			kref_get(&source->kref);
> > > > > +			break;
> > > > > +		}
> > > > > +	}
> > > > > +
> > > > > +	mutex_unlock(&kvm->irqsources.lock);
> > > > > +
> > > > > +	return source;
> > > > > +}
> > > > > +
> > > > >  /*
> > > > >   * --------------------------------------------------------------------
> > > > >   * irqfd: Allows an fd to be used to inject an interrupt to the guest
> > > > > @@ -406,6 +425,8 @@ kvm_eventfd_init(struct kvm *kvm)
> > > > >  	INIT_LIST_HEAD(&kvm->ioeventfds);
> > > > >  	mutex_init(&kvm->irqsources.lock);
> > > > >  	INIT_LIST_HEAD(&kvm->irqsources.items);
> > > > > +	spin_lock_init(&kvm->eoifds.lock);
> > > > > +	INIT_LIST_HEAD(&kvm->eoifds.items);
> > > > >  }
> > > > >  
> > > > >  /*
> > > > > @@ -772,3 +793,318 @@ kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
> > > > >  
> > > > >  	return kvm_assign_ioeventfd(kvm, args);
> > > > >  }
> > > > > +
> > > > > +/*
> > > > > + * --------------------------------------------------------------------
> > > > > + *  eoifd: Translate KVM APIC/IOAPIC EOI into eventfd signal.
> > > > > + *
> > > > > + *  userspace can register with an eventfd for receiving
> > > > > + *  notification when an EOI occurs.
> > > > > + * --------------------------------------------------------------------
> > > > > + */
> > > > > +
> > > > > +struct _eoifd {
> > > > > +	/* eventfd triggered on EOI */
> > > > > +	struct eventfd_ctx *eventfd;
> > > > > +	/* irq source ID de-asserted on EOI */
> > > > > +	struct _irq_source *source;
> > > > > +	wait_queue_t wait;
> > > > > +	/* EOI notification from KVM */
> > > > > +	struct kvm_irq_ack_notifier notifier;
> > > > > +	struct list_head list;
> > > > > +	poll_table pt;
> > > > > +	struct work_struct shutdown;
> > > > > +};
> > > > > +
> > > > > +/* Called under eoifds.lock */
> > > > > +static void eoifd_shutdown(struct work_struct *work)
> > > > > +{
> > > > > +	struct _eoifd *eoifd = container_of(work, struct _eoifd, shutdown);
> > > > > +	struct kvm *kvm = eoifd->source->kvm;
> > > > > +	u64 cnt;
> > > > > +
> > > > > +	/*
> > > > > +	 * Stop EOI signaling
> > > > > +	 */
> > > > > +	kvm_unregister_irq_ack_notifier(kvm, &eoifd->notifier);
> > > > > +
> > > > > +	/*
> > > > > +	 * Synchronize with the wait-queue and unhook ourselves to prevent
> > > > > +	 * further events.
> > > > > +	 */
> > > > > +	eventfd_ctx_remove_wait_queue(eoifd->eventfd, &eoifd->wait, &cnt);
> > > > > +
> > > > > +	/*
> > > > > +	 * Release resources
> > > > > +	 */
> > > > > +	eventfd_ctx_put(eoifd->eventfd);
> > > > > +	_irq_source_put(eoifd->source);
> > > > > +	kfree(eoifd);
> > > > > +}
> > > > > +
> > > > > +/* assumes kvm->eoifds.lock is held */
> > > > > +static bool eoifd_is_active(struct _eoifd *eoifd)
> > > > > +{
> > > > > +	return list_empty(&eoifd->list) ? false : true;
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > + * Mark the eoifd as inactive and schedule it for removal
> > > > > + *
> > > > > + * assumes kvm->eoifds.lock is held
> > > > > + */
> > > > > +static void eoifd_deactivate(struct _eoifd *eoifd)
> > > > > +{
> > > > > +	BUG_ON(!eoifd_is_active(eoifd));
> > > > > +
> > > > > +	list_del_init(&eoifd->list);
> > > > > +
> > > > > +	queue_work(irqfd_cleanup_wq, &eoifd->shutdown);
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > + * Called with wqh->lock held and interrupts disabled
> > > > > + */
> > > > > +static int eoifd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
> > > > > +{
> > > > > +	unsigned long flags = (unsigned long)key;
> > > > > +
> > > > > +	if (unlikely(flags & POLLHUP)) {
> > > > > +		/* The eventfd is closing, detach from KVM */
> > > > > +		struct _eoifd *eoifd = container_of(wait, struct _eoifd, wait);
> > > > > +		struct kvm *kvm = eoifd->source->kvm;
> > > > > +		unsigned long flags;
> > > > > +
> > > > > +		spin_lock_irqsave(&kvm->eoifds.lock, flags);
> > > > > +
> > > > > +		/*
> > > > > +		 * We must check if someone deactivated the eoifd before
> > > > > +		 * we could acquire the eoifds.lock since the item is
> > > > > +		 * deactivated from the KVM side before it is unhooked from
> > > > > +		 * the wait-queue.  If it is already deactivated, we can
> > > > > +		 * simply return knowing the other side will cleanup for us.
> > > > > +		 * We cannot race against the eoifd going away since the
> > > > > +		 * other side is required to acquire wqh->lock, which we hold
> > > > > +		 */
> > > > > +		if (eoifd_is_active(eoifd))
> > > > > +			eoifd_deactivate(eoifd);
> > > > > +
> > > > > +		spin_unlock_irqrestore(&kvm->eoifds.lock, flags);
> > > > > +	}
> > > > > +
> > > > > +	return 0;
> > > > > +}
> > > > 
> > > > Looks like there is a bug here: if I close irqfd, then close eoifd,
> > > > the key is not immediately released so an attempt to create
> > > > an irqfd can fail to get the source id.
> > > 
> > > Both irqfd and eoifd use the same workqueue for releasing objects and
> > > both flush on assign.
> > > 
> > > > Maybe we should simply document that userspace should deassign
> > > > eoifd before closing it? This is what we do for ioeventfd.
> > > > If we do this, the whole polling code can go away completely.
> > > 
> > > You're again ignoring the close problem.  We cannot document around an
> > > impossible requirement that fds are always deassigned before close.
> > 
> > Well userspace can easily call a deassign ioctl. Why is it so important
> > that deassign is not required?
> 
> Because everything allocated through a file descriptor, specific to that
> file descriptor, should be freed when the file descriptor is closed.
> That's what people expect.

That's what documentation is for.

> > > IMHO ioeventfd is broken here and I don't wish to emulate it's behavior.
> > 
> > So fix ioeventfd first. Making eoifd and ioeventfd behave differently does not
> > make sense they are very similar.
> 
> One at a time.  eoifd and ioeventfd have different requirements.
> ioeventfd is just wasting memory, eoifd can potentially exhaust irq
> source IDs.  Besides, you still defend ioeventfd as correct.

same as eoifd.

> > > > > +
> > > > > +static void eoifd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
> > > > > +				    poll_table *pt)
> > > > > +{
> > > > > +	struct _eoifd *eoifd = container_of(pt, struct _eoifd, pt);
> > > > > +	add_wait_queue(wqh, &eoifd->wait);
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > + * This function is called as the kvm VM fd is being released. Shutdown all
> > > > > + * eoifds that still remain open
> > > > > + */
> > > > > +void kvm_eoifd_release(struct kvm *kvm)
> > > > > +{
> > > > > +	struct _eoifd *tmp, *eoifd;
> > > > > +
> > > > > +	spin_lock_irq(&kvm->eoifds.lock);
> > > > > +
> > > > > +	list_for_each_entry_safe(eoifd, tmp, &kvm->eoifds.items, list)
> > > > > +		eoifd_deactivate(eoifd);
> > > > > +
> > > > > +	spin_unlock_irq(&kvm->eoifds.lock);
> > > > > +
> > > > > +	flush_workqueue(irqfd_cleanup_wq);
> > > > > +}
> > > > > +
> > > > > +static void eoifd_event(struct kvm_irq_ack_notifier *notifier)
> > > > > +{
> > > > > +	struct _eoifd *eoifd;
> > > > > +
> > > > > +	eoifd = container_of(notifier, struct _eoifd, notifier);
> > > > > +
> > > > > +	if (unlikely(!eoifd->source))
> > > > > +		return;
> > > > > +
> > > > > +	/*
> > > > > +	 * De-assert and send EOI, user needs to re-assert if
> > > > > +	 * device still requires service.
> > > > > +	 */
> > > > 
> > > > I'm not sure why did you drop filtering by source id.
> > > > This means userspace gets events even if it did not send an interrupt.
> > > > So
> > > > 1. Should be documented that you can get spurious events 
> > > > 2. when an interrupt is shared with an emulated device,
> > > >    and said device uses EOI, this will not
> > > >    perform well as we will wake up userspace on each EOI.
> > > > 3. Just sharing interrupt with virtio means we are polling
> > > >    assigned device on each virtio interrupt.
> > > 
> > > Didn't we just agree after v5 that filtering requires a spinlock around
> > > around calling kvm_irq_set

this is already the case with your patchset. to avoid this,
I am working on caching for interrupts, when ready
you should probably rebase on top of that.

> or at least a new interface to setting irqs
> > > that allows us to see the current assertion state and that neither of
> > > those seem to be worth the effort for level irqs?  That's why I dropped
> > > it.  Interrupts always have to support spurious events.  The comment
> > > immediately above indicates this.  Legacy interrupts, especially shared
> > > legacy interrupts should not be our primary performance path.  VFIO has
> > > a very efficient path for handling spurious EOIs.
> > 
> > But it will not help that vfio does this efficiently if userspace
> > is woken up. You need to make it efficient for userspace consumers.
> > Otherwise it's a vfio specific interface.
> 
> Does this effect the design of this interface or is this a potential
> future optimization?
> 

Not interface, implementation.  We just need to make it fast for all
users not just inkernel ones.
Alex Williamson July 31, 2012, 1:12 a.m. UTC | #6
On Tue, 2012-07-31 at 03:36 +0300, Michael S. Tsirkin wrote:
> On Mon, Jul 30, 2012 at 06:26:31PM -0600, Alex Williamson wrote:
> > On Tue, 2012-07-31 at 03:01 +0300, Michael S. Tsirkin wrote:
> > > On Mon, Jul 30, 2012 at 10:22:10AM -0600, Alex Williamson wrote:
> > > > On Sun, 2012-07-29 at 17:54 +0300, Michael S. Tsirkin wrote:
> > > > > On Tue, Jul 24, 2012 at 02:43:22PM -0600, Alex Williamson wrote:
> > > > > > This new ioctl enables an eventfd to be triggered when an EOI is
> > > > > > written for a specified irqchip pin.  The first user of this will
> > > > > > be external device assignment through VFIO, using a level irqfd
> > > > > > for asserting a PCI INTx interrupt and this interface for de-assert
> > > > > > and notification once the interrupt is serviced.
> > > > > > 
> > > > > > Here we make use of the reference counting of the _irq_source
> > > > > > object allowing us to share it with an irqfd and cleanup regardless
> > > > > > of the release order.
> > > > > > 
> > > > > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > > > 
> > > > > > ---
> > > > > > 
> > > > > >  Documentation/virtual/kvm/api.txt |   21 ++
> > > > > >  arch/x86/kvm/x86.c                |    2 
> > > > > >  include/linux/kvm.h               |   15 ++
> > > > > >  include/linux/kvm_host.h          |   13 +
> > > > > >  virt/kvm/eventfd.c                |  336 +++++++++++++++++++++++++++++++++++++
> > > > > >  virt/kvm/kvm_main.c               |   11 +
> > > > > >  6 files changed, 398 insertions(+)
> > > > > > 
> > > > > > diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> > > > > > index 3911e62..8cd6b36 100644
> > > > > > --- a/Documentation/virtual/kvm/api.txt
> > > > > > +++ b/Documentation/virtual/kvm/api.txt
> > > > > > @@ -1989,6 +1989,27 @@ return the hash table order in the parameter.  (If the guest is using
> > > > > >  the virtualized real-mode area (VRMA) facility, the kernel will
> > > > > >  re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.)
> > > > > >  
> > > > > > +4.77 KVM_EOIFD
> > > > > > +
> > > > > > +Capability: KVM_CAP_EOIFD
> > > > > > +Architectures: x86
> > > > > > +Type: vm ioctl
> > > > > > +Parameters: struct kvm_eoifd (in)
> > > > > > +Returns: 0 on success, < 0 on error
> > > > > > +
> > > > > > +KVM_EOIFD allows userspace to receive interrupt EOI notification
> > > > > > +through an eventfd.
> > > > > 
> > > > > I thought about it some more, and I think it should be renamed to an
> > > > > interrupt ack notification than eoi notification.
> > > > > For example, consider userspace that uses threaded interrupts.interrupts.
> > > > > Currently what will happen is each interrupt will be injected
> > > > > twice, since on eoi device is still asserting it.
> > > > 
> > > > I don't follow, why is userspace writing an eoi to the ioapic if it
> > > > hasn't handled the interrupt
> > > 
> > > It has handled it - it disabled the hardware interrupt.
> > 
> > So it's not injected twice, it's held pending at the ioapic the second
> > time, just like hardware.
> 
> It is not like hardware at all. in hardware there is no overhead
> here you interrupot the guest to run handler in host.

Obviously we have some overhead, we're emulating the guest hardware.
That doesn't make the behavior unlike hardware.

> >  Maybe there's a future optimization there,
> > but I don't think it's appropriate at this time.
> 
> Yes. But to make it *possible* in future we must remove
> the requirement to signal fd immediately on EOI.
> So rename it ackfd.

How does the name make that possible?  We can easily add a flag
EOIFD_FLAG_EOI_ON_REENABLE, or whatever.

> > > > and why wouldn't the same happen on bare
> > > > metal?
> > > 
> > > on bare metal level does not matter as long as interrupt
> > > is disabled.
> > > 
> > > > > One fix would be to delay event until interrupt is re-enabled.
> > > > > Now I am not asking you to fix this immediately,
> > > > > but I think we should make the interface generic by
> > > > > saying we report an ack to userspace and not specifically EOI.
> > > > 
> > > > Using the word "delay" in the context of interrupt delivery raises all
> > > > sorts of red flags for me, but I really don't understand your argument.
> > > 
> > > I am saying it's an "ack" of interrupt userspace cares about.
> > > The fact it is done by EOI is an implementation detail.
> > 
> > The implementation is how an EOI is generated on an ioapic, not that an
> > EOI exists.  How do I read a hardware spec and figure out what "ack of
> > interrupt" means?
> 
> It just means it will be called after guest has completed handling
> interrupt. How we detect that is our problem.

Conceptually, we're still looking for the EOI, we may just be able to
optimize to EOI && irqchip pin unmasked.  The name doesn't prohibit
anything here and eoifd is more descriptive that it relates to the end
of an interrupt where ackfd means nothing (what got acked?).

> > > > > >  kvm_eoifd.fd specifies the eventfd used for
> > > > > > +notification.  KVM_EOIFD_FLAG_DEASSIGN is used to de-assign an eoifd
> > > > > > +once assigned.  KVM_EOIFD also requires additional bits set in
> > > > > > +kvm_eoifd.flags to bind to the proper interrupt line.  The
> > > > > > +KVM_EOIFD_FLAG_LEVEL_IRQFD indicates that kvm_eoifd.key is provided
> > > > > > +and is a key from a level triggered interrupt (configured from
> > > > > > +KVM_IRQFD using KVM_IRQFD_FLAG_LEVEL).  The EOI notification is bound
> > > > > > +to the same GSI and irqchip input as the irqfd.  Both kvm_eoifd.key
> > > > > > +and KVM_EOIFD_FLAG_LEVEL_IRQFD must be specified on assignment and
> > > > > > +de-assignment of KVM_EOIFD.  A level irqfd may only be bound to a
> > > > > > +single eoifd.  KVM_CAP_EOIFD_LEVEL_IRQFD indicates support of
> > > > > > +KVM_EOIFD_FLAG_LEVEL_IRQFD.
> > > > > >  
> > > > > 
> > > > > Hmm returning the key means we'll need to keep refcounting for source
> > > > > IDs around forever. I liked passing the fd better: make implementation
> > > > > match interface and not the other way around.
> > > > 
> > > > False, a source ID has a finite lifecycle.  The fd approach was broken.
> > > > Holding the irqfd context imposed too many dependencies between eoifd
> > > > and irqfd necessitating things like one interface disabling another.  I
> > > > thoroughly disagree with that approach.
> > > 
> > > You keep saying this but it is still true: once irqfd
> > > is closed eoifd does not get any more interrupts.
> > 
> > How does that matter?
> 
> Well if it does not get events it is disabled.
> so you have one ifc disabling another, anyway.

And a level irqfd without an eoifd can never be de-asserted.  Either we
make modular components, assemble them to do useful work, and
disassemble them independently so they can be used by future interfaces
or we bundle eoifd as just an option of irqfd.  Which is it gonna be?

> > > > > >  5. The kvm_run structure
> > > > > >  ------------------------
> > > > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > > > > index 9ded39d..8f3164e 100644
> > > > > > --- a/arch/x86/kvm/x86.c
> > > > > > +++ b/arch/x86/kvm/x86.c
> > > > > > @@ -2171,6 +2171,8 @@ int kvm_dev_ioctl_check_extension(long ext)
> > > > > >  	case KVM_CAP_PCI_2_3:
> > > > > >  	case KVM_CAP_KVMCLOCK_CTRL:
> > > > > >  	case KVM_CAP_IRQFD_LEVEL:
> > > > > > +	case KVM_CAP_EOIFD:
> > > > > > +	case KVM_CAP_EOIFD_LEVEL_IRQFD:
> > > > > >  		r = 1;
> > > > > >  		break;
> > > > > >  	case KVM_CAP_COALESCED_MMIO:
> > > > > > diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> > > > > > index b2e6e4f..effb916 100644
> > > > > > --- a/include/linux/kvm.h
> > > > > > +++ b/include/linux/kvm.h
> > > > > > @@ -619,6 +619,8 @@ struct kvm_ppc_smmu_info {
> > > > > >  #define KVM_CAP_S390_COW 79
> > > > > >  #define KVM_CAP_PPC_ALLOC_HTAB 80
> > > > > >  #define KVM_CAP_IRQFD_LEVEL 81
> > > > > > +#define KVM_CAP_EOIFD 82
> > > > > > +#define KVM_CAP_EOIFD_LEVEL_IRQFD 83
> > > > > >  
> > > > > >  #ifdef KVM_CAP_IRQ_ROUTING
> > > > > >  
> > > > > > @@ -694,6 +696,17 @@ struct kvm_irqfd {
> > > > > >  	__u8  pad[20];
> > > > > >  };
> > > > > >  
> > > > > > +#define KVM_EOIFD_FLAG_DEASSIGN (1 << 0)
> > > > > > +/* Available with KVM_CAP_EOIFD_LEVEL_IRQFD */
> > > > > > +#define KVM_EOIFD_FLAG_LEVEL_IRQFD (1 << 1)
> > > > > > +
> > > > > > +struct kvm_eoifd {
> > > > > > +	__u32 fd;
> > > > > > +	__u32 flags;
> > > > > > +	__u32 key;
> > > > > > +	__u8 pad[20];
> > > > > > +};
> > > > > > +
> > > > > >  struct kvm_clock_data {
> > > > > >  	__u64 clock;
> > > > > >  	__u32 flags;
> > > > > > @@ -834,6 +847,8 @@ struct kvm_s390_ucas_mapping {
> > > > > >  #define KVM_PPC_GET_SMMU_INFO	  _IOR(KVMIO,  0xa6, struct kvm_ppc_smmu_info)
> > > > > >  /* Available with KVM_CAP_PPC_ALLOC_HTAB */
> > > > > >  #define KVM_PPC_ALLOCATE_HTAB	  _IOWR(KVMIO, 0xa7, __u32)
> > > > > > +/* Available with KVM_CAP_EOIFD */
> > > > > > +#define KVM_EOIFD                 _IOW(KVMIO,  0xa8, struct kvm_eoifd)
> > > > > >  
> > > > > >  /*
> > > > > >   * ioctls for vcpu fds
> > > > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > > > > index c73f071..01e72a6 100644
> > > > > > --- a/include/linux/kvm_host.h
> > > > > > +++ b/include/linux/kvm_host.h
> > > > > > @@ -289,6 +289,10 @@ struct kvm {
> > > > > >  		struct mutex lock;
> > > > > >  		struct list_head items;
> > > > > >  	} irqsources;
> > > > > > +	struct {
> > > > > > +		spinlock_t lock;
> > > > > > +		struct list_head items;
> > > > > > +	} eoifds;
> > > > > >  #endif
> > > > > >  	struct kvm_vm_stat stat;
> > > > > >  	struct kvm_arch arch;
> > > > > > @@ -832,6 +836,8 @@ int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args);
> > > > > >  void kvm_irqfd_release(struct kvm *kvm);
> > > > > >  void kvm_irq_routing_update(struct kvm *, struct kvm_irq_routing_table *);
> > > > > >  int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
> > > > > > +int kvm_eoifd(struct kvm *kvm, struct kvm_eoifd *args);
> > > > > > +void kvm_eoifd_release(struct kvm *kvm);
> > > > > >  
> > > > > >  #else
> > > > > >  
> > > > > > @@ -857,6 +863,13 @@ static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
> > > > > >  	return -ENOSYS;
> > > > > >  }
> > > > > >  
> > > > > > +static inline int kvm_eoifd(struct kvm *kvm, struct kvm_eoifd *args)
> > > > > > +{
> > > > > > +	return -ENOSYS;
> > > > > > +}
> > > > > > +
> > > > > > +static inline void kvm_eoifd_release(struct kvm *kvm) {}
> > > > > > +
> > > > > >  #endif /* CONFIG_HAVE_KVM_EVENTFD */
> > > > > >  
> > > > > >  #ifdef CONFIG_KVM_APIC_ARCHITECTURE
> > > > > > diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> > > > > > index 878cb52..3aa2d62 100644
> > > > > > --- a/virt/kvm/eventfd.c
> > > > > > +++ b/virt/kvm/eventfd.c
> > > > > > @@ -95,6 +95,25 @@ static struct _irq_source *_irq_source_alloc(struct kvm *kvm, int gsi)
> > > > > >  	return source;
> > > > > >  }
> > > > > >  
> > > > > > +static struct _irq_source *_irq_source_get_from_key(struct kvm *kvm, int key)
> > > > > > +{
> > > > > > +	struct _irq_source *tmp, *source = ERR_PTR(-ENOENT);
> > > > > > +
> > > > > > +	mutex_lock(&kvm->irqsources.lock);
> > > > > > +
> > > > > > +	list_for_each_entry(tmp, &kvm->irqsources.items, list) {
> > > > > > +		if (tmp->id == key) {
> > > > > > +			source = tmp;
> > > > > > +			kref_get(&source->kref);
> > > > > > +			break;
> > > > > > +		}
> > > > > > +	}
> > > > > > +
> > > > > > +	mutex_unlock(&kvm->irqsources.lock);
> > > > > > +
> > > > > > +	return source;
> > > > > > +}
> > > > > > +
> > > > > >  /*
> > > > > >   * --------------------------------------------------------------------
> > > > > >   * irqfd: Allows an fd to be used to inject an interrupt to the guest
> > > > > > @@ -406,6 +425,8 @@ kvm_eventfd_init(struct kvm *kvm)
> > > > > >  	INIT_LIST_HEAD(&kvm->ioeventfds);
> > > > > >  	mutex_init(&kvm->irqsources.lock);
> > > > > >  	INIT_LIST_HEAD(&kvm->irqsources.items);
> > > > > > +	spin_lock_init(&kvm->eoifds.lock);
> > > > > > +	INIT_LIST_HEAD(&kvm->eoifds.items);
> > > > > >  }
> > > > > >  
> > > > > >  /*
> > > > > > @@ -772,3 +793,318 @@ kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
> > > > > >  
> > > > > >  	return kvm_assign_ioeventfd(kvm, args);
> > > > > >  }
> > > > > > +
> > > > > > +/*
> > > > > > + * --------------------------------------------------------------------
> > > > > > + *  eoifd: Translate KVM APIC/IOAPIC EOI into eventfd signal.
> > > > > > + *
> > > > > > + *  userspace can register with an eventfd for receiving
> > > > > > + *  notification when an EOI occurs.
> > > > > > + * --------------------------------------------------------------------
> > > > > > + */
> > > > > > +
> > > > > > +struct _eoifd {
> > > > > > +	/* eventfd triggered on EOI */
> > > > > > +	struct eventfd_ctx *eventfd;
> > > > > > +	/* irq source ID de-asserted on EOI */
> > > > > > +	struct _irq_source *source;
> > > > > > +	wait_queue_t wait;
> > > > > > +	/* EOI notification from KVM */
> > > > > > +	struct kvm_irq_ack_notifier notifier;
> > > > > > +	struct list_head list;
> > > > > > +	poll_table pt;
> > > > > > +	struct work_struct shutdown;
> > > > > > +};
> > > > > > +
> > > > > > +/* Called under eoifds.lock */
> > > > > > +static void eoifd_shutdown(struct work_struct *work)
> > > > > > +{
> > > > > > +	struct _eoifd *eoifd = container_of(work, struct _eoifd, shutdown);
> > > > > > +	struct kvm *kvm = eoifd->source->kvm;
> > > > > > +	u64 cnt;
> > > > > > +
> > > > > > +	/*
> > > > > > +	 * Stop EOI signaling
> > > > > > +	 */
> > > > > > +	kvm_unregister_irq_ack_notifier(kvm, &eoifd->notifier);
> > > > > > +
> > > > > > +	/*
> > > > > > +	 * Synchronize with the wait-queue and unhook ourselves to prevent
> > > > > > +	 * further events.
> > > > > > +	 */
> > > > > > +	eventfd_ctx_remove_wait_queue(eoifd->eventfd, &eoifd->wait, &cnt);
> > > > > > +
> > > > > > +	/*
> > > > > > +	 * Release resources
> > > > > > +	 */
> > > > > > +	eventfd_ctx_put(eoifd->eventfd);
> > > > > > +	_irq_source_put(eoifd->source);
> > > > > > +	kfree(eoifd);
> > > > > > +}
> > > > > > +
> > > > > > +/* assumes kvm->eoifds.lock is held */
> > > > > > +static bool eoifd_is_active(struct _eoifd *eoifd)
> > > > > > +{
> > > > > > +	return list_empty(&eoifd->list) ? false : true;
> > > > > > +}
> > > > > > +
> > > > > > +/*
> > > > > > + * Mark the eoifd as inactive and schedule it for removal
> > > > > > + *
> > > > > > + * assumes kvm->eoifds.lock is held
> > > > > > + */
> > > > > > +static void eoifd_deactivate(struct _eoifd *eoifd)
> > > > > > +{
> > > > > > +	BUG_ON(!eoifd_is_active(eoifd));
> > > > > > +
> > > > > > +	list_del_init(&eoifd->list);
> > > > > > +
> > > > > > +	queue_work(irqfd_cleanup_wq, &eoifd->shutdown);
> > > > > > +}
> > > > > > +
> > > > > > +/*
> > > > > > + * Called with wqh->lock held and interrupts disabled
> > > > > > + */
> > > > > > +static int eoifd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
> > > > > > +{
> > > > > > +	unsigned long flags = (unsigned long)key;
> > > > > > +
> > > > > > +	if (unlikely(flags & POLLHUP)) {
> > > > > > +		/* The eventfd is closing, detach from KVM */
> > > > > > +		struct _eoifd *eoifd = container_of(wait, struct _eoifd, wait);
> > > > > > +		struct kvm *kvm = eoifd->source->kvm;
> > > > > > +		unsigned long flags;
> > > > > > +
> > > > > > +		spin_lock_irqsave(&kvm->eoifds.lock, flags);
> > > > > > +
> > > > > > +		/*
> > > > > > +		 * We must check if someone deactivated the eoifd before
> > > > > > +		 * we could acquire the eoifds.lock since the item is
> > > > > > +		 * deactivated from the KVM side before it is unhooked from
> > > > > > +		 * the wait-queue.  If it is already deactivated, we can
> > > > > > +		 * simply return knowing the other side will cleanup for us.
> > > > > > +		 * We cannot race against the eoifd going away since the
> > > > > > +		 * other side is required to acquire wqh->lock, which we hold
> > > > > > +		 */
> > > > > > +		if (eoifd_is_active(eoifd))
> > > > > > +			eoifd_deactivate(eoifd);
> > > > > > +
> > > > > > +		spin_unlock_irqrestore(&kvm->eoifds.lock, flags);
> > > > > > +	}
> > > > > > +
> > > > > > +	return 0;
> > > > > > +}
> > > > > 
> > > > > Looks like there is a bug here: if I close irqfd, then close eoifd,
> > > > > the key is not immediately released so an attempt to create
> > > > > an irqfd can fail to get the source id.
> > > > 
> > > > Both irqfd and eoifd use the same workqueue for releasing objects and
> > > > both flush on assign.
> > > > 
> > > > > Maybe we should simply document that userspace should deassign
> > > > > eoifd before closing it? This is what we do for ioeventfd.
> > > > > If we do this, the whole polling code can go away completely.
> > > > 
> > > > You're again ignoring the close problem.  We cannot document around an
> > > > impossible requirement that fds are always deassigned before close.
> > > 
> > > Well userspace can easily call a deassign ioctl. Why is it so important
> > > that deassign is not required?
> > 
> > Because everything allocated through a file descriptor, specific to that
> > file descriptor, should be freed when the file descriptor is closed.
> > That's what people expect.
> 
> That's what documentation is for.

No, documentation does not fix poor, non-intuitive design.  Gleb has
already voiced his agreement to cleanup on close.

> > > > IMHO ioeventfd is broken here and I don't wish to emulate it's behavior.
> > > 
> > > So fix ioeventfd first. Making eoifd and ioeventfd behave differently does not
> > > make sense they are very similar.
> > 
> > One at a time.  eoifd and ioeventfd have different requirements.
> > ioeventfd is just wasting memory, eoifd can potentially exhaust irq
> > source IDs.  Besides, you still defend ioeventfd as correct.
> 
> same as eoifd.

So why would I take up arms for an interface I don't even use?

> > > > > > +
> > > > > > +static void eoifd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
> > > > > > +				    poll_table *pt)
> > > > > > +{
> > > > > > +	struct _eoifd *eoifd = container_of(pt, struct _eoifd, pt);
> > > > > > +	add_wait_queue(wqh, &eoifd->wait);
> > > > > > +}
> > > > > > +
> > > > > > +/*
> > > > > > + * This function is called as the kvm VM fd is being released. Shutdown all
> > > > > > + * eoifds that still remain open
> > > > > > + */
> > > > > > +void kvm_eoifd_release(struct kvm *kvm)
> > > > > > +{
> > > > > > +	struct _eoifd *tmp, *eoifd;
> > > > > > +
> > > > > > +	spin_lock_irq(&kvm->eoifds.lock);
> > > > > > +
> > > > > > +	list_for_each_entry_safe(eoifd, tmp, &kvm->eoifds.items, list)
> > > > > > +		eoifd_deactivate(eoifd);
> > > > > > +
> > > > > > +	spin_unlock_irq(&kvm->eoifds.lock);
> > > > > > +
> > > > > > +	flush_workqueue(irqfd_cleanup_wq);
> > > > > > +}
> > > > > > +
> > > > > > +static void eoifd_event(struct kvm_irq_ack_notifier *notifier)
> > > > > > +{
> > > > > > +	struct _eoifd *eoifd;
> > > > > > +
> > > > > > +	eoifd = container_of(notifier, struct _eoifd, notifier);
> > > > > > +
> > > > > > +	if (unlikely(!eoifd->source))
> > > > > > +		return;
> > > > > > +
> > > > > > +	/*
> > > > > > +	 * De-assert and send EOI, user needs to re-assert if
> > > > > > +	 * device still requires service.
> > > > > > +	 */
> > > > > 
> > > > > I'm not sure why did you drop filtering by source id.
> > > > > This means userspace gets events even if it did not send an interrupt.
> > > > > So
> > > > > 1. Should be documented that you can get spurious events 
> > > > > 2. when an interrupt is shared with an emulated device,
> > > > >    and said device uses EOI, this will not
> > > > >    perform well as we will wake up userspace on each EOI.
> > > > > 3. Just sharing interrupt with virtio means we are polling
> > > > >    assigned device on each virtio interrupt.
> > > > 
> > > > Didn't we just agree after v5 that filtering requires a spinlock around
> > > > around calling kvm_irq_set
> 
> this is already the case with your patchset. to avoid this,
> I am working on caching for interrupts, when ready
> you should probably rebase on top of that.

Sounds like a parallel effort, not a serialization.  This is an
optimization that does not effect the API.

> > or at least a new interface to setting irqs
> > > > that allows us to see the current assertion state and that neither of
> > > > those seem to be worth the effort for level irqs?  That's why I dropped
> > > > it.  Interrupts always have to support spurious events.  The comment
> > > > immediately above indicates this.  Legacy interrupts, especially shared
> > > > legacy interrupts should not be our primary performance path.  VFIO has
> > > > a very efficient path for handling spurious EOIs.
> > > 
> > > But it will not help that vfio does this efficiently if userspace
> > > is woken up. You need to make it efficient for userspace consumers.
> > > Otherwise it's a vfio specific interface.
> > 
> > Does this effect the design of this interface or is this a potential
> > future optimization?
> > 
> 
> Not interface, implementation.  We just need to make it fast for all
> users not just inkernel ones.

Exactly, an optimization that doesn't change the API, not a blocking
issue.



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson Aug. 1, 2012, 7:06 p.m. UTC | #7
On Mon, 2012-07-30 at 19:12 -0600, Alex Williamson wrote:
> On Tue, 2012-07-31 at 03:36 +0300, Michael S. Tsirkin wrote:
> > On Mon, Jul 30, 2012 at 06:26:31PM -0600, Alex Williamson wrote:
> > > On Tue, 2012-07-31 at 03:01 +0300, Michael S. Tsirkin wrote:
> > > > You keep saying this but it is still true: once irqfd
> > > > is closed eoifd does not get any more interrupts.
> > > 
> > > How does that matter?
> > 
> > Well if it does not get events it is disabled.
> > so you have one ifc disabling another, anyway.
> 
> And a level irqfd without an eoifd can never be de-asserted.  Either we
> make modular components, assemble them to do useful work, and
> disassemble them independently so they can be used by future interfaces
> or we bundle eoifd as just an option of irqfd.  Which is it gonna be?

I don't think I've been successful at explaining my reasoning for making
EOI notification a separate interface, so let me try again...

When kvm is not enabled, the qemu vfio driver still needs to know about
EOIs to re-enable the physical interrupt.  Since the ioapic is emulated
in qemu, we can setup a notifier for this and create abstraction to make
it non-x86 specific, etc.  We just need to come up with a design and
implement it.  But what happens when kvm is then enabled?  ioapic
emulation moves to the kernel (assume kvm includes irqchip for this
argument even though it doesn't for POWER), qemu no longer knows about
EOIs, and the interface we just created to handle the non-kvm case stops
working.  Is anyone going to accept adding a qemu EOI notification
interface that only works when kvm is not enabled?

I suspect we therefore need a notification mechanism between kvm and
qemu to make it possible for that interface to continue working.  An
eventfd also seems like the right mechanism there.  A simple
modification to the proposed KVM_EOIFD here would allow it to trigger an
eventfd when an EOI is written to a specific gsi on
KVM_USERSPACE_IRQ_SOURCE_ID (define a flag and pass gsi in place of
key).

The split proposed here does require some assembly, but KVM_EOIFD is
re-usable as either a de-assert and notify mechanism tied to an irqfd or
a notify-only mechanism allowing users of a qemu EOI notification
infrastructure to continue working.  vfio doesn't necessarily need this
middle ground, but can easily be used to test it.

The alternative is that we pull eoifd into KVM_IRQFD and invent some
other new EOI interface for qemu.  That means we get EOIs tied to an
irqfd via one path and other EOIs via another ioctl.  Personally that
seems less desirable, but I'm willing to explore that route if
necessary.  Thanks,

Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Aug. 2, 2012, 8:42 a.m. UTC | #8
On Mon, Jul 30, 2012 at 07:12:15PM -0600, Alex Williamson wrote:
> > > > > > >  kvm_eoifd.fd specifies the eventfd used for
> > > > > > > +notification.  KVM_EOIFD_FLAG_DEASSIGN is used to de-assign an eoifd
> > > > > > > +once assigned.  KVM_EOIFD also requires additional bits set in
> > > > > > > +kvm_eoifd.flags to bind to the proper interrupt line.  The
> > > > > > > +KVM_EOIFD_FLAG_LEVEL_IRQFD indicates that kvm_eoifd.key is provided
> > > > > > > +and is a key from a level triggered interrupt (configured from
> > > > > > > +KVM_IRQFD using KVM_IRQFD_FLAG_LEVEL).  The EOI notification is bound
> > > > > > > +to the same GSI and irqchip input as the irqfd.  Both kvm_eoifd.key
> > > > > > > +and KVM_EOIFD_FLAG_LEVEL_IRQFD must be specified on assignment and
> > > > > > > +de-assignment of KVM_EOIFD.  A level irqfd may only be bound to a
> > > > > > > +single eoifd.  KVM_CAP_EOIFD_LEVEL_IRQFD indicates support of
> > > > > > > +KVM_EOIFD_FLAG_LEVEL_IRQFD.
> > > > > > >  
> > > > > > 
> > > > > > Hmm returning the key means we'll need to keep refcounting for source
> > > > > > IDs around forever. I liked passing the fd better: make implementation
> > > > > > match interface and not the other way around.
> > > > > 
> > > > > False, a source ID has a finite lifecycle.  The fd approach was broken.
> > > > > Holding the irqfd context imposed too many dependencies between eoifd
> > > > > and irqfd necessitating things like one interface disabling another.  I
> > > > > thoroughly disagree with that approach.
> > > > 
> > > > You keep saying this but it is still true: once irqfd
> > > > is closed eoifd does not get any more interrupts.
> > > 
> > > How does that matter?
> > 
> > Well if it does not get events it is disabled.
> > so you have one ifc disabling another, anyway.
> 
> And a level irqfd without an eoifd can never be de-asserted.  Either we
> make modular components, assemble them to do useful work, and
> disassemble them independently so they can be used by future interfaces
> or we bundle eoifd as just an option of irqfd.  Which is it gonna be?

I'm fine just making it an option. I think Gleb wanted a separate
EOIFD to handle timedrift but it later seemed that eventfd is not
suitable for that?
Avi Kivity Aug. 6, 2012, 10:17 a.m. UTC | #9
On 07/24/2012 11:43 PM, Alex Williamson wrote:
> This new ioctl enables an eventfd to be triggered when an EOI is
> written for a specified irqchip pin.  The first user of this will
> be external device assignment through VFIO, using a level irqfd
> for asserting a PCI INTx interrupt and this interface for de-assert
> and notification once the interrupt is serviced.
> 
> Here we make use of the reference counting of the _irq_source
> object allowing us to share it with an irqfd and cleanup regardless
> of the release order.

The name is slightly misleading.  eoifd doesn't trigger on EOI (which is
an APIC->IOAPIC interface) but rather when an interrupt controller
resamples an input line.  This happens for the IOAPIC when an EOI is
received for a vector that is configured for level interrupts and not
masked, or similarly for a PIC (but this is not triggered by an APIC EOI).

It's not a huge difference, but let's document it.

>  
> +4.77 KVM_EOIFD
> +
> +Capability: KVM_CAP_EOIFD
> +Architectures: x86
> +Type: vm ioctl
> +Parameters: struct kvm_eoifd (in)
> +Returns: 0 on success, < 0 on error
> +
> +KVM_EOIFD allows userspace to receive interrupt EOI notification
> +through an eventfd.  kvm_eoifd.fd specifies the eventfd used for
> +notification.  KVM_EOIFD_FLAG_DEASSIGN is used to de-assign an eoifd
> +once assigned.  KVM_EOIFD also requires additional bits set in
> +kvm_eoifd.flags to bind to the proper interrupt line.  The
> +KVM_EOIFD_FLAG_LEVEL_IRQFD indicates that kvm_eoifd.key is provided
> +and is a key from a level triggered interrupt (configured from
> +KVM_IRQFD using KVM_IRQFD_FLAG_LEVEL).  The EOI notification is bound
> +to the same GSI and irqchip input as the irqfd.  Both kvm_eoifd.key
> +and KVM_EOIFD_FLAG_LEVEL_IRQFD must be specified on assignment and
> +de-assignment of KVM_EOIFD.  A level irqfd may only be bound to a
> +single eoifd.  KVM_CAP_EOIFD_LEVEL_IRQFD indicates support of
> +KVM_EOIFD_FLAG_LEVEL_IRQFD.

Why do we need to couple eoifd and irqfd?
Avi Kivity Aug. 6, 2012, 10:38 a.m. UTC | #10
On 08/06/2012 01:17 PM, Avi Kivity wrote:
> 
>>  
>> +4.77 KVM_EOIFD
>> +
>> +Capability: KVM_CAP_EOIFD
>> +Architectures: x86
>> +Type: vm ioctl
>> +Parameters: struct kvm_eoifd (in)
>> +Returns: 0 on success, < 0 on error
>> +
>> +KVM_EOIFD allows userspace to receive interrupt EOI notification
>> +through an eventfd.  kvm_eoifd.fd specifies the eventfd used for
>> +notification.  KVM_EOIFD_FLAG_DEASSIGN is used to de-assign an eoifd
>> +once assigned.  KVM_EOIFD also requires additional bits set in
>> +kvm_eoifd.flags to bind to the proper interrupt line.  The
>> +KVM_EOIFD_FLAG_LEVEL_IRQFD indicates that kvm_eoifd.key is provided
>> +and is a key from a level triggered interrupt (configured from
>> +KVM_IRQFD using KVM_IRQFD_FLAG_LEVEL).  The EOI notification is bound
>> +to the same GSI and irqchip input as the irqfd.  Both kvm_eoifd.key
>> +and KVM_EOIFD_FLAG_LEVEL_IRQFD must be specified on assignment and
>> +de-assignment of KVM_EOIFD.  A level irqfd may only be bound to a
>> +single eoifd.  KVM_CAP_EOIFD_LEVEL_IRQFD indicates support of
>> +KVM_EOIFD_FLAG_LEVEL_IRQFD.
> 
> Why do we need to couple eoifd and irqfd?

Oh, it's to auto-deassert the line.

Regarding the implementation, instead of a linked list, would an array
of counters parallel to the bitmap make it simpler?
Avi Kivity Aug. 6, 2012, 10:40 a.m. UTC | #11
On 08/06/2012 01:38 PM, Avi Kivity wrote:

> Regarding the implementation, instead of a linked list, would an array
> of counters parallel to the bitmap make it simpler?

Or even, replace the bitmap with an array of counters.
Alex Williamson Aug. 9, 2012, 7:26 p.m. UTC | #12
On Mon, 2012-08-06 at 13:40 +0300, Avi Kivity wrote:
> On 08/06/2012 01:38 PM, Avi Kivity wrote:
> 
> > Regarding the implementation, instead of a linked list, would an array
> > of counters parallel to the bitmap make it simpler?
> 
> Or even, replace the bitmap with an array of counters.

I'm not sure a counter array is what we're really after.  That gives us
reference counting for the irq source IDs, but not the key->gsi lookup.
It also highlights another issue, that we have a limited set of source
IDs.  Looks like we have BITS_PER_LONG IDs, with two already used, one
for the shared userspace ID and another for the PIT.  How happy are we
going to be with a limit of 62 level interrupts in use at one time?

It's arguably a reasonable number since the most virtualization friendly
devices (sr-iov VFs) don't even support this kind of interrupt.  It's
also very wasteful allocating an entire source ID for a single GSI
within that source ID.  PCI supports interrupts A, B, C, and D, which,
in the most optimal config, each go to different GSIs.  So we could
theoretically be more efficient in our use and allocation of irq source
IDs if we tracked use by the source ID, gsi pair.

That probably makes it less practical to replace anything at the top
level with a counter array.  The key that we pass back is currently the
actual source ID, but we don't specify what it is, so we could split it
and have it encode a 16bit source ID plus 16 bit GSI.  It could also be
an idr entry.

Michael, would the interface be more acceptable to you if we added
separate ioctls to allocate and free some representation of an irq
source ID, gsi pair?  For instance, an ioctl might return an idr entry
for an irq source ID/gsi object which would then be passed as a
parameter in struct kvm_irqfd and struct kvm_eoifd so that the object
representing the source id/gsi isn't magically freed on it's own.  This
would also allow us to deassign/close one end and reconfigure it later.
Thanks,

Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Aug. 12, 2012, 7:49 a.m. UTC | #13
On Wed, Aug 01, 2012 at 01:06:42PM -0600, Alex Williamson wrote:
> On Mon, 2012-07-30 at 19:12 -0600, Alex Williamson wrote:
> > On Tue, 2012-07-31 at 03:36 +0300, Michael S. Tsirkin wrote:
> > > On Mon, Jul 30, 2012 at 06:26:31PM -0600, Alex Williamson wrote:
> > > > On Tue, 2012-07-31 at 03:01 +0300, Michael S. Tsirkin wrote:
> > > > > You keep saying this but it is still true: once irqfd
> > > > > is closed eoifd does not get any more interrupts.
> > > > 
> > > > How does that matter?
> > > 
> > > Well if it does not get events it is disabled.
> > > so you have one ifc disabling another, anyway.
> > 
> > And a level irqfd without an eoifd can never be de-asserted.  Either we
> > make modular components, assemble them to do useful work, and
> > disassemble them independently so they can be used by future interfaces
> > or we bundle eoifd as just an option of irqfd.  Which is it gonna be?
> 
> I don't think I've been successful at explaining my reasoning for making
> EOI notification a separate interface, so let me try again...
> 
> When kvm is not enabled, the qemu vfio driver still needs to know about
> EOIs to re-enable the physical interrupt.  Since the ioapic is emulated
> in qemu, we can setup a notifier for this and create abstraction to make
> it non-x86 specific, etc.  We just need to come up with a design and
> implement it.  But what happens when kvm is then enabled?  ioapic
> emulation moves to the kernel (assume kvm includes irqchip for this
> argument even though it doesn't for POWER), qemu no longer knows about
> EOIs, and the interface we just created to handle the non-kvm case stops
> working.  Is anyone going to accept adding a qemu EOI notification
> interface that only works when kvm is not enabled?

Yes, it's only a question of abstracting it at the right level.

For example, if as you suggest below kvm gives you an eventfd that
asserts an irq, laters automatically deasserts it and notifies another
eventfd, we can do exactly this in both tcg and kvm:

setup_level_irq(int gsi, int assert_eventfd, int deassert_eventfd)

Not advocating this interface but pointing out that to make
same abstraction to work in tcg and kvm, see what it does in
each of them first.

> I suspect we therefore need a notification mechanism between kvm and
> qemu to make it possible for that interface to continue working.

Even though no one is actually using it. IMHO, this is a maintainance
problem.

> An
> eventfd also seems like the right mechanism there.  A simple
> modification to the proposed KVM_EOIFD here would allow it to trigger an
> eventfd when an EOI is written to a specific gsi on
> KVM_USERSPACE_IRQ_SOURCE_ID (define a flag and pass gsi in place of
> key).
> 
> The split proposed here does require some assembly, but KVM_EOIFD is
> re-usable as either a de-assert and notify mechanism tied to an irqfd or
> a notify-only mechanism allowing users of a qemu EOI notification
> infrastructure to continue working.  vfio doesn't necessarily need this
> middle ground, but can easily be used to test it.
> 
> The alternative is that we pull eoifd into KVM_IRQFD and invent some
> other new EOI interface for qemu.  That means we get EOIs tied to an
> irqfd via one path and other EOIs via another ioctl.  Personally that
> seems less desirable, but I'm willing to explore that route if
> necessary.  Thanks,
> 
> Alex

Maybe we should focus on the fact that we notify userspace that we
deasserted interrupt instead of EOI.
Michael S. Tsirkin Aug. 12, 2012, 7:53 a.m. UTC | #14
On Mon, Aug 06, 2012 at 01:17:12PM +0300, Avi Kivity wrote:
> On 07/24/2012 11:43 PM, Alex Williamson wrote:
> > This new ioctl enables an eventfd to be triggered when an EOI is
> > written for a specified irqchip pin.  The first user of this will
> > be external device assignment through VFIO, using a level irqfd
> > for asserting a PCI INTx interrupt and this interface for de-assert
> > and notification once the interrupt is serviced.
> > 
> > Here we make use of the reference counting of the _irq_source
> > object allowing us to share it with an irqfd and cleanup regardless
> > of the release order.
> 
> The name is slightly misleading.  eoifd doesn't trigger on EOI (which is
> an APIC->IOAPIC interface) but rather when an interrupt controller
> resamples an input line.  This happens for the IOAPIC when an EOI is
> received for a vector that is configured for level interrupts and not
> masked, or similarly for a PIC (but this is not triggered by an APIC EOI).
> 
> It's not a huge difference, but let's document it.

In fact, when we really need to notify userspace is after
we auto-deassert an interrupt: userspace does not
need an EOI notification as such.


> >  
> > +4.77 KVM_EOIFD
> > +
> > +Capability: KVM_CAP_EOIFD
> > +Architectures: x86
> > +Type: vm ioctl
> > +Parameters: struct kvm_eoifd (in)
> > +Returns: 0 on success, < 0 on error
> > +
> > +KVM_EOIFD allows userspace to receive interrupt EOI notification
> > +through an eventfd.  kvm_eoifd.fd specifies the eventfd used for
> > +notification.  KVM_EOIFD_FLAG_DEASSIGN is used to de-assign an eoifd
> > +once assigned.  KVM_EOIFD also requires additional bits set in
> > +kvm_eoifd.flags to bind to the proper interrupt line.  The
> > +KVM_EOIFD_FLAG_LEVEL_IRQFD indicates that kvm_eoifd.key is provided
> > +and is a key from a level triggered interrupt (configured from
> > +KVM_IRQFD using KVM_IRQFD_FLAG_LEVEL).  The EOI notification is bound
> > +to the same GSI and irqchip input as the irqfd.  Both kvm_eoifd.key
> > +and KVM_EOIFD_FLAG_LEVEL_IRQFD must be specified on assignment and
> > +de-assignment of KVM_EOIFD.  A level irqfd may only be bound to a
> > +single eoifd.  KVM_CAP_EOIFD_LEVEL_IRQFD indicates support of
> > +KVM_EOIFD_FLAG_LEVEL_IRQFD.
> 
> Why do we need to couple eoifd and irqfd?
> 
> 
> 
> -- 
> error compiling committee.c: too many arguments to function
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Aug. 12, 2012, 8:36 a.m. UTC | #15
On 08/09/2012 10:26 PM, Alex Williamson wrote:
> On Mon, 2012-08-06 at 13:40 +0300, Avi Kivity wrote:
>> On 08/06/2012 01:38 PM, Avi Kivity wrote:
>> 
>> > Regarding the implementation, instead of a linked list, would an array
>> > of counters parallel to the bitmap make it simpler?
>> 
>> Or even, replace the bitmap with an array of counters.
> 
> I'm not sure a counter array is what we're really after.  That gives us
> reference counting for the irq source IDs, but not the key->gsi lookup.

You can look up the gsi while registering the eoifd, so it's accessible
as eoifd->gsi instead of eoifd->source->gsi.  The irqfd can go away
while the eoifd is still active, but is this a problem?


> It also highlights another issue, that we have a limited set of source
> IDs.  Looks like we have BITS_PER_LONG IDs, with two already used, one
> for the shared userspace ID and another for the PIT.  How happy are we
> going to be with a limit of 62 level interrupts in use at one time?

When we start being unhappy we can increase that number.  On the other
hand more locks and lists makes me unhappy now.

> 
> It's arguably a reasonable number since the most virtualization friendly
> devices (sr-iov VFs) don't even support this kind of interrupt.  It's
> also very wasteful allocating an entire source ID for a single GSI
> within that source ID.  PCI supports interrupts A, B, C, and D, which,
> in the most optimal config, each go to different GSIs.  So we could
> theoretically be more efficient in our use and allocation of irq source
> IDs if we tracked use by the source ID, gsi pair.

There are, in one userspace, just three gsis available for PCI links, so
you're compressing the source id space by 3.

> That probably makes it less practical to replace anything at the top
> level with a counter array.  The key that we pass back is currently the
> actual source ID, but we don't specify what it is, so we could split it
> and have it encode a 16bit source ID plus 16 bit GSI.  It could also be
> an idr entry.

We can fix those kinds of problems by adding another layer of
indirection.  But I doubt they will be needed.  I don't see people
assigning 60 legacy devices to one guest.

> Michael, would the interface be more acceptable to you if we added
> separate ioctls to allocate and free some representation of an irq
> source ID, gsi pair?  For instance, an ioctl might return an idr entry
> for an irq source ID/gsi object which would then be passed as a
> parameter in struct kvm_irqfd and struct kvm_eoifd so that the object
> representing the source id/gsi isn't magically freed on it's own.  This
> would also allow us to deassign/close one end and reconfigure it later.
> Thanks,

Another option is to push the responsibility for allocating IDs for the
association to userspace.  Let userspace both create the irqfd and the
eoifd with the same ID, the kernel matches them at registration time and
copies the gsi/sourceid from the first to the second eventfd.
Michael S. Tsirkin Aug. 12, 2012, 9:33 a.m. UTC | #16
On Thu, Aug 09, 2012 at 01:26:15PM -0600, Alex Williamson wrote:
> On Mon, 2012-08-06 at 13:40 +0300, Avi Kivity wrote:
> > On 08/06/2012 01:38 PM, Avi Kivity wrote:
> > 
> > > Regarding the implementation, instead of a linked list, would an array
> > > of counters parallel to the bitmap make it simpler?
> > 
> > Or even, replace the bitmap with an array of counters.
> 
> I'm not sure a counter array is what we're really after.  That gives us
> reference counting for the irq source IDs, but not the key->gsi lookup.
> It also highlights another issue, that we have a limited set of source
> IDs.  Looks like we have BITS_PER_LONG IDs, with two already used, one
> for the shared userspace ID and another for the PIT.  How happy are we
> going to be with a limit of 62 level interrupts in use at one time?
> 
> It's arguably a reasonable number since the most virtualization friendly
> devices (sr-iov VFs) don't even support this kind of interrupt.  It's
> also very wasteful allocating an entire source ID for a single GSI
> within that source ID.  PCI supports interrupts A, B, C, and D, which,
> in the most optimal config, each go to different GSIs.  So we could
> theoretically be more efficient in our use and allocation of irq source
> IDs if we tracked use by the source ID, gsi pair.
> 
> That probably makes it less practical to replace anything at the top
> level with a counter array.  The key that we pass back is currently the
> actual source ID, but we don't specify what it is, so we could split it
> and have it encode a 16bit source ID plus 16 bit GSI.  It could also be
> an idr entry.
> 
> Michael, would the interface be more acceptable to you if we added
> separate ioctls to allocate and free some representation of an irq
> source ID, gsi pair?  For instance, an ioctl might return an idr entry
> for an irq source ID/gsi object which would then be passed as a
> parameter in struct kvm_irqfd and struct kvm_eoifd so that the object
> representing the source id/gsi isn't magically freed on it's own.  This
> would also allow us to deassign/close one end and reconfigure it later.
> Thanks,
> 
> Alex

It's acceptable to me either way. I was only pointing out that as
designed, the interface looks simple at first but then you find out some
subtle limitations which are implementation driven. This gives
an overall feeling the abstraction is too low level.

If we compare to the existing irqfd, isn't the difference
simply that irqfd deasserts immediately ATM, while we
want to delay this until later?

If yes, then along the lines that you proposed, and combining with my
idea of tracking deasserts, how do you like the following:

/* Keep line asserted until guest has handled the interrupt. */
#define KVM_IRQFD_FLAG_DEASSERT_ON_ACK (1 << 1)
/* Notify after line is deasserted. */
#define KVM_IRQFD_FLAG_DEASSERT_EVENTFD (2 << 1)

	struct kvm_irqfd {
		__u32 fd;
		__u32 gsi;
		__u32 flags;
		/* eventfd to notify when line is deasserted */
		__u32 deassert_eventfd;
		__u8  pad[16];
	};

now the only limitation is that KVM_IRQFD_FLAG_DEASSERT_ON_ACK is only
effective for level interrupts.

Notes about lifetime of objects:
	- closing deassert_eventfd does nothing (we can keep
	  reference to it from irqfd so no need for
          complex polling/flushing scheme)
	- closing irqfd or deasserting dis-associates
	  deassert_eventfd automatically
	- source id is internal to irqfd and goes away with it

it looks harder to misuse and fits what we want to do nicely,
and needs less code to implement.

Avi, what do you think?
Alex Williamson Aug. 13, 2012, 4:48 p.m. UTC | #17
On Sun, 2012-08-12 at 10:49 +0300, Michael S. Tsirkin wrote:
> On Wed, Aug 01, 2012 at 01:06:42PM -0600, Alex Williamson wrote:
> > On Mon, 2012-07-30 at 19:12 -0600, Alex Williamson wrote:
> > > On Tue, 2012-07-31 at 03:36 +0300, Michael S. Tsirkin wrote:
> > > > On Mon, Jul 30, 2012 at 06:26:31PM -0600, Alex Williamson wrote:
> > > > > On Tue, 2012-07-31 at 03:01 +0300, Michael S. Tsirkin wrote:
> > > > > > You keep saying this but it is still true: once irqfd
> > > > > > is closed eoifd does not get any more interrupts.
> > > > > 
> > > > > How does that matter?
> > > > 
> > > > Well if it does not get events it is disabled.
> > > > so you have one ifc disabling another, anyway.
> > > 
> > > And a level irqfd without an eoifd can never be de-asserted.  Either we
> > > make modular components, assemble them to do useful work, and
> > > disassemble them independently so they can be used by future interfaces
> > > or we bundle eoifd as just an option of irqfd.  Which is it gonna be?
> > 
> > I don't think I've been successful at explaining my reasoning for making
> > EOI notification a separate interface, so let me try again...
> > 
> > When kvm is not enabled, the qemu vfio driver still needs to know about
> > EOIs to re-enable the physical interrupt.  Since the ioapic is emulated
> > in qemu, we can setup a notifier for this and create abstraction to make
> > it non-x86 specific, etc.  We just need to come up with a design and
> > implement it.  But what happens when kvm is then enabled?  ioapic
> > emulation moves to the kernel (assume kvm includes irqchip for this
> > argument even though it doesn't for POWER), qemu no longer knows about
> > EOIs, and the interface we just created to handle the non-kvm case stops
> > working.  Is anyone going to accept adding a qemu EOI notification
> > interface that only works when kvm is not enabled?
> 
> Yes, it's only a question of abstracting it at the right level.
> 
> For example, if as you suggest below kvm gives you an eventfd that
> asserts an irq, laters automatically deasserts it and notifies another
> eventfd, we can do exactly this in both tcg and kvm:
> 
> setup_level_irq(int gsi, int assert_eventfd, int deassert_eventfd)
> 
> Not advocating this interface but pointing out that to make
> same abstraction to work in tcg and kvm, see what it does in
> each of them first.

The tcg model I was thinking of is that we continue to use qemu_set_irq
to assert and de-assert the interrupt and add an eoi/ack notification
mechanism, much like the ack notifier that already exists in kvm.  There
doesn't seem to be much advantage to creating a new interrupt
infrastructure in tcg that can trigger interrupts by eventfds, so I
assume VFIO is always going to be responsible for the translation of an
eventfd to an irq assertion, get some kind of notification through qemu,
de-assert the interrupt and unmask the device.  With that model in mind,
perhaps it makes more sense why I've been keeping the eoi/ack separate
from irqfd.

> > I suspect we therefore need a notification mechanism between kvm and
> > qemu to make it possible for that interface to continue working.
> 
> Even though no one is actually using it. IMHO, this is a maintainance
> problem.

That's why I'm designing it the way I am.  VFIO will make use of it.  It
will just be using the de-assert and notify mode vs a notify-only mode
that tcg would use.  It would also be easy to add an option to vfio so
that we could fully test both modes.

> > An
> > eventfd also seems like the right mechanism there.  A simple
> > modification to the proposed KVM_EOIFD here would allow it to trigger an
> > eventfd when an EOI is written to a specific gsi on
> > KVM_USERSPACE_IRQ_SOURCE_ID (define a flag and pass gsi in place of
> > key).
> > 
> > The split proposed here does require some assembly, but KVM_EOIFD is
> > re-usable as either a de-assert and notify mechanism tied to an irqfd or
> > a notify-only mechanism allowing users of a qemu EOI notification
> > infrastructure to continue working.  vfio doesn't necessarily need this
> > middle ground, but can easily be used to test it.
> > 
> > The alternative is that we pull eoifd into KVM_IRQFD and invent some
> > other new EOI interface for qemu.  That means we get EOIs tied to an
> > irqfd via one path and other EOIs via another ioctl.  Personally that
> > seems less desirable, but I'm willing to explore that route if
> > necessary.  Thanks,
> > 
> > Alex
> 
> Maybe we should focus on the fact that we notify userspace that we
> deasserted interrupt instead of EOI.

But will a tcg user want the de-assert?  I assume not.  The de-assert is
an optimization to allow us to bypass evaluation in userspace.  In tcg
we're already there.  Thanks,

Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Aug. 13, 2012, 4:59 p.m. UTC | #18
On Mon, Aug 13, 2012 at 10:48:15AM -0600, Alex Williamson wrote:
> On Sun, 2012-08-12 at 10:49 +0300, Michael S. Tsirkin wrote:
> > On Wed, Aug 01, 2012 at 01:06:42PM -0600, Alex Williamson wrote:
> > > On Mon, 2012-07-30 at 19:12 -0600, Alex Williamson wrote:
> > > > On Tue, 2012-07-31 at 03:36 +0300, Michael S. Tsirkin wrote:
> > > > > On Mon, Jul 30, 2012 at 06:26:31PM -0600, Alex Williamson wrote:
> > > > > > On Tue, 2012-07-31 at 03:01 +0300, Michael S. Tsirkin wrote:
> > > > > > > You keep saying this but it is still true: once irqfd
> > > > > > > is closed eoifd does not get any more interrupts.
> > > > > > 
> > > > > > How does that matter?
> > > > > 
> > > > > Well if it does not get events it is disabled.
> > > > > so you have one ifc disabling another, anyway.
> > > > 
> > > > And a level irqfd without an eoifd can never be de-asserted.  Either we
> > > > make modular components, assemble them to do useful work, and
> > > > disassemble them independently so they can be used by future interfaces
> > > > or we bundle eoifd as just an option of irqfd.  Which is it gonna be?
> > > 
> > > I don't think I've been successful at explaining my reasoning for making
> > > EOI notification a separate interface, so let me try again...
> > > 
> > > When kvm is not enabled, the qemu vfio driver still needs to know about
> > > EOIs to re-enable the physical interrupt.  Since the ioapic is emulated
> > > in qemu, we can setup a notifier for this and create abstraction to make
> > > it non-x86 specific, etc.  We just need to come up with a design and
> > > implement it.  But what happens when kvm is then enabled?  ioapic
> > > emulation moves to the kernel (assume kvm includes irqchip for this
> > > argument even though it doesn't for POWER), qemu no longer knows about
> > > EOIs, and the interface we just created to handle the non-kvm case stops
> > > working.  Is anyone going to accept adding a qemu EOI notification
> > > interface that only works when kvm is not enabled?
> > 
> > Yes, it's only a question of abstracting it at the right level.
> > 
> > For example, if as you suggest below kvm gives you an eventfd that
> > asserts an irq, laters automatically deasserts it and notifies another
> > eventfd, we can do exactly this in both tcg and kvm:
> > 
> > setup_level_irq(int gsi, int assert_eventfd, int deassert_eventfd)
> > 
> > Not advocating this interface but pointing out that to make
> > same abstraction to work in tcg and kvm, see what it does in
> > each of them first.
> 
> The tcg model I was thinking of is that we continue to use qemu_set_irq
> to assert and de-assert the interrupt and add an eoi/ack notification
> mechanism, much like the ack notifier that already exists in kvm.  There
> doesn't seem to be much advantage to creating a new interrupt
> infrastructure in tcg that can trigger interrupts by eventfds, so I
> assume VFIO is always going to be responsible for the translation of an
> eventfd to an irq assertion, get some kind of notification through qemu,
> de-assert the interrupt and unmask the device.  With that model in mind,
> perhaps it makes more sense why I've been keeping the eoi/ack separate
> from irqfd.
> 
> > > I suspect we therefore need a notification mechanism between kvm and
> > > qemu to make it possible for that interface to continue working.
> > 
> > Even though no one is actually using it. IMHO, this is a maintainance
> > problem.
> 
> That's why I'm designing it the way I am.  VFIO will make use of it.  It
> will just be using the de-assert and notify mode vs a notify-only mode
> that tcg would use.  It would also be easy to add an option to vfio so
> that we could fully test both modes.
> 
> > > An
> > > eventfd also seems like the right mechanism there.  A simple
> > > modification to the proposed KVM_EOIFD here would allow it to trigger an
> > > eventfd when an EOI is written to a specific gsi on
> > > KVM_USERSPACE_IRQ_SOURCE_ID (define a flag and pass gsi in place of
> > > key).
> > > 
> > > The split proposed here does require some assembly, but KVM_EOIFD is
> > > re-usable as either a de-assert and notify mechanism tied to an irqfd or
> > > a notify-only mechanism allowing users of a qemu EOI notification
> > > infrastructure to continue working.  vfio doesn't necessarily need this
> > > middle ground, but can easily be used to test it.
> > > 
> > > The alternative is that we pull eoifd into KVM_IRQFD and invent some
> > > other new EOI interface for qemu.  That means we get EOIs tied to an
> > > irqfd via one path and other EOIs via another ioctl.  Personally that
> > > seems less desirable, but I'm willing to explore that route if
> > > necessary.  Thanks,
> > > 
> > > Alex
> > 
> > Maybe we should focus on the fact that we notify userspace that we
> > deasserted interrupt instead of EOI.
> 
> But will a tcg user want the de-assert?  I assume not.  The de-assert is
> an optimization to allow us to bypass evaluation in userspace.  In tcg
> we're already there.  Thanks,
> 
> Alex

Look what I am saying forget tcg and APIs. Build a kernel interface that
makes sense. Then in qemu look at kvm and tcg and build abstraction for
it.  Building kernel interface so you can make nice abstractions in tcg
is backwards.
Alex Williamson Aug. 13, 2012, 6:17 p.m. UTC | #19
On Mon, 2012-08-13 at 19:59 +0300, Michael S. Tsirkin wrote:
> On Mon, Aug 13, 2012 at 10:48:15AM -0600, Alex Williamson wrote:
> > On Sun, 2012-08-12 at 10:49 +0300, Michael S. Tsirkin wrote:
> > > On Wed, Aug 01, 2012 at 01:06:42PM -0600, Alex Williamson wrote:
> > > > On Mon, 2012-07-30 at 19:12 -0600, Alex Williamson wrote:
> > > > > On Tue, 2012-07-31 at 03:36 +0300, Michael S. Tsirkin wrote:
> > > > > > On Mon, Jul 30, 2012 at 06:26:31PM -0600, Alex Williamson wrote:
> > > > > > > On Tue, 2012-07-31 at 03:01 +0300, Michael S. Tsirkin wrote:
> > > > > > > > You keep saying this but it is still true: once irqfd
> > > > > > > > is closed eoifd does not get any more interrupts.
> > > > > > > 
> > > > > > > How does that matter?
> > > > > > 
> > > > > > Well if it does not get events it is disabled.
> > > > > > so you have one ifc disabling another, anyway.
> > > > > 
> > > > > And a level irqfd without an eoifd can never be de-asserted.  Either we
> > > > > make modular components, assemble them to do useful work, and
> > > > > disassemble them independently so they can be used by future interfaces
> > > > > or we bundle eoifd as just an option of irqfd.  Which is it gonna be?
> > > > 
> > > > I don't think I've been successful at explaining my reasoning for making
> > > > EOI notification a separate interface, so let me try again...
> > > > 
> > > > When kvm is not enabled, the qemu vfio driver still needs to know about
> > > > EOIs to re-enable the physical interrupt.  Since the ioapic is emulated
> > > > in qemu, we can setup a notifier for this and create abstraction to make
> > > > it non-x86 specific, etc.  We just need to come up with a design and
> > > > implement it.  But what happens when kvm is then enabled?  ioapic
> > > > emulation moves to the kernel (assume kvm includes irqchip for this
> > > > argument even though it doesn't for POWER), qemu no longer knows about
> > > > EOIs, and the interface we just created to handle the non-kvm case stops
> > > > working.  Is anyone going to accept adding a qemu EOI notification
> > > > interface that only works when kvm is not enabled?
> > > 
> > > Yes, it's only a question of abstracting it at the right level.
> > > 
> > > For example, if as you suggest below kvm gives you an eventfd that
> > > asserts an irq, laters automatically deasserts it and notifies another
> > > eventfd, we can do exactly this in both tcg and kvm:
> > > 
> > > setup_level_irq(int gsi, int assert_eventfd, int deassert_eventfd)
> > > 
> > > Not advocating this interface but pointing out that to make
> > > same abstraction to work in tcg and kvm, see what it does in
> > > each of them first.
> > 
> > The tcg model I was thinking of is that we continue to use qemu_set_irq
> > to assert and de-assert the interrupt and add an eoi/ack notification
> > mechanism, much like the ack notifier that already exists in kvm.  There
> > doesn't seem to be much advantage to creating a new interrupt
> > infrastructure in tcg that can trigger interrupts by eventfds, so I
> > assume VFIO is always going to be responsible for the translation of an
> > eventfd to an irq assertion, get some kind of notification through qemu,
> > de-assert the interrupt and unmask the device.  With that model in mind,
> > perhaps it makes more sense why I've been keeping the eoi/ack separate
> > from irqfd.
> > 
> > > > I suspect we therefore need a notification mechanism between kvm and
> > > > qemu to make it possible for that interface to continue working.
> > > 
> > > Even though no one is actually using it. IMHO, this is a maintainance
> > > problem.
> > 
> > That's why I'm designing it the way I am.  VFIO will make use of it.  It
> > will just be using the de-assert and notify mode vs a notify-only mode
> > that tcg would use.  It would also be easy to add an option to vfio so
> > that we could fully test both modes.
> > 
> > > > An
> > > > eventfd also seems like the right mechanism there.  A simple
> > > > modification to the proposed KVM_EOIFD here would allow it to trigger an
> > > > eventfd when an EOI is written to a specific gsi on
> > > > KVM_USERSPACE_IRQ_SOURCE_ID (define a flag and pass gsi in place of
> > > > key).
> > > > 
> > > > The split proposed here does require some assembly, but KVM_EOIFD is
> > > > re-usable as either a de-assert and notify mechanism tied to an irqfd or
> > > > a notify-only mechanism allowing users of a qemu EOI notification
> > > > infrastructure to continue working.  vfio doesn't necessarily need this
> > > > middle ground, but can easily be used to test it.
> > > > 
> > > > The alternative is that we pull eoifd into KVM_IRQFD and invent some
> > > > other new EOI interface for qemu.  That means we get EOIs tied to an
> > > > irqfd via one path and other EOIs via another ioctl.  Personally that
> > > > seems less desirable, but I'm willing to explore that route if
> > > > necessary.  Thanks,
> > > > 
> > > > Alex
> > > 
> > > Maybe we should focus on the fact that we notify userspace that we
> > > deasserted interrupt instead of EOI.
> > 
> > But will a tcg user want the de-assert?  I assume not.  The de-assert is
> > an optimization to allow us to bypass evaluation in userspace.  In tcg
> > we're already there.  Thanks,
> > 
> > Alex
> 
> Look what I am saying forget tcg and APIs. Build a kernel interface that
> makes sense. Then in qemu look at kvm and tcg and build abstraction for
> it.  Building kernel interface so you can make nice abstractions in tcg
> is backwards.

Can you suggest specifically what doesn't make sense?  For legacy
interrupts VFIO needs to:

- Assert an interrupt

        Eventfds seem to be the most efficient way to signal when to
        assert an interrupt and gives us the flexibility that we can
        send that signal to either another kernel module or to
        userspace.  KVM_IRQFD is designed for exactly this, but needs
        modifications for level triggered interrupts.  These include:
        
        - Using a different IRQ source ID
        
                GSIs are not exclusive, multiple devices may assert the
                same GSI.  IRQ source IDs are how KVM handles multiple
                inputs.
                
        - Assert-only
        
                KVM_IRQFD currently does assert->deassert to emulate an
                edge triggered interrupt.  For level, we need to be able
                to signal a discrete assertion and de-assertion event.
        
        This results in the modifications I've proposed to KVM_IRQFD.
                
- Know when to de-assert an interrupt

        Servicing an interrupt is device specific, we can't know for any
        random device what interactions with the device indicate service
        of an interrupt.  We therefore look to the underlying hardware
        support where a vCPU writes an End Of Interrupt to the APIC to
        indicate the chip should re-sample it's inputs and either
        de-assert or continue asserting the interrupt level.  Our device
        may still require service at this point, but this mechanism has
        proven effective with KVM assignment.
        
        This results in the notify-only portion of the EOIFD/IRQ_ACKFD.
        
- Deassert an interrupt

        Now that we have an interrupt that's been asserted and we
        suspect that we should re-evaluate the interrupt signal due to
        activity possibly related to an EOI, we need a mechanism to
        de-assert the interrupt.  There are two possibilities here:
        
        - Test and de-assert
        
                Depending on hardware support for INTxDisable, we may be
                able to poll whether the hardware is still asserting
                it's interrupt and de-assert if quiesced.  This
                optimizes for the case where the interrupt is still
                asserting as we avoid re-assertion and avoid unmasking
                the device.
        
        - De-assert, test, (re-assert)
        
                Not all hardware supports INTxDisable, so we may have no
                way to test whether the device is still asserting an
                interrupt other than to unmask and see if it re-fires.
                This not only supports the most hardware, but also
                optimizes for the case where the device is quiesced.
                
        Taking the latter path results in the de-assert and notify
        interface to the above EOIFD/IRQ_ACKFD.  This reduces the number
        of signals between components and supports the most hardware.
        
That leaves dealing with the IRQ source ID.  Initially I tried to hide
this from userspace as it's more of an implementation detail of KVM, but
in v8 I expose it as it offers more flexibility and (I hope) removes
some of the odd dependencies between interfaces imposed by previous
version.

If you have specific suggestions how else to approach this, I welcome
the feedback.  It would be backwards to design an interface exclusively
around a single user, but it would be just as backwards to not envision
how an interface would be used in advance.  Thanks,

Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Aug. 13, 2012, 7:50 p.m. UTC | #20
On Mon, Aug 13, 2012 at 12:17:25PM -0600, Alex Williamson wrote:
> On Mon, 2012-08-13 at 19:59 +0300, Michael S. Tsirkin wrote:
> > On Mon, Aug 13, 2012 at 10:48:15AM -0600, Alex Williamson wrote:
> > > On Sun, 2012-08-12 at 10:49 +0300, Michael S. Tsirkin wrote:
> > > > On Wed, Aug 01, 2012 at 01:06:42PM -0600, Alex Williamson wrote:
> > > > > On Mon, 2012-07-30 at 19:12 -0600, Alex Williamson wrote:
> > > > > > On Tue, 2012-07-31 at 03:36 +0300, Michael S. Tsirkin wrote:
> > > > > > > On Mon, Jul 30, 2012 at 06:26:31PM -0600, Alex Williamson wrote:
> > > > > > > > On Tue, 2012-07-31 at 03:01 +0300, Michael S. Tsirkin wrote:
> > > > > > > > > You keep saying this but it is still true: once irqfd
> > > > > > > > > is closed eoifd does not get any more interrupts.
> > > > > > > > 
> > > > > > > > How does that matter?
> > > > > > > 
> > > > > > > Well if it does not get events it is disabled.
> > > > > > > so you have one ifc disabling another, anyway.
> > > > > > 
> > > > > > And a level irqfd without an eoifd can never be de-asserted.  Either we
> > > > > > make modular components, assemble them to do useful work, and
> > > > > > disassemble them independently so they can be used by future interfaces
> > > > > > or we bundle eoifd as just an option of irqfd.  Which is it gonna be?
> > > > > 
> > > > > I don't think I've been successful at explaining my reasoning for making
> > > > > EOI notification a separate interface, so let me try again...
> > > > > 
> > > > > When kvm is not enabled, the qemu vfio driver still needs to know about
> > > > > EOIs to re-enable the physical interrupt.  Since the ioapic is emulated
> > > > > in qemu, we can setup a notifier for this and create abstraction to make
> > > > > it non-x86 specific, etc.  We just need to come up with a design and
> > > > > implement it.  But what happens when kvm is then enabled?  ioapic
> > > > > emulation moves to the kernel (assume kvm includes irqchip for this
> > > > > argument even though it doesn't for POWER), qemu no longer knows about
> > > > > EOIs, and the interface we just created to handle the non-kvm case stops
> > > > > working.  Is anyone going to accept adding a qemu EOI notification
> > > > > interface that only works when kvm is not enabled?
> > > > 
> > > > Yes, it's only a question of abstracting it at the right level.
> > > > 
> > > > For example, if as you suggest below kvm gives you an eventfd that
> > > > asserts an irq, laters automatically deasserts it and notifies another
> > > > eventfd, we can do exactly this in both tcg and kvm:
> > > > 
> > > > setup_level_irq(int gsi, int assert_eventfd, int deassert_eventfd)
> > > > 
> > > > Not advocating this interface but pointing out that to make
> > > > same abstraction to work in tcg and kvm, see what it does in
> > > > each of them first.
> > > 
> > > The tcg model I was thinking of is that we continue to use qemu_set_irq
> > > to assert and de-assert the interrupt and add an eoi/ack notification
> > > mechanism, much like the ack notifier that already exists in kvm.  There
> > > doesn't seem to be much advantage to creating a new interrupt
> > > infrastructure in tcg that can trigger interrupts by eventfds, so I
> > > assume VFIO is always going to be responsible for the translation of an
> > > eventfd to an irq assertion, get some kind of notification through qemu,
> > > de-assert the interrupt and unmask the device.  With that model in mind,
> > > perhaps it makes more sense why I've been keeping the eoi/ack separate
> > > from irqfd.
> > > 
> > > > > I suspect we therefore need a notification mechanism between kvm and
> > > > > qemu to make it possible for that interface to continue working.
> > > > 
> > > > Even though no one is actually using it. IMHO, this is a maintainance
> > > > problem.
> > > 
> > > That's why I'm designing it the way I am.  VFIO will make use of it.  It
> > > will just be using the de-assert and notify mode vs a notify-only mode
> > > that tcg would use.  It would also be easy to add an option to vfio so
> > > that we could fully test both modes.
> > > 
> > > > > An
> > > > > eventfd also seems like the right mechanism there.  A simple
> > > > > modification to the proposed KVM_EOIFD here would allow it to trigger an
> > > > > eventfd when an EOI is written to a specific gsi on
> > > > > KVM_USERSPACE_IRQ_SOURCE_ID (define a flag and pass gsi in place of
> > > > > key).
> > > > > 
> > > > > The split proposed here does require some assembly, but KVM_EOIFD is
> > > > > re-usable as either a de-assert and notify mechanism tied to an irqfd or
> > > > > a notify-only mechanism allowing users of a qemu EOI notification
> > > > > infrastructure to continue working.  vfio doesn't necessarily need this
> > > > > middle ground, but can easily be used to test it.
> > > > > 
> > > > > The alternative is that we pull eoifd into KVM_IRQFD and invent some
> > > > > other new EOI interface for qemu.  That means we get EOIs tied to an
> > > > > irqfd via one path and other EOIs via another ioctl.  Personally that
> > > > > seems less desirable, but I'm willing to explore that route if
> > > > > necessary.  Thanks,
> > > > > 
> > > > > Alex
> > > > 
> > > > Maybe we should focus on the fact that we notify userspace that we
> > > > deasserted interrupt instead of EOI.
> > > 
> > > But will a tcg user want the de-assert?  I assume not.  The de-assert is
> > > an optimization to allow us to bypass evaluation in userspace.  In tcg
> > > we're already there.  Thanks,
> > > 
> > > Alex
> > 
> > Look what I am saying forget tcg and APIs. Build a kernel interface that
> > makes sense. Then in qemu look at kvm and tcg and build abstraction for
> > it.  Building kernel interface so you can make nice abstractions in tcg
> > is backwards.
> 
> Can you suggest specifically what doesn't make sense?

Interface is just very easy to misuse. Here are things that
you expose that to me do not seem to make sense:

- ability to create irqfd that by default can not be deasserted
  (you need eoifd to deassert)
- interface to create eventfd that by default never gets events
  (you need irqfd to assert)
- creating ack eventfd requires level irqfd but you won't
  know it unless you read documentation
- duplicating level/edge information that we already have in GSI

Knowing all these quirks is a must if you want things to
work, but you do not know them until you read documentation.
This is not good interface, a good interface is
hard to misuse and self-documenting.


> For legacy interrupts VFIO needs to:
> 
> - Assert an interrupt
> 
>         Eventfds seem to be the most efficient way to signal when to
>         assert an interrupt and gives us the flexibility that we can
>         send that signal to either another kernel module or to
>         userspace.  KVM_IRQFD is designed for exactly this, but needs
>         modifications for level triggered interrupts.  These include:
>         
>         - Using a different IRQ source ID
>         
>                 GSIs are not exclusive, multiple devices may assert the
>                 same GSI.  IRQ source IDs are how KVM handles multiple
>                 inputs.

Actually, thinking about it some more, all assigned
device interrupts are deasserted on ack, so together.
And userspace does the OR in userspace already.

So why is it not enough to give IRQFDs a single separate
source ID, distinct from userspace but shared by all devices?


>         - Assert-only
>         
>                 KVM_IRQFD currently does assert->deassert to emulate an
>                 edge triggered interrupt.  For level, we need to be able
>                 to signal a discrete assertion and de-assertion event.
>         This results in the modifications I've proposed to KVM_IRQFD.

Actually is it really necessary at all?  What happens if we assert and
deassert immediately?  If guest lost the interrupt, on EOI device will
reassert resulting in another interrupt.

> - Know when to de-assert an interrupt
> 
>         Servicing an interrupt is device specific, we can't know for any
>         random device what interactions with the device indicate service
>         of an interrupt.  We therefore look to the underlying hardware
>         support where a vCPU writes an End Of Interrupt to the APIC to
>         indicate the chip should re-sample it's inputs and either
>         de-assert or continue asserting the interrupt level.  Our device
>         may still require service at this point, but this mechanism has
>         proven effective with KVM assignment.
>         
>         This results in the notify-only portion of the EOIFD/IRQ_ACKFD.
>         
> - Deassert an interrupt
> 
>         Now that we have an interrupt that's been asserted and we
>         suspect that we should re-evaluate the interrupt signal due to
>         activity possibly related to an EOI, we need a mechanism to
>         de-assert the interrupt.  There are two possibilities here:
>         
>         - Test and de-assert
>         
>                 Depending on hardware support for INTxDisable, we may be
>                 able to poll whether the hardware is still asserting
>                 it's interrupt and de-assert if quiesced.  This
>                 optimizes for the case where the interrupt is still
>                 asserting as we avoid re-assertion and avoid unmasking
>                 the device.
>         
>         - De-assert, test, (re-assert)
>         
>                 Not all hardware supports INTxDisable, so we may have no
>                 way to test whether the device is still asserting an
>                 interrupt other than to unmask and see if it re-fires.
>                 This not only supports the most hardware, but also
>                 optimizes for the case where the device is quiesced.
>                 
>         Taking the latter path results in the de-assert and notify
>         interface to the above EOIFD/IRQ_ACKFD.  This reduces the number
>         of signals between components and supports the most hardware.
>         
> That leaves dealing with the IRQ source ID.  Initially I tried to hide
> this from userspace as it's more of an implementation detail of KVM, but
> in v8 I expose it as it offers more flexibility and (I hope) removes
> some of the odd dependencies between interfaces imposed by previous
> version.
> 
> If you have specific suggestions how else to approach this, I welcome
> the feedback.
> It would be backwards to design an interface exclusively around a
> single user, but it would be just as backwards to not envision how an
> interface would be used in advance.  Thanks,
> 
> Alex

Could you address two questions I ask above pls?
If we really can use the same source ID for all irqfds,
and if it's ok to deassert immediately after all,
then large parts of code can go away.

Or maybe I was away for too long and forgot
what the problem were ...
Alex Williamson Aug. 13, 2012, 8:48 p.m. UTC | #21
On Mon, 2012-08-13 at 22:50 +0300, Michael S. Tsirkin wrote:
> On Mon, Aug 13, 2012 at 12:17:25PM -0600, Alex Williamson wrote:
> > On Mon, 2012-08-13 at 19:59 +0300, Michael S. Tsirkin wrote:
> > > On Mon, Aug 13, 2012 at 10:48:15AM -0600, Alex Williamson wrote:
> > > > On Sun, 2012-08-12 at 10:49 +0300, Michael S. Tsirkin wrote:
> > > > > On Wed, Aug 01, 2012 at 01:06:42PM -0600, Alex Williamson wrote:
> > > > > > On Mon, 2012-07-30 at 19:12 -0600, Alex Williamson wrote:
> > > > > > > On Tue, 2012-07-31 at 03:36 +0300, Michael S. Tsirkin wrote:
> > > > > > > > On Mon, Jul 30, 2012 at 06:26:31PM -0600, Alex Williamson wrote:
> > > > > > > > > On Tue, 2012-07-31 at 03:01 +0300, Michael S. Tsirkin wrote:
> > > > > > > > > > You keep saying this but it is still true: once irqfd
> > > > > > > > > > is closed eoifd does not get any more interrupts.
> > > > > > > > > 
> > > > > > > > > How does that matter?
> > > > > > > > 
> > > > > > > > Well if it does not get events it is disabled.
> > > > > > > > so you have one ifc disabling another, anyway.
> > > > > > > 
> > > > > > > And a level irqfd without an eoifd can never be de-asserted.  Either we
> > > > > > > make modular components, assemble them to do useful work, and
> > > > > > > disassemble them independently so they can be used by future interfaces
> > > > > > > or we bundle eoifd as just an option of irqfd.  Which is it gonna be?
> > > > > > 
> > > > > > I don't think I've been successful at explaining my reasoning for making
> > > > > > EOI notification a separate interface, so let me try again...
> > > > > > 
> > > > > > When kvm is not enabled, the qemu vfio driver still needs to know about
> > > > > > EOIs to re-enable the physical interrupt.  Since the ioapic is emulated
> > > > > > in qemu, we can setup a notifier for this and create abstraction to make
> > > > > > it non-x86 specific, etc.  We just need to come up with a design and
> > > > > > implement it.  But what happens when kvm is then enabled?  ioapic
> > > > > > emulation moves to the kernel (assume kvm includes irqchip for this
> > > > > > argument even though it doesn't for POWER), qemu no longer knows about
> > > > > > EOIs, and the interface we just created to handle the non-kvm case stops
> > > > > > working.  Is anyone going to accept adding a qemu EOI notification
> > > > > > interface that only works when kvm is not enabled?
> > > > > 
> > > > > Yes, it's only a question of abstracting it at the right level.
> > > > > 
> > > > > For example, if as you suggest below kvm gives you an eventfd that
> > > > > asserts an irq, laters automatically deasserts it and notifies another
> > > > > eventfd, we can do exactly this in both tcg and kvm:
> > > > > 
> > > > > setup_level_irq(int gsi, int assert_eventfd, int deassert_eventfd)
> > > > > 
> > > > > Not advocating this interface but pointing out that to make
> > > > > same abstraction to work in tcg and kvm, see what it does in
> > > > > each of them first.
> > > > 
> > > > The tcg model I was thinking of is that we continue to use qemu_set_irq
> > > > to assert and de-assert the interrupt and add an eoi/ack notification
> > > > mechanism, much like the ack notifier that already exists in kvm.  There
> > > > doesn't seem to be much advantage to creating a new interrupt
> > > > infrastructure in tcg that can trigger interrupts by eventfds, so I
> > > > assume VFIO is always going to be responsible for the translation of an
> > > > eventfd to an irq assertion, get some kind of notification through qemu,
> > > > de-assert the interrupt and unmask the device.  With that model in mind,
> > > > perhaps it makes more sense why I've been keeping the eoi/ack separate
> > > > from irqfd.
> > > > 
> > > > > > I suspect we therefore need a notification mechanism between kvm and
> > > > > > qemu to make it possible for that interface to continue working.
> > > > > 
> > > > > Even though no one is actually using it. IMHO, this is a maintainance
> > > > > problem.
> > > > 
> > > > That's why I'm designing it the way I am.  VFIO will make use of it.  It
> > > > will just be using the de-assert and notify mode vs a notify-only mode
> > > > that tcg would use.  It would also be easy to add an option to vfio so
> > > > that we could fully test both modes.
> > > > 
> > > > > > An
> > > > > > eventfd also seems like the right mechanism there.  A simple
> > > > > > modification to the proposed KVM_EOIFD here would allow it to trigger an
> > > > > > eventfd when an EOI is written to a specific gsi on
> > > > > > KVM_USERSPACE_IRQ_SOURCE_ID (define a flag and pass gsi in place of
> > > > > > key).
> > > > > > 
> > > > > > The split proposed here does require some assembly, but KVM_EOIFD is
> > > > > > re-usable as either a de-assert and notify mechanism tied to an irqfd or
> > > > > > a notify-only mechanism allowing users of a qemu EOI notification
> > > > > > infrastructure to continue working.  vfio doesn't necessarily need this
> > > > > > middle ground, but can easily be used to test it.
> > > > > > 
> > > > > > The alternative is that we pull eoifd into KVM_IRQFD and invent some
> > > > > > other new EOI interface for qemu.  That means we get EOIs tied to an
> > > > > > irqfd via one path and other EOIs via another ioctl.  Personally that
> > > > > > seems less desirable, but I'm willing to explore that route if
> > > > > > necessary.  Thanks,
> > > > > > 
> > > > > > Alex
> > > > > 
> > > > > Maybe we should focus on the fact that we notify userspace that we
> > > > > deasserted interrupt instead of EOI.
> > > > 
> > > > But will a tcg user want the de-assert?  I assume not.  The de-assert is
> > > > an optimization to allow us to bypass evaluation in userspace.  In tcg
> > > > we're already there.  Thanks,
> > > > 
> > > > Alex
> > > 
> > > Look what I am saying forget tcg and APIs. Build a kernel interface that
> > > makes sense. Then in qemu look at kvm and tcg and build abstraction for
> > > it.  Building kernel interface so you can make nice abstractions in tcg
> > > is backwards.
> > 
> > Can you suggest specifically what doesn't make sense?
> 
> Interface is just very easy to misuse. Here are things that
> you expose that to me do not seem to make sense:
> 
> - ability to create irqfd that by default can not be deasserted
>   (you need eoifd to deassert)

Well, it's not really the default, a user has to add a flag to get this
ability.

> - interface to create eventfd that by default never gets events
>   (you need irqfd to assert)

In v8, this isn't the default, the user has to specify that they want to
use it to de-assert.

> - creating ack eventfd requires level irqfd but you won't
>   know it unless you read documentation

This is also fixed in v8, you get a source ID, then hook it up to an
irqfd/irq ackfd any way you want.

> - duplicating level/edge information that we already have in GSI

Not really duplication, the edge/level information is several layers of
indirection away from this interface.  As we've discussed in the past,
relying on that information also means that the behavior of an ioctl
depends on the state of another piece of emulated hardware which is
controlled by the guest at the time the ioctl is called.  Personally, I
don't think that's a good characteristic.

> Knowing all these quirks is a must if you want things to
> work, but you do not know them until you read documentation.
> This is not good interface, a good interface is
> hard to misuse and self-documenting.

I think v8 makes improvements here, I'd be happy to hear your feedback
on it.

> > For legacy interrupts VFIO needs to:
> > 
> > - Assert an interrupt
> > 
> >         Eventfds seem to be the most efficient way to signal when to
> >         assert an interrupt and gives us the flexibility that we can
> >         send that signal to either another kernel module or to
> >         userspace.  KVM_IRQFD is designed for exactly this, but needs
> >         modifications for level triggered interrupts.  These include:
> >         
> >         - Using a different IRQ source ID
> >         
> >                 GSIs are not exclusive, multiple devices may assert the
> >                 same GSI.  IRQ source IDs are how KVM handles multiple
> >                 inputs.
> 
> Actually, thinking about it some more, all assigned
> device interrupts are deasserted on ack, so together.
> And userspace does the OR in userspace already.
> 
> So why is it not enough to give IRQFDs a single separate
> source ID, distinct from userspace but shared by all devices?

We could do that, but then we lose any ability to filter the KVM irq ack
notifier based on whether a given IRQ source ID is asserted.  This is
something you've been pushing for.  Note that patch 1/6 of the v8 series
adds this generically for all irq ack notifier users.  That's of course
just an optimization, we could have IRQ source IDs re-used and that
might be a good solution if we ever start exhausting them.  v8 allows
userspace to do this if it wants.

> >         - Assert-only
> >         
> >                 KVM_IRQFD currently does assert->deassert to emulate an
> >                 edge triggered interrupt.  For level, we need to be able
> >                 to signal a discrete assertion and de-assertion event.
> >         This results in the modifications I've proposed to KVM_IRQFD.
> 
> Actually is it really necessary at all?  What happens if we assert and
> deassert immediately?  If guest lost the interrupt, on EOI device will
> reassert resulting in another interrupt.

It's been a while since I've tried, but I recall I used this as a
workaround early on in development and it did work.  I don't feel it's a
proper representation of the hardware we're trying to emulate though and
istr that Avi wasn't too fond of it either.

> > - Know when to de-assert an interrupt
> > 
> >         Servicing an interrupt is device specific, we can't know for any
> >         random device what interactions with the device indicate service
> >         of an interrupt.  We therefore look to the underlying hardware
> >         support where a vCPU writes an End Of Interrupt to the APIC to
> >         indicate the chip should re-sample it's inputs and either
> >         de-assert or continue asserting the interrupt level.  Our device
> >         may still require service at this point, but this mechanism has
> >         proven effective with KVM assignment.
> >         
> >         This results in the notify-only portion of the EOIFD/IRQ_ACKFD.
> >         
> > - Deassert an interrupt
> > 
> >         Now that we have an interrupt that's been asserted and we
> >         suspect that we should re-evaluate the interrupt signal due to
> >         activity possibly related to an EOI, we need a mechanism to
> >         de-assert the interrupt.  There are two possibilities here:
> >         
> >         - Test and de-assert
> >         
> >                 Depending on hardware support for INTxDisable, we may be
> >                 able to poll whether the hardware is still asserting
> >                 it's interrupt and de-assert if quiesced.  This
> >                 optimizes for the case where the interrupt is still
> >                 asserting as we avoid re-assertion and avoid unmasking
> >                 the device.
> >         
> >         - De-assert, test, (re-assert)
> >         
> >                 Not all hardware supports INTxDisable, so we may have no
> >                 way to test whether the device is still asserting an
> >                 interrupt other than to unmask and see if it re-fires.
> >                 This not only supports the most hardware, but also
> >                 optimizes for the case where the device is quiesced.
> >                 
> >         Taking the latter path results in the de-assert and notify
> >         interface to the above EOIFD/IRQ_ACKFD.  This reduces the number
> >         of signals between components and supports the most hardware.
> >         
> > That leaves dealing with the IRQ source ID.  Initially I tried to hide
> > this from userspace as it's more of an implementation detail of KVM, but
> > in v8 I expose it as it offers more flexibility and (I hope) removes
> > some of the odd dependencies between interfaces imposed by previous
> > version.
> > 
> > If you have specific suggestions how else to approach this, I welcome
> > the feedback.
> > It would be backwards to design an interface exclusively around a
> > single user, but it would be just as backwards to not envision how an
> > interface would be used in advance.  Thanks,
> > 
> > Alex
> 
> Could you address two questions I ask above pls?
> If we really can use the same source ID for all irqfds,
> and if it's ok to deassert immediately after all,
> then large parts of code can go away.
> 
> Or maybe I was away for too long and forgot
> what the problem were ...

So if we de-assert immediately and remove the notify on de-assert, then
irq_ackfd becomes a notify-only interface.  In v8 that's what it is at
it's base, with an option to de-assert.  That option (patch 6/6) is a
tiny bit of code.

Removing the irq source ID isn't a clear win to me either.  I'm becoming
a broken record, but v8 already simplifies the irq source ID allocation
and preserves the ability to filter irq ack notifications and targeted
re-use of irq source IDs if userspace decides to support that.  Thanks,

Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson Aug. 13, 2012, 9:23 p.m. UTC | #22
On Sun, 2012-08-12 at 12:33 +0300, Michael S. Tsirkin wrote:
> On Thu, Aug 09, 2012 at 01:26:15PM -0600, Alex Williamson wrote:
> > On Mon, 2012-08-06 at 13:40 +0300, Avi Kivity wrote:
> > > On 08/06/2012 01:38 PM, Avi Kivity wrote:
> > > 
> > > > Regarding the implementation, instead of a linked list, would an array
> > > > of counters parallel to the bitmap make it simpler?
> > > 
> > > Or even, replace the bitmap with an array of counters.
> > 
> > I'm not sure a counter array is what we're really after.  That gives us
> > reference counting for the irq source IDs, but not the key->gsi lookup.
> > It also highlights another issue, that we have a limited set of source
> > IDs.  Looks like we have BITS_PER_LONG IDs, with two already used, one
> > for the shared userspace ID and another for the PIT.  How happy are we
> > going to be with a limit of 62 level interrupts in use at one time?
> > 
> > It's arguably a reasonable number since the most virtualization friendly
> > devices (sr-iov VFs) don't even support this kind of interrupt.  It's
> > also very wasteful allocating an entire source ID for a single GSI
> > within that source ID.  PCI supports interrupts A, B, C, and D, which,
> > in the most optimal config, each go to different GSIs.  So we could
> > theoretically be more efficient in our use and allocation of irq source
> > IDs if we tracked use by the source ID, gsi pair.
> > 
> > That probably makes it less practical to replace anything at the top
> > level with a counter array.  The key that we pass back is currently the
> > actual source ID, but we don't specify what it is, so we could split it
> > and have it encode a 16bit source ID plus 16 bit GSI.  It could also be
> > an idr entry.
> > 
> > Michael, would the interface be more acceptable to you if we added
> > separate ioctls to allocate and free some representation of an irq
> > source ID, gsi pair?  For instance, an ioctl might return an idr entry
> > for an irq source ID/gsi object which would then be passed as a
> > parameter in struct kvm_irqfd and struct kvm_eoifd so that the object
> > representing the source id/gsi isn't magically freed on it's own.  This
> > would also allow us to deassign/close one end and reconfigure it later.
> > Thanks,
> > 
> > Alex
> 
> It's acceptable to me either way. I was only pointing out that as
> designed, the interface looks simple at first but then you find out some
> subtle limitations which are implementation driven. This gives
> an overall feeling the abstraction is too low level.
> 
> If we compare to the existing irqfd, isn't the difference
> simply that irqfd deasserts immediately ATM, while we
> want to delay this until later?
> 
> If yes, then along the lines that you proposed, and combining with my
> idea of tracking deasserts, how do you like the following:
> 
> /* Keep line asserted until guest has handled the interrupt. */
> #define KVM_IRQFD_FLAG_DEASSERT_ON_ACK (1 << 1)
> /* Notify after line is deasserted. */
> #define KVM_IRQFD_FLAG_DEASSERT_EVENTFD (2 << 1)
> 
> 	struct kvm_irqfd {
> 		__u32 fd;
> 		__u32 gsi;
> 		__u32 flags;
> 		/* eventfd to notify when line is deasserted */
> 		__u32 deassert_eventfd;
> 		__u8  pad[16];
> 	};
> 
> now the only limitation is that KVM_IRQFD_FLAG_DEASSERT_ON_ACK is only
> effective for level interrupts.
> 
> Notes about lifetime of objects:
> 	- closing deassert_eventfd does nothing (we can keep
> 	  reference to it from irqfd so no need for
>           complex polling/flushing scheme)
> 	- closing irqfd or deasserting dis-associates
> 	  deassert_eventfd automatically
> 	- source id is internal to irqfd and goes away with it
> 
> it looks harder to misuse and fits what we want to do nicely,
> and needs less code to implement.

This is effectively what I meant when I suggested we either need to a)
pull eoifd into irqfd or b) implement them as modular components.  I
chose to implement b) because I think that non-irqfd related ack
notification to userspace will be useful and a) does not provide that.
So this interface enables exactly the use case for device assignment and
no more.  I feel like this is the start of an ioctl that will be quickly
deprecated, but if that's the direction we want to go, I'll write the
code.  Thanks,

Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson Aug. 13, 2012, 9:34 p.m. UTC | #23
On Sun, 2012-08-12 at 11:36 +0300, Avi Kivity wrote:
> On 08/09/2012 10:26 PM, Alex Williamson wrote:
> > On Mon, 2012-08-06 at 13:40 +0300, Avi Kivity wrote:
> >> On 08/06/2012 01:38 PM, Avi Kivity wrote:
> >> 
> >> > Regarding the implementation, instead of a linked list, would an array
> >> > of counters parallel to the bitmap make it simpler?
> >> 
> >> Or even, replace the bitmap with an array of counters.
> > 
> > I'm not sure a counter array is what we're really after.  That gives us
> > reference counting for the irq source IDs, but not the key->gsi lookup.
> 
> You can look up the gsi while registering the eoifd, so it's accessible
> as eoifd->gsi instead of eoifd->source->gsi.  The irqfd can go away
> while the eoifd is still active, but is this a problem?

In my opinion, no, but Michael disagrees.

> > It also highlights another issue, that we have a limited set of source
> > IDs.  Looks like we have BITS_PER_LONG IDs, with two already used, one
> > for the shared userspace ID and another for the PIT.  How happy are we
> > going to be with a limit of 62 level interrupts in use at one time?
> 
> When we start being unhappy we can increase that number.  On the other
> hand more locks and lists makes me unhappy now.

Yep, good point.  My latest version removes the source ID object lock
and list (and objects).  I still have a lock and list for the ack
notification, but it's hard not to unless we combine them into one
mega-irqfd ioctl as Michael suggests.

> > It's arguably a reasonable number since the most virtualization friendly
> > devices (sr-iov VFs) don't even support this kind of interrupt.  It's
> > also very wasteful allocating an entire source ID for a single GSI
> > within that source ID.  PCI supports interrupts A, B, C, and D, which,
> > in the most optimal config, each go to different GSIs.  So we could
> > theoretically be more efficient in our use and allocation of irq source
> > IDs if we tracked use by the source ID, gsi pair.
> 
> There are, in one userspace, just three gsis available for PCI links, so
> you're compressing the source id space by 3.

I imagine there's a way to put each PCI interrupt pin on a GSI, but
still only 4, not a great expansion of source ID space.  I like
Michael's idea of re-using source IDs if we run out better.

> > That probably makes it less practical to replace anything at the top
> > level with a counter array.  The key that we pass back is currently the
> > actual source ID, but we don't specify what it is, so we could split it
> > and have it encode a 16bit source ID plus 16 bit GSI.  It could also be
> > an idr entry.
> 
> We can fix those kinds of problems by adding another layer of
> indirection.  But I doubt they will be needed.  I don't see people
> assigning 60 legacy devices to one guest.

Yep, we can ignore it for now and put it in the hands of userspace to
re-use IDs if needed.

> > Michael, would the interface be more acceptable to you if we added
> > separate ioctls to allocate and free some representation of an irq
> > source ID, gsi pair?  For instance, an ioctl might return an idr entry
> > for an irq source ID/gsi object which would then be passed as a
> > parameter in struct kvm_irqfd and struct kvm_eoifd so that the object
> > representing the source id/gsi isn't magically freed on it's own.  This
> > would also allow us to deassign/close one end and reconfigure it later.
> > Thanks,
> 
> Another option is to push the responsibility for allocating IDs for the
> association to userspace.  Let userspace both create the irqfd and the
> eoifd with the same ID, the kernel matches them at registration time and
> copies the gsi/sourceid from the first to the second eventfd.

Aside from the copying gsi/sourceid bit, you've just described my latest
attempt at this series.  Specifying both a sourceid and gsi also allows
userspace to make better use of the sourceid address space (use more
than one gsi if userspace wants the complexity of managing them).
Thanks,

Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Aug. 13, 2012, 9:50 p.m. UTC | #24
On Mon, Aug 13, 2012 at 02:48:25PM -0600, Alex Williamson wrote:
> On Mon, 2012-08-13 at 22:50 +0300, Michael S. Tsirkin wrote:
> > On Mon, Aug 13, 2012 at 12:17:25PM -0600, Alex Williamson wrote:
> > > On Mon, 2012-08-13 at 19:59 +0300, Michael S. Tsirkin wrote:
> > > > On Mon, Aug 13, 2012 at 10:48:15AM -0600, Alex Williamson wrote:
> > > > > On Sun, 2012-08-12 at 10:49 +0300, Michael S. Tsirkin wrote:
> > > > > > On Wed, Aug 01, 2012 at 01:06:42PM -0600, Alex Williamson wrote:
> > > > > > > On Mon, 2012-07-30 at 19:12 -0600, Alex Williamson wrote:
> > > > > > > > On Tue, 2012-07-31 at 03:36 +0300, Michael S. Tsirkin wrote:
> > > > > > > > > On Mon, Jul 30, 2012 at 06:26:31PM -0600, Alex Williamson wrote:
> > > > > > > > > > On Tue, 2012-07-31 at 03:01 +0300, Michael S. Tsirkin wrote:
> > > > > > > > > > > You keep saying this but it is still true: once irqfd
> > > > > > > > > > > is closed eoifd does not get any more interrupts.
> > > > > > > > > > 
> > > > > > > > > > How does that matter?
> > > > > > > > > 
> > > > > > > > > Well if it does not get events it is disabled.
> > > > > > > > > so you have one ifc disabling another, anyway.
> > > > > > > > 
> > > > > > > > And a level irqfd without an eoifd can never be de-asserted.  Either we
> > > > > > > > make modular components, assemble them to do useful work, and
> > > > > > > > disassemble them independently so they can be used by future interfaces
> > > > > > > > or we bundle eoifd as just an option of irqfd.  Which is it gonna be?
> > > > > > > 
> > > > > > > I don't think I've been successful at explaining my reasoning for making
> > > > > > > EOI notification a separate interface, so let me try again...
> > > > > > > 
> > > > > > > When kvm is not enabled, the qemu vfio driver still needs to know about
> > > > > > > EOIs to re-enable the physical interrupt.  Since the ioapic is emulated
> > > > > > > in qemu, we can setup a notifier for this and create abstraction to make
> > > > > > > it non-x86 specific, etc.  We just need to come up with a design and
> > > > > > > implement it.  But what happens when kvm is then enabled?  ioapic
> > > > > > > emulation moves to the kernel (assume kvm includes irqchip for this
> > > > > > > argument even though it doesn't for POWER), qemu no longer knows about
> > > > > > > EOIs, and the interface we just created to handle the non-kvm case stops
> > > > > > > working.  Is anyone going to accept adding a qemu EOI notification
> > > > > > > interface that only works when kvm is not enabled?
> > > > > > 
> > > > > > Yes, it's only a question of abstracting it at the right level.
> > > > > > 
> > > > > > For example, if as you suggest below kvm gives you an eventfd that
> > > > > > asserts an irq, laters automatically deasserts it and notifies another
> > > > > > eventfd, we can do exactly this in both tcg and kvm:
> > > > > > 
> > > > > > setup_level_irq(int gsi, int assert_eventfd, int deassert_eventfd)
> > > > > > 
> > > > > > Not advocating this interface but pointing out that to make
> > > > > > same abstraction to work in tcg and kvm, see what it does in
> > > > > > each of them first.
> > > > > 
> > > > > The tcg model I was thinking of is that we continue to use qemu_set_irq
> > > > > to assert and de-assert the interrupt and add an eoi/ack notification
> > > > > mechanism, much like the ack notifier that already exists in kvm.  There
> > > > > doesn't seem to be much advantage to creating a new interrupt
> > > > > infrastructure in tcg that can trigger interrupts by eventfds, so I
> > > > > assume VFIO is always going to be responsible for the translation of an
> > > > > eventfd to an irq assertion, get some kind of notification through qemu,
> > > > > de-assert the interrupt and unmask the device.  With that model in mind,
> > > > > perhaps it makes more sense why I've been keeping the eoi/ack separate
> > > > > from irqfd.
> > > > > 
> > > > > > > I suspect we therefore need a notification mechanism between kvm and
> > > > > > > qemu to make it possible for that interface to continue working.
> > > > > > 
> > > > > > Even though no one is actually using it. IMHO, this is a maintainance
> > > > > > problem.
> > > > > 
> > > > > That's why I'm designing it the way I am.  VFIO will make use of it.  It
> > > > > will just be using the de-assert and notify mode vs a notify-only mode
> > > > > that tcg would use.  It would also be easy to add an option to vfio so
> > > > > that we could fully test both modes.
> > > > > 
> > > > > > > An
> > > > > > > eventfd also seems like the right mechanism there.  A simple
> > > > > > > modification to the proposed KVM_EOIFD here would allow it to trigger an
> > > > > > > eventfd when an EOI is written to a specific gsi on
> > > > > > > KVM_USERSPACE_IRQ_SOURCE_ID (define a flag and pass gsi in place of
> > > > > > > key).
> > > > > > > 
> > > > > > > The split proposed here does require some assembly, but KVM_EOIFD is
> > > > > > > re-usable as either a de-assert and notify mechanism tied to an irqfd or
> > > > > > > a notify-only mechanism allowing users of a qemu EOI notification
> > > > > > > infrastructure to continue working.  vfio doesn't necessarily need this
> > > > > > > middle ground, but can easily be used to test it.
> > > > > > > 
> > > > > > > The alternative is that we pull eoifd into KVM_IRQFD and invent some
> > > > > > > other new EOI interface for qemu.  That means we get EOIs tied to an
> > > > > > > irqfd via one path and other EOIs via another ioctl.  Personally that
> > > > > > > seems less desirable, but I'm willing to explore that route if
> > > > > > > necessary.  Thanks,
> > > > > > > 
> > > > > > > Alex
> > > > > > 
> > > > > > Maybe we should focus on the fact that we notify userspace that we
> > > > > > deasserted interrupt instead of EOI.
> > > > > 
> > > > > But will a tcg user want the de-assert?  I assume not.  The de-assert is
> > > > > an optimization to allow us to bypass evaluation in userspace.  In tcg
> > > > > we're already there.  Thanks,
> > > > > 
> > > > > Alex
> > > > 
> > > > Look what I am saying forget tcg and APIs. Build a kernel interface that
> > > > makes sense. Then in qemu look at kvm and tcg and build abstraction for
> > > > it.  Building kernel interface so you can make nice abstractions in tcg
> > > > is backwards.
> > > 
> > > Can you suggest specifically what doesn't make sense?
> > 
> > Interface is just very easy to misuse. Here are things that
> > you expose that to me do not seem to make sense:
> > 
> > - ability to create irqfd that by default can not be deasserted
> >   (you need eoifd to deassert)
> 
> Well, it's not really the default, a user has to add a flag to get this
> ability.
> 
> > - interface to create eventfd that by default never gets events
> >   (you need irqfd to assert)
> 
> In v8, this isn't the default, the user has to specify that they want to
> use it to de-assert.
> 
> > - creating ack eventfd requires level irqfd but you won't
> >   know it unless you read documentation
> 
> This is also fixed in v8, you get a source ID, then hook it up to an
> irqfd/irq ackfd any way you want.
> 
> > - duplicating level/edge information that we already have in GSI
> 
> Not really duplication, the edge/level information is several layers of
> indirection away from this interface.  As we've discussed in the past,
> relying on that information also means that the behavior of an ioctl
> depends on the state of another piece of emulated hardware which is
> controlled by the guest at the time the ioctl is called.  Personally, I
> don't think that's a good characteristic.
> 
> > Knowing all these quirks is a must if you want things to
> > work, but you do not know them until you read documentation.
> > This is not good interface, a good interface is
> > hard to misuse and self-documenting.
> 
> I think v8 makes improvements here, I'd be happy to hear your feedback
> on it.
> 
> > > For legacy interrupts VFIO needs to:
> > > 
> > > - Assert an interrupt
> > > 
> > >         Eventfds seem to be the most efficient way to signal when to
> > >         assert an interrupt and gives us the flexibility that we can
> > >         send that signal to either another kernel module or to
> > >         userspace.  KVM_IRQFD is designed for exactly this, but needs
> > >         modifications for level triggered interrupts.  These include:
> > >         
> > >         - Using a different IRQ source ID
> > >         
> > >                 GSIs are not exclusive, multiple devices may assert the
> > >                 same GSI.  IRQ source IDs are how KVM handles multiple
> > >                 inputs.
> > 
> > Actually, thinking about it some more, all assigned
> > device interrupts are deasserted on ack, so together.
> > And userspace does the OR in userspace already.
> > 
> > So why is it not enough to give IRQFDs a single separate
> > source ID, distinct from userspace but shared by all devices?
> 
> We could do that, but then we lose any ability to filter the KVM irq ack
> notifier based on whether a given IRQ source ID is asserted.  This is
> something you've been pushing for.

We ended tracking it in irqfd, no?

> Note that patch 1/6 of the v8 series
> adds this generically for all irq ack notifier users.  That's of course
> just an optimization,

How is it an optimization?

> we could have IRQ source IDs re-used and that
> might be a good solution if we ever start exhausting them.  v8 allows
> userspace to do this if it wants.

How does userspace know whether it should do it or not?

> > >         - Assert-only
> > >         
> > >                 KVM_IRQFD currently does assert->deassert to emulate an
> > >                 edge triggered interrupt.  For level, we need to be able
> > >                 to signal a discrete assertion and de-assertion event.
> > >         This results in the modifications I've proposed to KVM_IRQFD.
> > 
> > Actually is it really necessary at all?  What happens if we assert and
> > deassert immediately?  If guest lost the interrupt, on EOI device will
> > reassert resulting in another interrupt.
> 
> It's been a while since I've tried, but I recall I used this as a
> workaround early on in development and it did work.  I don't feel it's a
> proper representation of the hardware we're trying to emulate though and
> istr that Avi wasn't too fond of it either.

EOI hack is not a proper representation either.
I think we were just confused and thought there's a race.

> > > - Know when to de-assert an interrupt
> > > 
> > >         Servicing an interrupt is device specific, we can't know for any
> > >         random device what interactions with the device indicate service
> > >         of an interrupt.  We therefore look to the underlying hardware
> > >         support where a vCPU writes an End Of Interrupt to the APIC to
> > >         indicate the chip should re-sample it's inputs and either
> > >         de-assert or continue asserting the interrupt level.  Our device
> > >         may still require service at this point, but this mechanism has
> > >         proven effective with KVM assignment.
> > >         
> > >         This results in the notify-only portion of the EOIFD/IRQ_ACKFD.
> > >         
> > > - Deassert an interrupt
> > > 
> > >         Now that we have an interrupt that's been asserted and we
> > >         suspect that we should re-evaluate the interrupt signal due to
> > >         activity possibly related to an EOI, we need a mechanism to
> > >         de-assert the interrupt.  There are two possibilities here:
> > >         
> > >         - Test and de-assert
> > >         
> > >                 Depending on hardware support for INTxDisable, we may be
> > >                 able to poll whether the hardware is still asserting
> > >                 it's interrupt and de-assert if quiesced.  This
> > >                 optimizes for the case where the interrupt is still
> > >                 asserting as we avoid re-assertion and avoid unmasking
> > >                 the device.
> > >         
> > >         - De-assert, test, (re-assert)
> > >         
> > >                 Not all hardware supports INTxDisable, so we may have no
> > >                 way to test whether the device is still asserting an
> > >                 interrupt other than to unmask and see if it re-fires.
> > >                 This not only supports the most hardware, but also
> > >                 optimizes for the case where the device is quiesced.
> > >                 
> > >         Taking the latter path results in the de-assert and notify
> > >         interface to the above EOIFD/IRQ_ACKFD.  This reduces the number
> > >         of signals between components and supports the most hardware.
> > >         
> > > That leaves dealing with the IRQ source ID.  Initially I tried to hide
> > > this from userspace as it's more of an implementation detail of KVM, but
> > > in v8 I expose it as it offers more flexibility and (I hope) removes
> > > some of the odd dependencies between interfaces imposed by previous
> > > version.
> > > 
> > > If you have specific suggestions how else to approach this, I welcome
> > > the feedback.
> > > It would be backwards to design an interface exclusively around a
> > > single user, but it would be just as backwards to not envision how an
> > > interface would be used in advance.  Thanks,
> > > 
> > > Alex
> > 
> > Could you address two questions I ask above pls?
> > If we really can use the same source ID for all irqfds,
> > and if it's ok to deassert immediately after all,
> > then large parts of code can go away.
> > 
> > Or maybe I was away for too long and forgot
> > what the problem were ...
> 
> So if we de-assert immediately and remove the notify on de-assert, then
> irq_ackfd becomes a notify-only interface.  In v8 that's what it is at
> it's base, with an option to de-assert.  That option (patch 6/6) is a
> tiny bit of code.

But it is an interface that at least makes some sense.
And it is also an existing one.

> Removing the irq source ID isn't a clear win to me either.

It removes the limitation on number of ackfd/irqfd that there is.

> I'm becoming
> a broken record, but v8 already simplifies the irq source ID allocation
> and preserves the ability to filter irq ack notifications and targeted
> re-use of irq source IDs if userspace decides to support that.  Thanks,
> 
> Alex

I will look at v8.
Michael S. Tsirkin Aug. 13, 2012, 10 p.m. UTC | #25
On Mon, Aug 13, 2012 at 03:23:24PM -0600, Alex Williamson wrote:
> On Sun, 2012-08-12 at 12:33 +0300, Michael S. Tsirkin wrote:
> > On Thu, Aug 09, 2012 at 01:26:15PM -0600, Alex Williamson wrote:
> > > On Mon, 2012-08-06 at 13:40 +0300, Avi Kivity wrote:
> > > > On 08/06/2012 01:38 PM, Avi Kivity wrote:
> > > > 
> > > > > Regarding the implementation, instead of a linked list, would an array
> > > > > of counters parallel to the bitmap make it simpler?
> > > > 
> > > > Or even, replace the bitmap with an array of counters.
> > > 
> > > I'm not sure a counter array is what we're really after.  That gives us
> > > reference counting for the irq source IDs, but not the key->gsi lookup.
> > > It also highlights another issue, that we have a limited set of source
> > > IDs.  Looks like we have BITS_PER_LONG IDs, with two already used, one
> > > for the shared userspace ID and another for the PIT.  How happy are we
> > > going to be with a limit of 62 level interrupts in use at one time?
> > > 
> > > It's arguably a reasonable number since the most virtualization friendly
> > > devices (sr-iov VFs) don't even support this kind of interrupt.  It's
> > > also very wasteful allocating an entire source ID for a single GSI
> > > within that source ID.  PCI supports interrupts A, B, C, and D, which,
> > > in the most optimal config, each go to different GSIs.  So we could
> > > theoretically be more efficient in our use and allocation of irq source
> > > IDs if we tracked use by the source ID, gsi pair.
> > > 
> > > That probably makes it less practical to replace anything at the top
> > > level with a counter array.  The key that we pass back is currently the
> > > actual source ID, but we don't specify what it is, so we could split it
> > > and have it encode a 16bit source ID plus 16 bit GSI.  It could also be
> > > an idr entry.
> > > 
> > > Michael, would the interface be more acceptable to you if we added
> > > separate ioctls to allocate and free some representation of an irq
> > > source ID, gsi pair?  For instance, an ioctl might return an idr entry
> > > for an irq source ID/gsi object which would then be passed as a
> > > parameter in struct kvm_irqfd and struct kvm_eoifd so that the object
> > > representing the source id/gsi isn't magically freed on it's own.  This
> > > would also allow us to deassign/close one end and reconfigure it later.
> > > Thanks,
> > > 
> > > Alex
> > 
> > It's acceptable to me either way. I was only pointing out that as
> > designed, the interface looks simple at first but then you find out some
> > subtle limitations which are implementation driven. This gives
> > an overall feeling the abstraction is too low level.
> > 
> > If we compare to the existing irqfd, isn't the difference
> > simply that irqfd deasserts immediately ATM, while we
> > want to delay this until later?
> > 
> > If yes, then along the lines that you proposed, and combining with my
> > idea of tracking deasserts, how do you like the following:
> > 
> > /* Keep line asserted until guest has handled the interrupt. */
> > #define KVM_IRQFD_FLAG_DEASSERT_ON_ACK (1 << 1)
> > /* Notify after line is deasserted. */
> > #define KVM_IRQFD_FLAG_DEASSERT_EVENTFD (2 << 1)
> > 
> > 	struct kvm_irqfd {
> > 		__u32 fd;
> > 		__u32 gsi;
> > 		__u32 flags;
> > 		/* eventfd to notify when line is deasserted */
> > 		__u32 deassert_eventfd;
> > 		__u8  pad[16];
> > 	};
> > 
> > now the only limitation is that KVM_IRQFD_FLAG_DEASSERT_ON_ACK is only
> > effective for level interrupts.
> > 
> > Notes about lifetime of objects:
> > 	- closing deassert_eventfd does nothing (we can keep
> > 	  reference to it from irqfd so no need for
> >           complex polling/flushing scheme)
> > 	- closing irqfd or deasserting dis-associates
> > 	  deassert_eventfd automatically
> > 	- source id is internal to irqfd and goes away with it
> > 
> > it looks harder to misuse and fits what we want to do nicely,
> > and needs less code to implement.
> 
> This is effectively what I meant when I suggested we either need to a)
> pull eoifd into irqfd or b) implement them as modular components.  I
> chose to implement b) because I think that non-irqfd related ack
> notification to userspace will be useful and a) does not provide that.
> So this interface enables exactly the use case for device assignment and
> no more.  I feel like this is the start of an ioctl that will be quickly
> deprecated, but if that's the direction we want to go, I'll write the
> code.  Thanks,
> 
> Alex

Sorry I wrote this before I knew we really do not need
the deassert on ack at all, existing irqfd is fine for level.
Michael S. Tsirkin Aug. 13, 2012, 10:06 p.m. UTC | #26
On Mon, Aug 13, 2012 at 03:34:01PM -0600, Alex Williamson wrote:
> On Sun, 2012-08-12 at 11:36 +0300, Avi Kivity wrote:
> > On 08/09/2012 10:26 PM, Alex Williamson wrote:
> > > On Mon, 2012-08-06 at 13:40 +0300, Avi Kivity wrote:
> > >> On 08/06/2012 01:38 PM, Avi Kivity wrote:
> > >> 
> > >> > Regarding the implementation, instead of a linked list, would an array
> > >> > of counters parallel to the bitmap make it simpler?
> > >> 
> > >> Or even, replace the bitmap with an array of counters.
> > > 
> > > I'm not sure a counter array is what we're really after.  That gives us
> > > reference counting for the irq source IDs, but not the key->gsi lookup.
> > 
> > You can look up the gsi while registering the eoifd, so it's accessible
> > as eoifd->gsi instead of eoifd->source->gsi.  The irqfd can go away
> > while the eoifd is still active, but is this a problem?
> 
> In my opinion, no, but Michael disagrees.
> 
> > > It also highlights another issue, that we have a limited set of source
> > > IDs.  Looks like we have BITS_PER_LONG IDs, with two already used, one
> > > for the shared userspace ID and another for the PIT.  How happy are we
> > > going to be with a limit of 62 level interrupts in use at one time?
> > 
> > When we start being unhappy we can increase that number.  On the other
> > hand more locks and lists makes me unhappy now.
> 
> Yep, good point.  My latest version removes the source ID object lock
> and list (and objects).  I still have a lock and list for the ack
> notification, but it's hard not to unless we combine them into one
> mega-irqfd ioctl as Michael suggests.
>
> > > It's arguably a reasonable number since the most virtualization friendly
> > > devices (sr-iov VFs) don't even support this kind of interrupt.  It's
> > > also very wasteful allocating an entire source ID for a single GSI
> > > within that source ID.  PCI supports interrupts A, B, C, and D, which,
> > > in the most optimal config, each go to different GSIs.  So we could
> > > theoretically be more efficient in our use and allocation of irq source
> > > IDs if we tracked use by the source ID, gsi pair.
> > 
> > There are, in one userspace, just three gsis available for PCI links, so
> > you're compressing the source id space by 3.
> 
> I imagine there's a way to put each PCI interrupt pin on a GSI, but
> still only 4, not a great expansion of source ID space.  I like
> Michael's idea of re-using source IDs if we run out better.
> 
> > > That probably makes it less practical to replace anything at the top
> > > level with a counter array.  The key that we pass back is currently the
> > > actual source ID, but we don't specify what it is, so we could split it
> > > and have it encode a 16bit source ID plus 16 bit GSI.  It could also be
> > > an idr entry.
> > 
> > We can fix those kinds of problems by adding another layer of
> > indirection.  But I doubt they will be needed.  I don't see people
> > assigning 60 legacy devices to one guest.
> 
> Yep, we can ignore it for now and put it in the hands of userspace to
> re-use IDs if needed.
> 
> > > Michael, would the interface be more acceptable to you if we added
> > > separate ioctls to allocate and free some representation of an irq
> > > source ID, gsi pair?  For instance, an ioctl might return an idr entry
> > > for an irq source ID/gsi object which would then be passed as a
> > > parameter in struct kvm_irqfd and struct kvm_eoifd so that the object
> > > representing the source id/gsi isn't magically freed on it's own.  This
> > > would also allow us to deassign/close one end and reconfigure it later.
> > > Thanks,
> > 
> > Another option is to push the responsibility for allocating IDs for the
> > association to userspace.  Let userspace both create the irqfd and the
> > eoifd with the same ID, the kernel matches them at registration time and
> > copies the gsi/sourceid from the first to the second eventfd.
> 
> Aside from the copying gsi/sourceid bit, you've just described my latest
> attempt at this series.  Specifying both a sourceid and gsi also allows
> userspace to make better use of the sourceid address space (use more
> than one gsi if userspace wants the complexity of managing them).
> Thanks,
> 
> Alex

Turns out per device source ID is a bug copied from existing
device assignment. I am amazed we did not notice before.
There we have small # of devices so it's not a problem but there's no
reason just not to have a source ID for all irqfds.
So the problem goes away, and there is no limit on # of level irqfds,
and no need to manage IDs in userspace at all.
You can still have cookies in userspace if you like but do not map them
to source IDs.
Alex Williamson Aug. 13, 2012, 10:22 p.m. UTC | #27
On Tue, 2012-08-14 at 00:50 +0300, Michael S. Tsirkin wrote:
> On Mon, Aug 13, 2012 at 02:48:25PM -0600, Alex Williamson wrote:
> > On Mon, 2012-08-13 at 22:50 +0300, Michael S. Tsirkin wrote:
> > > On Mon, Aug 13, 2012 at 12:17:25PM -0600, Alex Williamson wrote:
> > > > On Mon, 2012-08-13 at 19:59 +0300, Michael S. Tsirkin wrote:
> > > > > On Mon, Aug 13, 2012 at 10:48:15AM -0600, Alex Williamson wrote:
> > > > > > On Sun, 2012-08-12 at 10:49 +0300, Michael S. Tsirkin wrote:
> > > > > > > On Wed, Aug 01, 2012 at 01:06:42PM -0600, Alex Williamson wrote:
> > > > > > > > On Mon, 2012-07-30 at 19:12 -0600, Alex Williamson wrote:
> > > > > > > > > On Tue, 2012-07-31 at 03:36 +0300, Michael S. Tsirkin wrote:
> > > > > > > > > > On Mon, Jul 30, 2012 at 06:26:31PM -0600, Alex Williamson wrote:
> > > > > > > > > > > On Tue, 2012-07-31 at 03:01 +0300, Michael S. Tsirkin wrote:
> > > > > > > > > > > > You keep saying this but it is still true: once irqfd
> > > > > > > > > > > > is closed eoifd does not get any more interrupts.
> > > > > > > > > > > 
> > > > > > > > > > > How does that matter?
> > > > > > > > > > 
> > > > > > > > > > Well if it does not get events it is disabled.
> > > > > > > > > > so you have one ifc disabling another, anyway.
> > > > > > > > > 
> > > > > > > > > And a level irqfd without an eoifd can never be de-asserted.  Either we
> > > > > > > > > make modular components, assemble them to do useful work, and
> > > > > > > > > disassemble them independently so they can be used by future interfaces
> > > > > > > > > or we bundle eoifd as just an option of irqfd.  Which is it gonna be?
> > > > > > > > 
> > > > > > > > I don't think I've been successful at explaining my reasoning for making
> > > > > > > > EOI notification a separate interface, so let me try again...
> > > > > > > > 
> > > > > > > > When kvm is not enabled, the qemu vfio driver still needs to know about
> > > > > > > > EOIs to re-enable the physical interrupt.  Since the ioapic is emulated
> > > > > > > > in qemu, we can setup a notifier for this and create abstraction to make
> > > > > > > > it non-x86 specific, etc.  We just need to come up with a design and
> > > > > > > > implement it.  But what happens when kvm is then enabled?  ioapic
> > > > > > > > emulation moves to the kernel (assume kvm includes irqchip for this
> > > > > > > > argument even though it doesn't for POWER), qemu no longer knows about
> > > > > > > > EOIs, and the interface we just created to handle the non-kvm case stops
> > > > > > > > working.  Is anyone going to accept adding a qemu EOI notification
> > > > > > > > interface that only works when kvm is not enabled?
> > > > > > > 
> > > > > > > Yes, it's only a question of abstracting it at the right level.
> > > > > > > 
> > > > > > > For example, if as you suggest below kvm gives you an eventfd that
> > > > > > > asserts an irq, laters automatically deasserts it and notifies another
> > > > > > > eventfd, we can do exactly this in both tcg and kvm:
> > > > > > > 
> > > > > > > setup_level_irq(int gsi, int assert_eventfd, int deassert_eventfd)
> > > > > > > 
> > > > > > > Not advocating this interface but pointing out that to make
> > > > > > > same abstraction to work in tcg and kvm, see what it does in
> > > > > > > each of them first.
> > > > > > 
> > > > > > The tcg model I was thinking of is that we continue to use qemu_set_irq
> > > > > > to assert and de-assert the interrupt and add an eoi/ack notification
> > > > > > mechanism, much like the ack notifier that already exists in kvm.  There
> > > > > > doesn't seem to be much advantage to creating a new interrupt
> > > > > > infrastructure in tcg that can trigger interrupts by eventfds, so I
> > > > > > assume VFIO is always going to be responsible for the translation of an
> > > > > > eventfd to an irq assertion, get some kind of notification through qemu,
> > > > > > de-assert the interrupt and unmask the device.  With that model in mind,
> > > > > > perhaps it makes more sense why I've been keeping the eoi/ack separate
> > > > > > from irqfd.
> > > > > > 
> > > > > > > > I suspect we therefore need a notification mechanism between kvm and
> > > > > > > > qemu to make it possible for that interface to continue working.
> > > > > > > 
> > > > > > > Even though no one is actually using it. IMHO, this is a maintainance
> > > > > > > problem.
> > > > > > 
> > > > > > That's why I'm designing it the way I am.  VFIO will make use of it.  It
> > > > > > will just be using the de-assert and notify mode vs a notify-only mode
> > > > > > that tcg would use.  It would also be easy to add an option to vfio so
> > > > > > that we could fully test both modes.
> > > > > > 
> > > > > > > > An
> > > > > > > > eventfd also seems like the right mechanism there.  A simple
> > > > > > > > modification to the proposed KVM_EOIFD here would allow it to trigger an
> > > > > > > > eventfd when an EOI is written to a specific gsi on
> > > > > > > > KVM_USERSPACE_IRQ_SOURCE_ID (define a flag and pass gsi in place of
> > > > > > > > key).
> > > > > > > > 
> > > > > > > > The split proposed here does require some assembly, but KVM_EOIFD is
> > > > > > > > re-usable as either a de-assert and notify mechanism tied to an irqfd or
> > > > > > > > a notify-only mechanism allowing users of a qemu EOI notification
> > > > > > > > infrastructure to continue working.  vfio doesn't necessarily need this
> > > > > > > > middle ground, but can easily be used to test it.
> > > > > > > > 
> > > > > > > > The alternative is that we pull eoifd into KVM_IRQFD and invent some
> > > > > > > > other new EOI interface for qemu.  That means we get EOIs tied to an
> > > > > > > > irqfd via one path and other EOIs via another ioctl.  Personally that
> > > > > > > > seems less desirable, but I'm willing to explore that route if
> > > > > > > > necessary.  Thanks,
> > > > > > > > 
> > > > > > > > Alex
> > > > > > > 
> > > > > > > Maybe we should focus on the fact that we notify userspace that we
> > > > > > > deasserted interrupt instead of EOI.
> > > > > > 
> > > > > > But will a tcg user want the de-assert?  I assume not.  The de-assert is
> > > > > > an optimization to allow us to bypass evaluation in userspace.  In tcg
> > > > > > we're already there.  Thanks,
> > > > > > 
> > > > > > Alex
> > > > > 
> > > > > Look what I am saying forget tcg and APIs. Build a kernel interface that
> > > > > makes sense. Then in qemu look at kvm and tcg and build abstraction for
> > > > > it.  Building kernel interface so you can make nice abstractions in tcg
> > > > > is backwards.
> > > > 
> > > > Can you suggest specifically what doesn't make sense?
> > > 
> > > Interface is just very easy to misuse. Here are things that
> > > you expose that to me do not seem to make sense:
> > > 
> > > - ability to create irqfd that by default can not be deasserted
> > >   (you need eoifd to deassert)
> > 
> > Well, it's not really the default, a user has to add a flag to get this
> > ability.
> > 
> > > - interface to create eventfd that by default never gets events
> > >   (you need irqfd to assert)
> > 
> > In v8, this isn't the default, the user has to specify that they want to
> > use it to de-assert.
> > 
> > > - creating ack eventfd requires level irqfd but you won't
> > >   know it unless you read documentation
> > 
> > This is also fixed in v8, you get a source ID, then hook it up to an
> > irqfd/irq ackfd any way you want.
> > 
> > > - duplicating level/edge information that we already have in GSI
> > 
> > Not really duplication, the edge/level information is several layers of
> > indirection away from this interface.  As we've discussed in the past,
> > relying on that information also means that the behavior of an ioctl
> > depends on the state of another piece of emulated hardware which is
> > controlled by the guest at the time the ioctl is called.  Personally, I
> > don't think that's a good characteristic.
> > 
> > > Knowing all these quirks is a must if you want things to
> > > work, but you do not know them until you read documentation.
> > > This is not good interface, a good interface is
> > > hard to misuse and self-documenting.
> > 
> > I think v8 makes improvements here, I'd be happy to hear your feedback
> > on it.
> > 
> > > > For legacy interrupts VFIO needs to:
> > > > 
> > > > - Assert an interrupt
> > > > 
> > > >         Eventfds seem to be the most efficient way to signal when to
> > > >         assert an interrupt and gives us the flexibility that we can
> > > >         send that signal to either another kernel module or to
> > > >         userspace.  KVM_IRQFD is designed for exactly this, but needs
> > > >         modifications for level triggered interrupts.  These include:
> > > >         
> > > >         - Using a different IRQ source ID
> > > >         
> > > >                 GSIs are not exclusive, multiple devices may assert the
> > > >                 same GSI.  IRQ source IDs are how KVM handles multiple
> > > >                 inputs.
> > > 
> > > Actually, thinking about it some more, all assigned
> > > device interrupts are deasserted on ack, so together.
> > > And userspace does the OR in userspace already.
> > > 
> > > So why is it not enough to give IRQFDs a single separate
> > > source ID, distinct from userspace but shared by all devices?
> > 
> > We could do that, but then we lose any ability to filter the KVM irq ack
> > notifier based on whether a given IRQ source ID is asserted.  This is
> > something you've been pushing for.
> 
> We ended tracking it in irqfd, no?

We could do it there, but as we've seen, tracking such at the point
where we do the deassert and notify requires fairly extensive locking to
prevent races that could cause the device to get stuck.

> > Note that patch 1/6 of the v8 series
> > adds this generically for all irq ack notifier users.  That's of course
> > just an optimization,
> 
> How is it an optimization?

We only fire ack notifiers for source IDs that are asserted, if the ack
notification user opts in to the filtering.  Hopefully resulting in
fewer spurious callbacks.

> > we could have IRQ source IDs re-used and that
> > might be a good solution if we ever start exhausting them.  v8 allows
> > userspace to do this if it wants.
> 
> How does userspace know whether it should do it or not?

When it runs out.  Maybe use a single one for all of them.  The
KVM_IRQ_SOURCE_ID ioctl in v8 tells userspace how many are available.
Userspace can create difference strategies based on how many are
available and number of devices.  For the vast majority of use cases,
getting a new source id for each device is probably fine.  If sourceids
run out, userspace has the option of creating a strategy to re-use them.

> > > >         - Assert-only
> > > >         
> > > >                 KVM_IRQFD currently does assert->deassert to emulate an
> > > >                 edge triggered interrupt.  For level, we need to be able
> > > >                 to signal a discrete assertion and de-assertion event.
> > > >         This results in the modifications I've proposed to KVM_IRQFD.
> > > 
> > > Actually is it really necessary at all?  What happens if we assert and
> > > deassert immediately?  If guest lost the interrupt, on EOI device will
> > > reassert resulting in another interrupt.
> > 
> > It's been a while since I've tried, but I recall I used this as a
> > workaround early on in development and it did work.  I don't feel it's a
> > proper representation of the hardware we're trying to emulate though and
> > istr that Avi wasn't too fond of it either.
> 
> EOI hack is not a proper representation either.
> I think we were just confused and thought there's a race.

Using the EOI as a trigger to de-assert and potentially re-assert may be
a hack, but it's about as close as we can come to following the behavior
of hardware.  It's actually quite similar to an apic re-sampling inputs
except we don't have a physical line to read and see that it's still
asserted.  We emulate this by de-asserting it and letting it re-assert
if necessary.  The emulation to the guest isn't perfect, but it's a lot
closer than immediately de-asserting the pin.  I think the discussion
below describes why I do this versus something that might be even closer
to actual hardware.

> > > > - Know when to de-assert an interrupt
> > > > 
> > > >         Servicing an interrupt is device specific, we can't know for any
> > > >         random device what interactions with the device indicate service
> > > >         of an interrupt.  We therefore look to the underlying hardware
> > > >         support where a vCPU writes an End Of Interrupt to the APIC to
> > > >         indicate the chip should re-sample it's inputs and either
> > > >         de-assert or continue asserting the interrupt level.  Our device
> > > >         may still require service at this point, but this mechanism has
> > > >         proven effective with KVM assignment.
> > > >         
> > > >         This results in the notify-only portion of the EOIFD/IRQ_ACKFD.
> > > >         
> > > > - Deassert an interrupt
> > > > 
> > > >         Now that we have an interrupt that's been asserted and we
> > > >         suspect that we should re-evaluate the interrupt signal due to
> > > >         activity possibly related to an EOI, we need a mechanism to
> > > >         de-assert the interrupt.  There are two possibilities here:
> > > >         
> > > >         - Test and de-assert
> > > >         
> > > >                 Depending on hardware support for INTxDisable, we may be
> > > >                 able to poll whether the hardware is still asserting
> > > >                 it's interrupt and de-assert if quiesced.  This
> > > >                 optimizes for the case where the interrupt is still
> > > >                 asserting as we avoid re-assertion and avoid unmasking
> > > >                 the device.
> > > >         
> > > >         - De-assert, test, (re-assert)
> > > >         
> > > >                 Not all hardware supports INTxDisable, so we may have no
> > > >                 way to test whether the device is still asserting an
> > > >                 interrupt other than to unmask and see if it re-fires.
> > > >                 This not only supports the most hardware, but also
> > > >                 optimizes for the case where the device is quiesced.
> > > >                 
> > > >         Taking the latter path results in the de-assert and notify
> > > >         interface to the above EOIFD/IRQ_ACKFD.  This reduces the number
> > > >         of signals between components and supports the most hardware.
> > > >         
> > > > That leaves dealing with the IRQ source ID.  Initially I tried to hide
> > > > this from userspace as it's more of an implementation detail of KVM, but
> > > > in v8 I expose it as it offers more flexibility and (I hope) removes
> > > > some of the odd dependencies between interfaces imposed by previous
> > > > version.
> > > > 
> > > > If you have specific suggestions how else to approach this, I welcome
> > > > the feedback.
> > > > It would be backwards to design an interface exclusively around a
> > > > single user, but it would be just as backwards to not envision how an
> > > > interface would be used in advance.  Thanks,
> > > > 
> > > > Alex
> > > 
> > > Could you address two questions I ask above pls?
> > > If we really can use the same source ID for all irqfds,
> > > and if it's ok to deassert immediately after all,
> > > then large parts of code can go away.
> > > 
> > > Or maybe I was away for too long and forgot
> > > what the problem were ...
> > 
> > So if we de-assert immediately and remove the notify on de-assert, then
> > irq_ackfd becomes a notify-only interface.  In v8 that's what it is at
> > it's base, with an option to de-assert.  That option (patch 6/6) is a
> > tiny bit of code.
> 
> But it is an interface that at least makes some sense.
> And it is also an existing one.

You often argue that debugging is an important consideration in
designing and using an interface.  Doesn't improperly representing the
interrupt state make debugging harder?  If the irq_state bit is clear we
don't know if assigned device is masked waiting for an EOI or quiesced.

> > Removing the irq source ID isn't a clear win to me either.
> 
> It removes the limitation on number of ackfd/irqfd that there is.

The irqfd still has to use a sourceid and that has to be specified
either by flag or flag and passed value.  If we make a flag for
"USE_ASSIGNED_DEVICE_SOURCE_ID", that again seems like a very narrowly
focused extension.

> > I'm becoming
> > a broken record, but v8 already simplifies the irq source ID allocation
> > and preserves the ability to filter irq ack notifications and targeted
> > re-use of irq source IDs if userspace decides to support that.  Thanks,
> > 
> > Alex
> 
> I will look at v8.

Thanks!

Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson Aug. 13, 2012, 10:41 p.m. UTC | #28
On Tue, 2012-08-14 at 01:06 +0300, Michael S. Tsirkin wrote:
> On Mon, Aug 13, 2012 at 03:34:01PM -0600, Alex Williamson wrote:
> > On Sun, 2012-08-12 at 11:36 +0300, Avi Kivity wrote:
> > > On 08/09/2012 10:26 PM, Alex Williamson wrote:
> > > > On Mon, 2012-08-06 at 13:40 +0300, Avi Kivity wrote:
> > > >> On 08/06/2012 01:38 PM, Avi Kivity wrote:
> > > >> 
> > > >> > Regarding the implementation, instead of a linked list, would an array
> > > >> > of counters parallel to the bitmap make it simpler?
> > > >> 
> > > >> Or even, replace the bitmap with an array of counters.
> > > > 
> > > > I'm not sure a counter array is what we're really after.  That gives us
> > > > reference counting for the irq source IDs, but not the key->gsi lookup.
> > > 
> > > You can look up the gsi while registering the eoifd, so it's accessible
> > > as eoifd->gsi instead of eoifd->source->gsi.  The irqfd can go away
> > > while the eoifd is still active, but is this a problem?
> > 
> > In my opinion, no, but Michael disagrees.
> > 
> > > > It also highlights another issue, that we have a limited set of source
> > > > IDs.  Looks like we have BITS_PER_LONG IDs, with two already used, one
> > > > for the shared userspace ID and another for the PIT.  How happy are we
> > > > going to be with a limit of 62 level interrupts in use at one time?
> > > 
> > > When we start being unhappy we can increase that number.  On the other
> > > hand more locks and lists makes me unhappy now.
> > 
> > Yep, good point.  My latest version removes the source ID object lock
> > and list (and objects).  I still have a lock and list for the ack
> > notification, but it's hard not to unless we combine them into one
> > mega-irqfd ioctl as Michael suggests.
> >
> > > > It's arguably a reasonable number since the most virtualization friendly
> > > > devices (sr-iov VFs) don't even support this kind of interrupt.  It's
> > > > also very wasteful allocating an entire source ID for a single GSI
> > > > within that source ID.  PCI supports interrupts A, B, C, and D, which,
> > > > in the most optimal config, each go to different GSIs.  So we could
> > > > theoretically be more efficient in our use and allocation of irq source
> > > > IDs if we tracked use by the source ID, gsi pair.
> > > 
> > > There are, in one userspace, just three gsis available for PCI links, so
> > > you're compressing the source id space by 3.
> > 
> > I imagine there's a way to put each PCI interrupt pin on a GSI, but
> > still only 4, not a great expansion of source ID space.  I like
> > Michael's idea of re-using source IDs if we run out better.
> > 
> > > > That probably makes it less practical to replace anything at the top
> > > > level with a counter array.  The key that we pass back is currently the
> > > > actual source ID, but we don't specify what it is, so we could split it
> > > > and have it encode a 16bit source ID plus 16 bit GSI.  It could also be
> > > > an idr entry.
> > > 
> > > We can fix those kinds of problems by adding another layer of
> > > indirection.  But I doubt they will be needed.  I don't see people
> > > assigning 60 legacy devices to one guest.
> > 
> > Yep, we can ignore it for now and put it in the hands of userspace to
> > re-use IDs if needed.
> > 
> > > > Michael, would the interface be more acceptable to you if we added
> > > > separate ioctls to allocate and free some representation of an irq
> > > > source ID, gsi pair?  For instance, an ioctl might return an idr entry
> > > > for an irq source ID/gsi object which would then be passed as a
> > > > parameter in struct kvm_irqfd and struct kvm_eoifd so that the object
> > > > representing the source id/gsi isn't magically freed on it's own.  This
> > > > would also allow us to deassign/close one end and reconfigure it later.
> > > > Thanks,
> > > 
> > > Another option is to push the responsibility for allocating IDs for the
> > > association to userspace.  Let userspace both create the irqfd and the
> > > eoifd with the same ID, the kernel matches them at registration time and
> > > copies the gsi/sourceid from the first to the second eventfd.
> > 
> > Aside from the copying gsi/sourceid bit, you've just described my latest
> > attempt at this series.  Specifying both a sourceid and gsi also allows
> > userspace to make better use of the sourceid address space (use more
> > than one gsi if userspace wants the complexity of managing them).
> > Thanks,
> > 
> > Alex
> 
> Turns out per device source ID is a bug copied from existing
> device assignment. I am amazed we did not notice before.
> There we have small # of devices so it's not a problem but there's no
> reason just not to have a source ID for all irqfds.
> So the problem goes away, and there is no limit on # of level irqfds,
> and no need to manage IDs in userspace at all.
> You can still have cookies in userspace if you like but do not map them
> to source IDs.

IMHO it's not a bug, it's an implementation decision.  They could be
shared, but that doesn't make it wrong to not share them.  Given that we
have 32 memory slots, the only way you could hit this would be to have a
lot of really slow devices that don't direct-map any BARs.  A reason to
not have the same source id for everything is that I think we can do ack
notification filtering more easily using separate source ids (as is done
in the first patch of the v8 series).  As the code is today, I agree,
there's probably no advantage to using multiple source IDs.  Thanks,

Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Aug. 13, 2012, 10:52 p.m. UTC | #29
On Mon, Aug 13, 2012 at 04:22:12PM -0600, Alex Williamson wrote:
> On Tue, 2012-08-14 at 00:50 +0300, Michael S. Tsirkin wrote:
> > On Mon, Aug 13, 2012 at 02:48:25PM -0600, Alex Williamson wrote:
> > > On Mon, 2012-08-13 at 22:50 +0300, Michael S. Tsirkin wrote:
> > > > On Mon, Aug 13, 2012 at 12:17:25PM -0600, Alex Williamson wrote:
> > > > > On Mon, 2012-08-13 at 19:59 +0300, Michael S. Tsirkin wrote:
> > > > > > On Mon, Aug 13, 2012 at 10:48:15AM -0600, Alex Williamson wrote:
> > > > > > > On Sun, 2012-08-12 at 10:49 +0300, Michael S. Tsirkin wrote:
> > > > > > > > On Wed, Aug 01, 2012 at 01:06:42PM -0600, Alex Williamson wrote:
> > > > > > > > > On Mon, 2012-07-30 at 19:12 -0600, Alex Williamson wrote:
> > > > > > > > > > On Tue, 2012-07-31 at 03:36 +0300, Michael S. Tsirkin wrote:
> > > > > > > > > > > On Mon, Jul 30, 2012 at 06:26:31PM -0600, Alex Williamson wrote:
> > > > > > > > > > > > On Tue, 2012-07-31 at 03:01 +0300, Michael S. Tsirkin wrote:
> > > > > > > > > > > > > You keep saying this but it is still true: once irqfd
> > > > > > > > > > > > > is closed eoifd does not get any more interrupts.
> > > > > > > > > > > > 
> > > > > > > > > > > > How does that matter?
> > > > > > > > > > > 
> > > > > > > > > > > Well if it does not get events it is disabled.
> > > > > > > > > > > so you have one ifc disabling another, anyway.
> > > > > > > > > > 
> > > > > > > > > > And a level irqfd without an eoifd can never be de-asserted.  Either we
> > > > > > > > > > make modular components, assemble them to do useful work, and
> > > > > > > > > > disassemble them independently so they can be used by future interfaces
> > > > > > > > > > or we bundle eoifd as just an option of irqfd.  Which is it gonna be?
> > > > > > > > > 
> > > > > > > > > I don't think I've been successful at explaining my reasoning for making
> > > > > > > > > EOI notification a separate interface, so let me try again...
> > > > > > > > > 
> > > > > > > > > When kvm is not enabled, the qemu vfio driver still needs to know about
> > > > > > > > > EOIs to re-enable the physical interrupt.  Since the ioapic is emulated
> > > > > > > > > in qemu, we can setup a notifier for this and create abstraction to make
> > > > > > > > > it non-x86 specific, etc.  We just need to come up with a design and
> > > > > > > > > implement it.  But what happens when kvm is then enabled?  ioapic
> > > > > > > > > emulation moves to the kernel (assume kvm includes irqchip for this
> > > > > > > > > argument even though it doesn't for POWER), qemu no longer knows about
> > > > > > > > > EOIs, and the interface we just created to handle the non-kvm case stops
> > > > > > > > > working.  Is anyone going to accept adding a qemu EOI notification
> > > > > > > > > interface that only works when kvm is not enabled?
> > > > > > > > 
> > > > > > > > Yes, it's only a question of abstracting it at the right level.
> > > > > > > > 
> > > > > > > > For example, if as you suggest below kvm gives you an eventfd that
> > > > > > > > asserts an irq, laters automatically deasserts it and notifies another
> > > > > > > > eventfd, we can do exactly this in both tcg and kvm:
> > > > > > > > 
> > > > > > > > setup_level_irq(int gsi, int assert_eventfd, int deassert_eventfd)
> > > > > > > > 
> > > > > > > > Not advocating this interface but pointing out that to make
> > > > > > > > same abstraction to work in tcg and kvm, see what it does in
> > > > > > > > each of them first.
> > > > > > > 
> > > > > > > The tcg model I was thinking of is that we continue to use qemu_set_irq
> > > > > > > to assert and de-assert the interrupt and add an eoi/ack notification
> > > > > > > mechanism, much like the ack notifier that already exists in kvm.  There
> > > > > > > doesn't seem to be much advantage to creating a new interrupt
> > > > > > > infrastructure in tcg that can trigger interrupts by eventfds, so I
> > > > > > > assume VFIO is always going to be responsible for the translation of an
> > > > > > > eventfd to an irq assertion, get some kind of notification through qemu,
> > > > > > > de-assert the interrupt and unmask the device.  With that model in mind,
> > > > > > > perhaps it makes more sense why I've been keeping the eoi/ack separate
> > > > > > > from irqfd.
> > > > > > > 
> > > > > > > > > I suspect we therefore need a notification mechanism between kvm and
> > > > > > > > > qemu to make it possible for that interface to continue working.
> > > > > > > > 
> > > > > > > > Even though no one is actually using it. IMHO, this is a maintainance
> > > > > > > > problem.
> > > > > > > 
> > > > > > > That's why I'm designing it the way I am.  VFIO will make use of it.  It
> > > > > > > will just be using the de-assert and notify mode vs a notify-only mode
> > > > > > > that tcg would use.  It would also be easy to add an option to vfio so
> > > > > > > that we could fully test both modes.
> > > > > > > 
> > > > > > > > > An
> > > > > > > > > eventfd also seems like the right mechanism there.  A simple
> > > > > > > > > modification to the proposed KVM_EOIFD here would allow it to trigger an
> > > > > > > > > eventfd when an EOI is written to a specific gsi on
> > > > > > > > > KVM_USERSPACE_IRQ_SOURCE_ID (define a flag and pass gsi in place of
> > > > > > > > > key).
> > > > > > > > > 
> > > > > > > > > The split proposed here does require some assembly, but KVM_EOIFD is
> > > > > > > > > re-usable as either a de-assert and notify mechanism tied to an irqfd or
> > > > > > > > > a notify-only mechanism allowing users of a qemu EOI notification
> > > > > > > > > infrastructure to continue working.  vfio doesn't necessarily need this
> > > > > > > > > middle ground, but can easily be used to test it.
> > > > > > > > > 
> > > > > > > > > The alternative is that we pull eoifd into KVM_IRQFD and invent some
> > > > > > > > > other new EOI interface for qemu.  That means we get EOIs tied to an
> > > > > > > > > irqfd via one path and other EOIs via another ioctl.  Personally that
> > > > > > > > > seems less desirable, but I'm willing to explore that route if
> > > > > > > > > necessary.  Thanks,
> > > > > > > > > 
> > > > > > > > > Alex
> > > > > > > > 
> > > > > > > > Maybe we should focus on the fact that we notify userspace that we
> > > > > > > > deasserted interrupt instead of EOI.
> > > > > > > 
> > > > > > > But will a tcg user want the de-assert?  I assume not.  The de-assert is
> > > > > > > an optimization to allow us to bypass evaluation in userspace.  In tcg
> > > > > > > we're already there.  Thanks,
> > > > > > > 
> > > > > > > Alex
> > > > > > 
> > > > > > Look what I am saying forget tcg and APIs. Build a kernel interface that
> > > > > > makes sense. Then in qemu look at kvm and tcg and build abstraction for
> > > > > > it.  Building kernel interface so you can make nice abstractions in tcg
> > > > > > is backwards.
> > > > > 
> > > > > Can you suggest specifically what doesn't make sense?
> > > > 
> > > > Interface is just very easy to misuse. Here are things that
> > > > you expose that to me do not seem to make sense:
> > > > 
> > > > - ability to create irqfd that by default can not be deasserted
> > > >   (you need eoifd to deassert)
> > > 
> > > Well, it's not really the default, a user has to add a flag to get this
> > > ability.
> > > 
> > > > - interface to create eventfd that by default never gets events
> > > >   (you need irqfd to assert)
> > > 
> > > In v8, this isn't the default, the user has to specify that they want to
> > > use it to de-assert.
> > > 
> > > > - creating ack eventfd requires level irqfd but you won't
> > > >   know it unless you read documentation
> > > 
> > > This is also fixed in v8, you get a source ID, then hook it up to an
> > > irqfd/irq ackfd any way you want.
> > > 
> > > > - duplicating level/edge information that we already have in GSI
> > > 
> > > Not really duplication, the edge/level information is several layers of
> > > indirection away from this interface.  As we've discussed in the past,
> > > relying on that information also means that the behavior of an ioctl
> > > depends on the state of another piece of emulated hardware which is
> > > controlled by the guest at the time the ioctl is called.  Personally, I
> > > don't think that's a good characteristic.
> > > 
> > > > Knowing all these quirks is a must if you want things to
> > > > work, but you do not know them until you read documentation.
> > > > This is not good interface, a good interface is
> > > > hard to misuse and self-documenting.
> > > 
> > > I think v8 makes improvements here, I'd be happy to hear your feedback
> > > on it.
> > > 
> > > > > For legacy interrupts VFIO needs to:
> > > > > 
> > > > > - Assert an interrupt
> > > > > 
> > > > >         Eventfds seem to be the most efficient way to signal when to
> > > > >         assert an interrupt and gives us the flexibility that we can
> > > > >         send that signal to either another kernel module or to
> > > > >         userspace.  KVM_IRQFD is designed for exactly this, but needs
> > > > >         modifications for level triggered interrupts.  These include:
> > > > >         
> > > > >         - Using a different IRQ source ID
> > > > >         
> > > > >                 GSIs are not exclusive, multiple devices may assert the
> > > > >                 same GSI.  IRQ source IDs are how KVM handles multiple
> > > > >                 inputs.
> > > > 
> > > > Actually, thinking about it some more, all assigned
> > > > device interrupts are deasserted on ack, so together.
> > > > And userspace does the OR in userspace already.
> > > > 
> > > > So why is it not enough to give IRQFDs a single separate
> > > > source ID, distinct from userspace but shared by all devices?
> > > 
> > > We could do that, but then we lose any ability to filter the KVM irq ack
> > > notifier based on whether a given IRQ source ID is asserted.  This is
> > > something you've been pushing for.
> > 
> > We ended tracking it in irqfd, no?
> 
> We could do it there, but as we've seen, tracking such at the point
> where we do the deassert and notify requires fairly extensive locking to
> prevent races that could cause the device to get stuck.
> 
> > > Note that patch 1/6 of the v8 series
> > > adds this generically for all irq ack notifier users.  That's of course
> > > just an optimization,
> > 
> > How is it an optimization?
> 
> We only fire ack notifiers for source IDs that are asserted, if the ack
> notification user opts in to the filtering.  Hopefully resulting in
> fewer spurious callbacks.
> 
> > > we could have IRQ source IDs re-used and that
> > > might be a good solution if we ever start exhausting them.  v8 allows
> > > userspace to do this if it wants.
> > 
> > How does userspace know whether it should do it or not?
> 
> When it runs out.  Maybe use a single one for all of them.  The
> KVM_IRQ_SOURCE_ID ioctl in v8 tells userspace how many are available.
> Userspace can create difference strategies based on how many are
> available and number of devices.  For the vast majority of use cases,
> getting a new source id for each device is probably fine.  If sourceids
> run out, userspace has the option of creating a strategy to re-use them.

Source id is an implementation detail. If you expose them to
userspace we end up with silly limitations e.g. if we want
to add another source we need to extend source id to 64 bit.

> > > > >         - Assert-only
> > > > >         
> > > > >                 KVM_IRQFD currently does assert->deassert to emulate an
> > > > >                 edge triggered interrupt.  For level, we need to be able
> > > > >                 to signal a discrete assertion and de-assertion event.
> > > > >         This results in the modifications I've proposed to KVM_IRQFD.
> > > > 
> > > > Actually is it really necessary at all?  What happens if we assert and
> > > > deassert immediately?  If guest lost the interrupt, on EOI device will
> > > > reassert resulting in another interrupt.
> > > 
> > > It's been a while since I've tried, but I recall I used this as a
> > > workaround early on in development and it did work.  I don't feel it's a
> > > proper representation of the hardware we're trying to emulate though and
> > > istr that Avi wasn't too fond of it either.
> > 
> > EOI hack is not a proper representation either.
> > I think we were just confused and thought there's a race.
> 
> Using the EOI as a trigger to de-assert and potentially re-assert may be
> a hack, but it's about as close as we can come to following the behavior
> of hardware.
> It's actually quite similar to an apic re-sampling inputs
> except we don't have a physical line to read and see that it's still
> asserted.  We emulate this by de-asserting it and letting it re-assert
> if necessary.  The emulation to the guest isn't perfect, but it's a lot
> closer than immediately de-asserting the pin.
> I think the discussion
> below describes why I do this versus something that might be even closer
> to actual hardware.

Sorry I don't understand what "quite similar" means.  If deassert on ack
is "closer" somehow show me some software that needs it.


> > > > > - Know when to de-assert an interrupt
> > > > > 
> > > > >         Servicing an interrupt is device specific, we can't know for any
> > > > >         random device what interactions with the device indicate service
> > > > >         of an interrupt.  We therefore look to the underlying hardware
> > > > >         support where a vCPU writes an End Of Interrupt to the APIC to
> > > > >         indicate the chip should re-sample it's inputs and either
> > > > >         de-assert or continue asserting the interrupt level.  Our device
> > > > >         may still require service at this point, but this mechanism has
> > > > >         proven effective with KVM assignment.
> > > > >         
> > > > >         This results in the notify-only portion of the EOIFD/IRQ_ACKFD.
> > > > >         
> > > > > - Deassert an interrupt
> > > > > 
> > > > >         Now that we have an interrupt that's been asserted and we
> > > > >         suspect that we should re-evaluate the interrupt signal due to
> > > > >         activity possibly related to an EOI, we need a mechanism to
> > > > >         de-assert the interrupt.  There are two possibilities here:
> > > > >         
> > > > >         - Test and de-assert
> > > > >         
> > > > >                 Depending on hardware support for INTxDisable, we may be
> > > > >                 able to poll whether the hardware is still asserting
> > > > >                 it's interrupt and de-assert if quiesced.  This
> > > > >                 optimizes for the case where the interrupt is still
> > > > >                 asserting as we avoid re-assertion and avoid unmasking
> > > > >                 the device.
> > > > >         
> > > > >         - De-assert, test, (re-assert)
> > > > >         
> > > > >                 Not all hardware supports INTxDisable, so we may have no
> > > > >                 way to test whether the device is still asserting an
> > > > >                 interrupt other than to unmask and see if it re-fires.
> > > > >                 This not only supports the most hardware, but also
> > > > >                 optimizes for the case where the device is quiesced.
> > > > >                 
> > > > >         Taking the latter path results in the de-assert and notify
> > > > >         interface to the above EOIFD/IRQ_ACKFD.  This reduces the number
> > > > >         of signals between components and supports the most hardware.
> > > > >         
> > > > > That leaves dealing with the IRQ source ID.  Initially I tried to hide
> > > > > this from userspace as it's more of an implementation detail of KVM, but
> > > > > in v8 I expose it as it offers more flexibility and (I hope) removes
> > > > > some of the odd dependencies between interfaces imposed by previous
> > > > > version.
> > > > > 
> > > > > If you have specific suggestions how else to approach this, I welcome
> > > > > the feedback.
> > > > > It would be backwards to design an interface exclusively around a
> > > > > single user, but it would be just as backwards to not envision how an
> > > > > interface would be used in advance.  Thanks,
> > > > > 
> > > > > Alex
> > > > 
> > > > Could you address two questions I ask above pls?
> > > > If we really can use the same source ID for all irqfds,
> > > > and if it's ok to deassert immediately after all,
> > > > then large parts of code can go away.
> > > > 
> > > > Or maybe I was away for too long and forgot
> > > > what the problem were ...
> > > 
> > > So if we de-assert immediately and remove the notify on de-assert, then
> > > irq_ackfd becomes a notify-only interface.  In v8 that's what it is at
> > > it's base, with an option to de-assert.  That option (patch 6/6) is a
> > > tiny bit of code.
> > 
> > But it is an interface that at least makes some sense.
> > And it is also an existing one.
> 
> You often argue that debugging is an important consideration in
> designing and using an interface.  Doesn't improperly representing the
> interrupt state make debugging harder?  If the irq_state bit is clear we
> don't know if assigned device is masked waiting for an EOI or quiesced.

VFIO knows it.

> > > Removing the irq source ID isn't a clear win to me either.
> > 
> > It removes the limitation on number of ackfd/irqfd that there is.
> 
> The irqfd still has to use a sourceid and that has to be specified
> either by flag or flag and passed value.

Neither. irqfd uses KVM_USERSPACE_IRQ_SOURCE_ID now.  It's a bug, it
breaks sharing. Define KVM_IRQFD_SOURCE_ID and make all irqfds use it.
Add a capability so userspace knows the bug is fixed.

>  If we make a flag for
> "USE_ASSIGNED_DEVICE_SOURCE_ID", that again seems like a very narrowly
> focused extension.
>
> > > I'm becoming
> > > a broken record, but v8 already simplifies the irq source ID allocation
> > > and preserves the ability to filter irq ack notifications and targeted
> > > re-use of irq source IDs if userspace decides to support that.  Thanks,
> > > 
> > > Alex
> > 
> > I will look at v8.
> 
> Thanks!
> 
> Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Aug. 13, 2012, 11 p.m. UTC | #30
On Mon, Aug 13, 2012 at 04:41:05PM -0600, Alex Williamson wrote:
> On Tue, 2012-08-14 at 01:06 +0300, Michael S. Tsirkin wrote:
> > On Mon, Aug 13, 2012 at 03:34:01PM -0600, Alex Williamson wrote:
> > > On Sun, 2012-08-12 at 11:36 +0300, Avi Kivity wrote:
> > > > On 08/09/2012 10:26 PM, Alex Williamson wrote:
> > > > > On Mon, 2012-08-06 at 13:40 +0300, Avi Kivity wrote:
> > > > >> On 08/06/2012 01:38 PM, Avi Kivity wrote:
> > > > >> 
> > > > >> > Regarding the implementation, instead of a linked list, would an array
> > > > >> > of counters parallel to the bitmap make it simpler?
> > > > >> 
> > > > >> Or even, replace the bitmap with an array of counters.
> > > > > 
> > > > > I'm not sure a counter array is what we're really after.  That gives us
> > > > > reference counting for the irq source IDs, but not the key->gsi lookup.
> > > > 
> > > > You can look up the gsi while registering the eoifd, so it's accessible
> > > > as eoifd->gsi instead of eoifd->source->gsi.  The irqfd can go away
> > > > while the eoifd is still active, but is this a problem?
> > > 
> > > In my opinion, no, but Michael disagrees.
> > > 
> > > > > It also highlights another issue, that we have a limited set of source
> > > > > IDs.  Looks like we have BITS_PER_LONG IDs, with two already used, one
> > > > > for the shared userspace ID and another for the PIT.  How happy are we
> > > > > going to be with a limit of 62 level interrupts in use at one time?
> > > > 
> > > > When we start being unhappy we can increase that number.  On the other
> > > > hand more locks and lists makes me unhappy now.
> > > 
> > > Yep, good point.  My latest version removes the source ID object lock
> > > and list (and objects).  I still have a lock and list for the ack
> > > notification, but it's hard not to unless we combine them into one
> > > mega-irqfd ioctl as Michael suggests.
> > >
> > > > > It's arguably a reasonable number since the most virtualization friendly
> > > > > devices (sr-iov VFs) don't even support this kind of interrupt.  It's
> > > > > also very wasteful allocating an entire source ID for a single GSI
> > > > > within that source ID.  PCI supports interrupts A, B, C, and D, which,
> > > > > in the most optimal config, each go to different GSIs.  So we could
> > > > > theoretically be more efficient in our use and allocation of irq source
> > > > > IDs if we tracked use by the source ID, gsi pair.
> > > > 
> > > > There are, in one userspace, just three gsis available for PCI links, so
> > > > you're compressing the source id space by 3.
> > > 
> > > I imagine there's a way to put each PCI interrupt pin on a GSI, but
> > > still only 4, not a great expansion of source ID space.  I like
> > > Michael's idea of re-using source IDs if we run out better.
> > > 
> > > > > That probably makes it less practical to replace anything at the top
> > > > > level with a counter array.  The key that we pass back is currently the
> > > > > actual source ID, but we don't specify what it is, so we could split it
> > > > > and have it encode a 16bit source ID plus 16 bit GSI.  It could also be
> > > > > an idr entry.
> > > > 
> > > > We can fix those kinds of problems by adding another layer of
> > > > indirection.  But I doubt they will be needed.  I don't see people
> > > > assigning 60 legacy devices to one guest.
> > > 
> > > Yep, we can ignore it for now and put it in the hands of userspace to
> > > re-use IDs if needed.
> > > 
> > > > > Michael, would the interface be more acceptable to you if we added
> > > > > separate ioctls to allocate and free some representation of an irq
> > > > > source ID, gsi pair?  For instance, an ioctl might return an idr entry
> > > > > for an irq source ID/gsi object which would then be passed as a
> > > > > parameter in struct kvm_irqfd and struct kvm_eoifd so that the object
> > > > > representing the source id/gsi isn't magically freed on it's own.  This
> > > > > would also allow us to deassign/close one end and reconfigure it later.
> > > > > Thanks,
> > > > 
> > > > Another option is to push the responsibility for allocating IDs for the
> > > > association to userspace.  Let userspace both create the irqfd and the
> > > > eoifd with the same ID, the kernel matches them at registration time and
> > > > copies the gsi/sourceid from the first to the second eventfd.
> > > 
> > > Aside from the copying gsi/sourceid bit, you've just described my latest
> > > attempt at this series.  Specifying both a sourceid and gsi also allows
> > > userspace to make better use of the sourceid address space (use more
> > > than one gsi if userspace wants the complexity of managing them).
> > > Thanks,
> > > 
> > > Alex
> > 
> > Turns out per device source ID is a bug copied from existing
> > device assignment. I am amazed we did not notice before.
> > There we have small # of devices so it's not a problem but there's no
> > reason just not to have a source ID for all irqfds.
> > So the problem goes away, and there is no limit on # of level irqfds,
> > and no need to manage IDs in userspace at all.
> > You can still have cookies in userspace if you like but do not map them
> > to source IDs.
> 
> IMHO it's not a bug, it's an implementation decision.  They could be
> shared, but that doesn't make it wrong to not share them.  Given that we
> have 32 memory slots, the only way you could hit this would be to have a
> lot of really slow devices that don't direct-map any BARs.  A reason to
> not have the same source id for everything is that I think we can do ack
> notification filtering more easily using separate source ids (as is done
> in the first patch of the v8 series).

Just a thought: can filtering read and clear the irqfd counter?

>  As the code is today, I agree,
> there's probably no advantage to using multiple source IDs.  Thanks,
> 
> Alex

I think one point worth addressing is, Gleb wanted
to get eoifd without irqfd at all and that works for
timer interrupt.
Alex Williamson Aug. 14, 2012, 3:09 a.m. UTC | #31
On Tue, 2012-08-14 at 02:00 +0300, Michael S. Tsirkin wrote:
> On Mon, Aug 13, 2012 at 04:41:05PM -0600, Alex Williamson wrote:
> > On Tue, 2012-08-14 at 01:06 +0300, Michael S. Tsirkin wrote:
> > > On Mon, Aug 13, 2012 at 03:34:01PM -0600, Alex Williamson wrote:
> > > > On Sun, 2012-08-12 at 11:36 +0300, Avi Kivity wrote:
> > > > > On 08/09/2012 10:26 PM, Alex Williamson wrote:
> > > > > > On Mon, 2012-08-06 at 13:40 +0300, Avi Kivity wrote:
> > > > > >> On 08/06/2012 01:38 PM, Avi Kivity wrote:
> > > > > >> 
> > > > > >> > Regarding the implementation, instead of a linked list, would an array
> > > > > >> > of counters parallel to the bitmap make it simpler?
> > > > > >> 
> > > > > >> Or even, replace the bitmap with an array of counters.
> > > > > > 
> > > > > > I'm not sure a counter array is what we're really after.  That gives us
> > > > > > reference counting for the irq source IDs, but not the key->gsi lookup.
> > > > > 
> > > > > You can look up the gsi while registering the eoifd, so it's accessible
> > > > > as eoifd->gsi instead of eoifd->source->gsi.  The irqfd can go away
> > > > > while the eoifd is still active, but is this a problem?
> > > > 
> > > > In my opinion, no, but Michael disagrees.
> > > > 
> > > > > > It also highlights another issue, that we have a limited set of source
> > > > > > IDs.  Looks like we have BITS_PER_LONG IDs, with two already used, one
> > > > > > for the shared userspace ID and another for the PIT.  How happy are we
> > > > > > going to be with a limit of 62 level interrupts in use at one time?
> > > > > 
> > > > > When we start being unhappy we can increase that number.  On the other
> > > > > hand more locks and lists makes me unhappy now.
> > > > 
> > > > Yep, good point.  My latest version removes the source ID object lock
> > > > and list (and objects).  I still have a lock and list for the ack
> > > > notification, but it's hard not to unless we combine them into one
> > > > mega-irqfd ioctl as Michael suggests.
> > > >
> > > > > > It's arguably a reasonable number since the most virtualization friendly
> > > > > > devices (sr-iov VFs) don't even support this kind of interrupt.  It's
> > > > > > also very wasteful allocating an entire source ID for a single GSI
> > > > > > within that source ID.  PCI supports interrupts A, B, C, and D, which,
> > > > > > in the most optimal config, each go to different GSIs.  So we could
> > > > > > theoretically be more efficient in our use and allocation of irq source
> > > > > > IDs if we tracked use by the source ID, gsi pair.
> > > > > 
> > > > > There are, in one userspace, just three gsis available for PCI links, so
> > > > > you're compressing the source id space by 3.
> > > > 
> > > > I imagine there's a way to put each PCI interrupt pin on a GSI, but
> > > > still only 4, not a great expansion of source ID space.  I like
> > > > Michael's idea of re-using source IDs if we run out better.
> > > > 
> > > > > > That probably makes it less practical to replace anything at the top
> > > > > > level with a counter array.  The key that we pass back is currently the
> > > > > > actual source ID, but we don't specify what it is, so we could split it
> > > > > > and have it encode a 16bit source ID plus 16 bit GSI.  It could also be
> > > > > > an idr entry.
> > > > > 
> > > > > We can fix those kinds of problems by adding another layer of
> > > > > indirection.  But I doubt they will be needed.  I don't see people
> > > > > assigning 60 legacy devices to one guest.
> > > > 
> > > > Yep, we can ignore it for now and put it in the hands of userspace to
> > > > re-use IDs if needed.
> > > > 
> > > > > > Michael, would the interface be more acceptable to you if we added
> > > > > > separate ioctls to allocate and free some representation of an irq
> > > > > > source ID, gsi pair?  For instance, an ioctl might return an idr entry
> > > > > > for an irq source ID/gsi object which would then be passed as a
> > > > > > parameter in struct kvm_irqfd and struct kvm_eoifd so that the object
> > > > > > representing the source id/gsi isn't magically freed on it's own.  This
> > > > > > would also allow us to deassign/close one end and reconfigure it later.
> > > > > > Thanks,
> > > > > 
> > > > > Another option is to push the responsibility for allocating IDs for the
> > > > > association to userspace.  Let userspace both create the irqfd and the
> > > > > eoifd with the same ID, the kernel matches them at registration time and
> > > > > copies the gsi/sourceid from the first to the second eventfd.
> > > > 
> > > > Aside from the copying gsi/sourceid bit, you've just described my latest
> > > > attempt at this series.  Specifying both a sourceid and gsi also allows
> > > > userspace to make better use of the sourceid address space (use more
> > > > than one gsi if userspace wants the complexity of managing them).
> > > > Thanks,
> > > > 
> > > > Alex
> > > 
> > > Turns out per device source ID is a bug copied from existing
> > > device assignment. I am amazed we did not notice before.
> > > There we have small # of devices so it's not a problem but there's no
> > > reason just not to have a source ID for all irqfds.
> > > So the problem goes away, and there is no limit on # of level irqfds,
> > > and no need to manage IDs in userspace at all.
> > > You can still have cookies in userspace if you like but do not map them
> > > to source IDs.
> > 
> > IMHO it's not a bug, it's an implementation decision.  They could be
> > shared, but that doesn't make it wrong to not share them.  Given that we
> > have 32 memory slots, the only way you could hit this would be to have a
> > lot of really slow devices that don't direct-map any BARs.  A reason to
> > not have the same source id for everything is that I think we can do ack
> > notification filtering more easily using separate source ids (as is done
> > in the first patch of the v8 series).
> 
> Just a thought: can filtering read and clear the irqfd counter?

Sorry, what's "the irqfd counter"?  The eventfd counter?  As I have it
in the patch series, the filtering happens where the irq ack notifier
calls the individual notifier callbacks.  That's not irqfd/eventfd
specific, so it doesn't have access to the eventfd counter there.
Taking the filtering into the into the actual callbacks seems to require
locking or maybe your proposed test and clear interface (which still
requires locking).

> >  As the code is today, I agree,
> > there's probably no advantage to using multiple source IDs.  Thanks,
> > 
> > Alex
> 
> I think one point worth addressing is, Gleb wanted
> to get eoifd without irqfd at all and that works for
> timer interrupt.

Right, that's what I'm referring to with the modular components vs
pulling eoifd into irqfd.  One gives us interfaces that can easily be
extended or already supports a more generic eoifd, the other gives us a
very specific use case and we'll have to come up with something else for
non-irqfd related eois.  Thanks,

Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Aug. 14, 2012, 8:35 a.m. UTC | #32
On Mon, Aug 13, 2012 at 09:09:43PM -0600, Alex Williamson wrote:
> On Tue, 2012-08-14 at 02:00 +0300, Michael S. Tsirkin wrote:
> > On Mon, Aug 13, 2012 at 04:41:05PM -0600, Alex Williamson wrote:
> > > On Tue, 2012-08-14 at 01:06 +0300, Michael S. Tsirkin wrote:
> > > > On Mon, Aug 13, 2012 at 03:34:01PM -0600, Alex Williamson wrote:
> > > > > On Sun, 2012-08-12 at 11:36 +0300, Avi Kivity wrote:
> > > > > > On 08/09/2012 10:26 PM, Alex Williamson wrote:
> > > > > > > On Mon, 2012-08-06 at 13:40 +0300, Avi Kivity wrote:
> > > > > > >> On 08/06/2012 01:38 PM, Avi Kivity wrote:
> > > > > > >> 
> > > > > > >> > Regarding the implementation, instead of a linked list, would an array
> > > > > > >> > of counters parallel to the bitmap make it simpler?
> > > > > > >> 
> > > > > > >> Or even, replace the bitmap with an array of counters.
> > > > > > > 
> > > > > > > I'm not sure a counter array is what we're really after.  That gives us
> > > > > > > reference counting for the irq source IDs, but not the key->gsi lookup.
> > > > > > 
> > > > > > You can look up the gsi while registering the eoifd, so it's accessible
> > > > > > as eoifd->gsi instead of eoifd->source->gsi.  The irqfd can go away
> > > > > > while the eoifd is still active, but is this a problem?
> > > > > 
> > > > > In my opinion, no, but Michael disagrees.
> > > > > 
> > > > > > > It also highlights another issue, that we have a limited set of source
> > > > > > > IDs.  Looks like we have BITS_PER_LONG IDs, with two already used, one
> > > > > > > for the shared userspace ID and another for the PIT.  How happy are we
> > > > > > > going to be with a limit of 62 level interrupts in use at one time?
> > > > > > 
> > > > > > When we start being unhappy we can increase that number.  On the other
> > > > > > hand more locks and lists makes me unhappy now.
> > > > > 
> > > > > Yep, good point.  My latest version removes the source ID object lock
> > > > > and list (and objects).  I still have a lock and list for the ack
> > > > > notification, but it's hard not to unless we combine them into one
> > > > > mega-irqfd ioctl as Michael suggests.
> > > > >
> > > > > > > It's arguably a reasonable number since the most virtualization friendly
> > > > > > > devices (sr-iov VFs) don't even support this kind of interrupt.  It's
> > > > > > > also very wasteful allocating an entire source ID for a single GSI
> > > > > > > within that source ID.  PCI supports interrupts A, B, C, and D, which,
> > > > > > > in the most optimal config, each go to different GSIs.  So we could
> > > > > > > theoretically be more efficient in our use and allocation of irq source
> > > > > > > IDs if we tracked use by the source ID, gsi pair.
> > > > > > 
> > > > > > There are, in one userspace, just three gsis available for PCI links, so
> > > > > > you're compressing the source id space by 3.
> > > > > 
> > > > > I imagine there's a way to put each PCI interrupt pin on a GSI, but
> > > > > still only 4, not a great expansion of source ID space.  I like
> > > > > Michael's idea of re-using source IDs if we run out better.
> > > > > 
> > > > > > > That probably makes it less practical to replace anything at the top
> > > > > > > level with a counter array.  The key that we pass back is currently the
> > > > > > > actual source ID, but we don't specify what it is, so we could split it
> > > > > > > and have it encode a 16bit source ID plus 16 bit GSI.  It could also be
> > > > > > > an idr entry.
> > > > > > 
> > > > > > We can fix those kinds of problems by adding another layer of
> > > > > > indirection.  But I doubt they will be needed.  I don't see people
> > > > > > assigning 60 legacy devices to one guest.
> > > > > 
> > > > > Yep, we can ignore it for now and put it in the hands of userspace to
> > > > > re-use IDs if needed.
> > > > > 
> > > > > > > Michael, would the interface be more acceptable to you if we added
> > > > > > > separate ioctls to allocate and free some representation of an irq
> > > > > > > source ID, gsi pair?  For instance, an ioctl might return an idr entry
> > > > > > > for an irq source ID/gsi object which would then be passed as a
> > > > > > > parameter in struct kvm_irqfd and struct kvm_eoifd so that the object
> > > > > > > representing the source id/gsi isn't magically freed on it's own.  This
> > > > > > > would also allow us to deassign/close one end and reconfigure it later.
> > > > > > > Thanks,
> > > > > > 
> > > > > > Another option is to push the responsibility for allocating IDs for the
> > > > > > association to userspace.  Let userspace both create the irqfd and the
> > > > > > eoifd with the same ID, the kernel matches them at registration time and
> > > > > > copies the gsi/sourceid from the first to the second eventfd.
> > > > > 
> > > > > Aside from the copying gsi/sourceid bit, you've just described my latest
> > > > > attempt at this series.  Specifying both a sourceid and gsi also allows
> > > > > userspace to make better use of the sourceid address space (use more
> > > > > than one gsi if userspace wants the complexity of managing them).
> > > > > Thanks,
> > > > > 
> > > > > Alex
> > > > 
> > > > Turns out per device source ID is a bug copied from existing
> > > > device assignment. I am amazed we did not notice before.
> > > > There we have small # of devices so it's not a problem but there's no
> > > > reason just not to have a source ID for all irqfds.
> > > > So the problem goes away, and there is no limit on # of level irqfds,
> > > > and no need to manage IDs in userspace at all.
> > > > You can still have cookies in userspace if you like but do not map them
> > > > to source IDs.
> > > 
> > > IMHO it's not a bug, it's an implementation decision.  They could be
> > > shared, but that doesn't make it wrong to not share them.  Given that we
> > > have 32 memory slots, the only way you could hit this would be to have a
> > > lot of really slow devices that don't direct-map any BARs.  A reason to
> > > not have the same source id for everything is that I think we can do ack
> > > notification filtering more easily using separate source ids (as is done
> > > in the first patch of the v8 series).
> > 
> > Just a thought: can filtering read and clear the irqfd counter?
> 
> Sorry, what's "the irqfd counter"?  The eventfd counter?  As I have it
> in the patch series, the filtering happens where the irq ack notifier
> calls the individual notifier callbacks.  That's not irqfd/eventfd
> specific, so it doesn't have access to the eventfd counter there.
> Taking the filtering into the into the actual callbacks seems to require
> locking or maybe your proposed test and clear interface (which still
> requires locking).
> 
> > >  As the code is today, I agree,
> > > there's probably no advantage to using multiple source IDs.  Thanks,
> > > 
> > > Alex
> > 
> > I think one point worth addressing is, Gleb wanted
> > to get eoifd without irqfd at all and that works for
> > timer interrupt.
> 
> Right, that's what I'm referring to with the modular components vs
> pulling eoifd into irqfd.  One gives us interfaces that can easily be
> extended or already supports a more generic eoifd, the other gives us a
> very specific use case and we'll have to come up with something else for
> non-irqfd related eois.  Thanks,
> 
> Alex

Yes that is fine but previous versions tied eoifd to irqfd
so were not useful alone anyway. Will look at v8.
Gleb Natapov Aug. 14, 2012, 10:10 a.m. UTC | #33
On Tue, Aug 14, 2012 at 01:52:13AM +0300, Michael S. Tsirkin wrote:
> > Using the EOI as a trigger to de-assert and potentially re-assert may be
> > a hack, but it's about as close as we can come to following the behavior
> > of hardware.
> > It's actually quite similar to an apic re-sampling inputs
> > except we don't have a physical line to read and see that it's still
> > asserted.  We emulate this by de-asserting it and letting it re-assert
> > if necessary.  The emulation to the guest isn't perfect, but it's a lot
> > closer than immediately de-asserting the pin.
> > I think the discussion
> > below describes why I do this versus something that might be even closer
> > to actual hardware.
> 
> Sorry I don't understand what "quite similar" means.  If deassert on ack
> is "closer" somehow show me some software that needs it.
> 
This is incorrect question to ask. The correct one is "is there guest
visible effect" and the answer is yes. If guest reads ioapic irr before
eoi it will incorrectly read zero. Now when we know what is guest visible
effect we can think about whether we can live with it. But it looks like
we can't since this have more serious implications. If interrupt is masked
in ioapic during irq delivery interrupt will be never delivered after unmask.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Aug. 14, 2012, 10:13 a.m. UTC | #34
On Tue, Aug 14, 2012 at 01:10:15PM +0300, Gleb Natapov wrote:
> On Tue, Aug 14, 2012 at 01:52:13AM +0300, Michael S. Tsirkin wrote:
> > > Using the EOI as a trigger to de-assert and potentially re-assert may be
> > > a hack, but it's about as close as we can come to following the behavior
> > > of hardware.
> > > It's actually quite similar to an apic re-sampling inputs
> > > except we don't have a physical line to read and see that it's still
> > > asserted.  We emulate this by de-asserting it and letting it re-assert
> > > if necessary.  The emulation to the guest isn't perfect, but it's a lot
> > > closer than immediately de-asserting the pin.
> > > I think the discussion
> > > below describes why I do this versus something that might be even closer
> > > to actual hardware.
> > 
> > Sorry I don't understand what "quite similar" means.  If deassert on ack
> > is "closer" somehow show me some software that needs it.
> > 
> This is incorrect question to ask. The correct one is "is there guest
> visible effect" and the answer is yes. If guest reads ioapic irr before
> eoi it will incorrectly read zero. Now when we know what is guest visible
> effect we can think about whether we can live with it. But it looks like
> we can't since this have more serious implications. If interrupt is masked
> in ioapic during irq delivery interrupt will be never delivered after unmask.
> 
Can be probably solved using mask notifiers, although I'd rather delete
them than add new users.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Aug. 14, 2012, 12:35 p.m. UTC | #35
On 08/12/2012 12:33 PM, Michael S. Tsirkin wrote:
>> 
>> Michael, would the interface be more acceptable to you if we added
>> separate ioctls to allocate and free some representation of an irq
>> source ID, gsi pair?  For instance, an ioctl might return an idr entry
>> for an irq source ID/gsi object which would then be passed as a
>> parameter in struct kvm_irqfd and struct kvm_eoifd so that the object
>> representing the source id/gsi isn't magically freed on it's own.  This
>> would also allow us to deassign/close one end and reconfigure it later.
>> Thanks,
>> 
>> Alex
> 
> It's acceptable to me either way. I was only pointing out that as
> designed, the interface looks simple at first but then you find out some
> subtle limitations which are implementation driven. This gives
> an overall feeling the abstraction is too low level.
> 
> If we compare to the existing irqfd, isn't the difference
> simply that irqfd deasserts immediately ATM, while we
> want to delay this until later?
> 
> If yes, then along the lines that you proposed, and combining with my
> idea of tracking deasserts, how do you like the following:
> 
> /* Keep line asserted until guest has handled the interrupt. */
> #define KVM_IRQFD_FLAG_DEASSERT_ON_ACK (1 << 1)
> /* Notify after line is deasserted. */
> #define KVM_IRQFD_FLAG_DEASSERT_EVENTFD (2 << 1)
> 
> 	struct kvm_irqfd {
> 		__u32 fd;
> 		__u32 gsi;
> 		__u32 flags;
> 		/* eventfd to notify when line is deasserted */
> 		__u32 deassert_eventfd;
> 		__u8  pad[16];
> 	};
> 
> now the only limitation is that KVM_IRQFD_FLAG_DEASSERT_ON_ACK is only
> effective for level interrupts.
> 
> Notes about lifetime of objects:
> 	- closing deassert_eventfd does nothing (we can keep
> 	  reference to it from irqfd so no need for
>           complex polling/flushing scheme)
> 	- closing irqfd or deasserting dis-associates
> 	  deassert_eventfd automatically
> 	- source id is internal to irqfd and goes away with it
> 
> it looks harder to misuse and fits what we want to do nicely,
> and needs less code to implement.
> 
> Avi, what do you think?

I think given all the complexity in the separate ioctl approach that
this makes sense.  There are no lifetime issues or code to match the two
eventfds.  Alex, would this API simplify the code?

Yet another option was raised in the past, and that was exiling ioapic
and pic to userspace.  This moves the entire issue to userspace.  The
cost is a new interface that implements the APIC bus (betweem APIC and
IOAPIC) and the INTACK sequence (between APIC and PIC), and potential
for performance regressions due to the PIC, IOAPIC, and PIT being in
userspace.  We would still have to keep the IOAPIC/PIC in the kernel,
but no new features would be added.

However, this is a huge job.  We could discuss this to death too but I
have the feeling the end result will be to choose the shorter path --
adding irqackfd/deassertfd/whateverwecallitfd.
Michael S. Tsirkin Aug. 14, 2012, 2:50 p.m. UTC | #36
On Tue, Aug 14, 2012 at 03:35:54PM +0300, Avi Kivity wrote:
> On 08/12/2012 12:33 PM, Michael S. Tsirkin wrote:
> >> 
> >> Michael, would the interface be more acceptable to you if we added
> >> separate ioctls to allocate and free some representation of an irq
> >> source ID, gsi pair?  For instance, an ioctl might return an idr entry
> >> for an irq source ID/gsi object which would then be passed as a
> >> parameter in struct kvm_irqfd and struct kvm_eoifd so that the object
> >> representing the source id/gsi isn't magically freed on it's own.  This
> >> would also allow us to deassign/close one end and reconfigure it later.
> >> Thanks,
> >> 
> >> Alex
> > 
> > It's acceptable to me either way. I was only pointing out that as
> > designed, the interface looks simple at first but then you find out some
> > subtle limitations which are implementation driven. This gives
> > an overall feeling the abstraction is too low level.
> > 
> > If we compare to the existing irqfd, isn't the difference
> > simply that irqfd deasserts immediately ATM, while we
> > want to delay this until later?
> > 
> > If yes, then along the lines that you proposed, and combining with my
> > idea of tracking deasserts, how do you like the following:
> > 
> > /* Keep line asserted until guest has handled the interrupt. */
> > #define KVM_IRQFD_FLAG_DEASSERT_ON_ACK (1 << 1)
> > /* Notify after line is deasserted. */
> > #define KVM_IRQFD_FLAG_DEASSERT_EVENTFD (2 << 1)
> > 
> > 	struct kvm_irqfd {
> > 		__u32 fd;
> > 		__u32 gsi;
> > 		__u32 flags;
> > 		/* eventfd to notify when line is deasserted */
> > 		__u32 deassert_eventfd;
> > 		__u8  pad[16];
> > 	};
> > 
> > now the only limitation is that KVM_IRQFD_FLAG_DEASSERT_ON_ACK is only
> > effective for level interrupts.
> > 
> > Notes about lifetime of objects:
> > 	- closing deassert_eventfd does nothing (we can keep
> > 	  reference to it from irqfd so no need for
> >           complex polling/flushing scheme)
> > 	- closing irqfd or deasserting dis-associates
> > 	  deassert_eventfd automatically
> > 	- source id is internal to irqfd and goes away with it
> > 
> > it looks harder to misuse and fits what we want to do nicely,
> > and needs less code to implement.
> > 
> > Avi, what do you think?
> 
> I think given all the complexity in the separate ioctl approach that
> this makes sense.  There are no lifetime issues or code to match the two
> eventfds.

OK, it's fine with me too then. Pls disregard my earlier proposal to
deassert immediately; Gleb showed me it does not work.

> Alex, would this API simplify the code?
> 
> Yet another option was raised in the past, and that was exiling ioapic
> and pic to userspace.  This moves the entire issue to userspace.  The
> cost is a new interface that implements the APIC bus (betweem APIC and
> IOAPIC) and the INTACK sequence (between APIC and PIC), and potential
> for performance regressions due to the PIC, IOAPIC, and PIT being in
> userspace.  We would still have to keep the IOAPIC/PIC in the kernel,
> but no new features would be added.
> 
> However, this is a huge job.  We could discuss this to death too but I
> have the feeling the end result will be to choose the shorter path --
> adding irqackfd/deassertfd/whateverwecallitfd.
> 
> 
> -- 
> error compiling committee.c: too many arguments to function
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson Aug. 14, 2012, 9:28 p.m. UTC | #37
On Tue, 2012-08-14 at 11:35 +0300, Michael S. Tsirkin wrote:
> On Mon, Aug 13, 2012 at 09:09:43PM -0600, Alex Williamson wrote:
> > On Tue, 2012-08-14 at 02:00 +0300, Michael S. Tsirkin wrote:
> > > On Mon, Aug 13, 2012 at 04:41:05PM -0600, Alex Williamson wrote:
> > > > On Tue, 2012-08-14 at 01:06 +0300, Michael S. Tsirkin wrote:
> > > > > On Mon, Aug 13, 2012 at 03:34:01PM -0600, Alex Williamson wrote:
> > > > > > On Sun, 2012-08-12 at 11:36 +0300, Avi Kivity wrote:
> > > > > > > On 08/09/2012 10:26 PM, Alex Williamson wrote:
> > > > > > > > On Mon, 2012-08-06 at 13:40 +0300, Avi Kivity wrote:
> > > > > > > >> On 08/06/2012 01:38 PM, Avi Kivity wrote:
> > > > > > > >> 
> > > > > > > >> > Regarding the implementation, instead of a linked list, would an array
> > > > > > > >> > of counters parallel to the bitmap make it simpler?
> > > > > > > >> 
> > > > > > > >> Or even, replace the bitmap with an array of counters.
> > > > > > > > 
> > > > > > > > I'm not sure a counter array is what we're really after.  That gives us
> > > > > > > > reference counting for the irq source IDs, but not the key->gsi lookup.
> > > > > > > 
> > > > > > > You can look up the gsi while registering the eoifd, so it's accessible
> > > > > > > as eoifd->gsi instead of eoifd->source->gsi.  The irqfd can go away
> > > > > > > while the eoifd is still active, but is this a problem?
> > > > > > 
> > > > > > In my opinion, no, but Michael disagrees.
> > > > > > 
> > > > > > > > It also highlights another issue, that we have a limited set of source
> > > > > > > > IDs.  Looks like we have BITS_PER_LONG IDs, with two already used, one
> > > > > > > > for the shared userspace ID and another for the PIT.  How happy are we
> > > > > > > > going to be with a limit of 62 level interrupts in use at one time?
> > > > > > > 
> > > > > > > When we start being unhappy we can increase that number.  On the other
> > > > > > > hand more locks and lists makes me unhappy now.
> > > > > > 
> > > > > > Yep, good point.  My latest version removes the source ID object lock
> > > > > > and list (and objects).  I still have a lock and list for the ack
> > > > > > notification, but it's hard not to unless we combine them into one
> > > > > > mega-irqfd ioctl as Michael suggests.
> > > > > >
> > > > > > > > It's arguably a reasonable number since the most virtualization friendly
> > > > > > > > devices (sr-iov VFs) don't even support this kind of interrupt.  It's
> > > > > > > > also very wasteful allocating an entire source ID for a single GSI
> > > > > > > > within that source ID.  PCI supports interrupts A, B, C, and D, which,
> > > > > > > > in the most optimal config, each go to different GSIs.  So we could
> > > > > > > > theoretically be more efficient in our use and allocation of irq source
> > > > > > > > IDs if we tracked use by the source ID, gsi pair.
> > > > > > > 
> > > > > > > There are, in one userspace, just three gsis available for PCI links, so
> > > > > > > you're compressing the source id space by 3.
> > > > > > 
> > > > > > I imagine there's a way to put each PCI interrupt pin on a GSI, but
> > > > > > still only 4, not a great expansion of source ID space.  I like
> > > > > > Michael's idea of re-using source IDs if we run out better.
> > > > > > 
> > > > > > > > That probably makes it less practical to replace anything at the top
> > > > > > > > level with a counter array.  The key that we pass back is currently the
> > > > > > > > actual source ID, but we don't specify what it is, so we could split it
> > > > > > > > and have it encode a 16bit source ID plus 16 bit GSI.  It could also be
> > > > > > > > an idr entry.
> > > > > > > 
> > > > > > > We can fix those kinds of problems by adding another layer of
> > > > > > > indirection.  But I doubt they will be needed.  I don't see people
> > > > > > > assigning 60 legacy devices to one guest.
> > > > > > 
> > > > > > Yep, we can ignore it for now and put it in the hands of userspace to
> > > > > > re-use IDs if needed.
> > > > > > 
> > > > > > > > Michael, would the interface be more acceptable to you if we added
> > > > > > > > separate ioctls to allocate and free some representation of an irq
> > > > > > > > source ID, gsi pair?  For instance, an ioctl might return an idr entry
> > > > > > > > for an irq source ID/gsi object which would then be passed as a
> > > > > > > > parameter in struct kvm_irqfd and struct kvm_eoifd so that the object
> > > > > > > > representing the source id/gsi isn't magically freed on it's own.  This
> > > > > > > > would also allow us to deassign/close one end and reconfigure it later.
> > > > > > > > Thanks,
> > > > > > > 
> > > > > > > Another option is to push the responsibility for allocating IDs for the
> > > > > > > association to userspace.  Let userspace both create the irqfd and the
> > > > > > > eoifd with the same ID, the kernel matches them at registration time and
> > > > > > > copies the gsi/sourceid from the first to the second eventfd.
> > > > > > 
> > > > > > Aside from the copying gsi/sourceid bit, you've just described my latest
> > > > > > attempt at this series.  Specifying both a sourceid and gsi also allows
> > > > > > userspace to make better use of the sourceid address space (use more
> > > > > > than one gsi if userspace wants the complexity of managing them).
> > > > > > Thanks,
> > > > > > 
> > > > > > Alex
> > > > > 
> > > > > Turns out per device source ID is a bug copied from existing
> > > > > device assignment. I am amazed we did not notice before.
> > > > > There we have small # of devices so it's not a problem but there's no
> > > > > reason just not to have a source ID for all irqfds.
> > > > > So the problem goes away, and there is no limit on # of level irqfds,
> > > > > and no need to manage IDs in userspace at all.
> > > > > You can still have cookies in userspace if you like but do not map them
> > > > > to source IDs.
> > > > 
> > > > IMHO it's not a bug, it's an implementation decision.  They could be
> > > > shared, but that doesn't make it wrong to not share them.  Given that we
> > > > have 32 memory slots, the only way you could hit this would be to have a
> > > > lot of really slow devices that don't direct-map any BARs.  A reason to
> > > > not have the same source id for everything is that I think we can do ack
> > > > notification filtering more easily using separate source ids (as is done
> > > > in the first patch of the v8 series).
> > > 
> > > Just a thought: can filtering read and clear the irqfd counter?
> > 
> > Sorry, what's "the irqfd counter"?  The eventfd counter?  As I have it
> > in the patch series, the filtering happens where the irq ack notifier
> > calls the individual notifier callbacks.  That's not irqfd/eventfd
> > specific, so it doesn't have access to the eventfd counter there.
> > Taking the filtering into the into the actual callbacks seems to require
> > locking or maybe your proposed test and clear interface (which still
> > requires locking).
> > 
> > > >  As the code is today, I agree,
> > > > there's probably no advantage to using multiple source IDs.  Thanks,
> > > > 
> > > > Alex
> > > 
> > > I think one point worth addressing is, Gleb wanted
> > > to get eoifd without irqfd at all and that works for
> > > timer interrupt.
> > 
> > Right, that's what I'm referring to with the modular components vs
> > pulling eoifd into irqfd.  One gives us interfaces that can easily be
> > extended or already supports a more generic eoifd, the other gives us a
> > very specific use case and we'll have to come up with something else for
> > non-irqfd related eois.  Thanks,
> > 
> > Alex
> 
> Yes that is fine but previous versions tied eoifd to irqfd
> so were not useful alone anyway. Will look at v8.

This is because earlier feedback rejected creating a version of the
ioctl that had no users.  It was only a matter of adding a flag to
indicate kvm_eoifd.key was actually a gsi and some trivial code changes
to enable such an interface.  v8 builds the interface in the other
direction, so I left the notify-only, untied version as the base.
Thanks,

Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson Aug. 14, 2012, 10:01 p.m. UTC | #38
On Tue, 2012-08-14 at 15:35 +0300, Avi Kivity wrote:
> On 08/12/2012 12:33 PM, Michael S. Tsirkin wrote:
> >> 
> >> Michael, would the interface be more acceptable to you if we added
> >> separate ioctls to allocate and free some representation of an irq
> >> source ID, gsi pair?  For instance, an ioctl might return an idr entry
> >> for an irq source ID/gsi object which would then be passed as a
> >> parameter in struct kvm_irqfd and struct kvm_eoifd so that the object
> >> representing the source id/gsi isn't magically freed on it's own.  This
> >> would also allow us to deassign/close one end and reconfigure it later.
> >> Thanks,
> >> 
> >> Alex
> > 
> > It's acceptable to me either way. I was only pointing out that as
> > designed, the interface looks simple at first but then you find out some
> > subtle limitations which are implementation driven. This gives
> > an overall feeling the abstraction is too low level.
> > 
> > If we compare to the existing irqfd, isn't the difference
> > simply that irqfd deasserts immediately ATM, while we
> > want to delay this until later?
> > 
> > If yes, then along the lines that you proposed, and combining with my
> > idea of tracking deasserts, how do you like the following:
> > 
> > /* Keep line asserted until guest has handled the interrupt. */
> > #define KVM_IRQFD_FLAG_DEASSERT_ON_ACK (1 << 1)
> > /* Notify after line is deasserted. */
> > #define KVM_IRQFD_FLAG_DEASSERT_EVENTFD (2 << 1)
> > 
> > 	struct kvm_irqfd {
> > 		__u32 fd;
> > 		__u32 gsi;
> > 		__u32 flags;
> > 		/* eventfd to notify when line is deasserted */
> > 		__u32 deassert_eventfd;
> > 		__u8  pad[16];
> > 	};
> > 
> > now the only limitation is that KVM_IRQFD_FLAG_DEASSERT_ON_ACK is only
> > effective for level interrupts.
> > 
> > Notes about lifetime of objects:
> > 	- closing deassert_eventfd does nothing (we can keep
> > 	  reference to it from irqfd so no need for
> >           complex polling/flushing scheme)
> > 	- closing irqfd or deasserting dis-associates
> > 	  deassert_eventfd automatically
> > 	- source id is internal to irqfd and goes away with it
> > 
> > it looks harder to misuse and fits what we want to do nicely,
> > and needs less code to implement.
> > 
> > Avi, what do you think?
> 
> I think given all the complexity in the separate ioctl approach that
> this makes sense.  There are no lifetime issues or code to match the two
> eventfds.  Alex, would this API simplify the code?

It does though I'm concerned that it's a very specific solution that
only addresses this problem.  Generic userspace eoi/ack is not
addressed.  The latest version using separate ioctls does a lot of
simplification by exposing irq sourceids.  The bulk of the code there is
duplicating what irqfd does just so we can catch the POLLHUP for
cleanup.  If there was an easier way to do that, we don't care about
POLLIN/POLLOUT, much of the code could be removed.  Alternatively we
could make some common infrastructure to simplify both irqfd and
irq_ackfd, but how to frame the helpers isn't easy.

> Yet another option was raised in the past, and that was exiling ioapic
> and pic to userspace.  This moves the entire issue to userspace.  The
> cost is a new interface that implements the APIC bus (betweem APIC and
> IOAPIC) and the INTACK sequence (between APIC and PIC), and potential
> for performance regressions due to the PIC, IOAPIC, and PIT being in
> userspace.  We would still have to keep the IOAPIC/PIC in the kernel,
> but no new features would be added.

Doesn't this assure a performance regression or are we assuming anywhere
we care about performance we're using MSI?  Thanks,

Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Aug. 14, 2012, 11:04 p.m. UTC | #39
On Tue, Aug 14, 2012 at 04:01:15PM -0600, Alex Williamson wrote:
> On Tue, 2012-08-14 at 15:35 +0300, Avi Kivity wrote:
> > On 08/12/2012 12:33 PM, Michael S. Tsirkin wrote:
> > >> 
> > >> Michael, would the interface be more acceptable to you if we added
> > >> separate ioctls to allocate and free some representation of an irq
> > >> source ID, gsi pair?  For instance, an ioctl might return an idr entry
> > >> for an irq source ID/gsi object which would then be passed as a
> > >> parameter in struct kvm_irqfd and struct kvm_eoifd so that the object
> > >> representing the source id/gsi isn't magically freed on it's own.  This
> > >> would also allow us to deassign/close one end and reconfigure it later.
> > >> Thanks,
> > >> 
> > >> Alex
> > > 
> > > It's acceptable to me either way. I was only pointing out that as
> > > designed, the interface looks simple at first but then you find out some
> > > subtle limitations which are implementation driven. This gives
> > > an overall feeling the abstraction is too low level.
> > > 
> > > If we compare to the existing irqfd, isn't the difference
> > > simply that irqfd deasserts immediately ATM, while we
> > > want to delay this until later?
> > > 
> > > If yes, then along the lines that you proposed, and combining with my
> > > idea of tracking deasserts, how do you like the following:
> > > 
> > > /* Keep line asserted until guest has handled the interrupt. */
> > > #define KVM_IRQFD_FLAG_DEASSERT_ON_ACK (1 << 1)
> > > /* Notify after line is deasserted. */
> > > #define KVM_IRQFD_FLAG_DEASSERT_EVENTFD (2 << 1)
> > > 
> > > 	struct kvm_irqfd {
> > > 		__u32 fd;
> > > 		__u32 gsi;
> > > 		__u32 flags;
> > > 		/* eventfd to notify when line is deasserted */
> > > 		__u32 deassert_eventfd;
> > > 		__u8  pad[16];
> > > 	};
> > > 
> > > now the only limitation is that KVM_IRQFD_FLAG_DEASSERT_ON_ACK is only
> > > effective for level interrupts.
> > > 
> > > Notes about lifetime of objects:
> > > 	- closing deassert_eventfd does nothing (we can keep
> > > 	  reference to it from irqfd so no need for
> > >           complex polling/flushing scheme)
> > > 	- closing irqfd or deasserting dis-associates
> > > 	  deassert_eventfd automatically
> > > 	- source id is internal to irqfd and goes away with it
> > > 
> > > it looks harder to misuse and fits what we want to do nicely,
> > > and needs less code to implement.
> > > 
> > > Avi, what do you think?
> > 
> > I think given all the complexity in the separate ioctl approach that
> > this makes sense.  There are no lifetime issues or code to match the two
> > eventfds.  Alex, would this API simplify the code?
> 
> It does though I'm concerned that it's a very specific solution that
> only addresses this problem.  Generic userspace eoi/ack is not
> addressed.  The latest version using separate ioctls does a lot of
> simplification by exposing irq sourceids.  The bulk of the code there is
> duplicating what irqfd does just so we can catch the POLLHUP for
> cleanup.  If there was an easier way to do that, we don't care about
> POLLIN/POLLOUT, much of the code could be removed.  Alternatively we
> could make some common infrastructure to simplify both irqfd and
> irq_ackfd, but how to frame the helpers isn't easy.

There is way easier with a single ioctl.  Don't you see?

As ack_eventfd pointer becomes part of the irqfd structure now, you
simply drop the reference together with irqfd.
In other words you do not care that ack eventfd goes
away anymore. So no need for POLLHUP handlers, no
separate DEASSERT that can race with that, etc.

So all this code just goes away, and it goes away completely, together
with managing source IDs (source ID comes an internal optimization to
avoid spurious EOIs, so no need to expose it to userspace anymore).

So all we are left with is minimal:
1. change irqfds to use a separate source id (can do this
   unconditionally for all irqfds)
2. check deassert on ack, if set register ack notifier
3. in ack notifier check deassert eventfd, if set signal it
4. (optionally) add a flag in irqfd, set on assert, test and clear
   on deassert, and only signal eventfd if it was set

on top of that we could try to do
5. allocate some more source IDs and if they are free try to use them as
   an optimization to avoid atomics


> > Yet another option was raised in the past, and that was exiling ioapic
> > and pic to userspace.  This moves the entire issue to userspace.  The
> > cost is a new interface that implements the APIC bus (betweem APIC and
> > IOAPIC) and the INTACK sequence (between APIC and PIC), and potential
> > for performance regressions due to the PIC, IOAPIC, and PIT being in
> > userspace.  We would still have to keep the IOAPIC/PIC in the kernel,
> > but no new features would be added.
> 
> Doesn't this assure a performance regression or are we assuming anywhere
> we care about performance we're using MSI?  Thanks,
> 
> Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson Aug. 14, 2012, 11:26 p.m. UTC | #40
On Wed, 2012-08-15 at 02:04 +0300, Michael S. Tsirkin wrote:
> On Tue, Aug 14, 2012 at 04:01:15PM -0600, Alex Williamson wrote:
> > On Tue, 2012-08-14 at 15:35 +0300, Avi Kivity wrote:
> > > On 08/12/2012 12:33 PM, Michael S. Tsirkin wrote:
> > > >> 
> > > >> Michael, would the interface be more acceptable to you if we added
> > > >> separate ioctls to allocate and free some representation of an irq
> > > >> source ID, gsi pair?  For instance, an ioctl might return an idr entry
> > > >> for an irq source ID/gsi object which would then be passed as a
> > > >> parameter in struct kvm_irqfd and struct kvm_eoifd so that the object
> > > >> representing the source id/gsi isn't magically freed on it's own.  This
> > > >> would also allow us to deassign/close one end and reconfigure it later.
> > > >> Thanks,
> > > >> 
> > > >> Alex
> > > > 
> > > > It's acceptable to me either way. I was only pointing out that as
> > > > designed, the interface looks simple at first but then you find out some
> > > > subtle limitations which are implementation driven. This gives
> > > > an overall feeling the abstraction is too low level.
> > > > 
> > > > If we compare to the existing irqfd, isn't the difference
> > > > simply that irqfd deasserts immediately ATM, while we
> > > > want to delay this until later?
> > > > 
> > > > If yes, then along the lines that you proposed, and combining with my
> > > > idea of tracking deasserts, how do you like the following:
> > > > 
> > > > /* Keep line asserted until guest has handled the interrupt. */
> > > > #define KVM_IRQFD_FLAG_DEASSERT_ON_ACK (1 << 1)
> > > > /* Notify after line is deasserted. */
> > > > #define KVM_IRQFD_FLAG_DEASSERT_EVENTFD (2 << 1)
> > > > 
> > > > 	struct kvm_irqfd {
> > > > 		__u32 fd;
> > > > 		__u32 gsi;
> > > > 		__u32 flags;
> > > > 		/* eventfd to notify when line is deasserted */
> > > > 		__u32 deassert_eventfd;
> > > > 		__u8  pad[16];
> > > > 	};
> > > > 
> > > > now the only limitation is that KVM_IRQFD_FLAG_DEASSERT_ON_ACK is only
> > > > effective for level interrupts.
> > > > 
> > > > Notes about lifetime of objects:
> > > > 	- closing deassert_eventfd does nothing (we can keep
> > > > 	  reference to it from irqfd so no need for
> > > >           complex polling/flushing scheme)
> > > > 	- closing irqfd or deasserting dis-associates
> > > > 	  deassert_eventfd automatically
> > > > 	- source id is internal to irqfd and goes away with it
> > > > 
> > > > it looks harder to misuse and fits what we want to do nicely,
> > > > and needs less code to implement.
> > > > 
> > > > Avi, what do you think?
> > > 
> > > I think given all the complexity in the separate ioctl approach that
> > > this makes sense.  There are no lifetime issues or code to match the two
> > > eventfds.  Alex, would this API simplify the code?
> > 
> > It does though I'm concerned that it's a very specific solution that
> > only addresses this problem.  Generic userspace eoi/ack is not
> > addressed.  The latest version using separate ioctls does a lot of
> > simplification by exposing irq sourceids.  The bulk of the code there is
> > duplicating what irqfd does just so we can catch the POLLHUP for
> > cleanup.  If there was an easier way to do that, we don't care about
> > POLLIN/POLLOUT, much of the code could be removed.  Alternatively we
> > could make some common infrastructure to simplify both irqfd and
> > irq_ackfd, but how to frame the helpers isn't easy.
> 
> There is way easier with a single ioctl.  Don't you see?
> 
> As ack_eventfd pointer becomes part of the irqfd structure now, you
> simply drop the reference together with irqfd.
> In other words you do not care that ack eventfd goes
> away anymore. So no need for POLLHUP handlers, no
> separate DEASSERT that can race with that, etc.
> 
> So all this code just goes away, and it goes away completely, together
> with managing source IDs (source ID comes an internal optimization to
> avoid spurious EOIs, so no need to expose it to userspace anymore).
> 
> So all we are left with is minimal:
> 1. change irqfds to use a separate source id (can do this
>    unconditionally for all irqfds)
> 2. check deassert on ack, if set register ack notifier
> 3. in ack notifier check deassert eventfd, if set signal it
> 4. (optionally) add a flag in irqfd, set on assert, test and clear
>    on deassert, and only signal eventfd if it was set
> 
> on top of that we could try to do
> 5. allocate some more source IDs and if they are free try to use them as
>    an optimization to avoid atomics

Yes, I understand.  It's simple, it's also very specific to this
problem, and doesn't address generic ack notification.  All of which
I've noted before and I continue to note that v8 offers simplifications
while retaining flexibility.  Least amount of code doesn't really buy us
much if we end up needing to invent new interfaces down the road because
we've created such a specific solution here.  Thanks,

Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Aug. 15, 2012, 1:09 p.m. UTC | #41
On 08/15/2012 02:26 AM, Alex Williamson wrote:
> 
> Yes, I understand.  It's simple, it's also very specific to this
> problem, and doesn't address generic ack notification.  All of which
> I've noted before and I continue to note that v8 offers simplifications
> while retaining flexibility.  Least amount of code doesn't really buy us
> much if we end up needing to invent new interfaces down the road because
> we've created such a specific solution here.  Thanks,
> 

One side of the coin is trying to create one generic interface instead
of multiple specific interfaces.  The other side is that by providing a
generic interface, you sometimes expose internal implementation details,
or you constrain future development in order to preserve that interface.
 If the generic interface is not actually exploited, you get pain for no
gain.

This tradeoff is different for every feature.  Right now I'm leaning
towards specialized interfaces here, because we expose quite a lot of
low-level detail.  However I'll review v8 soon and see.
diff mbox

Patch

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 3911e62..8cd6b36 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1989,6 +1989,27 @@  return the hash table order in the parameter.  (If the guest is using
 the virtualized real-mode area (VRMA) facility, the kernel will
 re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.)
 
+4.77 KVM_EOIFD
+
+Capability: KVM_CAP_EOIFD
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_eoifd (in)
+Returns: 0 on success, < 0 on error
+
+KVM_EOIFD allows userspace to receive interrupt EOI notification
+through an eventfd.  kvm_eoifd.fd specifies the eventfd used for
+notification.  KVM_EOIFD_FLAG_DEASSIGN is used to de-assign an eoifd
+once assigned.  KVM_EOIFD also requires additional bits set in
+kvm_eoifd.flags to bind to the proper interrupt line.  The
+KVM_EOIFD_FLAG_LEVEL_IRQFD indicates that kvm_eoifd.key is provided
+and is a key from a level triggered interrupt (configured from
+KVM_IRQFD using KVM_IRQFD_FLAG_LEVEL).  The EOI notification is bound
+to the same GSI and irqchip input as the irqfd.  Both kvm_eoifd.key
+and KVM_EOIFD_FLAG_LEVEL_IRQFD must be specified on assignment and
+de-assignment of KVM_EOIFD.  A level irqfd may only be bound to a
+single eoifd.  KVM_CAP_EOIFD_LEVEL_IRQFD indicates support of
+KVM_EOIFD_FLAG_LEVEL_IRQFD.
 
 5. The kvm_run structure
 ------------------------
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9ded39d..8f3164e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2171,6 +2171,8 @@  int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_PCI_2_3:
 	case KVM_CAP_KVMCLOCK_CTRL:
 	case KVM_CAP_IRQFD_LEVEL:
+	case KVM_CAP_EOIFD:
+	case KVM_CAP_EOIFD_LEVEL_IRQFD:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index b2e6e4f..effb916 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -619,6 +619,8 @@  struct kvm_ppc_smmu_info {
 #define KVM_CAP_S390_COW 79
 #define KVM_CAP_PPC_ALLOC_HTAB 80
 #define KVM_CAP_IRQFD_LEVEL 81
+#define KVM_CAP_EOIFD 82
+#define KVM_CAP_EOIFD_LEVEL_IRQFD 83
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -694,6 +696,17 @@  struct kvm_irqfd {
 	__u8  pad[20];
 };
 
+#define KVM_EOIFD_FLAG_DEASSIGN (1 << 0)
+/* Available with KVM_CAP_EOIFD_LEVEL_IRQFD */
+#define KVM_EOIFD_FLAG_LEVEL_IRQFD (1 << 1)
+
+struct kvm_eoifd {
+	__u32 fd;
+	__u32 flags;
+	__u32 key;
+	__u8 pad[20];
+};
+
 struct kvm_clock_data {
 	__u64 clock;
 	__u32 flags;
@@ -834,6 +847,8 @@  struct kvm_s390_ucas_mapping {
 #define KVM_PPC_GET_SMMU_INFO	  _IOR(KVMIO,  0xa6, struct kvm_ppc_smmu_info)
 /* Available with KVM_CAP_PPC_ALLOC_HTAB */
 #define KVM_PPC_ALLOCATE_HTAB	  _IOWR(KVMIO, 0xa7, __u32)
+/* Available with KVM_CAP_EOIFD */
+#define KVM_EOIFD                 _IOW(KVMIO,  0xa8, struct kvm_eoifd)
 
 /*
  * ioctls for vcpu fds
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c73f071..01e72a6 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -289,6 +289,10 @@  struct kvm {
 		struct mutex lock;
 		struct list_head items;
 	} irqsources;
+	struct {
+		spinlock_t lock;
+		struct list_head items;
+	} eoifds;
 #endif
 	struct kvm_vm_stat stat;
 	struct kvm_arch arch;
@@ -832,6 +836,8 @@  int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args);
 void kvm_irqfd_release(struct kvm *kvm);
 void kvm_irq_routing_update(struct kvm *, struct kvm_irq_routing_table *);
 int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
+int kvm_eoifd(struct kvm *kvm, struct kvm_eoifd *args);
+void kvm_eoifd_release(struct kvm *kvm);
 
 #else
 
@@ -857,6 +863,13 @@  static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 	return -ENOSYS;
 }
 
+static inline int kvm_eoifd(struct kvm *kvm, struct kvm_eoifd *args)
+{
+	return -ENOSYS;
+}
+
+static inline void kvm_eoifd_release(struct kvm *kvm) {}
+
 #endif /* CONFIG_HAVE_KVM_EVENTFD */
 
 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 878cb52..3aa2d62 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -95,6 +95,25 @@  static struct _irq_source *_irq_source_alloc(struct kvm *kvm, int gsi)
 	return source;
 }
 
+static struct _irq_source *_irq_source_get_from_key(struct kvm *kvm, int key)
+{
+	struct _irq_source *tmp, *source = ERR_PTR(-ENOENT);
+
+	mutex_lock(&kvm->irqsources.lock);
+
+	list_for_each_entry(tmp, &kvm->irqsources.items, list) {
+		if (tmp->id == key) {
+			source = tmp;
+			kref_get(&source->kref);
+			break;
+		}
+	}
+
+	mutex_unlock(&kvm->irqsources.lock);
+
+	return source;
+}
+
 /*
  * --------------------------------------------------------------------
  * irqfd: Allows an fd to be used to inject an interrupt to the guest
@@ -406,6 +425,8 @@  kvm_eventfd_init(struct kvm *kvm)
 	INIT_LIST_HEAD(&kvm->ioeventfds);
 	mutex_init(&kvm->irqsources.lock);
 	INIT_LIST_HEAD(&kvm->irqsources.items);
+	spin_lock_init(&kvm->eoifds.lock);
+	INIT_LIST_HEAD(&kvm->eoifds.items);
 }
 
 /*
@@ -772,3 +793,318 @@  kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 
 	return kvm_assign_ioeventfd(kvm, args);
 }
+
+/*
+ * --------------------------------------------------------------------
+ *  eoifd: Translate KVM APIC/IOAPIC EOI into eventfd signal.
+ *
+ *  userspace can register with an eventfd for receiving
+ *  notification when an EOI occurs.
+ * --------------------------------------------------------------------
+ */
+
+struct _eoifd {
+	/* eventfd triggered on EOI */
+	struct eventfd_ctx *eventfd;
+	/* irq source ID de-asserted on EOI */
+	struct _irq_source *source;
+	wait_queue_t wait;
+	/* EOI notification from KVM */
+	struct kvm_irq_ack_notifier notifier;
+	struct list_head list;
+	poll_table pt;
+	struct work_struct shutdown;
+};
+
+/* Called under eoifds.lock */
+static void eoifd_shutdown(struct work_struct *work)
+{
+	struct _eoifd *eoifd = container_of(work, struct _eoifd, shutdown);
+	struct kvm *kvm = eoifd->source->kvm;
+	u64 cnt;
+
+	/*
+	 * Stop EOI signaling
+	 */
+	kvm_unregister_irq_ack_notifier(kvm, &eoifd->notifier);
+
+	/*
+	 * Synchronize with the wait-queue and unhook ourselves to prevent
+	 * further events.
+	 */
+	eventfd_ctx_remove_wait_queue(eoifd->eventfd, &eoifd->wait, &cnt);
+
+	/*
+	 * Release resources
+	 */
+	eventfd_ctx_put(eoifd->eventfd);
+	_irq_source_put(eoifd->source);
+	kfree(eoifd);
+}
+
+/* assumes kvm->eoifds.lock is held */
+static bool eoifd_is_active(struct _eoifd *eoifd)
+{
+	return list_empty(&eoifd->list) ? false : true;
+}
+
+/*
+ * Mark the eoifd as inactive and schedule it for removal
+ *
+ * assumes kvm->eoifds.lock is held
+ */
+static void eoifd_deactivate(struct _eoifd *eoifd)
+{
+	BUG_ON(!eoifd_is_active(eoifd));
+
+	list_del_init(&eoifd->list);
+
+	queue_work(irqfd_cleanup_wq, &eoifd->shutdown);
+}
+
+/*
+ * Called with wqh->lock held and interrupts disabled
+ */
+static int eoifd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+	unsigned long flags = (unsigned long)key;
+
+	if (unlikely(flags & POLLHUP)) {
+		/* The eventfd is closing, detach from KVM */
+		struct _eoifd *eoifd = container_of(wait, struct _eoifd, wait);
+		struct kvm *kvm = eoifd->source->kvm;
+		unsigned long flags;
+
+		spin_lock_irqsave(&kvm->eoifds.lock, flags);
+
+		/*
+		 * We must check if someone deactivated the eoifd before
+		 * we could acquire the eoifds.lock since the item is
+		 * deactivated from the KVM side before it is unhooked from
+		 * the wait-queue.  If it is already deactivated, we can
+		 * simply return knowing the other side will cleanup for us.
+		 * We cannot race against the eoifd going away since the
+		 * other side is required to acquire wqh->lock, which we hold
+		 */
+		if (eoifd_is_active(eoifd))
+			eoifd_deactivate(eoifd);
+
+		spin_unlock_irqrestore(&kvm->eoifds.lock, flags);
+	}
+
+	return 0;
+}
+
+static void eoifd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
+				    poll_table *pt)
+{
+	struct _eoifd *eoifd = container_of(pt, struct _eoifd, pt);
+	add_wait_queue(wqh, &eoifd->wait);
+}
+
+/*
+ * This function is called as the kvm VM fd is being released. Shutdown all
+ * eoifds that still remain open
+ */
+void kvm_eoifd_release(struct kvm *kvm)
+{
+	struct _eoifd *tmp, *eoifd;
+
+	spin_lock_irq(&kvm->eoifds.lock);
+
+	list_for_each_entry_safe(eoifd, tmp, &kvm->eoifds.items, list)
+		eoifd_deactivate(eoifd);
+
+	spin_unlock_irq(&kvm->eoifds.lock);
+
+	flush_workqueue(irqfd_cleanup_wq);
+}
+
+static void eoifd_event(struct kvm_irq_ack_notifier *notifier)
+{
+	struct _eoifd *eoifd;
+
+	eoifd = container_of(notifier, struct _eoifd, notifier);
+
+	if (unlikely(!eoifd->source))
+		return;
+
+	/*
+	 * De-assert and send EOI, user needs to re-assert if
+	 * device still requires service.
+	 */
+	kvm_set_irq(eoifd->source->kvm,
+		    eoifd->source->id, eoifd->source->gsi, 0);
+	eventfd_signal(eoifd->eventfd, 1);
+}
+
+static int kvm_assign_eoifd(struct kvm *kvm, struct kvm_eoifd *args)
+{
+	struct file *file = NULL;
+	struct eventfd_ctx *eventfd = NULL;
+	struct _eoifd *eoifd = NULL, *tmp;
+	struct _irq_source *source = NULL;
+	int ret;
+	u64 cnt;
+
+	if (!(args->flags & KVM_EOIFD_FLAG_LEVEL_IRQFD))
+		return -EINVAL;
+
+	file = eventfd_fget(args->fd);
+	if (IS_ERR(file)) {
+		ret = PTR_ERR(file);
+		goto fail;
+	}
+
+	eventfd = eventfd_ctx_fileget(file);
+	if (IS_ERR(eventfd)) {
+		ret = PTR_ERR(eventfd);
+		goto fail;
+	}
+
+	eoifd = kzalloc(sizeof(*eoifd), GFP_KERNEL);
+	if (!eoifd) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	source = _irq_source_get_from_key(kvm, args->key);
+	if (IS_ERR(source)) {
+		ret = PTR_ERR(source);
+		goto fail;
+	}
+
+	INIT_LIST_HEAD(&eoifd->list);
+	INIT_WORK(&eoifd->shutdown, eoifd_shutdown);
+	eoifd->eventfd = eventfd;
+	eoifd->notifier.gsi = source->gsi;
+	eoifd->notifier.irq_acked = eoifd_event;
+
+	/*
+	 * Install our own custom wake-up handling so we are notified via
+	 * a callback whenever someone releases the underlying eventfd
+	 */
+	init_waitqueue_func_entry(&eoifd->wait, eoifd_wakeup);
+	init_poll_funcptr(&eoifd->pt, eoifd_ptable_queue_proc);
+
+	/*
+	 * Clear out any previously released eoifds that might conflict
+	 */
+	flush_workqueue(irqfd_cleanup_wq);
+
+	/*
+	 * This can sleep, so register before acquiring spinlock, notifier
+	 * becomes a nop until we finish.
+	 */
+	kvm_register_irq_ack_notifier(kvm, &eoifd->notifier);
+
+	/*
+	 * Install the wait queue function to allow cleanup when the
+	 * eventfd is closed by the user.  This grabs the wqh lock, so
+	 * we do it out of spinlock, holding the file reference ensures
+	 * we won't see a POLLHUP until setup is complete.
+	 */
+	file->f_op->poll(file, &eoifd->pt);
+
+	spin_lock_irq(&kvm->eoifds.lock);
+
+	/*
+	 * Enforce a one-to-one relationship between irq source and eoifd so
+	 * that this interface can't be used to consume all kernel memory.
+	 * NB. single eventfd can still be used by multiple eoifds.
+	 */
+	list_for_each_entry(tmp, &kvm->eoifds.items, list) {
+		if (tmp->source == source) {
+			spin_unlock_irq(&kvm->eoifds.lock);
+			ret = -EBUSY;
+			goto fail_unregister;
+		}
+	}
+
+	list_add_tail(&eoifd->list, &kvm->eoifds.items);
+	eoifd->source = source; /* Enable ack notifier */
+
+	spin_unlock_irq(&kvm->eoifds.lock);
+
+	fput(file); /* Enable POLLHUP */
+
+	return 0;
+
+fail_unregister:
+	eventfd_ctx_remove_wait_queue(eventfd, &eoifd->wait, &cnt);
+	kvm_unregister_irq_ack_notifier(kvm, &eoifd->notifier);
+fail:
+	if (source && !IS_ERR(source))
+		_irq_source_put(source);
+
+	if (eventfd && !IS_ERR(eventfd))
+		eventfd_ctx_put(eventfd);
+
+	if (file && !IS_ERR(file))
+		fput(file);
+
+	kfree(eoifd);
+	return ret;
+}
+
+static int kvm_deassign_eoifd(struct kvm *kvm, struct kvm_eoifd *args)
+{
+	struct eventfd_ctx *eventfd = NULL;
+	struct _irq_source *source = NULL;
+	struct _eoifd *eoifd;
+	int ret = -ENOENT;
+
+	if (!(args->flags & KVM_EOIFD_FLAG_LEVEL_IRQFD))
+		return -EINVAL;
+
+	eventfd = eventfd_ctx_fdget(args->fd);
+	if (IS_ERR(eventfd)) {
+		ret = PTR_ERR(eventfd);
+		goto fail;
+	}
+
+	source = _irq_source_get_from_key(kvm, args->key);
+	if (IS_ERR(source)) {
+		ret = PTR_ERR(source);
+		goto fail;
+	}
+
+	spin_lock_irq(&kvm->eoifds.lock);
+
+	list_for_each_entry(eoifd, &kvm->eoifds.items, list) {
+		if (eoifd->eventfd == eventfd && eoifd->source == source) {
+			eoifd_deactivate(eoifd);
+			ret = 0;
+			break;
+		}
+	}
+
+	spin_unlock_irq(&kvm->eoifds.lock);
+
+fail:
+	if (source && !IS_ERR(source))
+		_irq_source_put(source);
+	if (eventfd && !IS_ERR(eventfd))
+		eventfd_ctx_put(eventfd);
+
+	/*
+	 * Block until we know all outstanding shutdown jobs have completed
+	 * so that we guarantee there will not be any more EOIs signaled on
+	 * this eventfd once this deassign function returns.
+	 */
+	flush_workqueue(irqfd_cleanup_wq);
+
+	return ret;
+}
+
+int kvm_eoifd(struct kvm *kvm, struct kvm_eoifd *args)
+{
+	if (args->flags & ~(KVM_EOIFD_FLAG_DEASSIGN |
+			    KVM_EOIFD_FLAG_LEVEL_IRQFD))
+		return -EINVAL;
+
+	if (args->flags & KVM_EOIFD_FLAG_DEASSIGN)
+		return kvm_deassign_eoifd(kvm, args);
+
+	return kvm_assign_eoifd(kvm, args);
+}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2468523..0b241bf 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -620,6 +620,8 @@  static int kvm_vm_release(struct inode *inode, struct file *filp)
 
 	kvm_irqfd_release(kvm);
 
+	kvm_eoifd_release(kvm);
+
 	kvm_put_kvm(kvm);
 	return 0;
 }
@@ -2093,6 +2095,15 @@  static long kvm_vm_ioctl(struct file *filp,
 		break;
 	}
 #endif
+	case KVM_EOIFD: {
+		struct kvm_eoifd data;
+
+		r = -EFAULT;
+		if (copy_from_user(&data, argp, sizeof data))
+			goto out;
+		r = kvm_eoifd(kvm, &data);
+		break;
+	}
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
 		if (r == -ENOTTY)